I'm having trouble with using pandas.DataFrame's constructor and using the dtype argument. I'd like to preserve string values, but the following snippets always convert to a numeric type and then yield NaNs.
from __future__ import unicode_literals
from __future__ import print_function
import numpy as np
import pandas as pd
def main():
columns = ['great', 'good', 'average', 'bad', 'horrible']
# minimal example, dates are coming (as strings) from some
# non-file source.
example_data = {
'alice': ['', '', '', '2016-05-24', ''],
'bob': ['', '2015-01-02', '', '', '2012-09-15'],
'eve': ['2011-12-31', '', '1998-08-13', '', ''],
}
# first pass, yields dataframe full of NaNs
df = pd.DataFrame(data=example_data, index=example_data.keys(),
columns=columns, dtype=str) #or string, 'str', 'string', 'object'
print(df.dtypes)
print(df)
print()
# based on https://github.com/pydata/pandas/blob/master/pandas/core/frame.py
# and https://github.com/pydata/pandas/blob/37f95cef85834207db0930e863341efb285e38a2/pandas/types/common.py
# we're ultimately feeding dtype to numpy's dtype, so let's just use that:
# (using np.dtype('S10') and converting to str doesn't work either)
df = pd.DataFrame(data=example_data, index=example_data.keys(),
columns=columns, dtype=np.dtype('U'))
print(df.dtypes)
print(df) # still full of NaNs... =(
if __name__ == '__main__':
main()
What value(s) of dtypes will preserve strings in the data frame?
for reference:
$ python --version
2.7.12
$ pip2 list | grep pandas
pandas (0.18.1)
$ pip2 list | grep numpy
numpy (1.11.1)