From 78c0af5bc707b6fc6013e505af968b46f8f6fe52 Mon Sep 17 00:00:00 2001 From: "Matthew R. Becker" Date: Thu, 6 Apr 2017 12:54:55 -0500 Subject: [PATCH 1/9] ENH adding `keep_values` keyword to pd.get_dummies function, deprecating filling with zeros by default --- doc/source/whatsnew/v0.20.0.txt | 13 ++++ pandas/core/reshape.py | 130 +++++++++++++++++++++++++------ pandas/tests/test_reshape.py | 134 +++++++++++++++++++++++++------- 3 files changed, 224 insertions(+), 53 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0b98e57c606a3..41716e0e26ea0 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -372,6 +372,7 @@ Other Enhancements - :func:`MultiIndex.remove_unused_levels` has been added to facilitate :ref:`removing unused levels `. (:issue:`15694`) - ``pd.read_csv()`` will now raise a ``ParserError`` error whenever any parsing error occurs (:issue:`15913`, :issue:`15925`) - ``pd.read_csv()`` now supports the ``error_bad_lines`` and ``warn_bad_lines`` arguments for the Python parser (:issue:`15925`) +- ``pd.get_dummies()`` now accepts the ``fill_value`` keyword which specifies how to fill missing values in the dummy variables. (:issue:`15923`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations @@ -382,6 +383,18 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Deprecate Automatic Zero Filling of Missing Values in .get_dummies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``pd.get_dummies`` function currently fills missing values with zero by default. This behavior is in conflict with the rest of the pandas API since missing values should be filled with ``fillna`` and missing values should be propagated through pandas transformations. In the future, ``get_dummies`` will propagate missing values by default. The recommended way to reproduce the current behavior of filling with zeros with the new, upcoming API is + +.. ipython: python + + df = pd.get_dummies(df).fillna(0) + +For now, the current behavior of filling zeros by default has been kept, but not specifying a fill value with the ``fill_value`` keyword will raise a ``DeprecationWarning`` with the example above. + + .. _whatsnew_0200.api_breaking.deprecate_ix: Deprecate .ix diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index b03c3d77928c7..f2792e554daa1 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -4,6 +4,7 @@ from pandas import compat import itertools import re +import warnings import numpy as np @@ -1059,7 +1060,8 @@ def melt_stub(df, stub, i, j, value_vars, sep): def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, - columns=None, sparse=False, drop_first=False): + fill_value=None, columns=None, sparse=False, + drop_first=False): """ Convert categorical variable into dummy/indicator variables @@ -1075,7 +1077,15 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, If appending prefix, separator/delimiter to use. Or pass a list or dictionary as with `prefix.` dummy_na : bool, default False - Add a column to indicate NaNs, if False NaNs are ignored. + Add a column to indicate NaNs if True. + fill_value : scalar, default None + Value to fill NaNs with. The default of `None` will fill with + zeros. To do no filling of NaNs, specify `fill_value=np.nan`. + The default behavior of filling with zeros will be deprecated + in the future and using this default will not raise a + `DeprecationWarning`. + + .. versionadded:: 0.20.0 columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with @@ -1121,6 +1131,18 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, 1 0 1 0 2 0 0 1 + >>> pd.get_dummies(s1, fill_value=np.nan) + a b + 0 1 0 + 1 0 1 + 2 NaN NaN + + >>> pd.get_dummies(s1, fill_value=np.nan, dummy_na=True) + a b NaN + 0 1 0 0 + 1 0 1 0 + 2 NaN NaN 1 + >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]}) @@ -1153,6 +1175,17 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, from pandas.tools.concat import concat from itertools import cycle + # Deprecate filling missing values with zeros, GH15926 + # When this is finally deprecated, simply remove this block + # of code and change the default to np.nan in the function signature + # of `get_dummies`. + if fill_value is None: + warnings.warn('The default behavior of filling missing values ' + 'with zeros will be deprecated. Use ' + '`df = pd.get_dummies(df).fillna(0)` to reproduce ' + 'this behavior', DeprecationWarning) + fill_value = 0.0 + if isinstance(data, DataFrame): # determine columns being encoded @@ -1197,17 +1230,19 @@ def check_len(item, name): dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep, dummy_na=dummy_na, sparse=sparse, - drop_first=drop_first) + drop_first=drop_first, + fill_value=fill_value) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, - sparse=sparse, drop_first=drop_first) + sparse=sparse, drop_first=drop_first, + fill_value=fill_value) return result def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, - sparse=False, drop_first=False): + fill_value=np.nan, sparse=False, drop_first=False): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) @@ -1221,17 +1256,22 @@ def get_empty_Frame(data, sparse): else: return SparseDataFrame(index=index, default_fill_value=0) - # if all NaN - if not dummy_na and len(levels) == 0: + # If we get all NaN and are not making a dummy col, then just return. + # GH15826 + if len(levels) == 0 and not dummy_na: return get_empty_Frame(data, sparse) + # Record missing values before we munge the codes, GH15826 + missing_codes_msk = codes == -1 codes = codes.copy() if dummy_na: - codes[codes == -1] = len(levels) + codes[missing_codes_msk] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again - if drop_first and len(levels) == 1: + # test for length of levels was changed to `<=` from `==` to cover + # all NaN inputs, GH15826 + if drop_first and len(levels) <= 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) @@ -1250,11 +1290,36 @@ def get_empty_Frame(data, sparse): sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] - for ndx, code in enumerate(codes): - if code == -1: - # Blank entries if not dummy_na and code == -1, #GH4446 - continue - sp_indices[code].append(ndx) + for ndx, code, missing in zip( + range(len(codes)), codes, missing_codes_msk): + if missing: + # For missing values, we have to decide what to do. + # GH15926 + + if dummy_na: + # Then we need a one in the last column. + # GH15926 + sp_indices[code].append(ndx) + + if fill_value != 0: + # Then we need to mark these locations to put back another + # fill value later. (Zero fill values will be filled by the + # sparse array implicitly). + # Use a negative index here to code NaNs. + # Offset by -1 to account for zero. + # Have to add to ALL columns, except the + # last one if dummy_na. + # GH15926 + if dummy_na: + _num_cols = len(dummy_cols) - 1 + else: + _num_cols = len(dummy_cols) + for _code in range(_num_cols): + sp_indices[_code].append(-ndx - 1) + else: + # Value is not missing so do as normal. + # GH15926 + sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity @@ -1262,28 +1327,47 @@ def get_empty_Frame(data, sparse): sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): - sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8), - sparse_index=IntIndex(N, ixs), fill_value=0, - dtype=np.uint8) + sarr = np.ones(len(ixs), dtype=np.float32) + + # NaNs are marked by a negative index. + # Only need to set for sparse output if + # fill_value != 0. + # Ditto for any negative indexes generated above. + # GH15926 + if fill_value != 0: + ixs = np.array(ixs) + sarr[ixs < 0] = fill_value + ixs[ixs < 0] += 1 # undo the offset + ixs = np.abs(ixs) # set index back to positive. + + sarr = SparseArray( + sarr, + sparse_index=IntIndex(N, ixs), + fill_value=0, + dtype=np.float32) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, default_fill_value=0, - dtype=np.uint8) + dtype=np.float32) return out else: - dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0) + dummy_mat = np.eye( + number_of_cols, dtype=np.float32).take(codes, axis=0) - if not dummy_na: - # reset NaN GH4446 - dummy_mat[codes == -1] = 0 + # user specified fill value via `fill_value` GH15926 + if dummy_na: + dummy_mat[missing_codes_msk, :-1] = fill_value + else: + dummy_mat[missing_codes_msk] = fill_value if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] - return DataFrame(dummy_mat, index=index, columns=dummy_cols) + return DataFrame( + dummy_mat, index=index, columns=dummy_cols, dtype=np.float32) def make_axis_dummies(frame, axis='minor', transform=None): diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index ee255c1863b41..453d2621cc86f 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -243,7 +243,7 @@ def test_basic(self): 2: 0}, 'c': {0: 0, 1: 0, - 2: 1}}, dtype=np.uint8) + 2: 1}}, dtype=np.float32) assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected) assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected) @@ -262,7 +262,7 @@ def test_basic_types(self): expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, - dtype='uint8', + dtype='float32', columns=list('abc')) if not self.sparse: compare = tm.assert_frame_equal @@ -278,10 +278,11 @@ def test_basic_types(self): result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns) tm.assert_series_equal(result.get_dtype_counts(), - Series({'uint8': 8})) + Series({'float32': 8})) result = get_dummies(s_df, sparse=self.sparse, columns=['a']) - expected = Series({'uint8': 3, 'int64': 1, 'object': 1}).sort_values() + expected = Series( + {'float32': 3, 'int64': 1, 'object': 1}).sort_values() tm.assert_series_equal(result.get_dtype_counts().sort_values(), expected) @@ -307,7 +308,7 @@ def test_include_na(self): s = ['a', 'b', np.nan] res = get_dummies(s, sparse=self.sparse) exp = DataFrame({'a': {0: 1, 1: 0, 2: 0}, - 'b': {0: 0, 1: 1, 2: 0}}, dtype=np.uint8) + 'b': {0: 0, 1: 1, 2: 0}}, dtype=np.float32) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 @@ -315,7 +316,7 @@ def test_include_na(self): exp_na = DataFrame({nan: {0: 0, 1: 0, 2: 1}, 'a': {0: 1, 1: 0, 2: 0}, 'b': {0: 0, 1: 1, 2: 0}}, - dtype=np.uint8) + dtype=np.float32) exp_na = exp_na.reindex_axis(['a', 'b', nan], 1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns @@ -323,9 +324,28 @@ def test_include_na(self): res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse) exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], - dtype=np.uint8) + dtype=np.float32) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) + # Add `fill_value` keyword GH15926 + def test_fill_value_na(self): + s = ['a', 'b', np.nan] + res_na = get_dummies( + s, dummy_na=True, fill_value=np.nan, sparse=self.sparse) + exp_na = DataFrame({'a': [1, 0, np.nan], + 'b': [0, 1, np.nan], + np.nan: [0, 0, 1]}, + dtype=np.float32) + exp_na = exp_na.reindex_axis(['a', 'b', np.nan], 1) + assert_frame_equal(res_na, exp_na) + + res_just_na = get_dummies( + [nan], dummy_na=True, fill_value=np.nan, sparse=self.sparse) + exp_just_na = DataFrame([[1]], + columns=[np.nan], + dtype=np.float32) + assert_frame_equal(res_just_na, exp_just_na) + def test_unicode(self ): # See GH 6885 - get_dummies chokes on unicode values import unicodedata @@ -339,7 +359,7 @@ def test_unicode(self u('letter_%s') % eacute: {0: 0, 1: 1, 2: 1}}, - dtype=np.uint8) + dtype=np.float32) assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self): @@ -348,7 +368,7 @@ def test_dataframe_dummies_all_obj(self): expected = DataFrame({'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], - 'B_c': [0, 0, 1]}, dtype=np.uint8) + 'B_c': [0, 0, 1]}, dtype=np.float32) assert_frame_equal(result, expected) def test_dataframe_dummies_mix_default(self): @@ -360,7 +380,7 @@ def test_dataframe_dummies_mix_default(self): 'B_b': [1, 1, 0], 'B_c': [0, 0, 1]}) cols = ['A_a', 'A_b', 'B_b', 'B_c'] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(np.float32) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) @@ -376,7 +396,7 @@ def test_dataframe_dummies_prefix_list(self): 'from_B_b': [1, 1, 0], 'from_B_c': [0, 0, 1]}) cols = expected.columns[1:] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(np.float32) expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']] assert_frame_equal(result, expected) @@ -389,7 +409,7 @@ def test_dataframe_dummies_prefix_str(self): [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'], - dtype=np.uint8) + dtype=np.float32) expected = expected.astype({"C": np.int64}) assert_frame_equal(result, expected) @@ -402,7 +422,7 @@ def test_dataframe_dummies_subset(self): 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) cols = ['from_A_a', 'from_A_b'] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(np.float32) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self): @@ -415,7 +435,7 @@ def test_dataframe_dummies_prefix_sep(self): 'B..c': [0, 0, 1]}) expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] cols = expected.columns[1:] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(np.float32) assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse) @@ -446,11 +466,11 @@ def test_dataframe_dummies_prefix_dict(self): 'from_B_c': [0, 0, 1], 'C': [1, 2, 3]}) cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(np.float32) assert_frame_equal(result, expected) def test_dataframe_dummies_with_na(self): - df = self.df + df = self.df.copy() df.loc[3, :] = [np.nan, np.nan, np.nan] result = get_dummies(df, dummy_na=True, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3, np.nan], @@ -461,7 +481,7 @@ def test_dataframe_dummies_with_na(self): 'B_c': [0, 0, 1, 0], 'B_nan': [0, 0, 0, 1]}) cols = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan'] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(np.float32) expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']] assert_frame_equal(result, expected) @@ -470,6 +490,22 @@ def test_dataframe_dummies_with_na(self): expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) + result = get_dummies( + df, dummy_na=True, fill_value=np.nan, sparse=self.sparse) + expected = DataFrame({'C': [1, 2, 3, np.nan], + 'A_a': [1, 0, 1, np.nan], + 'A_b': [0, 1, 0, np.nan], + 'A_nan': [0, 0, 0, 1], + 'B_b': [1, 1, 0, np.nan], + 'B_c': [0, 0, 1, np.nan], + 'B_nan': [0, 0, 0, 1]}, + dtype=np.float64) + cols = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan'] + expected[cols] = expected[cols].astype(np.float32) + expected = expected[ + ['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']] + assert_frame_equal(result, expected) + def test_dataframe_dummies_with_categorical(self): df = self.df df['cat'] = pd.Categorical(['x', 'y', 'y']) @@ -482,7 +518,7 @@ def test_dataframe_dummies_with_categorical(self): 'cat_x': [1, 0, 0], 'cat_y': [0, 1, 1]}) cols = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y'] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(np.float32) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y']] assert_frame_equal(result, expected) @@ -499,7 +535,7 @@ def test_basic_drop_first(self): 2: 0}, 'c': {0: 0, 1: 0, - 2: 1}}, dtype=np.uint8) + 2: 1}}, dtype=np.float32) result = get_dummies(s_list, sparse=self.sparse, drop_first=True) assert_frame_equal(result, expected) @@ -537,7 +573,7 @@ def test_basic_drop_first_NA(self): res = get_dummies(s_NA, sparse=self.sparse, drop_first=True) exp = DataFrame({'b': {0: 0, 1: 1, - 2: 0}}, dtype=np.uint8) + 2: 0}}, dtype=np.float32) assert_frame_equal(res, exp) res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse, @@ -547,20 +583,30 @@ def test_basic_drop_first_NA(self): 2: 0}, nan: {0: 0, 1: 0, - 2: 1}}, dtype=np.uint8).reindex_axis( + 2: 1}}, dtype=np.float32).reindex_axis( ['b', nan], 1) assert_frame_equal(res_na, exp_na) + res_na = get_dummies(s_NA, fill_value=np.nan, sparse=self.sparse, + drop_first=True) + exp_na = DataFrame({'b': [0, 1, np.nan]}, dtype=np.float32) + assert_frame_equal(res_na, exp_na) + res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse, drop_first=True) - exp_just_na = DataFrame(index=np.arange(1)) + exp_just_na = DataFrame(index=range(1)) + assert_frame_equal(res_just_na, exp_just_na) + + res_just_na = get_dummies([nan], sparse=self.sparse, + drop_first=True) + exp_just_na = DataFrame(index=range(1)) assert_frame_equal(res_just_na, exp_just_na) def test_dataframe_dummies_drop_first(self): df = self.df[['A', 'B']] result = get_dummies(df, sparse=self.sparse, drop_first=True) expected = DataFrame({'A_b': [0, 1, 0], - 'B_c': [0, 0, 1]}, dtype=np.uint8) + 'B_c': [0, 0, 1]}, dtype=np.float32) assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_categorical(self): @@ -572,12 +618,12 @@ def test_dataframe_dummies_drop_first_with_categorical(self): 'B_c': [0, 0, 1], 'cat_y': [0, 1, 1]}) cols = ['A_b', 'B_c', 'cat_y'] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(np.float32) expected = expected[['C', 'A_b', 'B_c', 'cat_y']] assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_na(self): - df = self.df + df = self.df.copy() df.loc[3, :] = [np.nan, np.nan, np.nan] result = get_dummies(df, dummy_na=True, sparse=self.sparse, drop_first=True) @@ -587,13 +633,41 @@ def test_dataframe_dummies_drop_first_with_na(self): 'B_c': [0, 0, 1, 0], 'B_nan': [0, 0, 0, 1]}) cols = ['A_b', 'A_nan', 'B_c', 'B_nan'] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(np.float32) + expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']] + assert_frame_equal(result, expected) + result = get_dummies(df, dummy_na=True, fill_value=np.nan, + sparse=self.sparse, drop_first=True) + expected = DataFrame({'C': [1, 2, 3, np.nan], + 'A_b': [0, 1, 0, np.nan], + 'A_nan': [0, 0, 0, 1], + 'B_c': [0, 0, 1, np.nan], + 'B_nan': [0, 0, 0, 1]}, dtype=np.float64) + cols = ['A_b', 'B_c', 'A_nan', 'B_nan'] + expected[cols] = expected[cols].astype(np.float32) expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']] assert_frame_equal(result, expected) result = get_dummies(df, dummy_na=False, sparse=self.sparse, drop_first=True) + expected = DataFrame({'C': [1, 2, 3, np.nan], + 'A_b': [0, 1, 0, 0], + 'B_c': [0, 0, 1, 0]}, + dtype=np.float64) + cols = ['A_b', 'B_c'] + expected[cols] = expected[cols].astype(np.float32) + expected = expected[['C', 'A_b', 'B_c']] + assert_frame_equal(result, expected) + + result = get_dummies(df, dummy_na=False, sparse=self.sparse, + drop_first=True, fill_value=np.nan) + expected = DataFrame({'C': [1, 2, 3, np.nan], + 'A_b': [0, 1, 0, np.nan], + 'B_c': [0, 0, 1, np.nan]}, + dtype=np.float64) + cols = ['A_b', 'B_c'] + expected[cols] = expected[cols].astype(np.float32) expected = expected[['C', 'A_b', 'B_c']] assert_frame_equal(result, expected) @@ -601,14 +675,14 @@ def test_int_int(self): data = Series([1, 2, 1]) result = pd.get_dummies(data) expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], - dtype=np.uint8) + dtype=np.float32) tm.assert_frame_equal(result, expected) data = Series(pd.Categorical(['a', 'b', 'a'])) result = pd.get_dummies(data) expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=pd.Categorical(['a', 'b']), - dtype=np.uint8) + dtype=np.float32) tm.assert_frame_equal(result, expected) def test_int_df(self): @@ -625,7 +699,7 @@ def test_int_df(self): [2, 2., 0, 1, 0, 1], [1, 1., 1, 0, 1, 0] ], columns=columns) - expected[columns[2:]] = expected[columns[2:]].astype(np.uint8) + expected[columns[2:]] = expected[columns[2:]].astype(np.float32) result = pd.get_dummies(data, columns=['A', 'B']) tm.assert_frame_equal(result, expected) @@ -636,7 +710,7 @@ def test_dataframe_dummies_preserve_categorical_dtype(self): ordered=ordered) result = get_dummies(cat) - data = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.uint8) + data = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.float32) cols = pd.CategoricalIndex(cat.categories, categories=cat.categories, ordered=ordered) From 0304ede1a0ab7aa97b93d58b00bd04bf0a8235e0 Mon Sep 17 00:00:00 2001 From: beckermr Date: Fri, 7 Apr 2017 16:42:38 -0500 Subject: [PATCH 2/9] DOC update whatsnew --- doc/source/whatsnew/v0.20.0.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 41716e0e26ea0..c8780961b764d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -383,8 +383,8 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Deprecate Automatic Zero Filling of Missing Values in .get_dummies -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Deprecate Automatic Zero Filling of Missing Values in pd.get_dummies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``pd.get_dummies`` function currently fills missing values with zero by default. This behavior is in conflict with the rest of the pandas API since missing values should be filled with ``fillna`` and missing values should be propagated through pandas transformations. In the future, ``get_dummies`` will propagate missing values by default. The recommended way to reproduce the current behavior of filling with zeros with the new, upcoming API is From 22bf6ce49e22eb09954f6aeea5791d1e952e89a1 Mon Sep 17 00:00:00 2001 From: beckermr Date: Sat, 8 Apr 2017 08:40:39 -0500 Subject: [PATCH 3/9] ENH repsond to CR --- doc/source/whatsnew/v0.20.0.txt | 15 ++++++++------- pandas/core/reshape.py | 30 +++++++++++++++--------------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c8780961b764d..127a041952f7e 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -372,7 +372,7 @@ Other Enhancements - :func:`MultiIndex.remove_unused_levels` has been added to facilitate :ref:`removing unused levels `. (:issue:`15694`) - ``pd.read_csv()`` will now raise a ``ParserError`` error whenever any parsing error occurs (:issue:`15913`, :issue:`15925`) - ``pd.read_csv()`` now supports the ``error_bad_lines`` and ``warn_bad_lines`` arguments for the Python parser (:issue:`15925`) -- ``pd.get_dummies()`` now accepts the ``fill_value`` keyword which specifies how to fill missing values in the dummy variables. (:issue:`15923`) +- ``pd.get_dummies()`` now accepts the ``fill_value`` keyword which specifies how to fill NaN values in the dummy variables. (:issue:`15923`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations @@ -383,16 +383,17 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Deprecate Automatic Zero Filling of Missing Values in pd.get_dummies -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Deprecate Automatic Zero Filling of Missing Values in ``pd.get_dummies`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``pd.get_dummies`` function currently fills missing values with zero by default. This behavior is in conflict with the rest of the pandas API since missing values should be filled with ``fillna`` and missing values should be propagated through pandas transformations. In the future, ``get_dummies`` will propagate missing values by default. The recommended way to reproduce the current behavior of filling with zeros with the new, upcoming API is +The :func:`get_dummies` function currently fills NaN values with zero by default. This behavior is in conflict with the rest of the pandas API since NaN values should be filled with ``fillna`` or a ``fill_value`` keyword, and NaN values should be propagated through pandas transformations. In the future, :func:`get_dummies` will propagate NaN values by default. (:issue:`15923`) -.. ipython: python - df = pd.get_dummies(df).fillna(0) +The recommended way to reproduce the current behavior of filling NaN values with zeros with the new, upcoming API is + +.. ipython: python -For now, the current behavior of filling zeros by default has been kept, but not specifying a fill value with the ``fill_value`` keyword will raise a ``DeprecationWarning`` with the example above. + df = pd.get_dummies(df, fill_value=0) .. _whatsnew_0200.api_breaking.deprecate_ix: diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index f2792e554daa1..b55073ea4d67f 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -1060,8 +1060,8 @@ def melt_stub(df, stub, i, j, value_vars, sep): def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, - fill_value=None, columns=None, sparse=False, - drop_first=False): + columns=None, sparse=False, drop_first=False, + fill_value=None): """ Convert categorical variable into dummy/indicator variables @@ -1078,14 +1078,6 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, list or dictionary as with `prefix.` dummy_na : bool, default False Add a column to indicate NaNs if True. - fill_value : scalar, default None - Value to fill NaNs with. The default of `None` will fill with - zeros. To do no filling of NaNs, specify `fill_value=np.nan`. - The default behavior of filling with zeros will be deprecated - in the future and using this default will not raise a - `DeprecationWarning`. - - .. versionadded:: 0.20.0 columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with @@ -1101,6 +1093,14 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, first level. .. versionadded:: 0.18.0 + fill_value : scalar, default None + Value to fill NaNs with. The default of `None` will fill with + zeros. To do no filling of NaNs, specify `fill_value=np.nan`. + The default behavior of filling with zeros will be deprecated + in the future and using this default will not raise a + `FutureWarning`. + + .. versionadded:: 0.20.0 Returns ------- dummies : DataFrame or SparseDataFrame @@ -1175,16 +1175,16 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, from pandas.tools.concat import concat from itertools import cycle - # Deprecate filling missing values with zeros, GH15926 + # Deprecate filling NaN values with zeros, GH15926 # When this is finally deprecated, simply remove this block # of code and change the default to np.nan in the function signature # of `get_dummies`. if fill_value is None: - warnings.warn('The default behavior of filling missing values ' + warnings.warn('The default behavior of filling NaN values ' 'with zeros will be deprecated. Use ' - '`df = pd.get_dummies(df).fillna(0)` to reproduce ' - 'this behavior', DeprecationWarning) - fill_value = 0.0 + '`df = pd.get_dummies(df, fill_value=0)` to reproduce ' + 'this behavior', FutureWarning, stack_level=3) + fill_value = 0 if isinstance(data, DataFrame): # determine columns being encoded From 766e094737034c95dfa247e308e0a1effb11c5fa Mon Sep 17 00:00:00 2001 From: beckermr Date: Sat, 8 Apr 2017 09:33:08 -0500 Subject: [PATCH 4/9] ENH added type downcasting, still needs to be refactored --- pandas/core/reshape.py | 66 +++++++++++++++++++++++++++--------- pandas/tests/test_reshape.py | 56 +++++++++++++++--------------- 2 files changed, 78 insertions(+), 44 deletions(-) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index b55073ea4d67f..9d580a53c5c0a 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -11,7 +11,7 @@ from pandas.types.common import (_ensure_platform_int, is_list_like, is_bool_dtype, needs_i8_conversion) -from pandas.types.cast import maybe_promote +from pandas.types.cast import maybe_promote, infer_dtype_from_scalar from pandas.types.missing import notnull import pandas.types.concat as _concat @@ -1183,9 +1183,40 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, warnings.warn('The default behavior of filling NaN values ' 'with zeros will be deprecated. Use ' '`df = pd.get_dummies(df, fill_value=0)` to reproduce ' - 'this behavior', FutureWarning, stack_level=3) + 'this behavior', FutureWarning, 3) fill_value = 0 + # Infer the proper output dtype. + # GH15926 + try: + if np.all(np.isfinite(data.values if hasattr(data, 'values') else data)): + any_null = True + else: + any_null = False + except TypeError: + any_null = False + + if any_null: + output_dtype = np.uint8 + else: + fill_value_dtype, fill_value = infer_dtype_from_scalar(fill_value) + + if 'int' in str(fill_value_dtype): + if fill_value >= 0: + if fill_value <= np.iinfo(np.uint8).max: + output_dtype = np.uint8 + else: + output_dtype = np.uint64 + else: + if fill_value >= np.iinfo(np.int8).min and fill_value <= np.iinfo(np.int8).max: + output_dtype = np.int8 + else: + output_dtype = np.int64 + elif 'float' in str(fill_value_dtype): + output_dtype = np.float32 + else: + raise ValueError('`fill_value` must be `np.nan`, an int or a float type!') + if isinstance(data, DataFrame): # determine columns being encoded @@ -1231,18 +1262,21 @@ def check_len(item, name): dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep, dummy_na=dummy_na, sparse=sparse, drop_first=drop_first, - fill_value=fill_value) + fill_value=fill_value, + output_dtype=output_dtype) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, sparse=sparse, drop_first=drop_first, - fill_value=fill_value) + fill_value=fill_value, + output_dtype=output_dtype) return result def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, - fill_value=np.nan, sparse=False, drop_first=False): + fill_value=np.nan, sparse=False, drop_first=False, + output_dtype=np.uint8): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) @@ -1261,11 +1295,11 @@ def get_empty_Frame(data, sparse): if len(levels) == 0 and not dummy_na: return get_empty_Frame(data, sparse) - # Record missing values before we munge the codes, GH15826 - missing_codes_msk = codes == -1 + # Record NaN values before we munge the codes, GH15826 + nan_codes_msk = codes == -1 codes = codes.copy() if dummy_na: - codes[missing_codes_msk] = len(levels) + codes[nan_codes_msk] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again @@ -1291,7 +1325,7 @@ def get_empty_Frame(data, sparse): N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code, missing in zip( - range(len(codes)), codes, missing_codes_msk): + range(len(codes)), codes, nan_codes_msk): if missing: # For missing values, we have to decide what to do. # GH15926 @@ -1327,7 +1361,7 @@ def get_empty_Frame(data, sparse): sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): - sarr = np.ones(len(ixs), dtype=np.float32) + sarr = np.ones(len(ixs), dtype=output_dtype) # NaNs are marked by a negative index. # Only need to set for sparse output if @@ -1344,30 +1378,30 @@ def get_empty_Frame(data, sparse): sarr, sparse_index=IntIndex(N, ixs), fill_value=0, - dtype=np.float32) + dtype=output_dtype) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, default_fill_value=0, - dtype=np.float32) + dtype=output_dtype) return out else: dummy_mat = np.eye( - number_of_cols, dtype=np.float32).take(codes, axis=0) + number_of_cols, dtype=output_dtype).take(codes, axis=0) # user specified fill value via `fill_value` GH15926 if dummy_na: - dummy_mat[missing_codes_msk, :-1] = fill_value + dummy_mat[nan_codes_msk, :-1] = fill_value else: - dummy_mat[missing_codes_msk] = fill_value + dummy_mat[nan_codes_msk] = fill_value if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame( - dummy_mat, index=index, columns=dummy_cols, dtype=np.float32) + dummy_mat, index=index, columns=dummy_cols, dtype=output_dtype) def make_axis_dummies(frame, axis='minor', transform=None): diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 453d2621cc86f..e3a09d7065664 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -243,7 +243,7 @@ def test_basic(self): 2: 0}, 'c': {0: 0, 1: 0, - 2: 1}}, dtype=np.float32) + 2: 1}}, dtype=np.uint8) assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected) assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected) @@ -262,7 +262,7 @@ def test_basic_types(self): expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, - dtype='float32', + dtype='uint8', columns=list('abc')) if not self.sparse: compare = tm.assert_frame_equal @@ -278,11 +278,11 @@ def test_basic_types(self): result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns) tm.assert_series_equal(result.get_dtype_counts(), - Series({'float32': 8})) + Series({'uint8': 8})) result = get_dummies(s_df, sparse=self.sparse, columns=['a']) expected = Series( - {'float32': 3, 'int64': 1, 'object': 1}).sort_values() + {'uint8': 3, 'int64': 1, 'object': 1}).sort_values() tm.assert_series_equal(result.get_dtype_counts().sort_values(), expected) @@ -308,7 +308,7 @@ def test_include_na(self): s = ['a', 'b', np.nan] res = get_dummies(s, sparse=self.sparse) exp = DataFrame({'a': {0: 1, 1: 0, 2: 0}, - 'b': {0: 0, 1: 1, 2: 0}}, dtype=np.float32) + 'b': {0: 0, 1: 1, 2: 0}}, dtype=np.uint8) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 @@ -316,7 +316,7 @@ def test_include_na(self): exp_na = DataFrame({nan: {0: 0, 1: 0, 2: 1}, 'a': {0: 1, 1: 0, 2: 0}, 'b': {0: 0, 1: 1, 2: 0}}, - dtype=np.float32) + dtype=np.uint8) exp_na = exp_na.reindex_axis(['a', 'b', nan], 1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns @@ -324,7 +324,7 @@ def test_include_na(self): res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse) exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], - dtype=np.float32) + dtype=np.uint8) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) # Add `fill_value` keyword GH15926 @@ -359,7 +359,7 @@ def test_unicode(self u('letter_%s') % eacute: {0: 0, 1: 1, 2: 1}}, - dtype=np.float32) + dtype=np.uint8) assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self): @@ -368,7 +368,7 @@ def test_dataframe_dummies_all_obj(self): expected = DataFrame({'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], - 'B_c': [0, 0, 1]}, dtype=np.float32) + 'B_c': [0, 0, 1]}, dtype=np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_mix_default(self): @@ -380,7 +380,7 @@ def test_dataframe_dummies_mix_default(self): 'B_b': [1, 1, 0], 'B_c': [0, 0, 1]}) cols = ['A_a', 'A_b', 'B_b', 'B_c'] - expected[cols] = expected[cols].astype(np.float32) + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) @@ -396,7 +396,7 @@ def test_dataframe_dummies_prefix_list(self): 'from_B_b': [1, 1, 0], 'from_B_c': [0, 0, 1]}) cols = expected.columns[1:] - expected[cols] = expected[cols].astype(np.float32) + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']] assert_frame_equal(result, expected) @@ -409,7 +409,7 @@ def test_dataframe_dummies_prefix_str(self): [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'], - dtype=np.float32) + dtype=np.uint8) expected = expected.astype({"C": np.int64}) assert_frame_equal(result, expected) @@ -422,7 +422,7 @@ def test_dataframe_dummies_subset(self): 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) cols = ['from_A_a', 'from_A_b'] - expected[cols] = expected[cols].astype(np.float32) + expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self): @@ -435,7 +435,7 @@ def test_dataframe_dummies_prefix_sep(self): 'B..c': [0, 0, 1]}) expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] cols = expected.columns[1:] - expected[cols] = expected[cols].astype(np.float32) + expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse) @@ -466,7 +466,7 @@ def test_dataframe_dummies_prefix_dict(self): 'from_B_c': [0, 0, 1], 'C': [1, 2, 3]}) cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] - expected[cols] = expected[cols].astype(np.float32) + expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_with_na(self): @@ -481,7 +481,7 @@ def test_dataframe_dummies_with_na(self): 'B_c': [0, 0, 1, 0], 'B_nan': [0, 0, 0, 1]}) cols = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan'] - expected[cols] = expected[cols].astype(np.float32) + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']] assert_frame_equal(result, expected) @@ -518,7 +518,7 @@ def test_dataframe_dummies_with_categorical(self): 'cat_x': [1, 0, 0], 'cat_y': [0, 1, 1]}) cols = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y'] - expected[cols] = expected[cols].astype(np.float32) + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y']] assert_frame_equal(result, expected) @@ -535,7 +535,7 @@ def test_basic_drop_first(self): 2: 0}, 'c': {0: 0, 1: 0, - 2: 1}}, dtype=np.float32) + 2: 1}}, dtype=np.uint8) result = get_dummies(s_list, sparse=self.sparse, drop_first=True) assert_frame_equal(result, expected) @@ -573,7 +573,7 @@ def test_basic_drop_first_NA(self): res = get_dummies(s_NA, sparse=self.sparse, drop_first=True) exp = DataFrame({'b': {0: 0, 1: 1, - 2: 0}}, dtype=np.float32) + 2: 0}}, dtype=np.uint8) assert_frame_equal(res, exp) res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse, @@ -583,7 +583,7 @@ def test_basic_drop_first_NA(self): 2: 0}, nan: {0: 0, 1: 0, - 2: 1}}, dtype=np.float32).reindex_axis( + 2: 1}}, dtype=np.uint8).reindex_axis( ['b', nan], 1) assert_frame_equal(res_na, exp_na) @@ -606,7 +606,7 @@ def test_dataframe_dummies_drop_first(self): df = self.df[['A', 'B']] result = get_dummies(df, sparse=self.sparse, drop_first=True) expected = DataFrame({'A_b': [0, 1, 0], - 'B_c': [0, 0, 1]}, dtype=np.float32) + 'B_c': [0, 0, 1]}, dtype=np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_categorical(self): @@ -618,7 +618,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self): 'B_c': [0, 0, 1], 'cat_y': [0, 1, 1]}) cols = ['A_b', 'B_c', 'cat_y'] - expected[cols] = expected[cols].astype(np.float32) + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_b', 'B_c', 'cat_y']] assert_frame_equal(result, expected) @@ -633,7 +633,7 @@ def test_dataframe_dummies_drop_first_with_na(self): 'B_c': [0, 0, 1, 0], 'B_nan': [0, 0, 0, 1]}) cols = ['A_b', 'A_nan', 'B_c', 'B_nan'] - expected[cols] = expected[cols].astype(np.float32) + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']] assert_frame_equal(result, expected) @@ -656,7 +656,7 @@ def test_dataframe_dummies_drop_first_with_na(self): 'B_c': [0, 0, 1, 0]}, dtype=np.float64) cols = ['A_b', 'B_c'] - expected[cols] = expected[cols].astype(np.float32) + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_b', 'B_c']] assert_frame_equal(result, expected) @@ -675,14 +675,14 @@ def test_int_int(self): data = Series([1, 2, 1]) result = pd.get_dummies(data) expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], - dtype=np.float32) + dtype=np.uint8) tm.assert_frame_equal(result, expected) data = Series(pd.Categorical(['a', 'b', 'a'])) result = pd.get_dummies(data) expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=pd.Categorical(['a', 'b']), - dtype=np.float32) + dtype=np.uint8) tm.assert_frame_equal(result, expected) def test_int_df(self): @@ -699,7 +699,7 @@ def test_int_df(self): [2, 2., 0, 1, 0, 1], [1, 1., 1, 0, 1, 0] ], columns=columns) - expected[columns[2:]] = expected[columns[2:]].astype(np.float32) + expected[columns[2:]] = expected[columns[2:]].astype(np.uint8) result = pd.get_dummies(data, columns=['A', 'B']) tm.assert_frame_equal(result, expected) @@ -710,7 +710,7 @@ def test_dataframe_dummies_preserve_categorical_dtype(self): ordered=ordered) result = get_dummies(cat) - data = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.float32) + data = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.uint8) cols = pd.CategoricalIndex(cat.categories, categories=cat.categories, ordered=ordered) From f5490f8f1ea2a70ad387d69237f4ae028c74d1c0 Mon Sep 17 00:00:00 2001 From: beckermr Date: Sat, 8 Apr 2017 14:54:21 -0500 Subject: [PATCH 5/9] ENH reworked new get_dummies --- pandas/core/reshape.py | 112 ++++++++++++-------------------- pandas/tests/test_reshape.py | 2 +- pandas/tests/types/test_cast.py | 93 ++++++++++++++++++++++++++ pandas/types/cast.py | 55 +++++++++++++++- 4 files changed, 188 insertions(+), 74 deletions(-) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 9d580a53c5c0a..2871d5760d0f1 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -1188,34 +1188,20 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, # Infer the proper output dtype. # GH15926 - try: - if np.all(np.isfinite(data.values if hasattr(data, 'values') else data)): - any_null = True - else: - any_null = False - except TypeError: - any_null = False - - if any_null: - output_dtype = np.uint8 - else: - fill_value_dtype, fill_value = infer_dtype_from_scalar(fill_value) - - if 'int' in str(fill_value_dtype): - if fill_value >= 0: - if fill_value <= np.iinfo(np.uint8).max: - output_dtype = np.uint8 - else: - output_dtype = np.uint64 - else: - if fill_value >= np.iinfo(np.int8).min and fill_value <= np.iinfo(np.int8).max: - output_dtype = np.int8 - else: - output_dtype = np.int64 - elif 'float' in str(fill_value_dtype): + vals = data.values.ravel() if hasattr(data, 'values') else data + isnotfinite = [] + for v in vals: + try: + isnotfinite.append(~np.isfinite(v)) + except TypeError: + isnotfinite.append(False) + if np.any(isnotfinite): + output_dtype, fill_value = infer_dtype_from_scalar( + fill_value, downcast=True, allow_uint=True) + if output_dtype == np.float16: output_dtype = np.float32 - else: - raise ValueError('`fill_value` must be `np.nan`, an int or a float type!') + else: + output_dtype = np.uint8 if isinstance(data, DataFrame): # determine columns being encoded @@ -1297,9 +1283,10 @@ def get_empty_Frame(data, sparse): # Record NaN values before we munge the codes, GH15826 nan_codes_msk = codes == -1 + num_orig_levels = len(levels) codes = codes.copy() if dummy_na: - codes[nan_codes_msk] = len(levels) + codes[nan_codes_msk] = num_orig_levels levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again @@ -1323,57 +1310,38 @@ def get_empty_Frame(data, sparse): if sparse: sparse_series = {} N = len(data) - sp_indices = [[] for _ in range(len(dummy_cols))] - for ndx, code, missing in zip( - range(len(codes)), codes, nan_codes_msk): - if missing: - # For missing values, we have to decide what to do. - # GH15926 - - if dummy_na: - # Then we need a one in the last column. - # GH15926 - sp_indices[code].append(ndx) - - if fill_value != 0: - # Then we need to mark these locations to put back another - # fill value later. (Zero fill values will be filled by the - # sparse array implicitly). - # Use a negative index here to code NaNs. - # Offset by -1 to account for zero. - # Have to add to ALL columns, except the - # last one if dummy_na. - # GH15926 - if dummy_na: - _num_cols = len(dummy_cols) - 1 - else: - _num_cols = len(dummy_cols) - for _code in range(_num_cols): - sp_indices[_code].append(-ndx - 1) - else: - # Value is not missing so do as normal. - # GH15926 - sp_indices[code].append(ndx) + # Construct lists of inds and if the value is NaN. + # GH15926 + sp_indices = [None] * len(dummy_cols) + sp_fill = [None] * len(dummy_cols) + for code in np.unique(codes[codes != -1]): + # Non-zero value in sparse array if value is of the level + # or the value is NaN and it is filled non-zero and + # and it is not the dummy column for NaNs. + # GH15926 + sp_indices[code] = sorted( + np.where((codes == code) | + ((fill_value != 0) & + (code < num_orig_levels) & + nan_codes_msk))[0].tolist()) + + # Value is filled with `fill_value` if it is NaN + # and not in dummy col and fill value is non-zero. + # GH15926 + sp_fill[code] = (nan_codes_msk[sp_indices[code]] & + (fill_value != 0) & + (code < num_orig_levels)) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] - for col, ixs in zip(dummy_cols, sp_indices): - sarr = np.ones(len(ixs), dtype=output_dtype) - - # NaNs are marked by a negative index. - # Only need to set for sparse output if - # fill_value != 0. - # Ditto for any negative indexes generated above. - # GH15926 - if fill_value != 0: - ixs = np.array(ixs) - sarr[ixs < 0] = fill_value - ixs[ixs < 0] += 1 # undo the offset - ixs = np.abs(ixs) # set index back to positive. + sp_fill = sp_fill[1:] + for col, ixs, fill in zip(dummy_cols, sp_indices, sp_fill): + sarr = np.ones(len(ixs), dtype=output_dtype) + sarr[fill] = fill_value # Fill with `fill_value`, GH15926 sarr = SparseArray( sarr, sparse_index=IntIndex(N, ixs), diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index e3a09d7065664..831edf697b2ad 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -327,8 +327,8 @@ def test_include_na(self): dtype=np.uint8) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) - # Add `fill_value` keyword GH15926 def test_fill_value_na(self): + # Add `fill_value` keyword GH15926 s = ['a', 'b', np.nan] res_na = get_dummies( s, dummy_na=True, fill_value=np.nan, sparse=self.sparse) diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py index de6ef7af9d7f9..713ca2747de7e 100644 --- a/pandas/tests/types/test_cast.py +++ b/pandas/tests/types/test_cast.py @@ -86,6 +86,99 @@ def test_datetime_with_timezone(self): class TestInferDtype(object): + def test_infer_dtype_from_scalar_downcast_basic(self): + # Make sure downcasting works. GH15926 + + for dtypec in [np.int8, np.int16, np.int32, np.int64]: + data = dtypec(12) + dtype, val = infer_dtype_from_scalar( + data, downcast=True, allow_uint=False) + assert dtype == np.int8 + dtype, val = infer_dtype_from_scalar( + data, downcast=True, allow_uint=True) + assert dtype == np.uint8 + + data = dtypec(-12) + dtype, val = infer_dtype_from_scalar( + data, downcast=True, allow_uint=False) + assert dtype == np.int8 + dtype, val = infer_dtype_from_scalar( + data, downcast=True, allow_uint=True) + assert dtype == np.int8 + + for dtypec in [np.uint8, np.uint16, np.uint32, np.uint64]: + data = dtypec(12) + dtype, val = infer_dtype_from_scalar( + data, downcast=True, allow_uint=False) + assert dtype == np.uint8 + dtype, val = infer_dtype_from_scalar( + data, downcast=True, allow_uint=True) + assert dtype == np.uint8 + + data = 12 + dtype, val = infer_dtype_from_scalar( + data, downcast=True) + assert dtype == np.int8 + dtype, val = infer_dtype_from_scalar( + data, downcast=True, allow_uint=True) + assert dtype == np.uint8 + + data = -12 + dtype, val = infer_dtype_from_scalar( + data, downcast=True) + assert dtype == np.int8 + dtype, val = infer_dtype_from_scalar( + data, downcast=True, allow_uint=True) + assert dtype == np.int8 + + + for dtypec in [np.float16, np.float32, np.float64]: + data = dtypec(12) + dtype, val = infer_dtype_from_scalar(data, downcast=True) + assert dtype == np.float16 + + data = np.float(12) + dtype, val = infer_dtype_from_scalar(data, downcast=True) + assert dtype == np.float16 + + def test_infer_dtype_from_scalar_downcast_bounds(self): + # Make sure downcasting works at bounds. GH15926 + + for dtypec, dtypec_up in [(np.uint8, np.uint16), + (np.uint16, np.uint32), + (np.uint32, np.uint64)]: + val = dtypec(np.iinfo(dtypec).max) + + data = dtypec(val - 1) + dtype, _ = infer_dtype_from_scalar( + data, downcast=True, allow_uint=False) + assert dtype == dtypec + dtype, _ = infer_dtype_from_scalar( + data, downcast=True, allow_uint=True) + assert dtype == dtypec + + data = dtypec_up(val + 1) + dtype, _ = infer_dtype_from_scalar( + data, downcast=True, allow_uint=False) + assert dtype == dtypec_up + dtype, _ = infer_dtype_from_scalar( + data, downcast=True, allow_uint=True) + assert dtype == dtypec_up + + for dtypec, dtypec_up in [(np.float16, np.float32), + (np.float32, np.float64)]: + data = dtypec(np.finfo(dtypec).min) + dtype, _ = infer_dtype_from_scalar(data, downcast=True) + assert dtype == dtypec_up + dtype, _ = infer_dtype_from_scalar(data, downcast=True) + assert dtype == dtypec_up + + data = dtypec(np.finfo(dtypec).max) + dtype, _ = infer_dtype_from_scalar(data, downcast=True) + assert dtype == dtypec_up + dtype, _ = infer_dtype_from_scalar(data, downcast=True) + assert dtype == dtypec_up + def test_infer_dtype_from_scalar(self): # Test that _infer_dtype_from_scalar is returning correct dtype for int # and float. diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 580ce12de3333..77c59417090b5 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -312,7 +312,10 @@ def maybe_promote(dtype, fill_value=np.nan): return dtype, fill_value -def infer_dtype_from_scalar(val, pandas_dtype=False): +def infer_dtype_from_scalar(val, + pandas_dtype=False, + downcast=False, + allow_uint=False): """ interpret the dtype from a scalar @@ -322,8 +325,52 @@ def infer_dtype_from_scalar(val, pandas_dtype=False): whether to infer dtype including pandas extension types. If False, scalar belongs to pandas extension types is inferred as object + downcast : bool, default False + If True, downcast float and integer types to the smallest width + type that can hold `val`. + + .. versionadded:: 0.20.0 + allow_uint : bool, default False + If True and `downcast` is True, non-negative integers will be + downcast to smallest width unsigned integer type that can hold + them. Otherwise, signed types are always downcast to signed types + and the same for unsigned types. + + .. versionadded:: 0.20.0 """ + def _downcast_dtype(dtype, val): + if 'float' in str(dtype): + if ((val > np.finfo(np.float16).min and + val < np.finfo(np.float16).max) or val is np.nan): + return np.float16 + elif (val > np.finfo(np.float32).min and + val < np.finfo(np.float32).max): + return np.float32 + else: + return np.float64 + elif 'uint' in str(dtype) or (val >= 0 and allow_uint): + if val < np.iinfo(np.uint8).max: + return np.uint8 + elif val < np.iinfo(np.uint16).max: + return np.uint16 + elif val < np.iinfo(np.uint32).max: + return np.uint32 + else: + return np.uint64 + elif 'int' in str(dtype): + if (val > np.iinfo(np.int8).min and + val < np.iinfo(np.int8).max): + return np.int8 + elif (val > np.iinfo(np.int16).min and + val < np.iinfo(np.int16).max): + return np.int16 + elif (val > np.iinfo(np.int32).min and + val < np.iinfo(np.int32).max): + return np.int32 + else: + return np.int64 + dtype = np.object_ # a 1-element ndarray @@ -335,6 +382,8 @@ def infer_dtype_from_scalar(val, pandas_dtype=False): dtype = val.dtype val = val.item() + dtype = _downcast_dtype(dtype, val) if downcast else dtype + elif isinstance(val, string_types): # If we create an empty array using a string to infer @@ -370,12 +419,16 @@ def infer_dtype_from_scalar(val, pandas_dtype=False): else: dtype = np.int64 + dtype = _downcast_dtype(dtype, val) if downcast else dtype + elif is_float(val): if isinstance(val, np.floating): dtype = type(val) else: dtype = np.float64 + dtype = _downcast_dtype(dtype, val) if downcast else dtype + elif is_complex(val): dtype = np.complex_ From 3e932a8709200264a5db85667c4f593eadccf6a2 Mon Sep 17 00:00:00 2001 From: beckermr Date: Sat, 8 Apr 2017 14:56:00 -0500 Subject: [PATCH 6/9] STY removed an extra blank line --- pandas/tests/types/test_cast.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py index 713ca2747de7e..f0cca3803eea8 100644 --- a/pandas/tests/types/test_cast.py +++ b/pandas/tests/types/test_cast.py @@ -131,7 +131,6 @@ def test_infer_dtype_from_scalar_downcast_basic(self): data, downcast=True, allow_uint=True) assert dtype == np.int8 - for dtypec in [np.float16, np.float32, np.float64]: data = dtypec(12) dtype, val = infer_dtype_from_scalar(data, downcast=True) From 1ac101fa33da437eab4e87524f8340ae51ab9eb8 Mon Sep 17 00:00:00 2001 From: beckermr Date: Tue, 11 Apr 2017 12:50:17 -0500 Subject: [PATCH 7/9] ENH added general routine for downcasting types, refactored in other spots --- pandas/core/reshape.py | 39 +++-- pandas/tests/types/test_cast.py | 245 +++++++++++++++++++++----------- pandas/tools/util.py | 31 +--- pandas/types/cast.py | 123 +++++++++------- 4 files changed, 262 insertions(+), 176 deletions(-) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 2871d5760d0f1..b9090eb98e754 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -11,7 +11,8 @@ from pandas.types.common import (_ensure_platform_int, is_list_like, is_bool_dtype, needs_i8_conversion) -from pandas.types.cast import maybe_promote, infer_dtype_from_scalar +from pandas.types.cast import (maybe_promote, infer_dtype_from_scalar, + maybe_downcast_itemsize) from pandas.types.missing import notnull import pandas.types.concat as _concat @@ -1077,7 +1078,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, If appending prefix, separator/delimiter to use. Or pass a list or dictionary as with `prefix.` dummy_na : bool, default False - Add a column to indicate NaNs if True. + If True, add an extra dummy column to indicate NaNs, otherwise + no extra column is added. columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with @@ -1094,11 +1096,16 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, .. versionadded:: 0.18.0 fill_value : scalar, default None - Value to fill NaNs with. The default of `None` will fill with - zeros. To do no filling of NaNs, specify `fill_value=np.nan`. - The default behavior of filling with zeros will be deprecated - in the future and using this default will not raise a - `FutureWarning`. + Value to fill NaNs with. If no missing values are found or NaN is not + used to fill them, the returned data type will be the smallest + width type that can represent the returned values. See + pandas.types.cast.maybe_downcast_itemsize for details. If NaNs are + present and NaN is used to fill them, then the smallest floating + point type (typically `np.float32`) will be used. Currently, the + default of `None` will fill with zeros. To do no filling of NaNs, + specify `fill_value=np.nan`. The default behavior of filling with + zeros will be deprecated in the future and using this default will + now raise a `FutureWarning`. .. versionadded:: 0.20.0 Returns @@ -1196,10 +1203,20 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, except TypeError: isnotfinite.append(False) if np.any(isnotfinite): - output_dtype, fill_value = infer_dtype_from_scalar( - fill_value, downcast=True, allow_uint=True) - if output_dtype == np.float16: - output_dtype = np.float32 + output_dtype, fill_value = infer_dtype_from_scalar(fill_value) + # `maybe_downcast_itemsize` only accepts arrays, so make a one + # element array and then extract the value back out. GH15926 + if 'float' in str(output_dtype) or fill_value is np.nan: + output_dtype, fill_value = maybe_downcast_itemsize( + np.array([np.float64(fill_value)]), 'float') + elif 'int' in str(output_dtype): + if fill_value >= 0: + fill_value = np.uint64(fill_value) + else: + fill_value = np.int64(fill_value) + output_dtype, fill_value \ + = maybe_downcast_itemsize(np.array([fill_value]), 'unsigned') + fill_value = output_dtype(fill_value[0]) else: output_dtype = np.uint8 diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py index f0cca3803eea8..0ffdac1e878e3 100644 --- a/pandas/tests/types/test_cast.py +++ b/pandas/tests/types/test_cast.py @@ -16,7 +16,8 @@ infer_dtype_from_array, maybe_convert_string_to_object, maybe_convert_scalar, - find_common_type) + find_common_type, + maybe_downcast_itemsize) from pandas.types.dtypes import (CategoricalDtype, DatetimeTZDtype, PeriodDtype) from pandas.util import testing as tm @@ -84,99 +85,181 @@ def test_datetime_with_timezone(self): tm.assert_index_equal(res, exp) -class TestInferDtype(object): - - def test_infer_dtype_from_scalar_downcast_basic(self): - # Make sure downcasting works. GH15926 - - for dtypec in [np.int8, np.int16, np.int32, np.int64]: - data = dtypec(12) - dtype, val = infer_dtype_from_scalar( - data, downcast=True, allow_uint=False) - assert dtype == np.int8 - dtype, val = infer_dtype_from_scalar( - data, downcast=True, allow_uint=True) - assert dtype == np.uint8 +class TestMaybeDowncastItemSize(object): - data = dtypec(-12) - dtype, val = infer_dtype_from_scalar( - data, downcast=True, allow_uint=False) - assert dtype == np.int8 - dtype, val = infer_dtype_from_scalar( - data, downcast=True, allow_uint=True) - assert dtype == np.int8 - - for dtypec in [np.uint8, np.uint16, np.uint32, np.uint64]: - data = dtypec(12) - dtype, val = infer_dtype_from_scalar( - data, downcast=True, allow_uint=False) - assert dtype == np.uint8 - dtype, val = infer_dtype_from_scalar( - data, downcast=True, allow_uint=True) - assert dtype == np.uint8 - - data = 12 - dtype, val = infer_dtype_from_scalar( - data, downcast=True) - assert dtype == np.int8 - dtype, val = infer_dtype_from_scalar( - data, downcast=True, allow_uint=True) - assert dtype == np.uint8 + @pytest.mark.parametrize( + "dtypec", + [np.float16, np.float32, np.float64]) + def test_maybe_downcast_itemsize_float(self, dtypec): + # Make sure downcasting works for floats. GH15926 + + data = np.array([12], dtype=dtypec) + dtype, val = maybe_downcast_itemsize(data, 'float') + if np.dtype(dtypec).itemsize >= 4: + assert dtype == np.float32 + else: + assert dtype == dtypec - data = -12 - dtype, val = infer_dtype_from_scalar( - data, downcast=True) + @pytest.mark.parametrize( + "data, dtypec", + [(12, np.int8), + (12, np.int16), + (12, np.int32), + (12, np.int64), + (12, np.uint8), + (12, np.uint16), + (12, np.uint32), + (12, np.uint64), + (-12, np.int8), + (-12, np.int16), + (-12, np.int32), + (-12, np.int64)]) + def test_maybe_downcast_itemsize_int(self, data, dtypec): + # Make sure downcasting works for ints. GH15926 + + data = np.array([data], dtype=dtypec) + dtype, val = maybe_downcast_itemsize( + data, downcast='integer') assert dtype == np.int8 - dtype, val = infer_dtype_from_scalar( - data, downcast=True, allow_uint=True) + dtype, val = maybe_downcast_itemsize( + data, downcast='signed') assert dtype == np.int8 + dtype, val = maybe_downcast_itemsize( + data, downcast='unsigned') + if val >= 0: + assert dtype == np.uint8 + else: + assert dtype == dtypec + dtype, val = maybe_downcast_itemsize( + data, downcast='float') + if np.dtype(dtypec).itemsize >= 4: + assert dtype == np.float32 + else: + assert dtype == dtypec - for dtypec in [np.float16, np.float32, np.float64]: - data = dtypec(12) - dtype, val = infer_dtype_from_scalar(data, downcast=True) - assert dtype == np.float16 - - data = np.float(12) - dtype, val = infer_dtype_from_scalar(data, downcast=True) - assert dtype == np.float16 + @pytest.mark.parametrize( + "dtypec, dtypec_up", + [(np.uint8, np.uint16), + (np.uint16, np.uint32), + (np.uint32, np.uint64)]) + def test_maybe_downcast_itemsize_uint_bounds(self, dtypec, dtypec_up): + # Make sure downcasting works at bounds for uint. GH15926 + + val = np.array([np.iinfo(dtypec).max], dtype=dtypec) + + data = val - 1 + dtype, _ = maybe_downcast_itemsize( + data, 'unsigned') + assert dtype == dtypec + dtype, _ = maybe_downcast_itemsize( + data, 'integer') + assert dtype == dtypec + dtype, _ = maybe_downcast_itemsize( + data, 'signed') + assert dtype == dtypec + dtype, _ = maybe_downcast_itemsize( + data, 'float') + if np.dtype(dtypec).itemsize >= 4: + assert dtype == np.float32 + else: + assert dtype == dtypec - def test_infer_dtype_from_scalar_downcast_bounds(self): - # Make sure downcasting works at bounds. GH15926 + data = val.astype(dtypec_up) + 1 + dtype, _ = maybe_downcast_itemsize( + data, 'unsigned') + assert dtype == dtypec_up + dtype, _ = maybe_downcast_itemsize( + data, 'integer') + assert dtype \ + == getattr(np, str(np.dtype(dtypec_up)).replace('uint', 'int')) + dtype, _ = maybe_downcast_itemsize( + data, 'signed') + assert dtype \ + == getattr(np, str(np.dtype(dtypec_up)).replace('uint', 'int')) + dtype, _ = maybe_downcast_itemsize( + data, 'float') + if np.dtype(dtypec_up).itemsize >= 4: + assert dtype == np.float32 + else: + assert dtype == dtypec_up - for dtypec, dtypec_up in [(np.uint8, np.uint16), - (np.uint16, np.uint32), - (np.uint32, np.uint64)]: - val = dtypec(np.iinfo(dtypec).max) + @pytest.mark.parametrize( + "dtypec, dtypec_up", + [(np.float16, np.float32), + (np.float32, np.float64)]) + def test_maybe_downcast_itemsize_float_bounds(self, dtypec, dtypec_up): + # Make sure downcasting works at bounds for float. GH15926 + + data = np.array( + [float(np.finfo(dtypec).min) * 2.0], dtype=dtypec_up) + dtype, val = maybe_downcast_itemsize(data, 'float') + assert dtype == dtypec_up + + data = np.array( + [float(np.finfo(dtypec).max) * 2.0], dtype=dtypec_up) + dtype, _ = maybe_downcast_itemsize(data, 'float') + assert dtype == dtypec_up + + data = np.array( + [float(np.finfo(dtypec).min) * 0.5], dtype=dtypec) + dtype, val = maybe_downcast_itemsize(data, 'float') + assert dtype == dtypec + + data = np.array( + [float(np.finfo(dtypec).max) * 0.5], dtype=dtypec) + dtype, _ = maybe_downcast_itemsize(data, 'float') + assert dtype == dtypec - data = dtypec(val - 1) - dtype, _ = infer_dtype_from_scalar( - data, downcast=True, allow_uint=False) - assert dtype == dtypec - dtype, _ = infer_dtype_from_scalar( - data, downcast=True, allow_uint=True) + @pytest.mark.parametrize( + "dtypec, dtypec_up", + [(np.int8, np.int16), + (np.int16, np.int32), + (np.int32, np.int64)]) + def test_maybe_downcast_itemsize_int_bounds(self, dtypec, dtypec_up): + # Make sure downcasting works at bounds for uint. GH15926 + + val = np.array([np.iinfo(dtypec).max], dtype=dtypec) + + data = val - 1 + dtype, _ = maybe_downcast_itemsize( + data, 'unsigned') + assert dtype \ + == getattr(np, str(np.dtype(dtypec)).replace('int', 'uint')) + dtype, _ = maybe_downcast_itemsize( + data, 'integer') + assert dtype == dtypec + dtype, _ = maybe_downcast_itemsize( + data, 'signed') + assert dtype == dtypec + dtype, _ = maybe_downcast_itemsize( + data, 'float') + if np.dtype(dtypec).itemsize >= 4: + assert dtype == np.float32 + else: assert dtype == dtypec - data = dtypec_up(val + 1) - dtype, _ = infer_dtype_from_scalar( - data, downcast=True, allow_uint=False) - assert dtype == dtypec_up - dtype, _ = infer_dtype_from_scalar( - data, downcast=True, allow_uint=True) + data = val.astype(dtypec_up) + 1 + dtype, _ = maybe_downcast_itemsize( + data, 'unsigned') + assert dtype \ + == getattr(np, str(np.dtype(dtypec)).replace('int', 'uint')) + dtype, _ = maybe_downcast_itemsize( + data, 'integer') + assert dtype \ + == getattr(np, str(np.dtype(dtypec_up)).replace('uint', 'int')) + dtype, _ = maybe_downcast_itemsize( + data, 'signed') + assert dtype \ + == getattr(np, str(np.dtype(dtypec_up)).replace('uint', 'int')) + dtype, _ = maybe_downcast_itemsize( + data, 'float') + if np.dtype(dtypec_up).itemsize >= 4: + assert dtype == np.float32 + else: assert dtype == dtypec_up - for dtypec, dtypec_up in [(np.float16, np.float32), - (np.float32, np.float64)]: - data = dtypec(np.finfo(dtypec).min) - dtype, _ = infer_dtype_from_scalar(data, downcast=True) - assert dtype == dtypec_up - dtype, _ = infer_dtype_from_scalar(data, downcast=True) - assert dtype == dtypec_up - data = dtypec(np.finfo(dtypec).max) - dtype, _ = infer_dtype_from_scalar(data, downcast=True) - assert dtype == dtypec_up - dtype, _ = infer_dtype_from_scalar(data, downcast=True) - assert dtype == dtypec_up +class TestInferDtype(object): def test_infer_dtype_from_scalar(self): # Test that _infer_dtype_from_scalar is returning correct dtype for int diff --git a/pandas/tools/util.py b/pandas/tools/util.py index 263d2f16a4216..4f2c6bbd23951 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -9,7 +9,7 @@ is_decimal, is_scalar as isscalar) -from pandas.types.cast import maybe_downcast_to_dtype +from pandas.types.cast import maybe_downcast_itemsize import pandas as pd from pandas.compat import reduce @@ -159,9 +159,6 @@ def to_numeric(arg, errors='raise', downcast=None): 3 -3.0 dtype: float64 """ - if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): - raise ValueError('invalid downcasting method provided') - is_series = False is_index = False is_scalar = False @@ -206,31 +203,7 @@ def to_numeric(arg, errors='raise', downcast=None): # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values): - typecodes = None - - if downcast in ('integer', 'signed'): - typecodes = np.typecodes['Integer'] - elif downcast == 'unsigned' and np.min(values) >= 0: - typecodes = np.typecodes['UnsignedInteger'] - elif downcast == 'float': - typecodes = np.typecodes['Float'] - - # pandas support goes only to np.float32, - # as float dtypes smaller than that are - # extremely rare and not well supported - float_32_char = np.dtype(np.float32).char - float_32_ind = typecodes.index(float_32_char) - typecodes = typecodes[float_32_ind:] - - if typecodes is not None: - # from smallest to largest - for dtype in typecodes: - if np.dtype(dtype).itemsize <= values.dtype.itemsize: - values = maybe_downcast_to_dtype(values, dtype) - - # successful conversion - if values.dtype == dtype: - break + _, values = maybe_downcast_itemsize(values, downcast) if is_series: return pd.Series(values, index=arg.index, name=arg.name) diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 77c59417090b5..b724a1711f448 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -90,7 +90,11 @@ def trans(x): # noqa return result if issubclass(dtype.type, np.floating): - return result.astype(dtype) + if np.allclose(result, trans(result).astype(dtype)): + return result.astype(dtype) + else: + return result + elif is_bool_dtype(dtype) or is_integer_dtype(dtype): # if we don't have any elements, just astype it @@ -312,10 +316,69 @@ def maybe_promote(dtype, fill_value=np.nan): return dtype, fill_value -def infer_dtype_from_scalar(val, - pandas_dtype=False, - downcast=False, - allow_uint=False): +def maybe_downcast_itemsize(val, downcast): + """maybe downcast an itemsize + + Parameters + ---------- + val : any object with a numeric type + Value to maybe be downcasted. + downcast : str, one of {'integer', 'signed', 'unsigned', 'float'} + Downcast that resulting data to the smallest numerical dtype + possible according to the following rules: + + - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) + - 'unsigned': smallest unsigned int dtype (min.: np.uint8) + - 'float': smallest float dtype (min.: np.float32) + + Downcasting will only occur if the size + of the data's dtype is strictly larger than + the dtype it is to be cast to, so if none of the dtypes + checked satisfy that specification, no downcasting will be + performed on the data. + + Values smaller than the minimums above will be returned as is. + + .. versionadded:: 0.20.0 + Returns + ------- + dtype : a numpy dtype + val : the downcasted value + """ + + if downcast not in ('integer', 'signed', 'unsigned', 'float'): + raise ValueError('invalid downcasting method provided') + + typecodes = None + + if downcast in ('integer', 'signed'): + typecodes = np.typecodes['Integer'] + elif downcast == 'unsigned' and np.min(val) >= 0: + typecodes = np.typecodes['UnsignedInteger'] + elif downcast == 'float': + typecodes = np.typecodes['Float'] + + # pandas support goes only to np.float32, + # as float dtypes smaller than that are + # extremely rare and not well supported + float_32_char = np.dtype(np.float32).char + float_32_ind = typecodes.index(float_32_char) + typecodes = typecodes[float_32_ind:] + + if typecodes is not None: + # from smallest to largest + for dtype in typecodes: + if np.dtype(dtype).itemsize <= val.dtype.itemsize: + val = maybe_downcast_to_dtype(val, dtype) + + # successful conversion + if val.dtype == dtype: + break + + return val.dtype.type, val + + +def infer_dtype_from_scalar(val, pandas_dtype=False): """ interpret the dtype from a scalar @@ -325,52 +388,8 @@ def infer_dtype_from_scalar(val, whether to infer dtype including pandas extension types. If False, scalar belongs to pandas extension types is inferred as object - downcast : bool, default False - If True, downcast float and integer types to the smallest width - type that can hold `val`. - - .. versionadded:: 0.20.0 - allow_uint : bool, default False - If True and `downcast` is True, non-negative integers will be - downcast to smallest width unsigned integer type that can hold - them. Otherwise, signed types are always downcast to signed types - and the same for unsigned types. - - .. versionadded:: 0.20.0 """ - def _downcast_dtype(dtype, val): - if 'float' in str(dtype): - if ((val > np.finfo(np.float16).min and - val < np.finfo(np.float16).max) or val is np.nan): - return np.float16 - elif (val > np.finfo(np.float32).min and - val < np.finfo(np.float32).max): - return np.float32 - else: - return np.float64 - elif 'uint' in str(dtype) or (val >= 0 and allow_uint): - if val < np.iinfo(np.uint8).max: - return np.uint8 - elif val < np.iinfo(np.uint16).max: - return np.uint16 - elif val < np.iinfo(np.uint32).max: - return np.uint32 - else: - return np.uint64 - elif 'int' in str(dtype): - if (val > np.iinfo(np.int8).min and - val < np.iinfo(np.int8).max): - return np.int8 - elif (val > np.iinfo(np.int16).min and - val < np.iinfo(np.int16).max): - return np.int16 - elif (val > np.iinfo(np.int32).min and - val < np.iinfo(np.int32).max): - return np.int32 - else: - return np.int64 - dtype = np.object_ # a 1-element ndarray @@ -382,8 +401,6 @@ def _downcast_dtype(dtype, val): dtype = val.dtype val = val.item() - dtype = _downcast_dtype(dtype, val) if downcast else dtype - elif isinstance(val, string_types): # If we create an empty array using a string to infer @@ -419,16 +436,12 @@ def _downcast_dtype(dtype, val): else: dtype = np.int64 - dtype = _downcast_dtype(dtype, val) if downcast else dtype - elif is_float(val): if isinstance(val, np.floating): dtype = type(val) else: dtype = np.float64 - dtype = _downcast_dtype(dtype, val) if downcast else dtype - elif is_complex(val): dtype = np.complex_ From 3111ed7f53c03a7e6e81ebd3113eb6330bbd519e Mon Sep 17 00:00:00 2001 From: beckermr Date: Tue, 11 Apr 2017 12:54:13 -0500 Subject: [PATCH 8/9] DOC added docs for new function --- doc/source/whatsnew/v0.20.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 127a041952f7e..4db70ef7825ea 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -373,6 +373,7 @@ Other Enhancements - ``pd.read_csv()`` will now raise a ``ParserError`` error whenever any parsing error occurs (:issue:`15913`, :issue:`15925`) - ``pd.read_csv()`` now supports the ``error_bad_lines`` and ``warn_bad_lines`` arguments for the Python parser (:issue:`15925`) - ``pd.get_dummies()`` now accepts the ``fill_value`` keyword which specifies how to fill NaN values in the dummy variables. (:issue:`15923`) +- ``pd.types.cast`` has a new function ``maybe_downcast_itemsize`` which can be used to reduce the width of numeric types. (:issue:`15923`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations From f7ee8f56f7ca9fab271889081806bcb4d7c67875 Mon Sep 17 00:00:00 2001 From: beckermr Date: Tue, 11 Apr 2017 13:53:42 -0500 Subject: [PATCH 9/9] TST updated tests for new routine --- pandas/tests/test_reshape.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 831edf697b2ad..10ecef6fe5a48 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -335,7 +335,7 @@ def test_fill_value_na(self): exp_na = DataFrame({'a': [1, 0, np.nan], 'b': [0, 1, np.nan], np.nan: [0, 0, 1]}, - dtype=np.float32) + dtype=np.float64) exp_na = exp_na.reindex_axis(['a', 'b', np.nan], 1) assert_frame_equal(res_na, exp_na) @@ -343,7 +343,7 @@ def test_fill_value_na(self): [nan], dummy_na=True, fill_value=np.nan, sparse=self.sparse) exp_just_na = DataFrame([[1]], columns=[np.nan], - dtype=np.float32) + dtype=np.float64) assert_frame_equal(res_just_na, exp_just_na) def test_unicode(self @@ -501,7 +501,7 @@ def test_dataframe_dummies_with_na(self): 'B_nan': [0, 0, 0, 1]}, dtype=np.float64) cols = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan'] - expected[cols] = expected[cols].astype(np.float32) + expected[cols] = expected[cols].astype(np.float64) expected = expected[ ['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']] assert_frame_equal(result, expected) @@ -589,7 +589,7 @@ def test_basic_drop_first_NA(self): res_na = get_dummies(s_NA, fill_value=np.nan, sparse=self.sparse, drop_first=True) - exp_na = DataFrame({'b': [0, 1, np.nan]}, dtype=np.float32) + exp_na = DataFrame({'b': [0, 1, np.nan]}, dtype=np.float64) assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse, @@ -645,7 +645,7 @@ def test_dataframe_dummies_drop_first_with_na(self): 'B_c': [0, 0, 1, np.nan], 'B_nan': [0, 0, 0, 1]}, dtype=np.float64) cols = ['A_b', 'B_c', 'A_nan', 'B_nan'] - expected[cols] = expected[cols].astype(np.float32) + expected[cols] = expected[cols].astype(np.float64) expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']] assert_frame_equal(result, expected) @@ -667,7 +667,7 @@ def test_dataframe_dummies_drop_first_with_na(self): 'B_c': [0, 0, 1, np.nan]}, dtype=np.float64) cols = ['A_b', 'B_c'] - expected[cols] = expected[cols].astype(np.float32) + expected[cols] = expected[cols].astype(np.float64) expected = expected[['C', 'A_b', 'B_c']] assert_frame_equal(result, expected)