From 78c0af5bc707b6fc6013e505af968b46f8f6fe52 Mon Sep 17 00:00:00 2001
From: "Matthew R. Becker" <beckermr@users.noreply.github.com>
Date: Thu, 6 Apr 2017 12:54:55 -0500
Subject: [PATCH 1/9] ENH adding `keep_values` keyword to pd.get_dummies
 function, deprecating filling with zeros by default

---
 doc/source/whatsnew/v0.20.0.txt |  13 ++++
 pandas/core/reshape.py          | 130 +++++++++++++++++++++++++------
 pandas/tests/test_reshape.py    | 134 +++++++++++++++++++++++++-------
 3 files changed, 224 insertions(+), 53 deletions(-)

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 0b98e57c606a3..41716e0e26ea0 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -372,6 +372,7 @@ Other Enhancements
 - :func:`MultiIndex.remove_unused_levels` has been added to facilitate :ref:`removing unused levels <advanced.shown_levels>`. (:issue:`15694`)
 - ``pd.read_csv()`` will now raise a ``ParserError`` error whenever any parsing error occurs (:issue:`15913`, :issue:`15925`)
 - ``pd.read_csv()`` now supports the ``error_bad_lines`` and ``warn_bad_lines`` arguments for the Python parser (:issue:`15925`)
+- ``pd.get_dummies()`` now accepts the ``fill_value`` keyword which specifies how to fill missing values in the dummy variables. (:issue:`15923`)
 
 
 .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
@@ -382,6 +383,18 @@ Other Enhancements
 Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+Deprecate Automatic Zero Filling of Missing Values in .get_dummies
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``pd.get_dummies`` function currently fills missing values with zero by default. This behavior is in conflict with the rest of the pandas API since missing values should be filled with ``fillna`` and missing values should be propagated through pandas transformations. In the future, ``get_dummies`` will propagate missing values by default. The recommended way to reproduce the current behavior of filling with zeros with the new, upcoming API is
+
+.. ipython: python
+
+  df = pd.get_dummies(df).fillna(0)
+
+For now, the current behavior of filling zeros by default has been kept, but not specifying a fill value with the ``fill_value`` keyword will raise a ``DeprecationWarning`` with the example above.
+
+
 .. _whatsnew_0200.api_breaking.deprecate_ix:
 
 Deprecate .ix
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
index b03c3d77928c7..f2792e554daa1 100644
--- a/pandas/core/reshape.py
+++ b/pandas/core/reshape.py
@@ -4,6 +4,7 @@
 from pandas import compat
 import itertools
 import re
+import warnings
 
 import numpy as np
 
@@ -1059,7 +1060,8 @@ def melt_stub(df, stub, i, j, value_vars, sep):
 
 
 def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
-                columns=None, sparse=False, drop_first=False):
+                fill_value=None, columns=None, sparse=False,
+                drop_first=False):
     """
     Convert categorical variable into dummy/indicator variables
 
@@ -1075,7 +1077,15 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
         If appending prefix, separator/delimiter to use. Or pass a
         list or dictionary as with `prefix.`
     dummy_na : bool, default False
-        Add a column to indicate NaNs, if False NaNs are ignored.
+        Add a column to indicate NaNs if True.
+    fill_value : scalar, default None
+        Value to fill NaNs with. The default of `None` will fill with
+        zeros. To do no filling of NaNs, specify `fill_value=np.nan`.
+        The default behavior of filling with zeros will be deprecated
+        in the future and using this default will not raise a
+        `DeprecationWarning`.
+
+        .. versionadded:: 0.20.0
     columns : list-like, default None
         Column names in the DataFrame to be encoded.
         If `columns` is None then all the columns with
@@ -1121,6 +1131,18 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
     1  0  1    0
     2  0  0    1
 
+    >>> pd.get_dummies(s1, fill_value=np.nan)
+         a    b
+    0    1    0
+    1    0    1
+    2  NaN  NaN
+
+    >>> pd.get_dummies(s1, fill_value=np.nan, dummy_na=True)
+         a    b  NaN
+    0    1    0    0
+    1    0    1    0
+    2  NaN  NaN    1
+
     >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
                         'C': [1, 2, 3]})
 
@@ -1153,6 +1175,17 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
     from pandas.tools.concat import concat
     from itertools import cycle
 
+    # Deprecate filling missing values with zeros, GH15926
+    # When this is finally deprecated, simply remove this block
+    # of code and change the default to np.nan in the function signature
+    # of `get_dummies`.
+    if fill_value is None:
+        warnings.warn('The default behavior of filling missing values '
+                      'with zeros will be deprecated. Use '
+                      '`df = pd.get_dummies(df).fillna(0)` to reproduce '
+                      'this behavior', DeprecationWarning)
+        fill_value = 0.0
+
     if isinstance(data, DataFrame):
         # determine columns being encoded
 
@@ -1197,17 +1230,19 @@ def check_len(item, name):
 
             dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
                                     dummy_na=dummy_na, sparse=sparse,
-                                    drop_first=drop_first)
+                                    drop_first=drop_first,
+                                    fill_value=fill_value)
             with_dummies.append(dummy)
         result = concat(with_dummies, axis=1)
     else:
         result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
-                                 sparse=sparse, drop_first=drop_first)
+                                 sparse=sparse, drop_first=drop_first,
+                                 fill_value=fill_value)
     return result
 
 
 def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
-                    sparse=False, drop_first=False):
+                    fill_value=np.nan, sparse=False, drop_first=False):
     # Series avoids inconsistent NaN handling
     codes, levels = _factorize_from_iterable(Series(data))
 
@@ -1221,17 +1256,22 @@ def get_empty_Frame(data, sparse):
         else:
             return SparseDataFrame(index=index, default_fill_value=0)
 
-    # if all NaN
-    if not dummy_na and len(levels) == 0:
+    # If we get all NaN and are not making a dummy col, then just return.
+    # GH15826
+    if len(levels) == 0 and not dummy_na:
         return get_empty_Frame(data, sparse)
 
+    # Record missing values before we munge the codes, GH15826
+    missing_codes_msk = codes == -1
     codes = codes.copy()
     if dummy_na:
-        codes[codes == -1] = len(levels)
+        codes[missing_codes_msk] = len(levels)
         levels = np.append(levels, np.nan)
 
     # if dummy_na, we just fake a nan level. drop_first will drop it again
-    if drop_first and len(levels) == 1:
+    # test for length of levels was changed to `<=` from `==` to cover
+    # all NaN inputs, GH15826
+    if drop_first and len(levels) <= 1:
         return get_empty_Frame(data, sparse)
 
     number_of_cols = len(levels)
@@ -1250,11 +1290,36 @@ def get_empty_Frame(data, sparse):
         sparse_series = {}
         N = len(data)
         sp_indices = [[] for _ in range(len(dummy_cols))]
-        for ndx, code in enumerate(codes):
-            if code == -1:
-                # Blank entries if not dummy_na and code == -1, #GH4446
-                continue
-            sp_indices[code].append(ndx)
+        for ndx, code, missing in zip(
+                range(len(codes)), codes, missing_codes_msk):
+            if missing:
+                # For missing values, we have to decide what to do.
+                # GH15926
+
+                if dummy_na:
+                    # Then we need a one in the last column.
+                    # GH15926
+                    sp_indices[code].append(ndx)
+
+                if fill_value != 0:
+                    # Then we need to mark these locations to put back another
+                    # fill value later. (Zero fill values will be filled by the
+                    # sparse array implicitly).
+                    # Use a negative index here to code NaNs.
+                    # Offset by -1 to account for zero.
+                    # Have to add to ALL columns, except the
+                    # last one if dummy_na.
+                    # GH15926
+                    if dummy_na:
+                        _num_cols = len(dummy_cols) - 1
+                    else:
+                        _num_cols = len(dummy_cols)
+                    for _code in range(_num_cols):
+                        sp_indices[_code].append(-ndx - 1)
+            else:
+                # Value is not missing so do as normal.
+                # GH15926
+                sp_indices[code].append(ndx)
 
         if drop_first:
             # remove first categorical level to avoid perfect collinearity
@@ -1262,28 +1327,47 @@ def get_empty_Frame(data, sparse):
             sp_indices = sp_indices[1:]
             dummy_cols = dummy_cols[1:]
         for col, ixs in zip(dummy_cols, sp_indices):
-            sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8),
-                               sparse_index=IntIndex(N, ixs), fill_value=0,
-                               dtype=np.uint8)
+            sarr = np.ones(len(ixs), dtype=np.float32)
+
+            # NaNs are marked by a negative index.
+            # Only need to set for sparse output if
+            # fill_value != 0.
+            # Ditto for any negative indexes generated above.
+            # GH15926
+            if fill_value != 0:
+                ixs = np.array(ixs)
+                sarr[ixs < 0] = fill_value
+                ixs[ixs < 0] += 1  # undo the offset
+                ixs = np.abs(ixs)  # set index back to positive.
+
+            sarr = SparseArray(
+                sarr,
+                sparse_index=IntIndex(N, ixs),
+                fill_value=0,
+                dtype=np.float32)
             sparse_series[col] = SparseSeries(data=sarr, index=index)
 
         out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols,
                               default_fill_value=0,
-                              dtype=np.uint8)
+                              dtype=np.float32)
         return out
 
     else:
-        dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0)
+        dummy_mat = np.eye(
+            number_of_cols, dtype=np.float32).take(codes, axis=0)
 
-        if not dummy_na:
-            # reset NaN GH4446
-            dummy_mat[codes == -1] = 0
+        # user specified fill value via `fill_value` GH15926
+        if dummy_na:
+            dummy_mat[missing_codes_msk, :-1] = fill_value
+        else:
+            dummy_mat[missing_codes_msk] = fill_value
 
         if drop_first:
             # remove first GH12042
             dummy_mat = dummy_mat[:, 1:]
             dummy_cols = dummy_cols[1:]
-        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
+        return DataFrame(
+            dummy_mat, index=index, columns=dummy_cols, dtype=np.float32)
 
 
 def make_axis_dummies(frame, axis='minor', transform=None):
diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py
index ee255c1863b41..453d2621cc86f 100644
--- a/pandas/tests/test_reshape.py
+++ b/pandas/tests/test_reshape.py
@@ -243,7 +243,7 @@ def test_basic(self):
                                     2: 0},
                               'c': {0: 0,
                                     1: 0,
-                                    2: 1}}, dtype=np.uint8)
+                                    2: 1}}, dtype=np.float32)
         assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected)
         assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected)
 
@@ -262,7 +262,7 @@ def test_basic_types(self):
         expected = DataFrame({'a': [1, 0, 0],
                               'b': [0, 1, 0],
                               'c': [0, 0, 1]},
-                             dtype='uint8',
+                             dtype='float32',
                              columns=list('abc'))
         if not self.sparse:
             compare = tm.assert_frame_equal
@@ -278,10 +278,11 @@ def test_basic_types(self):
 
         result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns)
         tm.assert_series_equal(result.get_dtype_counts(),
-                               Series({'uint8': 8}))
+                               Series({'float32': 8}))
 
         result = get_dummies(s_df, sparse=self.sparse, columns=['a'])
-        expected = Series({'uint8': 3, 'int64': 1, 'object': 1}).sort_values()
+        expected = Series(
+            {'float32': 3, 'int64': 1, 'object': 1}).sort_values()
         tm.assert_series_equal(result.get_dtype_counts().sort_values(),
                                expected)
 
@@ -307,7 +308,7 @@ def test_include_na(self):
         s = ['a', 'b', np.nan]
         res = get_dummies(s, sparse=self.sparse)
         exp = DataFrame({'a': {0: 1, 1: 0, 2: 0},
-                         'b': {0: 0, 1: 1, 2: 0}}, dtype=np.uint8)
+                         'b': {0: 0, 1: 1, 2: 0}}, dtype=np.float32)
         assert_frame_equal(res, exp)
 
         # Sparse dataframes do not allow nan labelled columns, see #GH8822
@@ -315,7 +316,7 @@ def test_include_na(self):
         exp_na = DataFrame({nan: {0: 0, 1: 0, 2: 1},
                             'a': {0: 1, 1: 0, 2: 0},
                             'b': {0: 0, 1: 1, 2: 0}},
-                           dtype=np.uint8)
+                           dtype=np.float32)
         exp_na = exp_na.reindex_axis(['a', 'b', nan], 1)
         # hack (NaN handling in assert_index_equal)
         exp_na.columns = res_na.columns
@@ -323,9 +324,28 @@ def test_include_na(self):
 
         res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse)
         exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan],
-                                dtype=np.uint8)
+                                dtype=np.float32)
         tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
 
+    # Add `fill_value` keyword GH15926
+    def test_fill_value_na(self):
+        s = ['a', 'b', np.nan]
+        res_na = get_dummies(
+            s, dummy_na=True, fill_value=np.nan, sparse=self.sparse)
+        exp_na = DataFrame({'a': [1, 0, np.nan],
+                            'b': [0, 1, np.nan],
+                            np.nan: [0, 0, 1]},
+                           dtype=np.float32)
+        exp_na = exp_na.reindex_axis(['a', 'b', np.nan], 1)
+        assert_frame_equal(res_na, exp_na)
+
+        res_just_na = get_dummies(
+            [nan], dummy_na=True, fill_value=np.nan, sparse=self.sparse)
+        exp_just_na = DataFrame([[1]],
+                                columns=[np.nan],
+                                dtype=np.float32)
+        assert_frame_equal(res_just_na, exp_just_na)
+
     def test_unicode(self
                      ):  # See GH 6885 - get_dummies chokes on unicode values
         import unicodedata
@@ -339,7 +359,7 @@ def test_unicode(self
                          u('letter_%s') % eacute: {0: 0,
                                                    1: 1,
                                                    2: 1}},
-                        dtype=np.uint8)
+                        dtype=np.float32)
         assert_frame_equal(res, exp)
 
     def test_dataframe_dummies_all_obj(self):
@@ -348,7 +368,7 @@ def test_dataframe_dummies_all_obj(self):
         expected = DataFrame({'A_a': [1, 0, 1],
                               'A_b': [0, 1, 0],
                               'B_b': [1, 1, 0],
-                              'B_c': [0, 0, 1]}, dtype=np.uint8)
+                              'B_c': [0, 0, 1]}, dtype=np.float32)
         assert_frame_equal(result, expected)
 
     def test_dataframe_dummies_mix_default(self):
@@ -360,7 +380,7 @@ def test_dataframe_dummies_mix_default(self):
                               'B_b': [1, 1, 0],
                               'B_c': [0, 0, 1]})
         cols = ['A_a', 'A_b', 'B_b', 'B_c']
-        expected[cols] = expected[cols].astype(np.uint8)
+        expected[cols] = expected[cols].astype(np.float32)
         expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
         assert_frame_equal(result, expected)
 
@@ -376,7 +396,7 @@ def test_dataframe_dummies_prefix_list(self):
                               'from_B_b': [1, 1, 0],
                               'from_B_c': [0, 0, 1]})
         cols = expected.columns[1:]
-        expected[cols] = expected[cols].astype(np.uint8)
+        expected[cols] = expected[cols].astype(np.float32)
         expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b',
                              'from_B_c']]
         assert_frame_equal(result, expected)
@@ -389,7 +409,7 @@ def test_dataframe_dummies_prefix_str(self):
                               [2, 0, 1, 1, 0],
                               [3, 1, 0, 0, 1]],
                              columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'],
-                             dtype=np.uint8)
+                             dtype=np.float32)
         expected = expected.astype({"C": np.int64})
         assert_frame_equal(result, expected)
 
@@ -402,7 +422,7 @@ def test_dataframe_dummies_subset(self):
                               'B': ['b', 'b', 'c'],
                               'C': [1, 2, 3]})
         cols = ['from_A_a', 'from_A_b']
-        expected[cols] = expected[cols].astype(np.uint8)
+        expected[cols] = expected[cols].astype(np.float32)
         assert_frame_equal(result, expected)
 
     def test_dataframe_dummies_prefix_sep(self):
@@ -415,7 +435,7 @@ def test_dataframe_dummies_prefix_sep(self):
                               'B..c': [0, 0, 1]})
         expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
         cols = expected.columns[1:]
-        expected[cols] = expected[cols].astype(np.uint8)
+        expected[cols] = expected[cols].astype(np.float32)
         assert_frame_equal(result, expected)
 
         result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse)
@@ -446,11 +466,11 @@ def test_dataframe_dummies_prefix_dict(self):
                               'from_B_c': [0, 0, 1],
                               'C': [1, 2, 3]})
         cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']
-        expected[cols] = expected[cols].astype(np.uint8)
+        expected[cols] = expected[cols].astype(np.float32)
         assert_frame_equal(result, expected)
 
     def test_dataframe_dummies_with_na(self):
-        df = self.df
+        df = self.df.copy()
         df.loc[3, :] = [np.nan, np.nan, np.nan]
         result = get_dummies(df, dummy_na=True, sparse=self.sparse)
         expected = DataFrame({'C': [1, 2, 3, np.nan],
@@ -461,7 +481,7 @@ def test_dataframe_dummies_with_na(self):
                               'B_c': [0, 0, 1, 0],
                               'B_nan': [0, 0, 0, 1]})
         cols = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']
-        expected[cols] = expected[cols].astype(np.uint8)
+        expected[cols] = expected[cols].astype(np.float32)
         expected = expected[['C', 'A_a', 'A_b', 'A_nan',
                              'B_b', 'B_c', 'B_nan']]
         assert_frame_equal(result, expected)
@@ -470,6 +490,22 @@ def test_dataframe_dummies_with_na(self):
         expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
         assert_frame_equal(result, expected)
 
+        result = get_dummies(
+            df, dummy_na=True, fill_value=np.nan, sparse=self.sparse)
+        expected = DataFrame({'C': [1, 2, 3, np.nan],
+                              'A_a': [1, 0, 1, np.nan],
+                              'A_b': [0, 1, 0, np.nan],
+                              'A_nan': [0, 0, 0, 1],
+                              'B_b': [1, 1, 0, np.nan],
+                              'B_c': [0, 0, 1, np.nan],
+                              'B_nan': [0, 0, 0, 1]},
+                             dtype=np.float64)
+        cols = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']
+        expected[cols] = expected[cols].astype(np.float32)
+        expected = expected[
+            ['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']]
+        assert_frame_equal(result, expected)
+
     def test_dataframe_dummies_with_categorical(self):
         df = self.df
         df['cat'] = pd.Categorical(['x', 'y', 'y'])
@@ -482,7 +518,7 @@ def test_dataframe_dummies_with_categorical(self):
                               'cat_x': [1, 0, 0],
                               'cat_y': [0, 1, 1]})
         cols = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y']
-        expected[cols] = expected[cols].astype(np.uint8)
+        expected[cols] = expected[cols].astype(np.float32)
         expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c',
                              'cat_x', 'cat_y']]
         assert_frame_equal(result, expected)
@@ -499,7 +535,7 @@ def test_basic_drop_first(self):
                                     2: 0},
                               'c': {0: 0,
                                     1: 0,
-                                    2: 1}}, dtype=np.uint8)
+                                    2: 1}}, dtype=np.float32)
 
         result = get_dummies(s_list, sparse=self.sparse, drop_first=True)
         assert_frame_equal(result, expected)
@@ -537,7 +573,7 @@ def test_basic_drop_first_NA(self):
         res = get_dummies(s_NA, sparse=self.sparse, drop_first=True)
         exp = DataFrame({'b': {0: 0,
                                1: 1,
-                               2: 0}}, dtype=np.uint8)
+                               2: 0}}, dtype=np.float32)
         assert_frame_equal(res, exp)
 
         res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse,
@@ -547,20 +583,30 @@ def test_basic_drop_first_NA(self):
                                   2: 0},
                             nan: {0: 0,
                                   1: 0,
-                                  2: 1}}, dtype=np.uint8).reindex_axis(
+                                  2: 1}}, dtype=np.float32).reindex_axis(
                                       ['b', nan], 1)
         assert_frame_equal(res_na, exp_na)
 
+        res_na = get_dummies(s_NA, fill_value=np.nan, sparse=self.sparse,
+                             drop_first=True)
+        exp_na = DataFrame({'b': [0, 1, np.nan]}, dtype=np.float32)
+        assert_frame_equal(res_na, exp_na)
+
         res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse,
                                   drop_first=True)
-        exp_just_na = DataFrame(index=np.arange(1))
+        exp_just_na = DataFrame(index=range(1))
+        assert_frame_equal(res_just_na, exp_just_na)
+
+        res_just_na = get_dummies([nan], sparse=self.sparse,
+                                  drop_first=True)
+        exp_just_na = DataFrame(index=range(1))
         assert_frame_equal(res_just_na, exp_just_na)
 
     def test_dataframe_dummies_drop_first(self):
         df = self.df[['A', 'B']]
         result = get_dummies(df, sparse=self.sparse, drop_first=True)
         expected = DataFrame({'A_b': [0, 1, 0],
-                              'B_c': [0, 0, 1]}, dtype=np.uint8)
+                              'B_c': [0, 0, 1]}, dtype=np.float32)
         assert_frame_equal(result, expected)
 
     def test_dataframe_dummies_drop_first_with_categorical(self):
@@ -572,12 +618,12 @@ def test_dataframe_dummies_drop_first_with_categorical(self):
                               'B_c': [0, 0, 1],
                               'cat_y': [0, 1, 1]})
         cols = ['A_b', 'B_c', 'cat_y']
-        expected[cols] = expected[cols].astype(np.uint8)
+        expected[cols] = expected[cols].astype(np.float32)
         expected = expected[['C', 'A_b', 'B_c', 'cat_y']]
         assert_frame_equal(result, expected)
 
     def test_dataframe_dummies_drop_first_with_na(self):
-        df = self.df
+        df = self.df.copy()
         df.loc[3, :] = [np.nan, np.nan, np.nan]
         result = get_dummies(df, dummy_na=True, sparse=self.sparse,
                              drop_first=True)
@@ -587,13 +633,41 @@ def test_dataframe_dummies_drop_first_with_na(self):
                               'B_c': [0, 0, 1, 0],
                               'B_nan': [0, 0, 0, 1]})
         cols = ['A_b', 'A_nan', 'B_c', 'B_nan']
-        expected[cols] = expected[cols].astype(np.uint8)
+        expected[cols] = expected[cols].astype(np.float32)
+        expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']]
+        assert_frame_equal(result, expected)
 
+        result = get_dummies(df, dummy_na=True, fill_value=np.nan,
+                             sparse=self.sparse, drop_first=True)
+        expected = DataFrame({'C': [1, 2, 3, np.nan],
+                              'A_b': [0, 1, 0, np.nan],
+                              'A_nan': [0, 0, 0, 1],
+                              'B_c': [0, 0, 1, np.nan],
+                              'B_nan': [0, 0, 0, 1]}, dtype=np.float64)
+        cols = ['A_b', 'B_c', 'A_nan', 'B_nan']
+        expected[cols] = expected[cols].astype(np.float32)
         expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']]
         assert_frame_equal(result, expected)
 
         result = get_dummies(df, dummy_na=False, sparse=self.sparse,
                              drop_first=True)
+        expected = DataFrame({'C': [1, 2, 3, np.nan],
+                              'A_b': [0, 1, 0, 0],
+                              'B_c': [0, 0, 1, 0]},
+                             dtype=np.float64)
+        cols = ['A_b', 'B_c']
+        expected[cols] = expected[cols].astype(np.float32)
+        expected = expected[['C', 'A_b', 'B_c']]
+        assert_frame_equal(result, expected)
+
+        result = get_dummies(df, dummy_na=False, sparse=self.sparse,
+                             drop_first=True, fill_value=np.nan)
+        expected = DataFrame({'C': [1, 2, 3, np.nan],
+                              'A_b': [0, 1, 0, np.nan],
+                              'B_c': [0, 0, 1, np.nan]},
+                             dtype=np.float64)
+        cols = ['A_b', 'B_c']
+        expected[cols] = expected[cols].astype(np.float32)
         expected = expected[['C', 'A_b', 'B_c']]
         assert_frame_equal(result, expected)
 
@@ -601,14 +675,14 @@ def test_int_int(self):
         data = Series([1, 2, 1])
         result = pd.get_dummies(data)
         expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2],
-                             dtype=np.uint8)
+                             dtype=np.float32)
         tm.assert_frame_equal(result, expected)
 
         data = Series(pd.Categorical(['a', 'b', 'a']))
         result = pd.get_dummies(data)
         expected = DataFrame([[1, 0], [0, 1], [1, 0]],
                              columns=pd.Categorical(['a', 'b']),
-                             dtype=np.uint8)
+                             dtype=np.float32)
         tm.assert_frame_equal(result, expected)
 
     def test_int_df(self):
@@ -625,7 +699,7 @@ def test_int_df(self):
             [2, 2., 0, 1, 0, 1],
             [1, 1., 1, 0, 1, 0]
         ], columns=columns)
-        expected[columns[2:]] = expected[columns[2:]].astype(np.uint8)
+        expected[columns[2:]] = expected[columns[2:]].astype(np.float32)
         result = pd.get_dummies(data, columns=['A', 'B'])
         tm.assert_frame_equal(result, expected)
 
@@ -636,7 +710,7 @@ def test_dataframe_dummies_preserve_categorical_dtype(self):
                                  ordered=ordered)
             result = get_dummies(cat)
 
-            data = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.uint8)
+            data = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.float32)
             cols = pd.CategoricalIndex(cat.categories,
                                        categories=cat.categories,
                                        ordered=ordered)

From 0304ede1a0ab7aa97b93d58b00bd04bf0a8235e0 Mon Sep 17 00:00:00 2001
From: beckermr <mbecker@civisanalytics.com>
Date: Fri, 7 Apr 2017 16:42:38 -0500
Subject: [PATCH 2/9] DOC update whatsnew

---
 doc/source/whatsnew/v0.20.0.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 41716e0e26ea0..c8780961b764d 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -383,8 +383,8 @@ Other Enhancements
 Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Deprecate Automatic Zero Filling of Missing Values in .get_dummies
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Deprecate Automatic Zero Filling of Missing Values in pd.get_dummies
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The ``pd.get_dummies`` function currently fills missing values with zero by default. This behavior is in conflict with the rest of the pandas API since missing values should be filled with ``fillna`` and missing values should be propagated through pandas transformations. In the future, ``get_dummies`` will propagate missing values by default. The recommended way to reproduce the current behavior of filling with zeros with the new, upcoming API is
 

From 22bf6ce49e22eb09954f6aeea5791d1e952e89a1 Mon Sep 17 00:00:00 2001
From: beckermr <mbecker@civisanalytics.com>
Date: Sat, 8 Apr 2017 08:40:39 -0500
Subject: [PATCH 3/9] ENH repsond to CR

---
 doc/source/whatsnew/v0.20.0.txt | 15 ++++++++-------
 pandas/core/reshape.py          | 30 +++++++++++++++---------------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index c8780961b764d..127a041952f7e 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -372,7 +372,7 @@ Other Enhancements
 - :func:`MultiIndex.remove_unused_levels` has been added to facilitate :ref:`removing unused levels <advanced.shown_levels>`. (:issue:`15694`)
 - ``pd.read_csv()`` will now raise a ``ParserError`` error whenever any parsing error occurs (:issue:`15913`, :issue:`15925`)
 - ``pd.read_csv()`` now supports the ``error_bad_lines`` and ``warn_bad_lines`` arguments for the Python parser (:issue:`15925`)
-- ``pd.get_dummies()`` now accepts the ``fill_value`` keyword which specifies how to fill missing values in the dummy variables. (:issue:`15923`)
+- ``pd.get_dummies()`` now accepts the ``fill_value`` keyword which specifies how to fill NaN values in the dummy variables. (:issue:`15923`)
 
 
 .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
@@ -383,16 +383,17 @@ Other Enhancements
 Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Deprecate Automatic Zero Filling of Missing Values in pd.get_dummies
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Deprecate Automatic Zero Filling of Missing Values in ``pd.get_dummies``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The ``pd.get_dummies`` function currently fills missing values with zero by default. This behavior is in conflict with the rest of the pandas API since missing values should be filled with ``fillna`` and missing values should be propagated through pandas transformations. In the future, ``get_dummies`` will propagate missing values by default. The recommended way to reproduce the current behavior of filling with zeros with the new, upcoming API is
+The :func:`get_dummies` function currently fills NaN values with zero by default. This behavior is in conflict with the rest of the pandas API since NaN values should be filled with ``fillna`` or a ``fill_value`` keyword, and NaN values should be propagated through pandas transformations. In the future, :func:`get_dummies` will propagate NaN values by default. (:issue:`15923`)
 
-.. ipython: python
 
-  df = pd.get_dummies(df).fillna(0)
+The recommended way to reproduce the current behavior of filling NaN values with zeros with the new, upcoming API is
+
+.. ipython: python
 
-For now, the current behavior of filling zeros by default has been kept, but not specifying a fill value with the ``fill_value`` keyword will raise a ``DeprecationWarning`` with the example above.
+  df = pd.get_dummies(df, fill_value=0)
 
 
 .. _whatsnew_0200.api_breaking.deprecate_ix:
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
index f2792e554daa1..b55073ea4d67f 100644
--- a/pandas/core/reshape.py
+++ b/pandas/core/reshape.py
@@ -1060,8 +1060,8 @@ def melt_stub(df, stub, i, j, value_vars, sep):
 
 
 def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
-                fill_value=None, columns=None, sparse=False,
-                drop_first=False):
+                columns=None, sparse=False, drop_first=False,
+                fill_value=None):
     """
     Convert categorical variable into dummy/indicator variables
 
@@ -1078,14 +1078,6 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
         list or dictionary as with `prefix.`
     dummy_na : bool, default False
         Add a column to indicate NaNs if True.
-    fill_value : scalar, default None
-        Value to fill NaNs with. The default of `None` will fill with
-        zeros. To do no filling of NaNs, specify `fill_value=np.nan`.
-        The default behavior of filling with zeros will be deprecated
-        in the future and using this default will not raise a
-        `DeprecationWarning`.
-
-        .. versionadded:: 0.20.0
     columns : list-like, default None
         Column names in the DataFrame to be encoded.
         If `columns` is None then all the columns with
@@ -1101,6 +1093,14 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
         first level.
 
         .. versionadded:: 0.18.0
+    fill_value : scalar, default None
+        Value to fill NaNs with. The default of `None` will fill with
+        zeros. To do no filling of NaNs, specify `fill_value=np.nan`.
+        The default behavior of filling with zeros will be deprecated
+        in the future and using this default will not raise a
+        `FutureWarning`.
+
+        .. versionadded:: 0.20.0
     Returns
     -------
     dummies : DataFrame or SparseDataFrame
@@ -1175,16 +1175,16 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
     from pandas.tools.concat import concat
     from itertools import cycle
 
-    # Deprecate filling missing values with zeros, GH15926
+    # Deprecate filling NaN values with zeros, GH15926
     # When this is finally deprecated, simply remove this block
     # of code and change the default to np.nan in the function signature
     # of `get_dummies`.
     if fill_value is None:
-        warnings.warn('The default behavior of filling missing values '
+        warnings.warn('The default behavior of filling NaN values '
                       'with zeros will be deprecated. Use '
-                      '`df = pd.get_dummies(df).fillna(0)` to reproduce '
-                      'this behavior', DeprecationWarning)
-        fill_value = 0.0
+                      '`df = pd.get_dummies(df, fill_value=0)` to reproduce '
+                      'this behavior', FutureWarning, stack_level=3)
+        fill_value = 0
 
     if isinstance(data, DataFrame):
         # determine columns being encoded

From 766e094737034c95dfa247e308e0a1effb11c5fa Mon Sep 17 00:00:00 2001
From: beckermr <mbecker@civisanalytics.com>
Date: Sat, 8 Apr 2017 09:33:08 -0500
Subject: [PATCH 4/9] ENH added type downcasting, still needs to be refactored

---
 pandas/core/reshape.py       | 66 +++++++++++++++++++++++++++---------
 pandas/tests/test_reshape.py | 56 +++++++++++++++---------------
 2 files changed, 78 insertions(+), 44 deletions(-)

diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
index b55073ea4d67f..9d580a53c5c0a 100644
--- a/pandas/core/reshape.py
+++ b/pandas/core/reshape.py
@@ -11,7 +11,7 @@
 from pandas.types.common import (_ensure_platform_int,
                                  is_list_like, is_bool_dtype,
                                  needs_i8_conversion)
-from pandas.types.cast import maybe_promote
+from pandas.types.cast import maybe_promote, infer_dtype_from_scalar
 from pandas.types.missing import notnull
 import pandas.types.concat as _concat
 
@@ -1183,9 +1183,40 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
         warnings.warn('The default behavior of filling NaN values '
                       'with zeros will be deprecated. Use '
                       '`df = pd.get_dummies(df, fill_value=0)` to reproduce '
-                      'this behavior', FutureWarning, stack_level=3)
+                      'this behavior', FutureWarning, 3)
         fill_value = 0
 
+    # Infer the proper output dtype.
+    # GH15926
+    try:
+        if np.all(np.isfinite(data.values if hasattr(data, 'values') else data)):
+            any_null = True
+        else:
+            any_null = False
+    except TypeError:
+        any_null = False
+
+    if any_null:
+        output_dtype = np.uint8
+    else:
+        fill_value_dtype, fill_value = infer_dtype_from_scalar(fill_value)
+
+        if 'int' in str(fill_value_dtype):
+            if fill_value >= 0:
+                if fill_value <= np.iinfo(np.uint8).max:
+                    output_dtype = np.uint8
+                else:
+                    output_dtype = np.uint64
+            else:
+                if fill_value >= np.iinfo(np.int8).min and fill_value <= np.iinfo(np.int8).max:
+                    output_dtype = np.int8
+                else:
+                    output_dtype = np.int64
+        elif 'float' in str(fill_value_dtype):
+            output_dtype = np.float32
+        else:
+            raise ValueError('`fill_value` must be `np.nan`, an int or a float type!')
+
     if isinstance(data, DataFrame):
         # determine columns being encoded
 
@@ -1231,18 +1262,21 @@ def check_len(item, name):
             dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
                                     dummy_na=dummy_na, sparse=sparse,
                                     drop_first=drop_first,
-                                    fill_value=fill_value)
+                                    fill_value=fill_value,
+                                    output_dtype=output_dtype)
             with_dummies.append(dummy)
         result = concat(with_dummies, axis=1)
     else:
         result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
                                  sparse=sparse, drop_first=drop_first,
-                                 fill_value=fill_value)
+                                 fill_value=fill_value,
+                                 output_dtype=output_dtype)
     return result
 
 
 def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
-                    fill_value=np.nan, sparse=False, drop_first=False):
+                    fill_value=np.nan, sparse=False, drop_first=False,
+                    output_dtype=np.uint8):
     # Series avoids inconsistent NaN handling
     codes, levels = _factorize_from_iterable(Series(data))
 
@@ -1261,11 +1295,11 @@ def get_empty_Frame(data, sparse):
     if len(levels) == 0 and not dummy_na:
         return get_empty_Frame(data, sparse)
 
-    # Record missing values before we munge the codes, GH15826
-    missing_codes_msk = codes == -1
+    # Record NaN values before we munge the codes, GH15826
+    nan_codes_msk = codes == -1
     codes = codes.copy()
     if dummy_na:
-        codes[missing_codes_msk] = len(levels)
+        codes[nan_codes_msk] = len(levels)
         levels = np.append(levels, np.nan)
 
     # if dummy_na, we just fake a nan level. drop_first will drop it again
@@ -1291,7 +1325,7 @@ def get_empty_Frame(data, sparse):
         N = len(data)
         sp_indices = [[] for _ in range(len(dummy_cols))]
         for ndx, code, missing in zip(
-                range(len(codes)), codes, missing_codes_msk):
+                range(len(codes)), codes, nan_codes_msk):
             if missing:
                 # For missing values, we have to decide what to do.
                 # GH15926
@@ -1327,7 +1361,7 @@ def get_empty_Frame(data, sparse):
             sp_indices = sp_indices[1:]
             dummy_cols = dummy_cols[1:]
         for col, ixs in zip(dummy_cols, sp_indices):
-            sarr = np.ones(len(ixs), dtype=np.float32)
+            sarr = np.ones(len(ixs), dtype=output_dtype)
 
             # NaNs are marked by a negative index.
             # Only need to set for sparse output if
@@ -1344,30 +1378,30 @@ def get_empty_Frame(data, sparse):
                 sarr,
                 sparse_index=IntIndex(N, ixs),
                 fill_value=0,
-                dtype=np.float32)
+                dtype=output_dtype)
             sparse_series[col] = SparseSeries(data=sarr, index=index)
 
         out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols,
                               default_fill_value=0,
-                              dtype=np.float32)
+                              dtype=output_dtype)
         return out
 
     else:
         dummy_mat = np.eye(
-            number_of_cols, dtype=np.float32).take(codes, axis=0)
+            number_of_cols, dtype=output_dtype).take(codes, axis=0)
 
         # user specified fill value via `fill_value` GH15926
         if dummy_na:
-            dummy_mat[missing_codes_msk, :-1] = fill_value
+            dummy_mat[nan_codes_msk, :-1] = fill_value
         else:
-            dummy_mat[missing_codes_msk] = fill_value
+            dummy_mat[nan_codes_msk] = fill_value
 
         if drop_first:
             # remove first GH12042
             dummy_mat = dummy_mat[:, 1:]
             dummy_cols = dummy_cols[1:]
         return DataFrame(
-            dummy_mat, index=index, columns=dummy_cols, dtype=np.float32)
+            dummy_mat, index=index, columns=dummy_cols, dtype=output_dtype)
 
 
 def make_axis_dummies(frame, axis='minor', transform=None):
diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py
index 453d2621cc86f..e3a09d7065664 100644
--- a/pandas/tests/test_reshape.py
+++ b/pandas/tests/test_reshape.py
@@ -243,7 +243,7 @@ def test_basic(self):
                                     2: 0},
                               'c': {0: 0,
                                     1: 0,
-                                    2: 1}}, dtype=np.float32)
+                                    2: 1}}, dtype=np.uint8)
         assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected)
         assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected)
 
@@ -262,7 +262,7 @@ def test_basic_types(self):
         expected = DataFrame({'a': [1, 0, 0],
                               'b': [0, 1, 0],
                               'c': [0, 0, 1]},
-                             dtype='float32',
+                             dtype='uint8',
                              columns=list('abc'))
         if not self.sparse:
             compare = tm.assert_frame_equal
@@ -278,11 +278,11 @@ def test_basic_types(self):
 
         result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns)
         tm.assert_series_equal(result.get_dtype_counts(),
-                               Series({'float32': 8}))
+                               Series({'uint8': 8}))
 
         result = get_dummies(s_df, sparse=self.sparse, columns=['a'])
         expected = Series(
-            {'float32': 3, 'int64': 1, 'object': 1}).sort_values()
+            {'uint8': 3, 'int64': 1, 'object': 1}).sort_values()
         tm.assert_series_equal(result.get_dtype_counts().sort_values(),
                                expected)
 
@@ -308,7 +308,7 @@ def test_include_na(self):
         s = ['a', 'b', np.nan]
         res = get_dummies(s, sparse=self.sparse)
         exp = DataFrame({'a': {0: 1, 1: 0, 2: 0},
-                         'b': {0: 0, 1: 1, 2: 0}}, dtype=np.float32)
+                         'b': {0: 0, 1: 1, 2: 0}}, dtype=np.uint8)
         assert_frame_equal(res, exp)
 
         # Sparse dataframes do not allow nan labelled columns, see #GH8822
@@ -316,7 +316,7 @@ def test_include_na(self):
         exp_na = DataFrame({nan: {0: 0, 1: 0, 2: 1},
                             'a': {0: 1, 1: 0, 2: 0},
                             'b': {0: 0, 1: 1, 2: 0}},
-                           dtype=np.float32)
+                           dtype=np.uint8)
         exp_na = exp_na.reindex_axis(['a', 'b', nan], 1)
         # hack (NaN handling in assert_index_equal)
         exp_na.columns = res_na.columns
@@ -324,7 +324,7 @@ def test_include_na(self):
 
         res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse)
         exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan],
-                                dtype=np.float32)
+                                dtype=np.uint8)
         tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
 
     # Add `fill_value` keyword GH15926
@@ -359,7 +359,7 @@ def test_unicode(self
                          u('letter_%s') % eacute: {0: 0,
                                                    1: 1,
                                                    2: 1}},
-                        dtype=np.float32)
+                        dtype=np.uint8)
         assert_frame_equal(res, exp)
 
     def test_dataframe_dummies_all_obj(self):
@@ -368,7 +368,7 @@ def test_dataframe_dummies_all_obj(self):
         expected = DataFrame({'A_a': [1, 0, 1],
                               'A_b': [0, 1, 0],
                               'B_b': [1, 1, 0],
-                              'B_c': [0, 0, 1]}, dtype=np.float32)
+                              'B_c': [0, 0, 1]}, dtype=np.uint8)
         assert_frame_equal(result, expected)
 
     def test_dataframe_dummies_mix_default(self):
@@ -380,7 +380,7 @@ def test_dataframe_dummies_mix_default(self):
                               'B_b': [1, 1, 0],
                               'B_c': [0, 0, 1]})
         cols = ['A_a', 'A_b', 'B_b', 'B_c']
-        expected[cols] = expected[cols].astype(np.float32)
+        expected[cols] = expected[cols].astype(np.uint8)
         expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
         assert_frame_equal(result, expected)
 
@@ -396,7 +396,7 @@ def test_dataframe_dummies_prefix_list(self):
                               'from_B_b': [1, 1, 0],
                               'from_B_c': [0, 0, 1]})
         cols = expected.columns[1:]
-        expected[cols] = expected[cols].astype(np.float32)
+        expected[cols] = expected[cols].astype(np.uint8)
         expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b',
                              'from_B_c']]
         assert_frame_equal(result, expected)
@@ -409,7 +409,7 @@ def test_dataframe_dummies_prefix_str(self):
                               [2, 0, 1, 1, 0],
                               [3, 1, 0, 0, 1]],
                              columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'],
-                             dtype=np.float32)
+                             dtype=np.uint8)
         expected = expected.astype({"C": np.int64})
         assert_frame_equal(result, expected)
 
@@ -422,7 +422,7 @@ def test_dataframe_dummies_subset(self):
                               'B': ['b', 'b', 'c'],
                               'C': [1, 2, 3]})
         cols = ['from_A_a', 'from_A_b']
-        expected[cols] = expected[cols].astype(np.float32)
+        expected[cols] = expected[cols].astype(np.uint8)
         assert_frame_equal(result, expected)
 
     def test_dataframe_dummies_prefix_sep(self):
@@ -435,7 +435,7 @@ def test_dataframe_dummies_prefix_sep(self):
                               'B..c': [0, 0, 1]})
         expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
         cols = expected.columns[1:]
-        expected[cols] = expected[cols].astype(np.float32)
+        expected[cols] = expected[cols].astype(np.uint8)
         assert_frame_equal(result, expected)
 
         result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse)
@@ -466,7 +466,7 @@ def test_dataframe_dummies_prefix_dict(self):
                               'from_B_c': [0, 0, 1],
                               'C': [1, 2, 3]})
         cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']
-        expected[cols] = expected[cols].astype(np.float32)
+        expected[cols] = expected[cols].astype(np.uint8)
         assert_frame_equal(result, expected)
 
     def test_dataframe_dummies_with_na(self):
@@ -481,7 +481,7 @@ def test_dataframe_dummies_with_na(self):
                               'B_c': [0, 0, 1, 0],
                               'B_nan': [0, 0, 0, 1]})
         cols = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']
-        expected[cols] = expected[cols].astype(np.float32)
+        expected[cols] = expected[cols].astype(np.uint8)
         expected = expected[['C', 'A_a', 'A_b', 'A_nan',
                              'B_b', 'B_c', 'B_nan']]
         assert_frame_equal(result, expected)
@@ -518,7 +518,7 @@ def test_dataframe_dummies_with_categorical(self):
                               'cat_x': [1, 0, 0],
                               'cat_y': [0, 1, 1]})
         cols = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y']
-        expected[cols] = expected[cols].astype(np.float32)
+        expected[cols] = expected[cols].astype(np.uint8)
         expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c',
                              'cat_x', 'cat_y']]
         assert_frame_equal(result, expected)
@@ -535,7 +535,7 @@ def test_basic_drop_first(self):
                                     2: 0},
                               'c': {0: 0,
                                     1: 0,
-                                    2: 1}}, dtype=np.float32)
+                                    2: 1}}, dtype=np.uint8)
 
         result = get_dummies(s_list, sparse=self.sparse, drop_first=True)
         assert_frame_equal(result, expected)
@@ -573,7 +573,7 @@ def test_basic_drop_first_NA(self):
         res = get_dummies(s_NA, sparse=self.sparse, drop_first=True)
         exp = DataFrame({'b': {0: 0,
                                1: 1,
-                               2: 0}}, dtype=np.float32)
+                               2: 0}}, dtype=np.uint8)
         assert_frame_equal(res, exp)
 
         res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse,
@@ -583,7 +583,7 @@ def test_basic_drop_first_NA(self):
                                   2: 0},
                             nan: {0: 0,
                                   1: 0,
-                                  2: 1}}, dtype=np.float32).reindex_axis(
+                                  2: 1}}, dtype=np.uint8).reindex_axis(
                                       ['b', nan], 1)
         assert_frame_equal(res_na, exp_na)
 
@@ -606,7 +606,7 @@ def test_dataframe_dummies_drop_first(self):
         df = self.df[['A', 'B']]
         result = get_dummies(df, sparse=self.sparse, drop_first=True)
         expected = DataFrame({'A_b': [0, 1, 0],
-                              'B_c': [0, 0, 1]}, dtype=np.float32)
+                              'B_c': [0, 0, 1]}, dtype=np.uint8)
         assert_frame_equal(result, expected)
 
     def test_dataframe_dummies_drop_first_with_categorical(self):
@@ -618,7 +618,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self):
                               'B_c': [0, 0, 1],
                               'cat_y': [0, 1, 1]})
         cols = ['A_b', 'B_c', 'cat_y']
-        expected[cols] = expected[cols].astype(np.float32)
+        expected[cols] = expected[cols].astype(np.uint8)
         expected = expected[['C', 'A_b', 'B_c', 'cat_y']]
         assert_frame_equal(result, expected)
 
@@ -633,7 +633,7 @@ def test_dataframe_dummies_drop_first_with_na(self):
                               'B_c': [0, 0, 1, 0],
                               'B_nan': [0, 0, 0, 1]})
         cols = ['A_b', 'A_nan', 'B_c', 'B_nan']
-        expected[cols] = expected[cols].astype(np.float32)
+        expected[cols] = expected[cols].astype(np.uint8)
         expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']]
         assert_frame_equal(result, expected)
 
@@ -656,7 +656,7 @@ def test_dataframe_dummies_drop_first_with_na(self):
                               'B_c': [0, 0, 1, 0]},
                              dtype=np.float64)
         cols = ['A_b', 'B_c']
-        expected[cols] = expected[cols].astype(np.float32)
+        expected[cols] = expected[cols].astype(np.uint8)
         expected = expected[['C', 'A_b', 'B_c']]
         assert_frame_equal(result, expected)
 
@@ -675,14 +675,14 @@ def test_int_int(self):
         data = Series([1, 2, 1])
         result = pd.get_dummies(data)
         expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2],
-                             dtype=np.float32)
+                             dtype=np.uint8)
         tm.assert_frame_equal(result, expected)
 
         data = Series(pd.Categorical(['a', 'b', 'a']))
         result = pd.get_dummies(data)
         expected = DataFrame([[1, 0], [0, 1], [1, 0]],
                              columns=pd.Categorical(['a', 'b']),
-                             dtype=np.float32)
+                             dtype=np.uint8)
         tm.assert_frame_equal(result, expected)
 
     def test_int_df(self):
@@ -699,7 +699,7 @@ def test_int_df(self):
             [2, 2., 0, 1, 0, 1],
             [1, 1., 1, 0, 1, 0]
         ], columns=columns)
-        expected[columns[2:]] = expected[columns[2:]].astype(np.float32)
+        expected[columns[2:]] = expected[columns[2:]].astype(np.uint8)
         result = pd.get_dummies(data, columns=['A', 'B'])
         tm.assert_frame_equal(result, expected)
 
@@ -710,7 +710,7 @@ def test_dataframe_dummies_preserve_categorical_dtype(self):
                                  ordered=ordered)
             result = get_dummies(cat)
 
-            data = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.float32)
+            data = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.uint8)
             cols = pd.CategoricalIndex(cat.categories,
                                        categories=cat.categories,
                                        ordered=ordered)

From f5490f8f1ea2a70ad387d69237f4ae028c74d1c0 Mon Sep 17 00:00:00 2001
From: beckermr <mbecker@civisanalytics.com>
Date: Sat, 8 Apr 2017 14:54:21 -0500
Subject: [PATCH 5/9] ENH reworked new get_dummies

---
 pandas/core/reshape.py          | 112 ++++++++++++--------------------
 pandas/tests/test_reshape.py    |   2 +-
 pandas/tests/types/test_cast.py |  93 ++++++++++++++++++++++++++
 pandas/types/cast.py            |  55 +++++++++++++++-
 4 files changed, 188 insertions(+), 74 deletions(-)

diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
index 9d580a53c5c0a..2871d5760d0f1 100644
--- a/pandas/core/reshape.py
+++ b/pandas/core/reshape.py
@@ -1188,34 +1188,20 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
 
     # Infer the proper output dtype.
     # GH15926
-    try:
-        if np.all(np.isfinite(data.values if hasattr(data, 'values') else data)):
-            any_null = True
-        else:
-            any_null = False
-    except TypeError:
-        any_null = False
-
-    if any_null:
-        output_dtype = np.uint8
-    else:
-        fill_value_dtype, fill_value = infer_dtype_from_scalar(fill_value)
-
-        if 'int' in str(fill_value_dtype):
-            if fill_value >= 0:
-                if fill_value <= np.iinfo(np.uint8).max:
-                    output_dtype = np.uint8
-                else:
-                    output_dtype = np.uint64
-            else:
-                if fill_value >= np.iinfo(np.int8).min and fill_value <= np.iinfo(np.int8).max:
-                    output_dtype = np.int8
-                else:
-                    output_dtype = np.int64
-        elif 'float' in str(fill_value_dtype):
+    vals = data.values.ravel() if hasattr(data, 'values') else data
+    isnotfinite = []
+    for v in vals:
+        try:
+            isnotfinite.append(~np.isfinite(v))
+        except TypeError:
+            isnotfinite.append(False)
+    if np.any(isnotfinite):
+        output_dtype, fill_value = infer_dtype_from_scalar(
+            fill_value, downcast=True, allow_uint=True)
+        if output_dtype == np.float16:
             output_dtype = np.float32
-        else:
-            raise ValueError('`fill_value` must be `np.nan`, an int or a float type!')
+    else:
+        output_dtype = np.uint8
 
     if isinstance(data, DataFrame):
         # determine columns being encoded
@@ -1297,9 +1283,10 @@ def get_empty_Frame(data, sparse):
 
     # Record NaN values before we munge the codes, GH15826
     nan_codes_msk = codes == -1
+    num_orig_levels = len(levels)
     codes = codes.copy()
     if dummy_na:
-        codes[nan_codes_msk] = len(levels)
+        codes[nan_codes_msk] = num_orig_levels
         levels = np.append(levels, np.nan)
 
     # if dummy_na, we just fake a nan level. drop_first will drop it again
@@ -1323,57 +1310,38 @@ def get_empty_Frame(data, sparse):
     if sparse:
         sparse_series = {}
         N = len(data)
-        sp_indices = [[] for _ in range(len(dummy_cols))]
-        for ndx, code, missing in zip(
-                range(len(codes)), codes, nan_codes_msk):
-            if missing:
-                # For missing values, we have to decide what to do.
-                # GH15926
-
-                if dummy_na:
-                    # Then we need a one in the last column.
-                    # GH15926
-                    sp_indices[code].append(ndx)
-
-                if fill_value != 0:
-                    # Then we need to mark these locations to put back another
-                    # fill value later. (Zero fill values will be filled by the
-                    # sparse array implicitly).
-                    # Use a negative index here to code NaNs.
-                    # Offset by -1 to account for zero.
-                    # Have to add to ALL columns, except the
-                    # last one if dummy_na.
-                    # GH15926
-                    if dummy_na:
-                        _num_cols = len(dummy_cols) - 1
-                    else:
-                        _num_cols = len(dummy_cols)
-                    for _code in range(_num_cols):
-                        sp_indices[_code].append(-ndx - 1)
-            else:
-                # Value is not missing so do as normal.
-                # GH15926
-                sp_indices[code].append(ndx)
+        # Construct lists of inds and if the value is NaN.
+        # GH15926
+        sp_indices = [None] * len(dummy_cols)
+        sp_fill = [None] * len(dummy_cols)
+        for code in np.unique(codes[codes != -1]):
+            # Non-zero value in sparse array if value is of the level
+            # or the value is NaN and it is filled non-zero and
+            # and it is not the dummy column for NaNs.
+            # GH15926
+            sp_indices[code] = sorted(
+                np.where((codes == code) |
+                         ((fill_value != 0) &
+                          (code < num_orig_levels) &
+                          nan_codes_msk))[0].tolist())
+
+            # Value is filled with `fill_value` if it is NaN
+            # and not in dummy col and fill value is non-zero.
+            # GH15926
+            sp_fill[code] = (nan_codes_msk[sp_indices[code]] &
+                             (fill_value != 0) &
+                             (code < num_orig_levels))
 
         if drop_first:
             # remove first categorical level to avoid perfect collinearity
             # GH12042
             sp_indices = sp_indices[1:]
             dummy_cols = dummy_cols[1:]
-        for col, ixs in zip(dummy_cols, sp_indices):
-            sarr = np.ones(len(ixs), dtype=output_dtype)
-
-            # NaNs are marked by a negative index.
-            # Only need to set for sparse output if
-            # fill_value != 0.
-            # Ditto for any negative indexes generated above.
-            # GH15926
-            if fill_value != 0:
-                ixs = np.array(ixs)
-                sarr[ixs < 0] = fill_value
-                ixs[ixs < 0] += 1  # undo the offset
-                ixs = np.abs(ixs)  # set index back to positive.
+            sp_fill = sp_fill[1:]
 
+        for col, ixs, fill in zip(dummy_cols, sp_indices, sp_fill):
+            sarr = np.ones(len(ixs), dtype=output_dtype)
+            sarr[fill] = fill_value  # Fill with `fill_value`, GH15926
             sarr = SparseArray(
                 sarr,
                 sparse_index=IntIndex(N, ixs),
diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py
index e3a09d7065664..831edf697b2ad 100644
--- a/pandas/tests/test_reshape.py
+++ b/pandas/tests/test_reshape.py
@@ -327,8 +327,8 @@ def test_include_na(self):
                                 dtype=np.uint8)
         tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
 
-    # Add `fill_value` keyword GH15926
     def test_fill_value_na(self):
+        # Add `fill_value` keyword GH15926
         s = ['a', 'b', np.nan]
         res_na = get_dummies(
             s, dummy_na=True, fill_value=np.nan, sparse=self.sparse)
diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py
index de6ef7af9d7f9..713ca2747de7e 100644
--- a/pandas/tests/types/test_cast.py
+++ b/pandas/tests/types/test_cast.py
@@ -86,6 +86,99 @@ def test_datetime_with_timezone(self):
 
 class TestInferDtype(object):
 
+    def test_infer_dtype_from_scalar_downcast_basic(self):
+        # Make sure downcasting works. GH15926
+
+        for dtypec in [np.int8, np.int16, np.int32, np.int64]:
+            data = dtypec(12)
+            dtype, val = infer_dtype_from_scalar(
+                data, downcast=True, allow_uint=False)
+            assert dtype == np.int8
+            dtype, val = infer_dtype_from_scalar(
+                data, downcast=True, allow_uint=True)
+            assert dtype == np.uint8
+
+            data = dtypec(-12)
+            dtype, val = infer_dtype_from_scalar(
+                data, downcast=True, allow_uint=False)
+            assert dtype == np.int8
+            dtype, val = infer_dtype_from_scalar(
+                data, downcast=True, allow_uint=True)
+            assert dtype == np.int8
+
+        for dtypec in [np.uint8, np.uint16, np.uint32, np.uint64]:
+            data = dtypec(12)
+            dtype, val = infer_dtype_from_scalar(
+                data, downcast=True, allow_uint=False)
+            assert dtype == np.uint8
+            dtype, val = infer_dtype_from_scalar(
+                data, downcast=True, allow_uint=True)
+            assert dtype == np.uint8
+
+        data = 12
+        dtype, val = infer_dtype_from_scalar(
+            data, downcast=True)
+        assert dtype == np.int8
+        dtype, val = infer_dtype_from_scalar(
+            data, downcast=True, allow_uint=True)
+        assert dtype == np.uint8
+
+        data = -12
+        dtype, val = infer_dtype_from_scalar(
+            data, downcast=True)
+        assert dtype == np.int8
+        dtype, val = infer_dtype_from_scalar(
+            data, downcast=True, allow_uint=True)
+        assert dtype == np.int8
+
+
+        for dtypec in [np.float16, np.float32, np.float64]:
+            data = dtypec(12)
+            dtype, val = infer_dtype_from_scalar(data, downcast=True)
+            assert dtype == np.float16
+
+        data = np.float(12)
+        dtype, val = infer_dtype_from_scalar(data, downcast=True)
+        assert dtype == np.float16
+
+    def test_infer_dtype_from_scalar_downcast_bounds(self):
+        # Make sure downcasting works at bounds. GH15926
+
+        for dtypec, dtypec_up in [(np.uint8, np.uint16),
+                                  (np.uint16, np.uint32),
+                                  (np.uint32, np.uint64)]:
+            val = dtypec(np.iinfo(dtypec).max)
+
+            data = dtypec(val - 1)
+            dtype, _ = infer_dtype_from_scalar(
+                data, downcast=True, allow_uint=False)
+            assert dtype == dtypec
+            dtype, _ = infer_dtype_from_scalar(
+                data, downcast=True, allow_uint=True)
+            assert dtype == dtypec
+
+            data = dtypec_up(val + 1)
+            dtype, _ = infer_dtype_from_scalar(
+                data, downcast=True, allow_uint=False)
+            assert dtype == dtypec_up
+            dtype, _ = infer_dtype_from_scalar(
+                data, downcast=True, allow_uint=True)
+            assert dtype == dtypec_up
+
+        for dtypec, dtypec_up in [(np.float16, np.float32),
+                                  (np.float32, np.float64)]:
+            data = dtypec(np.finfo(dtypec).min)
+            dtype, _ = infer_dtype_from_scalar(data, downcast=True)
+            assert dtype == dtypec_up
+            dtype, _ = infer_dtype_from_scalar(data, downcast=True)
+            assert dtype == dtypec_up
+
+            data = dtypec(np.finfo(dtypec).max)
+            dtype, _ = infer_dtype_from_scalar(data, downcast=True)
+            assert dtype == dtypec_up
+            dtype, _ = infer_dtype_from_scalar(data, downcast=True)
+            assert dtype == dtypec_up
+
     def test_infer_dtype_from_scalar(self):
         # Test that _infer_dtype_from_scalar is returning correct dtype for int
         # and float.
diff --git a/pandas/types/cast.py b/pandas/types/cast.py
index 580ce12de3333..77c59417090b5 100644
--- a/pandas/types/cast.py
+++ b/pandas/types/cast.py
@@ -312,7 +312,10 @@ def maybe_promote(dtype, fill_value=np.nan):
     return dtype, fill_value
 
 
-def infer_dtype_from_scalar(val, pandas_dtype=False):
+def infer_dtype_from_scalar(val,
+                            pandas_dtype=False,
+                            downcast=False,
+                            allow_uint=False):
     """
     interpret the dtype from a scalar
 
@@ -322,8 +325,52 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
         whether to infer dtype including pandas extension types.
         If False, scalar belongs to pandas extension types is inferred as
         object
+    downcast : bool, default False
+        If True, downcast float and integer types to the smallest width
+        type that can hold `val`.
+
+    .. versionadded:: 0.20.0
+    allow_uint : bool, default False
+        If True and `downcast` is True, non-negative integers will be
+        downcast to smallest width unsigned integer type that can hold
+        them. Otherwise, signed types are always downcast to signed types
+        and the same for unsigned types.
+
+    .. versionadded:: 0.20.0
     """
 
+    def _downcast_dtype(dtype, val):
+        if 'float' in str(dtype):
+            if ((val > np.finfo(np.float16).min and
+                    val < np.finfo(np.float16).max) or val is np.nan):
+                return np.float16
+            elif (val > np.finfo(np.float32).min and
+                    val < np.finfo(np.float32).max):
+                return np.float32
+            else:
+                return np.float64
+        elif 'uint' in str(dtype) or (val >= 0 and allow_uint):
+            if val < np.iinfo(np.uint8).max:
+                return np.uint8
+            elif val < np.iinfo(np.uint16).max:
+                return np.uint16
+            elif val < np.iinfo(np.uint32).max:
+                return np.uint32
+            else:
+                return np.uint64
+        elif 'int' in str(dtype):
+            if (val > np.iinfo(np.int8).min and
+                    val < np.iinfo(np.int8).max):
+                return np.int8
+            elif (val > np.iinfo(np.int16).min and
+                    val < np.iinfo(np.int16).max):
+                return np.int16
+            elif (val > np.iinfo(np.int32).min and
+                    val < np.iinfo(np.int32).max):
+                return np.int32
+            else:
+                return np.int64
+
     dtype = np.object_
 
     # a 1-element ndarray
@@ -335,6 +382,8 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
         dtype = val.dtype
         val = val.item()
 
+        dtype = _downcast_dtype(dtype, val) if downcast else dtype
+
     elif isinstance(val, string_types):
 
         # If we create an empty array using a string to infer
@@ -370,12 +419,16 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
         else:
             dtype = np.int64
 
+        dtype = _downcast_dtype(dtype, val) if downcast else dtype
+
     elif is_float(val):
         if isinstance(val, np.floating):
             dtype = type(val)
         else:
             dtype = np.float64
 
+        dtype = _downcast_dtype(dtype, val) if downcast else dtype
+
     elif is_complex(val):
         dtype = np.complex_
 

From 3e932a8709200264a5db85667c4f593eadccf6a2 Mon Sep 17 00:00:00 2001
From: beckermr <mbecker@civisanalytics.com>
Date: Sat, 8 Apr 2017 14:56:00 -0500
Subject: [PATCH 6/9] STY removed an extra blank line

---
 pandas/tests/types/test_cast.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py
index 713ca2747de7e..f0cca3803eea8 100644
--- a/pandas/tests/types/test_cast.py
+++ b/pandas/tests/types/test_cast.py
@@ -131,7 +131,6 @@ def test_infer_dtype_from_scalar_downcast_basic(self):
             data, downcast=True, allow_uint=True)
         assert dtype == np.int8
 
-
         for dtypec in [np.float16, np.float32, np.float64]:
             data = dtypec(12)
             dtype, val = infer_dtype_from_scalar(data, downcast=True)

From 1ac101fa33da437eab4e87524f8340ae51ab9eb8 Mon Sep 17 00:00:00 2001
From: beckermr <mbecker@civisanalytics.com>
Date: Tue, 11 Apr 2017 12:50:17 -0500
Subject: [PATCH 7/9] ENH added general routine for downcasting types,
 refactored in other spots

---
 pandas/core/reshape.py          |  39 +++--
 pandas/tests/types/test_cast.py | 245 +++++++++++++++++++++-----------
 pandas/tools/util.py            |  31 +---
 pandas/types/cast.py            | 123 +++++++++-------
 4 files changed, 262 insertions(+), 176 deletions(-)

diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
index 2871d5760d0f1..b9090eb98e754 100644
--- a/pandas/core/reshape.py
+++ b/pandas/core/reshape.py
@@ -11,7 +11,8 @@
 from pandas.types.common import (_ensure_platform_int,
                                  is_list_like, is_bool_dtype,
                                  needs_i8_conversion)
-from pandas.types.cast import maybe_promote, infer_dtype_from_scalar
+from pandas.types.cast import (maybe_promote, infer_dtype_from_scalar,
+                               maybe_downcast_itemsize)
 from pandas.types.missing import notnull
 import pandas.types.concat as _concat
 
@@ -1077,7 +1078,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
         If appending prefix, separator/delimiter to use. Or pass a
         list or dictionary as with `prefix.`
     dummy_na : bool, default False
-        Add a column to indicate NaNs if True.
+        If True, add an extra dummy column to indicate NaNs, otherwise
+        no extra column is added.
     columns : list-like, default None
         Column names in the DataFrame to be encoded.
         If `columns` is None then all the columns with
@@ -1094,11 +1096,16 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
 
         .. versionadded:: 0.18.0
     fill_value : scalar, default None
-        Value to fill NaNs with. The default of `None` will fill with
-        zeros. To do no filling of NaNs, specify `fill_value=np.nan`.
-        The default behavior of filling with zeros will be deprecated
-        in the future and using this default will not raise a
-        `FutureWarning`.
+        Value to fill NaNs with. If no missing values are found or NaN is not
+        used to fill them, the returned data type will be the smallest
+        width type that can represent the returned values. See
+        pandas.types.cast.maybe_downcast_itemsize for details. If NaNs are
+        present and NaN is used to fill them, then the smallest floating
+        point type (typically `np.float32`) will be used. Currently, the
+        default of `None` will fill with zeros. To do no filling of NaNs,
+        specify `fill_value=np.nan`. The default behavior of filling with
+        zeros will be deprecated in the future and using this default will
+        now raise a `FutureWarning`.
 
         .. versionadded:: 0.20.0
     Returns
@@ -1196,10 +1203,20 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
         except TypeError:
             isnotfinite.append(False)
     if np.any(isnotfinite):
-        output_dtype, fill_value = infer_dtype_from_scalar(
-            fill_value, downcast=True, allow_uint=True)
-        if output_dtype == np.float16:
-            output_dtype = np.float32
+        output_dtype, fill_value = infer_dtype_from_scalar(fill_value)
+        # `maybe_downcast_itemsize` only accepts arrays, so make a one
+        # element array and then extract the value back out. GH15926
+        if 'float' in str(output_dtype) or fill_value is np.nan:
+            output_dtype, fill_value = maybe_downcast_itemsize(
+                np.array([np.float64(fill_value)]), 'float')
+        elif 'int' in str(output_dtype):
+            if fill_value >= 0:
+                fill_value = np.uint64(fill_value)
+            else:
+                fill_value = np.int64(fill_value)
+            output_dtype, fill_value \
+                = maybe_downcast_itemsize(np.array([fill_value]), 'unsigned')
+        fill_value = output_dtype(fill_value[0])
     else:
         output_dtype = np.uint8
 
diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py
index f0cca3803eea8..0ffdac1e878e3 100644
--- a/pandas/tests/types/test_cast.py
+++ b/pandas/tests/types/test_cast.py
@@ -16,7 +16,8 @@
                                infer_dtype_from_array,
                                maybe_convert_string_to_object,
                                maybe_convert_scalar,
-                               find_common_type)
+                               find_common_type,
+                               maybe_downcast_itemsize)
 from pandas.types.dtypes import (CategoricalDtype,
                                  DatetimeTZDtype, PeriodDtype)
 from pandas.util import testing as tm
@@ -84,99 +85,181 @@ def test_datetime_with_timezone(self):
         tm.assert_index_equal(res, exp)
 
 
-class TestInferDtype(object):
-
-    def test_infer_dtype_from_scalar_downcast_basic(self):
-        # Make sure downcasting works. GH15926
-
-        for dtypec in [np.int8, np.int16, np.int32, np.int64]:
-            data = dtypec(12)
-            dtype, val = infer_dtype_from_scalar(
-                data, downcast=True, allow_uint=False)
-            assert dtype == np.int8
-            dtype, val = infer_dtype_from_scalar(
-                data, downcast=True, allow_uint=True)
-            assert dtype == np.uint8
+class TestMaybeDowncastItemSize(object):
 
-            data = dtypec(-12)
-            dtype, val = infer_dtype_from_scalar(
-                data, downcast=True, allow_uint=False)
-            assert dtype == np.int8
-            dtype, val = infer_dtype_from_scalar(
-                data, downcast=True, allow_uint=True)
-            assert dtype == np.int8
-
-        for dtypec in [np.uint8, np.uint16, np.uint32, np.uint64]:
-            data = dtypec(12)
-            dtype, val = infer_dtype_from_scalar(
-                data, downcast=True, allow_uint=False)
-            assert dtype == np.uint8
-            dtype, val = infer_dtype_from_scalar(
-                data, downcast=True, allow_uint=True)
-            assert dtype == np.uint8
-
-        data = 12
-        dtype, val = infer_dtype_from_scalar(
-            data, downcast=True)
-        assert dtype == np.int8
-        dtype, val = infer_dtype_from_scalar(
-            data, downcast=True, allow_uint=True)
-        assert dtype == np.uint8
+    @pytest.mark.parametrize(
+        "dtypec",
+        [np.float16, np.float32, np.float64])
+    def test_maybe_downcast_itemsize_float(self, dtypec):
+        # Make sure downcasting works for floats. GH15926
+
+        data = np.array([12], dtype=dtypec)
+        dtype, val = maybe_downcast_itemsize(data, 'float')
+        if np.dtype(dtypec).itemsize >= 4:
+            assert dtype == np.float32
+        else:
+            assert dtype == dtypec
 
-        data = -12
-        dtype, val = infer_dtype_from_scalar(
-            data, downcast=True)
+    @pytest.mark.parametrize(
+        "data, dtypec",
+        [(12, np.int8),
+         (12, np.int16),
+         (12, np.int32),
+         (12, np.int64),
+         (12, np.uint8),
+         (12, np.uint16),
+         (12, np.uint32),
+         (12, np.uint64),
+         (-12, np.int8),
+         (-12, np.int16),
+         (-12, np.int32),
+         (-12, np.int64)])
+    def test_maybe_downcast_itemsize_int(self, data, dtypec):
+        # Make sure downcasting works for ints. GH15926
+
+        data = np.array([data], dtype=dtypec)
+        dtype, val = maybe_downcast_itemsize(
+            data, downcast='integer')
         assert dtype == np.int8
-        dtype, val = infer_dtype_from_scalar(
-            data, downcast=True, allow_uint=True)
+        dtype, val = maybe_downcast_itemsize(
+            data, downcast='signed')
         assert dtype == np.int8
+        dtype, val = maybe_downcast_itemsize(
+            data, downcast='unsigned')
+        if val >= 0:
+            assert dtype == np.uint8
+        else:
+            assert dtype == dtypec
+        dtype, val = maybe_downcast_itemsize(
+            data, downcast='float')
+        if np.dtype(dtypec).itemsize >= 4:
+            assert dtype == np.float32
+        else:
+            assert dtype == dtypec
 
-        for dtypec in [np.float16, np.float32, np.float64]:
-            data = dtypec(12)
-            dtype, val = infer_dtype_from_scalar(data, downcast=True)
-            assert dtype == np.float16
-
-        data = np.float(12)
-        dtype, val = infer_dtype_from_scalar(data, downcast=True)
-        assert dtype == np.float16
+    @pytest.mark.parametrize(
+        "dtypec, dtypec_up",
+        [(np.uint8, np.uint16),
+         (np.uint16, np.uint32),
+         (np.uint32, np.uint64)])
+    def test_maybe_downcast_itemsize_uint_bounds(self, dtypec, dtypec_up):
+        # Make sure downcasting works at bounds for uint. GH15926
+
+        val = np.array([np.iinfo(dtypec).max], dtype=dtypec)
+
+        data = val - 1
+        dtype, _ = maybe_downcast_itemsize(
+            data, 'unsigned')
+        assert dtype == dtypec
+        dtype, _ = maybe_downcast_itemsize(
+            data, 'integer')
+        assert dtype == dtypec
+        dtype, _ = maybe_downcast_itemsize(
+            data, 'signed')
+        assert dtype == dtypec
+        dtype, _ = maybe_downcast_itemsize(
+            data, 'float')
+        if np.dtype(dtypec).itemsize >= 4:
+            assert dtype == np.float32
+        else:
+            assert dtype == dtypec
 
-    def test_infer_dtype_from_scalar_downcast_bounds(self):
-        # Make sure downcasting works at bounds. GH15926
+        data = val.astype(dtypec_up) + 1
+        dtype, _ = maybe_downcast_itemsize(
+            data, 'unsigned')
+        assert dtype == dtypec_up
+        dtype, _ = maybe_downcast_itemsize(
+            data, 'integer')
+        assert dtype \
+            == getattr(np, str(np.dtype(dtypec_up)).replace('uint', 'int'))
+        dtype, _ = maybe_downcast_itemsize(
+            data, 'signed')
+        assert dtype \
+            == getattr(np, str(np.dtype(dtypec_up)).replace('uint', 'int'))
+        dtype, _ = maybe_downcast_itemsize(
+            data, 'float')
+        if np.dtype(dtypec_up).itemsize >= 4:
+            assert dtype == np.float32
+        else:
+            assert dtype == dtypec_up
 
-        for dtypec, dtypec_up in [(np.uint8, np.uint16),
-                                  (np.uint16, np.uint32),
-                                  (np.uint32, np.uint64)]:
-            val = dtypec(np.iinfo(dtypec).max)
+    @pytest.mark.parametrize(
+        "dtypec, dtypec_up",
+        [(np.float16, np.float32),
+         (np.float32, np.float64)])
+    def test_maybe_downcast_itemsize_float_bounds(self, dtypec, dtypec_up):
+        # Make sure downcasting works at bounds for float. GH15926
+
+        data = np.array(
+            [float(np.finfo(dtypec).min) * 2.0], dtype=dtypec_up)
+        dtype, val = maybe_downcast_itemsize(data, 'float')
+        assert dtype == dtypec_up
+
+        data = np.array(
+            [float(np.finfo(dtypec).max) * 2.0], dtype=dtypec_up)
+        dtype, _ = maybe_downcast_itemsize(data, 'float')
+        assert dtype == dtypec_up
+
+        data = np.array(
+            [float(np.finfo(dtypec).min) * 0.5], dtype=dtypec)
+        dtype, val = maybe_downcast_itemsize(data, 'float')
+        assert dtype == dtypec
+
+        data = np.array(
+            [float(np.finfo(dtypec).max) * 0.5], dtype=dtypec)
+        dtype, _ = maybe_downcast_itemsize(data, 'float')
+        assert dtype == dtypec
 
-            data = dtypec(val - 1)
-            dtype, _ = infer_dtype_from_scalar(
-                data, downcast=True, allow_uint=False)
-            assert dtype == dtypec
-            dtype, _ = infer_dtype_from_scalar(
-                data, downcast=True, allow_uint=True)
+    @pytest.mark.parametrize(
+        "dtypec, dtypec_up",
+        [(np.int8, np.int16),
+         (np.int16, np.int32),
+         (np.int32, np.int64)])
+    def test_maybe_downcast_itemsize_int_bounds(self, dtypec, dtypec_up):
+        # Make sure downcasting works at bounds for uint. GH15926
+
+        val = np.array([np.iinfo(dtypec).max], dtype=dtypec)
+
+        data = val - 1
+        dtype, _ = maybe_downcast_itemsize(
+            data, 'unsigned')
+        assert dtype \
+            == getattr(np, str(np.dtype(dtypec)).replace('int', 'uint'))
+        dtype, _ = maybe_downcast_itemsize(
+            data, 'integer')
+        assert dtype == dtypec
+        dtype, _ = maybe_downcast_itemsize(
+            data, 'signed')
+        assert dtype == dtypec
+        dtype, _ = maybe_downcast_itemsize(
+            data, 'float')
+        if np.dtype(dtypec).itemsize >= 4:
+            assert dtype == np.float32
+        else:
             assert dtype == dtypec
 
-            data = dtypec_up(val + 1)
-            dtype, _ = infer_dtype_from_scalar(
-                data, downcast=True, allow_uint=False)
-            assert dtype == dtypec_up
-            dtype, _ = infer_dtype_from_scalar(
-                data, downcast=True, allow_uint=True)
+        data = val.astype(dtypec_up) + 1
+        dtype, _ = maybe_downcast_itemsize(
+            data, 'unsigned')
+        assert dtype \
+            == getattr(np, str(np.dtype(dtypec)).replace('int', 'uint'))
+        dtype, _ = maybe_downcast_itemsize(
+            data, 'integer')
+        assert dtype \
+            == getattr(np, str(np.dtype(dtypec_up)).replace('uint', 'int'))
+        dtype, _ = maybe_downcast_itemsize(
+            data, 'signed')
+        assert dtype \
+            == getattr(np, str(np.dtype(dtypec_up)).replace('uint', 'int'))
+        dtype, _ = maybe_downcast_itemsize(
+            data, 'float')
+        if np.dtype(dtypec_up).itemsize >= 4:
+            assert dtype == np.float32
+        else:
             assert dtype == dtypec_up
 
-        for dtypec, dtypec_up in [(np.float16, np.float32),
-                                  (np.float32, np.float64)]:
-            data = dtypec(np.finfo(dtypec).min)
-            dtype, _ = infer_dtype_from_scalar(data, downcast=True)
-            assert dtype == dtypec_up
-            dtype, _ = infer_dtype_from_scalar(data, downcast=True)
-            assert dtype == dtypec_up
 
-            data = dtypec(np.finfo(dtypec).max)
-            dtype, _ = infer_dtype_from_scalar(data, downcast=True)
-            assert dtype == dtypec_up
-            dtype, _ = infer_dtype_from_scalar(data, downcast=True)
-            assert dtype == dtypec_up
+class TestInferDtype(object):
 
     def test_infer_dtype_from_scalar(self):
         # Test that _infer_dtype_from_scalar is returning correct dtype for int
diff --git a/pandas/tools/util.py b/pandas/tools/util.py
index 263d2f16a4216..4f2c6bbd23951 100644
--- a/pandas/tools/util.py
+++ b/pandas/tools/util.py
@@ -9,7 +9,7 @@
                                  is_decimal,
                                  is_scalar as isscalar)
 
-from pandas.types.cast import maybe_downcast_to_dtype
+from pandas.types.cast import maybe_downcast_itemsize
 
 import pandas as pd
 from pandas.compat import reduce
@@ -159,9 +159,6 @@ def to_numeric(arg, errors='raise', downcast=None):
     3   -3.0
     dtype: float64
     """
-    if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'):
-        raise ValueError('invalid downcasting method provided')
-
     is_series = False
     is_index = False
     is_scalar = False
@@ -206,31 +203,7 @@ def to_numeric(arg, errors='raise', downcast=None):
     # attempt downcast only if the data has been successfully converted
     # to a numerical dtype and if a downcast method has been specified
     if downcast is not None and is_numeric_dtype(values):
-        typecodes = None
-
-        if downcast in ('integer', 'signed'):
-            typecodes = np.typecodes['Integer']
-        elif downcast == 'unsigned' and np.min(values) >= 0:
-            typecodes = np.typecodes['UnsignedInteger']
-        elif downcast == 'float':
-            typecodes = np.typecodes['Float']
-
-            # pandas support goes only to np.float32,
-            # as float dtypes smaller than that are
-            # extremely rare and not well supported
-            float_32_char = np.dtype(np.float32).char
-            float_32_ind = typecodes.index(float_32_char)
-            typecodes = typecodes[float_32_ind:]
-
-        if typecodes is not None:
-            # from smallest to largest
-            for dtype in typecodes:
-                if np.dtype(dtype).itemsize <= values.dtype.itemsize:
-                    values = maybe_downcast_to_dtype(values, dtype)
-
-                    # successful conversion
-                    if values.dtype == dtype:
-                        break
+        _, values = maybe_downcast_itemsize(values, downcast)
 
     if is_series:
         return pd.Series(values, index=arg.index, name=arg.name)
diff --git a/pandas/types/cast.py b/pandas/types/cast.py
index 77c59417090b5..b724a1711f448 100644
--- a/pandas/types/cast.py
+++ b/pandas/types/cast.py
@@ -90,7 +90,11 @@ def trans(x):  # noqa
                 return result
 
         if issubclass(dtype.type, np.floating):
-            return result.astype(dtype)
+            if np.allclose(result, trans(result).astype(dtype)):
+                return result.astype(dtype)
+            else:
+                return result
+
         elif is_bool_dtype(dtype) or is_integer_dtype(dtype):
 
             # if we don't have any elements, just astype it
@@ -312,10 +316,69 @@ def maybe_promote(dtype, fill_value=np.nan):
     return dtype, fill_value
 
 
-def infer_dtype_from_scalar(val,
-                            pandas_dtype=False,
-                            downcast=False,
-                            allow_uint=False):
+def maybe_downcast_itemsize(val, downcast):
+    """maybe downcast an itemsize
+
+    Parameters
+    ----------
+    val : any object with a numeric type
+        Value to maybe be downcasted.
+    downcast : str, one of {'integer', 'signed', 'unsigned', 'float'}
+        Downcast that resulting data to the smallest numerical dtype
+        possible according to the following rules:
+
+        - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
+        - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
+        - 'float': smallest float dtype (min.: np.float32)
+
+        Downcasting will only occur if the size
+        of the data's dtype is strictly larger than
+        the dtype it is to be cast to, so if none of the dtypes
+        checked satisfy that specification, no downcasting will be
+        performed on the data.
+
+        Values smaller than the minimums above will be returned as is.
+
+    .. versionadded:: 0.20.0
+    Returns
+    -------
+    dtype : a numpy dtype
+    val : the downcasted value
+    """
+
+    if downcast not in ('integer', 'signed', 'unsigned', 'float'):
+        raise ValueError('invalid downcasting method provided')
+
+    typecodes = None
+
+    if downcast in ('integer', 'signed'):
+        typecodes = np.typecodes['Integer']
+    elif downcast == 'unsigned' and np.min(val) >= 0:
+        typecodes = np.typecodes['UnsignedInteger']
+    elif downcast == 'float':
+        typecodes = np.typecodes['Float']
+
+        # pandas support goes only to np.float32,
+        # as float dtypes smaller than that are
+        # extremely rare and not well supported
+        float_32_char = np.dtype(np.float32).char
+        float_32_ind = typecodes.index(float_32_char)
+        typecodes = typecodes[float_32_ind:]
+
+    if typecodes is not None:
+        # from smallest to largest
+        for dtype in typecodes:
+            if np.dtype(dtype).itemsize <= val.dtype.itemsize:
+                val = maybe_downcast_to_dtype(val, dtype)
+
+                # successful conversion
+                if val.dtype == dtype:
+                    break
+
+    return val.dtype.type, val
+
+
+def infer_dtype_from_scalar(val, pandas_dtype=False):
     """
     interpret the dtype from a scalar
 
@@ -325,52 +388,8 @@ def infer_dtype_from_scalar(val,
         whether to infer dtype including pandas extension types.
         If False, scalar belongs to pandas extension types is inferred as
         object
-    downcast : bool, default False
-        If True, downcast float and integer types to the smallest width
-        type that can hold `val`.
-
-    .. versionadded:: 0.20.0
-    allow_uint : bool, default False
-        If True and `downcast` is True, non-negative integers will be
-        downcast to smallest width unsigned integer type that can hold
-        them. Otherwise, signed types are always downcast to signed types
-        and the same for unsigned types.
-
-    .. versionadded:: 0.20.0
     """
 
-    def _downcast_dtype(dtype, val):
-        if 'float' in str(dtype):
-            if ((val > np.finfo(np.float16).min and
-                    val < np.finfo(np.float16).max) or val is np.nan):
-                return np.float16
-            elif (val > np.finfo(np.float32).min and
-                    val < np.finfo(np.float32).max):
-                return np.float32
-            else:
-                return np.float64
-        elif 'uint' in str(dtype) or (val >= 0 and allow_uint):
-            if val < np.iinfo(np.uint8).max:
-                return np.uint8
-            elif val < np.iinfo(np.uint16).max:
-                return np.uint16
-            elif val < np.iinfo(np.uint32).max:
-                return np.uint32
-            else:
-                return np.uint64
-        elif 'int' in str(dtype):
-            if (val > np.iinfo(np.int8).min and
-                    val < np.iinfo(np.int8).max):
-                return np.int8
-            elif (val > np.iinfo(np.int16).min and
-                    val < np.iinfo(np.int16).max):
-                return np.int16
-            elif (val > np.iinfo(np.int32).min and
-                    val < np.iinfo(np.int32).max):
-                return np.int32
-            else:
-                return np.int64
-
     dtype = np.object_
 
     # a 1-element ndarray
@@ -382,8 +401,6 @@ def _downcast_dtype(dtype, val):
         dtype = val.dtype
         val = val.item()
 
-        dtype = _downcast_dtype(dtype, val) if downcast else dtype
-
     elif isinstance(val, string_types):
 
         # If we create an empty array using a string to infer
@@ -419,16 +436,12 @@ def _downcast_dtype(dtype, val):
         else:
             dtype = np.int64
 
-        dtype = _downcast_dtype(dtype, val) if downcast else dtype
-
     elif is_float(val):
         if isinstance(val, np.floating):
             dtype = type(val)
         else:
             dtype = np.float64
 
-        dtype = _downcast_dtype(dtype, val) if downcast else dtype
-
     elif is_complex(val):
         dtype = np.complex_
 

From 3111ed7f53c03a7e6e81ebd3113eb6330bbd519e Mon Sep 17 00:00:00 2001
From: beckermr <mbecker@civisanalytics.com>
Date: Tue, 11 Apr 2017 12:54:13 -0500
Subject: [PATCH 8/9] DOC added docs for new function

---
 doc/source/whatsnew/v0.20.0.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 127a041952f7e..4db70ef7825ea 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -373,6 +373,7 @@ Other Enhancements
 - ``pd.read_csv()`` will now raise a ``ParserError`` error whenever any parsing error occurs (:issue:`15913`, :issue:`15925`)
 - ``pd.read_csv()`` now supports the ``error_bad_lines`` and ``warn_bad_lines`` arguments for the Python parser (:issue:`15925`)
 - ``pd.get_dummies()`` now accepts the ``fill_value`` keyword which specifies how to fill NaN values in the dummy variables. (:issue:`15923`)
+- ``pd.types.cast`` has a new function ``maybe_downcast_itemsize`` which can be used to reduce the width of numeric types. (:issue:`15923`)
 
 
 .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations

From f7ee8f56f7ca9fab271889081806bcb4d7c67875 Mon Sep 17 00:00:00 2001
From: beckermr <mbecker@civisanalytics.com>
Date: Tue, 11 Apr 2017 13:53:42 -0500
Subject: [PATCH 9/9] TST updated tests for new routine

---
 pandas/tests/test_reshape.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py
index 831edf697b2ad..10ecef6fe5a48 100644
--- a/pandas/tests/test_reshape.py
+++ b/pandas/tests/test_reshape.py
@@ -335,7 +335,7 @@ def test_fill_value_na(self):
         exp_na = DataFrame({'a': [1, 0, np.nan],
                             'b': [0, 1, np.nan],
                             np.nan: [0, 0, 1]},
-                           dtype=np.float32)
+                           dtype=np.float64)
         exp_na = exp_na.reindex_axis(['a', 'b', np.nan], 1)
         assert_frame_equal(res_na, exp_na)
 
@@ -343,7 +343,7 @@ def test_fill_value_na(self):
             [nan], dummy_na=True, fill_value=np.nan, sparse=self.sparse)
         exp_just_na = DataFrame([[1]],
                                 columns=[np.nan],
-                                dtype=np.float32)
+                                dtype=np.float64)
         assert_frame_equal(res_just_na, exp_just_na)
 
     def test_unicode(self
@@ -501,7 +501,7 @@ def test_dataframe_dummies_with_na(self):
                               'B_nan': [0, 0, 0, 1]},
                              dtype=np.float64)
         cols = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']
-        expected[cols] = expected[cols].astype(np.float32)
+        expected[cols] = expected[cols].astype(np.float64)
         expected = expected[
             ['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']]
         assert_frame_equal(result, expected)
@@ -589,7 +589,7 @@ def test_basic_drop_first_NA(self):
 
         res_na = get_dummies(s_NA, fill_value=np.nan, sparse=self.sparse,
                              drop_first=True)
-        exp_na = DataFrame({'b': [0, 1, np.nan]}, dtype=np.float32)
+        exp_na = DataFrame({'b': [0, 1, np.nan]}, dtype=np.float64)
         assert_frame_equal(res_na, exp_na)
 
         res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse,
@@ -645,7 +645,7 @@ def test_dataframe_dummies_drop_first_with_na(self):
                               'B_c': [0, 0, 1, np.nan],
                               'B_nan': [0, 0, 0, 1]}, dtype=np.float64)
         cols = ['A_b', 'B_c', 'A_nan', 'B_nan']
-        expected[cols] = expected[cols].astype(np.float32)
+        expected[cols] = expected[cols].astype(np.float64)
         expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']]
         assert_frame_equal(result, expected)
 
@@ -667,7 +667,7 @@ def test_dataframe_dummies_drop_first_with_na(self):
                               'B_c': [0, 0, 1, np.nan]},
                              dtype=np.float64)
         cols = ['A_b', 'B_c']
-        expected[cols] = expected[cols].astype(np.float32)
+        expected[cols] = expected[cols].astype(np.float64)
         expected = expected[['C', 'A_b', 'B_c']]
         assert_frame_equal(result, expected)