-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH add fill_value feature to pd.get_dummies #15926
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
78c0af5
0304ede
22bf6ce
766e094
f5490f8
3e932a8
1ac101f
3111ed7
f7ee8f5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -372,6 +372,8 @@ Other Enhancements | |
- :func:`MultiIndex.remove_unused_levels` has been added to facilitate :ref:`removing unused levels <advanced.shown_levels>`. (:issue:`15694`) | ||
- ``pd.read_csv()`` will now raise a ``ParserError`` error whenever any parsing error occurs (:issue:`15913`, :issue:`15925`) | ||
- ``pd.read_csv()`` now supports the ``error_bad_lines`` and ``warn_bad_lines`` arguments for the Python parser (:issue:`15925`) | ||
- ``pd.get_dummies()`` now accepts the ``fill_value`` keyword which specifies how to fill NaN values in the dummy variables. (:issue:`15923`) | ||
- ``pd.types.cast`` has a new function ``maybe_downcast_itemsize`` which can be used to reduce the width of numeric types. (:issue:`15923`) | ||
|
||
|
||
.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations | ||
|
@@ -382,6 +384,19 @@ Other Enhancements | |
Backwards incompatible API changes | ||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. needs a ref |
||
Deprecate Automatic Zero Filling of Missing Values in ``pd.get_dummies`` | ||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
|
||
The :func:`get_dummies` function currently fills NaN values with zero by default. This behavior is in conflict with the rest of the pandas API since NaN values should be filled with ``fillna`` or a ``fill_value`` keyword, and NaN values should be propagated through pandas transformations. In the future, :func:`get_dummies` will propagate NaN values by default. (:issue:`15923`) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. double back-ticks around |
||
|
||
|
||
The recommended way to reproduce the current behavior of filling NaN values with zeros with the new, upcoming API is | ||
|
||
.. ipython: python | ||
|
||
df = pd.get_dummies(df, fill_value=0) | ||
|
||
|
||
.. _whatsnew_0200.api_breaking.deprecate_ix: | ||
|
||
Deprecate .ix | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,13 +4,15 @@ | |
from pandas import compat | ||
import itertools | ||
import re | ||
import warnings | ||
|
||
import numpy as np | ||
|
||
from pandas.types.common import (_ensure_platform_int, | ||
is_list_like, is_bool_dtype, | ||
needs_i8_conversion) | ||
from pandas.types.cast import maybe_promote | ||
from pandas.types.cast import (maybe_promote, infer_dtype_from_scalar, | ||
maybe_downcast_itemsize) | ||
from pandas.types.missing import notnull | ||
import pandas.types.concat as _concat | ||
|
||
|
@@ -1059,7 +1061,8 @@ def melt_stub(df, stub, i, j, value_vars, sep): | |
|
||
|
||
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, | ||
columns=None, sparse=False, drop_first=False): | ||
columns=None, sparse=False, drop_first=False, | ||
fill_value=None): | ||
""" | ||
Convert categorical variable into dummy/indicator variables | ||
|
||
|
@@ -1075,7 +1078,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, | |
If appending prefix, separator/delimiter to use. Or pass a | ||
list or dictionary as with `prefix.` | ||
dummy_na : bool, default False | ||
Add a column to indicate NaNs, if False NaNs are ignored. | ||
If True, add an extra dummy column to indicate NaNs, otherwise | ||
no extra column is added. | ||
columns : list-like, default None | ||
Column names in the DataFrame to be encoded. | ||
If `columns` is None then all the columns with | ||
|
@@ -1091,6 +1095,19 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, | |
first level. | ||
|
||
.. versionadded:: 0.18.0 | ||
fill_value : scalar, default None | ||
Value to fill NaNs with. If no missing values are found or NaN is not | ||
used to fill them, the returned data type will be the smallest | ||
width type that can represent the returned values. See | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't list internal functions. |
||
pandas.types.cast.maybe_downcast_itemsize for details. If NaNs are | ||
present and NaN is used to fill them, then the smallest floating | ||
point type (typically `np.float32`) will be used. Currently, the | ||
default of `None` will fill with zeros. To do no filling of NaNs, | ||
specify `fill_value=np.nan`. The default behavior of filling with | ||
zeros will be deprecated in the future and using this default will | ||
now raise a `FutureWarning`. | ||
|
||
.. versionadded:: 0.20.0 | ||
Returns | ||
------- | ||
dummies : DataFrame or SparseDataFrame | ||
|
@@ -1121,6 +1138,18 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, | |
1 0 1 0 | ||
2 0 0 1 | ||
|
||
>>> pd.get_dummies(s1, fill_value=np.nan) | ||
a b | ||
0 1 0 | ||
1 0 1 | ||
2 NaN NaN | ||
|
||
>>> pd.get_dummies(s1, fill_value=np.nan, dummy_na=True) | ||
a b NaN | ||
0 1 0 0 | ||
1 0 1 0 | ||
2 NaN NaN 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this the output we want? In case of a dummy_na column, I would expect the row to contain 0/1's, and not NaN/1 ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes this is the output. If you want zeros, supply fill_value=0. The point here is that a NaN means we do not know what label it is. Putting a zero in the missing spots would indicate that the example is NOT one of those labels. We of course do not know what label the example is since the label is missing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is also the added benefit of making the handling of missing values and filling them consistent across pandas so it presents a consistent API to users. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, but you ask for a dummy column, which means: treat NaNs as a separate value, and thus make a separate column for it.
Yes, you can do that, but I think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Agreed. I'm struggling to think of a reason you would ever want There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just because you can't think of a reason doesn't mean it doesn't exist. People could fill in a prior for missing values and still want to mark examples which are missing. The point here is to be principled and consistent in terms of the pandas API while also not making assumptions about other people's data. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The consistency here is not a good argument in my opinion. With the default (which we are changing here), the NaNs will be preserved, consistent with how this is typical dealt with in pandas. When using the dummy_na column, this means that you are treating NaN no longer as a 'missing value' (in the sense of 'I don't know which one of the other values it is'), but as one of the valid values. The consequence is IMO that the other columns should be 0.
I don't understand this one.
Agreed. We could document that the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I explained an example of this in an earlier thread. Suppose you know your are missing labels at random and you have a statistically representative data set. Then filling in missing labels with the fraction of each label from the data you have is a good assumption. You might in this case still want to mark which examples had missing labels too. The point I have made many times is that the fewer assumptions pandas makes about the intentions of users and their data the better. The pandas user base is broad and covers many academic displines from the social sciences to astronomy plus a broad range of tech and industry domains. Assumptions built into pandas from one domain may not make sense for the others. The goal should be to provide general configurable tooling to work with data based on a consistent API with idiomatic syntax. Not filling missing values with zeros by default and having people supply a fill_value keyword meets these criterion |
||
|
||
>>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], | ||
'C': [1, 2, 3]}) | ||
|
||
|
@@ -1153,6 +1182,44 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, | |
from pandas.tools.concat import concat | ||
from itertools import cycle | ||
|
||
# Deprecate filling NaN values with zeros, GH15926 | ||
# When this is finally deprecated, simply remove this block | ||
# of code and change the default to np.nan in the function signature | ||
# of `get_dummies`. | ||
if fill_value is None: | ||
warnings.warn('The default behavior of filling NaN values ' | ||
'with zeros will be deprecated. Use ' | ||
'`df = pd.get_dummies(df, fill_value=0)` to reproduce ' | ||
'this behavior', FutureWarning, 3) | ||
fill_value = 0 | ||
|
||
# Infer the proper output dtype. | ||
# GH15926 | ||
vals = data.values.ravel() if hasattr(data, 'values') else data | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
What do you mean here? |
||
isnotfinite = [] | ||
for v in vals: | ||
try: | ||
isnotfinite.append(~np.isfinite(v)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is pretty complicated logic and needs to be refactored |
||
except TypeError: | ||
isnotfinite.append(False) | ||
if np.any(isnotfinite): | ||
output_dtype, fill_value = infer_dtype_from_scalar(fill_value) | ||
# `maybe_downcast_itemsize` only accepts arrays, so make a one | ||
# element array and then extract the value back out. GH15926 | ||
if 'float' in str(output_dtype) or fill_value is np.nan: | ||
output_dtype, fill_value = maybe_downcast_itemsize( | ||
np.array([np.float64(fill_value)]), 'float') | ||
elif 'int' in str(output_dtype): | ||
if fill_value >= 0: | ||
fill_value = np.uint64(fill_value) | ||
else: | ||
fill_value = np.int64(fill_value) | ||
output_dtype, fill_value \ | ||
= maybe_downcast_itemsize(np.array([fill_value]), 'unsigned') | ||
fill_value = output_dtype(fill_value[0]) | ||
else: | ||
output_dtype = np.uint8 | ||
|
||
if isinstance(data, DataFrame): | ||
# determine columns being encoded | ||
|
||
|
@@ -1197,17 +1264,22 @@ def check_len(item, name): | |
|
||
dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep, | ||
dummy_na=dummy_na, sparse=sparse, | ||
drop_first=drop_first) | ||
drop_first=drop_first, | ||
fill_value=fill_value, | ||
output_dtype=output_dtype) | ||
with_dummies.append(dummy) | ||
result = concat(with_dummies, axis=1) | ||
else: | ||
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, | ||
sparse=sparse, drop_first=drop_first) | ||
sparse=sparse, drop_first=drop_first, | ||
fill_value=fill_value, | ||
output_dtype=output_dtype) | ||
return result | ||
|
||
|
||
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, | ||
sparse=False, drop_first=False): | ||
fill_value=np.nan, sparse=False, drop_first=False, | ||
output_dtype=np.uint8): | ||
# Series avoids inconsistent NaN handling | ||
codes, levels = _factorize_from_iterable(Series(data)) | ||
|
||
|
@@ -1221,17 +1293,23 @@ def get_empty_Frame(data, sparse): | |
else: | ||
return SparseDataFrame(index=index, default_fill_value=0) | ||
|
||
# if all NaN | ||
if not dummy_na and len(levels) == 0: | ||
# If we get all NaN and are not making a dummy col, then just return. | ||
# GH15826 | ||
if len(levels) == 0 and not dummy_na: | ||
return get_empty_Frame(data, sparse) | ||
|
||
# Record NaN values before we munge the codes, GH15826 | ||
nan_codes_msk = codes == -1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or can call this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. though in-line this is pretty reasonable I don't see a reason to create a variable here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For future notes, you have to cache the nan-values since anything with |
||
num_orig_levels = len(levels) | ||
codes = codes.copy() | ||
if dummy_na: | ||
codes[codes == -1] = len(levels) | ||
codes[nan_codes_msk] = num_orig_levels | ||
levels = np.append(levels, np.nan) | ||
|
||
# if dummy_na, we just fake a nan level. drop_first will drop it again | ||
if drop_first and len(levels) == 1: | ||
# test for length of levels was changed to `<=` from `==` to cover | ||
# all NaN inputs, GH15826 | ||
if drop_first and len(levels) <= 1: | ||
return get_empty_Frame(data, sparse) | ||
|
||
number_of_cols = len(levels) | ||
|
@@ -1249,41 +1327,66 @@ def get_empty_Frame(data, sparse): | |
if sparse: | ||
sparse_series = {} | ||
N = len(data) | ||
sp_indices = [[] for _ in range(len(dummy_cols))] | ||
for ndx, code in enumerate(codes): | ||
if code == -1: | ||
# Blank entries if not dummy_na and code == -1, #GH4446 | ||
continue | ||
sp_indices[code].append(ndx) | ||
# Construct lists of inds and if the value is NaN. | ||
# GH15926 | ||
sp_indices = [None] * len(dummy_cols) | ||
sp_fill = [None] * len(dummy_cols) | ||
for code in np.unique(codes[codes != -1]): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use |
||
# Non-zero value in sparse array if value is of the level | ||
# or the value is NaN and it is filled non-zero and | ||
# and it is not the dummy column for NaNs. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this still feels pretty inefficient, but maybe not an issue in practice. |
||
# GH15926 | ||
sp_indices[code] = sorted( | ||
np.where((codes == code) | | ||
((fill_value != 0) & | ||
(code < num_orig_levels) & | ||
nan_codes_msk))[0].tolist()) | ||
|
||
# Value is filled with `fill_value` if it is NaN | ||
# and not in dummy col and fill value is non-zero. | ||
# GH15926 | ||
sp_fill[code] = (nan_codes_msk[sp_indices[code]] & | ||
(fill_value != 0) & | ||
(code < num_orig_levels)) | ||
|
||
if drop_first: | ||
# remove first categorical level to avoid perfect collinearity | ||
# GH12042 | ||
sp_indices = sp_indices[1:] | ||
dummy_cols = dummy_cols[1:] | ||
for col, ixs in zip(dummy_cols, sp_indices): | ||
sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8), | ||
sparse_index=IntIndex(N, ixs), fill_value=0, | ||
dtype=np.uint8) | ||
sp_fill = sp_fill[1:] | ||
|
||
for col, ixs, fill in zip(dummy_cols, sp_indices, sp_fill): | ||
sarr = np.ones(len(ixs), dtype=output_dtype) | ||
sarr[fill] = fill_value # Fill with `fill_value`, GH15926 | ||
sarr = SparseArray( | ||
sarr, | ||
sparse_index=IntIndex(N, ixs), | ||
fill_value=0, | ||
dtype=output_dtype) | ||
sparse_series[col] = SparseSeries(data=sarr, index=index) | ||
|
||
out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, | ||
default_fill_value=0, | ||
dtype=np.uint8) | ||
dtype=output_dtype) | ||
return out | ||
|
||
else: | ||
dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0) | ||
dummy_mat = np.eye( | ||
number_of_cols, dtype=output_dtype).take(codes, axis=0) | ||
|
||
if not dummy_na: | ||
# reset NaN GH4446 | ||
dummy_mat[codes == -1] = 0 | ||
# user specified fill value via `fill_value` GH15926 | ||
if dummy_na: | ||
dummy_mat[nan_codes_msk, :-1] = fill_value | ||
else: | ||
dummy_mat[nan_codes_msk] = fill_value | ||
|
||
if drop_first: | ||
# remove first GH12042 | ||
dummy_mat = dummy_mat[:, 1:] | ||
dummy_cols = dummy_cols[1:] | ||
return DataFrame(dummy_mat, index=index, columns=dummy_cols) | ||
return DataFrame( | ||
dummy_mat, index=index, columns=dummy_cols, dtype=output_dtype) | ||
|
||
|
||
def make_axis_dummies(frame, axis='minor', transform=None): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
2nd entry is not necessary, this is internal.