Skip to content

ENH:column-wise DataFrame.fillna and duplicated DataFrame.fillna with Series and Dict #30922

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ For example:
Other enhancements
^^^^^^^^^^^^^^^^^^
- :class:`Index` with object dtype supports division and multiplication (:issue:`34160`)
-
- :meth:`DataFrame.fillna` can fill NA values column-wise with a dictionary or :class:`Series` (:issue:`4514`)
-


Expand Down
29 changes: 17 additions & 12 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6134,20 +6134,25 @@ def fillna(
)

elif isinstance(value, (dict, ABCSeries)):
temp_data = self if inplace else self.copy()

if axis == 1:
raise NotImplementedError(
"Currently only can fill "
"with dict/Series column "
"by column"
)
for i, item in enumerate(temp_data.items()):
label, content = item
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this doesn't make sense with the axis here; you are updating the same column whether axis==0 or 1

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback
Yes. but, filled value is different whether axis==0 or 1. And 'downcast' works properly when execute column-based.

temp_data.iloc[:, i] = content.fillna(
value, limit=limit, inplace=False, downcast=downcast
)
else:
for i, item in enumerate(temp_data.items()):
label, content = item
if label not in value:
continue
temp_data.iloc[:, i] = content.fillna(
value[label], limit=limit, inplace=False, downcast=downcast
)

result = self if inplace else self.copy()
for k, v in value.items():
if k not in result:
continue
obj = result[k]
obj.fillna(v, limit=limit, inplace=True, downcast=downcast)
return result if not inplace else None
temp_data = temp_data.infer_objects()
new_data = temp_data._mgr

elif not is_list_like(value):
new_data = self._mgr.fillna(
Expand Down
95 changes: 91 additions & 4 deletions pandas/tests/frame/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,10 +618,6 @@ def test_fillna_dict_series(self):
expected = df.fillna(df.max().to_dict())
tm.assert_frame_equal(result, expected)

# disable this for now
with pytest.raises(NotImplementedError, match="column by column"):
df.fillna(df.max(1), axis=1)

def test_fillna_dataframe(self):
# GH 8377
df = DataFrame(
Expand Down Expand Up @@ -710,3 +706,94 @@ def test_fill_corner(self, float_frame, float_string_frame):

# TODO(wesm): unused?
result = empty_float.fillna(value=0) # noqa

@pytest.mark.parametrize(
"expected,fill_value",
[
(
DataFrame(
[[100, 100], [200, 4], [5, 6]], columns=list("AB"), dtype="float64"
),
Series([100, 200, 300]),
),
(
DataFrame(
[[100, 100], [np.nan, 4], [5, 6]],
columns=list("AB"),
dtype="float64",
),
{0: 100, 2: 300, 3: 400},
),
],
)
def test_fillna_column_wise(self, expected, fill_value):
# GH 4514
df = DataFrame([[np.nan, np.nan], [np.nan, 4], [5, 6]], columns=list("AB"))
result = df.fillna(fill_value, axis=1)
tm.assert_frame_equal(expected, result)

def test_fillna_column_wise_downcast(self):
# GH 4514
df = DataFrame([[np.nan, 2], [3, np.nan], [np.nan, np.nan]], columns=list("AB"))
s = Series([100, 200, 300])

expected = DataFrame(
[[100, 2], [3, 200], [300, 300]], columns=list("AB"), dtype="int64"
)
result = df.fillna(s, axis=1, downcast="infer")
tm.assert_frame_equal(expected, result)

@pytest.mark.parametrize(
"fill_value", [Series([100, 200, 300]), {0: 100, 2: 300, 3: 400}]
)
def test_fillna_column_wise_inplace(self, fill_value):
# GH 4514
df = DataFrame([[np.nan, np.nan], [np.nan, 4], [5, 6]], columns=list("AB"))
expected = df.fillna(fill_value, axis=1, inplace=False)
df.fillna(fill_value, axis=1, inplace=True)
tm.assert_frame_equal(expected, df)

@pytest.mark.parametrize(
"fill_value",
[Series([100, 200, 300], index=[0, 1, 2]), {0: 100, 1: 200, 2: 300}],
)
def test_fillna_column_wise_duplicated_with_series_dict(self, fill_value):
# GH 4514
df = DataFrame(
[[np.nan, np.nan, 3], [np.nan, 5, np.nan], [7, np.nan, np.nan]],
columns=list("ABB"),
index=[0, 0, 1],
)
expected = DataFrame(
[[100, 100, 3], [100, 5, 100], [7, 200, 200]],
columns=list("ABB"),
index=[0, 0, 1],
dtype="float64",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be integers no?

Copy link
Contributor Author

@proost proost Feb 21, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WillAyd
There may be disagreement, what i intended is keeping data type same before. Although column's type in df is float because of "np.nan", anyway each column's in df is float. So if "fillna" changes data type, it means "fillna" fills NA and also changes data type. I think this is not right and if user want to change data types, then it is user's share.
Nevertheless if you think this must be integers, i will change it.
For that reason, "downcast" parameter is.

)

result = df.fillna(fill_value, axis=1)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"fill_value",
[
Series([100, 200, 300], index=["A", "B", "C"]),
{"A": 100, "B": 200, "C": 300},
],
)
def test_fillna_duplicated_with_series_dict(self, fill_value):
# GH 4514
df = DataFrame(
[[np.nan, np.nan, 3], [np.nan, 5, np.nan], [7, np.nan, np.nan]],
columns=list("ABB"),
index=[0, 0, 1],
)
expected = DataFrame(
[[100, 200, 3], [100, 5, 200], [7, 200, 200]],
columns=list("ABB"),
index=[0, 0, 1],
dtype="float64",
)

result = df.fillna(fill_value)
tm.assert_frame_equal(result, expected)