Skip to content

BUG: numeric_only with axis=1 in DataFrame.corrwith and DataFrameGroupBy.cummin/max #47724

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10550,7 +10550,8 @@ def corrwith(
else:
return this.apply(lambda x: other.corr(x, method=method), axis=axis)

other = other._get_numeric_data()
if numeric_only_bool:
other = other._get_numeric_data()
left, right = this.align(other, join="inner", copy=False)

if axis == 1:
Expand All @@ -10563,11 +10564,15 @@ def corrwith(
right = right + left * 0

# demeaned data
ldem = left - left.mean()
rdem = right - right.mean()
ldem = left - left.mean(numeric_only=numeric_only_bool)
rdem = right - right.mean(numeric_only=numeric_only_bool)

num = (ldem * rdem).sum()
dom = (left.count() - 1) * left.std() * right.std()
dom = (
(left.count() - 1)
* left.std(numeric_only=numeric_only_bool)
* right.std(numeric_only=numeric_only_bool)
)

correl = num / dom

Expand Down
12 changes: 10 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3630,7 +3630,11 @@ def cummin(self, axis=0, numeric_only=False, **kwargs) -> NDFrameT:
skipna = kwargs.get("skipna", True)
if axis != 0:
f = lambda x: np.minimum.accumulate(x, axis)
return self._python_apply_general(f, self._selected_obj, is_transform=True)
numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis)
obj = self._selected_obj
if numeric_only_bool:
obj = obj._get_numeric_data()
return self._python_apply_general(f, obj, is_transform=True)

return self._cython_transform(
"cummin", numeric_only=numeric_only, skipna=skipna
Expand All @@ -3650,7 +3654,11 @@ def cummax(self, axis=0, numeric_only=False, **kwargs) -> NDFrameT:
skipna = kwargs.get("skipna", True)
if axis != 0:
f = lambda x: np.maximum.accumulate(x, axis)
return self._python_apply_general(f, self._selected_obj, is_transform=True)
numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis)
obj = self._selected_obj
if numeric_only_bool:
obj = obj._get_numeric_data()
return self._python_apply_general(f, obj, is_transform=True)

return self._cython_transform(
"cummax", numeric_only=numeric_only, skipna=skipna
Expand Down
80 changes: 78 additions & 2 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,81 @@ def test_idxmin_idxmax_axis1():
gb2.idxmax(axis=1)


@pytest.mark.parametrize("numeric_only", [True, False, None])
def test_axis1_numeric_only(request, groupby_func, numeric_only):
if groupby_func in ("idxmax", "idxmin"):
pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1")
if groupby_func in ("mad", "tshift"):
pytest.skip("mad and tshift are deprecated")
if groupby_func in ("corrwith", "skew"):
msg = "GH#47723 groupby.corrwith and skew do not correctly implement axis=1"
request.node.add_marker(pytest.mark.xfail(reason=msg))

df = DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df["E"] = "x"
groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4]
gb = df.groupby(groups)
method = getattr(gb, groupby_func)
args = (0,) if groupby_func == "fillna" else ()
kwargs = {"axis": 1}
if numeric_only is not None:
# when numeric_only is None we don't pass any argument
kwargs["numeric_only"] = numeric_only

# Functions without numeric_only and axis args
no_args = ("cumprod", "cumsum", "diff", "fillna", "pct_change", "rank", "shift")
# Functions with axis args
has_axis = (
"cumprod",
"cumsum",
"diff",
"pct_change",
"rank",
"shift",
"cummax",
"cummin",
"idxmin",
"idxmax",
"fillna",
)
if numeric_only is not None and groupby_func in no_args:
msg = "got an unexpected keyword argument 'numeric_only'"
with pytest.raises(TypeError, match=msg):
method(*args, **kwargs)
elif groupby_func not in has_axis:
msg = "got an unexpected keyword argument 'axis'"
warn = FutureWarning if groupby_func == "skew" and not numeric_only else None
with tm.assert_produces_warning(warn, match="Dropping of nuisance columns"):
with pytest.raises(TypeError, match=msg):
method(*args, **kwargs)
# fillna and shift are successful even on object dtypes
elif (numeric_only is None or not numeric_only) and groupby_func not in (
"fillna",
"shift",
):
msgs = (
# cummax, cummin, rank
"not supported between instances of",
# cumprod
"can't multiply sequence by non-int of type 'float'",
# cumsum, diff, pct_change
"unsupported operand type",
)
with pytest.raises(TypeError, match=f"({'|'.join(msgs)})"):
method(*args, **kwargs)
else:
result = method(*args, **kwargs)

df_expected = df.drop(columns="E").T if numeric_only else df.T
expected = getattr(df_expected, groupby_func)(*args).T
if groupby_func == "shift" and not numeric_only:
# shift with axis=1 leaves the leftmost column as numeric
# but transposing for expected gives us object dtype
expected = expected.astype(float)

tm.assert_equal(result, expected)


def test_groupby_cumprod():
# GH 4095
df = DataFrame({"key": ["b"] * 10, "value": 2})
Expand Down Expand Up @@ -1321,7 +1396,7 @@ def test_deprecate_numeric_only(
assert "b" not in result.columns
elif (
# kernels that work on any dtype and have numeric_only arg
kernel in ("first", "last", "corrwith")
kernel in ("first", "last")
or (
# kernels that work on any dtype and don't have numeric_only arg
kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique")
Expand All @@ -1339,7 +1414,8 @@ def test_deprecate_numeric_only(
"(not allowed for this dtype"
"|must be a string or a number"
"|cannot be performed against 'object' dtypes"
"|must be a string or a real number)"
"|must be a string or a real number"
"|unsupported operand type)"
)
with pytest.raises(TypeError, match=msg):
method(*args, **kwargs)
Expand Down