Skip to content

BUG: DataFrame.groupby(., dropna=True, axis=0) incorrectly throws ShapeError [RESUBMIT] #37685

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ Missing
^^^^^^^

- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`)
-
- Bug in :class:`Grouper` now correctly propagates ``dropna`` argument and :meth:`DataFrameGroupBy.transform` now correctly handles missing values for ``dropna=True``(:issue:`35612`)

MultiIndex
^^^^^^^^^^
Expand Down
1 change: 0 additions & 1 deletion pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,6 @@ def _transform_general(self, func, *args, **kwargs):
result = maybe_downcast_numeric(result, self._selected_obj.dtype)

result.name = self._selected_obj.name
result.index = self._selected_obj.index
return result

def _transform_fast(self, result) -> Series:
Expand Down
24 changes: 19 additions & 5 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -718,12 +718,26 @@ def _set_result_index_ordered(

# the values/counts are repeated according to the group index
# shortcut if we have an already ordered grouper
if not self.grouper.is_monotonic:
index = Index(np.concatenate(self._get_indices(self.grouper.result_index)))
result.set_axis(index, axis=self.axis, inplace=True)
result = result.sort_index(axis=self.axis)
if self.grouper.is_monotonic:
result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
return result

# row order is scrambled => sort the rows by position in original index
original_positions = Index(
np.concatenate(self._get_indices(self.grouper.result_index))
)
result.set_axis(original_positions, axis=self.axis, inplace=True)
result = result.sort_index(axis=self.axis)

result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
rows_dropped = len(result.index) < len(self._selected_obj)

# get index by slicing original index according to original positions
# slice drops attrs => use set_axis when no rows were dropped
if rows_dropped:
sorted_indexer = result.index
result.index = self._selected_obj.index[sorted_indexer]
else:
result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
return result

def _dir_additions(self) -> Set[str]:
Expand Down
51 changes: 34 additions & 17 deletions pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,36 +171,53 @@ def test_grouper_dropna_propagation(dropna):


@pytest.mark.parametrize(
"dropna,df_expected,s_expected",
"dropna,input_index,expected_data,expected_index",
[
pytest.param(
(True, pd.RangeIndex(0, 4), {"B": [2, 2, 1]}, pd.RangeIndex(0, 3)),
(True, list("abcd"), {"B": [2, 2, 1]}, list("abc")),
(
True,
pd.DataFrame({"B": [2, 2, 1]}),
pd.Series(data=[2, 2, 1], name="B"),
marks=pytest.mark.xfail(raises=ValueError),
pd.MultiIndex.from_tuples(
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
),
{"B": [2, 2, 1]},
pd.MultiIndex.from_tuples(
[(1, "R"), (1, "B"), (2, "R")], names=["num", "col"]
),
),
(False, pd.RangeIndex(0, 4), {"B": [2, 2, 1, 1]}, pd.RangeIndex(0, 4)),
(False, list("abcd"), {"B": [2, 2, 1, 1]}, list("abcd")),
(
False,
pd.DataFrame({"B": [2, 2, 1, 1]}),
pd.Series(data=[2, 2, 1, 1], name="B"),
pd.MultiIndex.from_tuples(
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
),
{"B": [2, 2, 1, 1]},
pd.MultiIndex.from_tuples(
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
),
),
],
)
def test_slice_groupby_then_transform(dropna, df_expected, s_expected):
# GH35014
def test_groupby_dataframe_slice_then_transform(
dropna, input_index, expected_data, expected_index
):
# GH35014 & GH35612

df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]})
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=input_index)
gb = df.groupby("A", dropna=dropna)

res = gb.transform(len)
tm.assert_frame_equal(res, df_expected)
result = gb.transform(len)
expected = pd.DataFrame(expected_data, index=expected_index)
tm.assert_frame_equal(result, expected)

gb_slice = gb[["B"]]
res = gb_slice.transform(len)
tm.assert_frame_equal(res, df_expected)
result = gb[["B"]].transform(len)
expected = pd.DataFrame(expected_data, index=expected_index)
tm.assert_frame_equal(result, expected)

res = gb["B"].transform(len)
tm.assert_series_equal(res, s_expected)
result = gb["B"].transform(len)
expected = pd.Series(expected_data["B"], index=expected_index, name="B")
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@ def test_list_grouper_with_nat(self):
[
(
"transform",
Series(name=2, dtype=np.float64, index=pd.RangeIndex(0, 0, 1)),
Series(name=2, dtype=np.float64, index=Index([])),
),
(
"agg",
Expand Down