diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 561562f367db2..8d246b16ea4c4 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -347,7 +347,7 @@ Groupby/Resample/Rolling - Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`) - Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) - Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`) - +- Ensured that result group order is correct when grouping on an ordered Categorical and specifying ``observed=True`` (:issue:`25871`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e470a32b85cd6..335d40dd16949 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -302,6 +302,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, if observed: codes = algorithms.unique1d(self.grouper.codes) codes = codes[codes != -1] + if sort or self.grouper.ordered: + codes = np.sort(codes) else: codes = np.arange(len(categories)) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e118135ccc75d..8dfb27ee34232 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -453,6 +453,28 @@ def test_dataframe_categorical_with_nan(observed): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("ordered", [True, False]) +@pytest.mark.parametrize("observed", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): + # GH 25871: Fix groupby sorting on ordered Categoricals + # Build a dataframe with a Categorical having one unobserved category ('AWOL'), and a Series with identical values + cat = pd.Categorical(['d', 'a', 'b', 'a', 'd', 'b'], categories=['a', 'b', 'AWOL', 'd'], ordered=ordered) + val = pd.Series (['d', 'a', 'b', 'a', 'd', 'b']) + df = pd.DataFrame({'cat': cat, 'val': val}) + + # aggregate on the Categorical + result = df.groupby('cat', observed=observed, sort=sort)['val'].agg('first') + + # If ordering is correct, we expect index labels equal to aggregation results, + # except for 'observed=False', when index contains 'AWOL' and aggregation None + label = pd.Series(result.index.array, dtype='object') + aggr = pd.Series(result.array) + if not observed: + aggr[aggr.isna()] = 'AWOL' + tm.assert_equal(label, aggr) + + def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range('2014-01-01', periods=4)