From 6ee0371b5ead4983e82fb60dff7b5e6a758bb3ac Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 16 Nov 2023 21:57:38 -0500 Subject: [PATCH 1/5] BUG: GroupBy.value_counts sorting order --- pandas/core/groupby/groupby.py | 23 +++++--- .../groupby/methods/test_value_counts.py | 52 +++++++++++++++---- 2 files changed, 58 insertions(+), 17 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3412f18a40313..d4a86bcd68db1 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2826,6 +2826,22 @@ def _value_counts( ).sortlevel() result_series = result_series.reindex(multi_index, fill_value=0) + if sort: + # Sort by the values + result_series = result_series.sort_values( + ascending=ascending, kind="stable" + ) + if self.sort: + # Sort by the groupings + names = result_series.index.names + # GH#56007 - Temporarily replace names in case they are integers + result_series.index.names = range(len(names)) + index_level = list(range(len(self.grouper.groupings))) + result_series = result_series.sort_index( + level=index_level, sort_remaining=False + ) + result_series.index.names = names + if normalize: # Normalize the results by dividing by the original group sizes. # We are guaranteed to have the first N levels be the @@ -2845,13 +2861,6 @@ def _value_counts( # Handle groups of non-observed categories result_series = result_series.fillna(0.0) - if sort: - # Sort the values and then resort by the main grouping - index_level = range(len(self.grouper.groupings)) - result_series = result_series.sort_values(ascending=ascending).sort_index( - level=index_level, sort_remaining=False - ) - result: Series | DataFrame if self.as_index: result = result_series diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 200fa5fd4367d..8c440992df392 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -385,8 +385,8 @@ def test_against_frame_and_seriesgroupby( "sort, ascending, expected_rows, expected_count, expected_group_size", [ (False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]), - (True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]), - (True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]), + (True, False, [3, 0, 1, 2, 4], [2, 1, 1, 1, 1], [3, 1, 3, 1, 1]), + (True, True, [0, 1, 2, 4, 3], [1, 1, 1, 1, 2], [1, 3, 1, 1, 3]), ], ) def test_compound( @@ -811,19 +811,19 @@ def test_categorical_single_grouper_observed_false( ("FR", "female", "high"), ("FR", "male", "medium"), ("FR", "female", "low"), - ("FR", "male", "high"), ("FR", "female", "medium"), + ("FR", "male", "high"), ("US", "female", "high"), ("US", "male", "low"), - ("US", "male", "medium"), - ("US", "male", "high"), - ("US", "female", "medium"), ("US", "female", "low"), - ("ASIA", "male", "low"), - ("ASIA", "male", "high"), - ("ASIA", "female", "medium"), - ("ASIA", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), ("ASIA", "female", "high"), + ("ASIA", "female", "low"), + ("ASIA", "female", "medium"), + ("ASIA", "male", "high"), + ("ASIA", "male", "low"), ("ASIA", "male", "medium"), ] @@ -1177,3 +1177,35 @@ def test_value_counts_integer_columns(): {1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"], "count": 1} ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("vc_sort", [True, False]) +@pytest.mark.parametrize("normalize", [True, False]) +def test_value_counts_sort(sort, vc_sort, normalize): + # GH#55951 + # These values give different results for all four combinations of sort options + # and would give a different result if we sorted by normalized values. + # However, value_counts only sorts by frequencies. + df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}) + gb = df.groupby("a", sort=sort) + result = gb.value_counts(sort=vc_sort, normalize=normalize) + + if normalize: + values = [2 / 3, 1 / 3, 1.0] + else: + values = [2, 1, 1] + index = MultiIndex( + levels=[[1, 2], [3, 4]], codes=[[0, 0, 1], [0, 1, 0]], names=["a", 0] + ) + expected = Series(values, index=index, name="proportion" if normalize else "count") + if sort and vc_sort: + taker = [0, 1, 2] + elif sort and not vc_sort: + taker = [0, 1, 2] + elif not sort and vc_sort: + taker = [0, 2, 1] + else: + taker = [2, 1, 0] + expected = expected.take(taker) + + tm.assert_series_equal(result, expected) From 8f54fed6e1b34444a98bd14d33b5506850723fd2 Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 16 Nov 2023 23:12:36 -0500 Subject: [PATCH 2/5] Whatsnew --- doc/source/whatsnew/v2.2.0.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 93b0ccad9f2aa..18c9d2fb8e3ba 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -444,6 +444,9 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) +- Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`56007`) +- Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`56007`) +- Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`56007`) Reshaping ^^^^^^^^^ From 5d7b1225f34625658826eb71e20587bddf151bfa Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 16 Nov 2023 23:17:16 -0500 Subject: [PATCH 3/5] cleanup --- pandas/tests/groupby/methods/test_value_counts.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 8c440992df392..cfe8dcd9d74dd 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -1183,9 +1183,6 @@ def test_value_counts_integer_columns(): @pytest.mark.parametrize("normalize", [True, False]) def test_value_counts_sort(sort, vc_sort, normalize): # GH#55951 - # These values give different results for all four combinations of sort options - # and would give a different result if we sorted by normalized values. - # However, value_counts only sorts by frequencies. df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}) gb = df.groupby("a", sort=sort) result = gb.value_counts(sort=vc_sort, normalize=normalize) From ee2e4680b280cc0e733c18f023ad836dc8c8a8ea Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 17 Nov 2023 16:17:55 -0500 Subject: [PATCH 4/5] Fix categorical and add test --- pandas/core/groupby/groupby.py | 4 +-- .../groupby/methods/test_value_counts.py | 35 ++++++++++++++++++- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d4a86bcd68db1..c17badccfb59a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2821,9 +2821,9 @@ def _value_counts( for grouping in groupings ): levels_list = [ping.result_index for ping in groupings] - multi_index, _ = MultiIndex.from_product( + multi_index = MultiIndex.from_product( levels_list, names=[ping.name for ping in groupings] - ).sortlevel() + ) result_series = result_series.reindex(multi_index, fill_value=0) if sort: diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index cfe8dcd9d74dd..f1650b22acf4c 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -1184,7 +1184,7 @@ def test_value_counts_integer_columns(): def test_value_counts_sort(sort, vc_sort, normalize): # GH#55951 df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}) - gb = df.groupby("a", sort=sort) + gb = df.groupby("a", sort=sort, observed=True) result = gb.value_counts(sort=vc_sort, normalize=normalize) if normalize: @@ -1206,3 +1206,36 @@ def test_value_counts_sort(sort, vc_sort, normalize): expected = expected.take(taker) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("vc_sort", [True, False]) +@pytest.mark.parametrize("normalize", [True, False]) +def test_value_counts_sort_categorical(sort, vc_sort, normalize): + # GH#55951 + df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}, dtype="category") + gb = df.groupby("a", sort=sort, observed=True) + result = gb.value_counts(sort=vc_sort, normalize=normalize) + + if normalize: + values = [2 / 3, 1 / 3, 1.0, 0.0] + else: + values = [2, 1, 1, 0] + name = "proportion" if normalize else "count" + expected = DataFrame( + { + "a": Categorical([1, 1, 2, 2]), + 0: Categorical([3, 4, 3, 4]), + name: values, + } + ).set_index(["a", 0])[name] + if sort and vc_sort: + taker = [0, 1, 2, 3] + elif sort and not vc_sort: + taker = [0, 1, 2, 3] + elif not sort and vc_sort: + taker = [0, 2, 1, 3] + else: + taker = [2, 3, 0, 1] + expected = expected.take(taker) + + tm.assert_series_equal(result, expected) From d0dad4c2e726001d2e4ef56dd9ab906cb1ef6bd3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 17 Nov 2023 16:25:07 -0500 Subject: [PATCH 5/5] cleanup --- pandas/tests/groupby/methods/test_value_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index f1650b22acf4c..2fa79c815d282 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -1184,7 +1184,7 @@ def test_value_counts_integer_columns(): def test_value_counts_sort(sort, vc_sort, normalize): # GH#55951 df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}) - gb = df.groupby("a", sort=sort, observed=True) + gb = df.groupby("a", sort=sort) result = gb.value_counts(sort=vc_sort, normalize=normalize) if normalize: