Skip to content

Commit 439906e

Browse files
authored
BUG: DataFrameGroupBy.value_counts() fails if as_index=False and there are duplicate column labels (#45160)
1 parent 006400f commit 439906e

File tree

2 files changed

+59
-24
lines changed

2 files changed

+59
-24
lines changed

pandas/core/groupby/generic.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1731,32 +1731,48 @@ def value_counts(
17311731
observed=self.observed,
17321732
dropna=self.dropna,
17331733
)
1734-
result = cast(Series, gb.size())
1734+
result_series = cast(Series, gb.size())
17351735

17361736
if normalize:
17371737
# Normalize the results by dividing by the original group sizes.
17381738
# We are guaranteed to have the first N levels be the
17391739
# user-requested grouping.
1740-
levels = list(range(len(self.grouper.groupings), result.index.nlevels))
1741-
indexed_group_size = result.groupby(
1742-
result.index.droplevel(levels),
1740+
levels = list(
1741+
range(len(self.grouper.groupings), result_series.index.nlevels)
1742+
)
1743+
indexed_group_size = result_series.groupby(
1744+
result_series.index.droplevel(levels),
17431745
sort=self.sort,
17441746
observed=self.observed,
17451747
dropna=self.dropna,
17461748
).transform("sum")
17471749

1748-
result /= indexed_group_size
1750+
result_series /= indexed_group_size
17491751

17501752
if sort:
17511753
# Sort the values and then resort by the main grouping
17521754
index_level = range(len(self.grouper.groupings))
1753-
result = result.sort_values(ascending=ascending).sort_index(
1754-
level=index_level, sort_remaining=False
1755-
)
1755+
result_series = result_series.sort_values(
1756+
ascending=ascending
1757+
).sort_index(level=index_level, sort_remaining=False)
17561758

1757-
if not self.as_index:
1759+
result: Series | DataFrame
1760+
if self.as_index:
1761+
result = result_series
1762+
else:
17581763
# Convert to frame
1759-
result = result.reset_index(name="proportion" if normalize else "count")
1764+
name = "proportion" if normalize else "count"
1765+
index = result_series.index
1766+
columns = com.fill_missing_names(index.names)
1767+
if name in columns:
1768+
raise ValueError(
1769+
f"Column label '{name}' is duplicate of result column"
1770+
)
1771+
result_series.name = name
1772+
result_series.index = index.set_names(range(len(columns)))
1773+
result_frame = result_series.reset_index()
1774+
result_frame.columns = columns + [name]
1775+
result = result_frame
17601776
return result.__finalize__(self.obj, method="value_counts")
17611777

17621778

pandas/tests/groupby/test_frame_value_counts.py

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -406,33 +406,52 @@ def test_mixed_groupings(normalize, expected_label, expected_values):
406406

407407

408408
@pytest.mark.parametrize(
409-
"test, expected_names",
409+
"test, columns, expected_names",
410410
[
411-
("repeat", ["a", None, "d", "b", "b", "e"]),
412-
("level", ["a", None, "d", "b", "c", "level_1"]),
411+
("repeat", list("abbde"), ["a", None, "d", "b", "b", "e"]),
412+
("level", list("abcd") + ["level_1"], ["a", None, "d", "b", "c", "level_1"]),
413413
],
414414
)
415415
@pytest.mark.parametrize("as_index", [False, True])
416-
def test_column_name_clashes(test, expected_names, as_index):
417-
df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8], "e": [9, 10]})
418-
if test == "repeat":
419-
df.columns = list("abbde")
420-
else:
421-
df.columns = list("abcd") + ["level_1"]
422-
416+
def test_column_label_duplicates(test, columns, expected_names, as_index):
417+
# GH 44992
418+
# Test for duplicate input column labels and generated duplicate labels
419+
df = DataFrame([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], columns=columns)
420+
expected_data = [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)]
421+
result = df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts()
423422
if as_index:
424-
result = df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts()
425423
expected = Series(
426424
data=(1, 1),
427425
index=MultiIndex.from_tuples(
428-
[(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)],
426+
expected_data,
429427
names=expected_names,
430428
),
431429
)
432430
tm.assert_series_equal(result, expected)
433431
else:
434-
with pytest.raises(ValueError, match="cannot insert"):
435-
df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts()
432+
expected_data = [list(row) + [1] for row in expected_data]
433+
expected_columns = list(expected_names)
434+
expected_columns[1] = "level_1"
435+
expected_columns.append("count")
436+
expected = DataFrame(expected_data, columns=expected_columns)
437+
tm.assert_frame_equal(result, expected)
438+
439+
440+
@pytest.mark.parametrize(
441+
"normalize, expected_label",
442+
[
443+
(False, "count"),
444+
(True, "proportion"),
445+
],
446+
)
447+
def test_result_label_duplicates(normalize, expected_label):
448+
# Test for result column label duplicating an input column label
449+
gb = DataFrame([[1, 2, 3]], columns=["a", "b", expected_label]).groupby(
450+
"a", as_index=False
451+
)
452+
msg = f"Column label '{expected_label}' is duplicate of result column"
453+
with pytest.raises(ValueError, match=msg):
454+
gb.value_counts(normalize=normalize)
436455

437456

438457
def test_ambiguous_grouping():

0 commit comments

Comments
 (0)