Skip to content

Commit e62c70f

Browse files
committed
BUG: Raise MergeError when suffixes result in duplicate column names (GH#61402)
1 parent 9c5b9ee commit e62c70f

File tree

3 files changed

+23
-6
lines changed

3 files changed

+23
-6
lines changed

doc/source/whatsnew/v2.3.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ Groupby/resample/rolling
170170

171171
Reshaping
172172
^^^^^^^^^
173-
-
173+
- Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`)
174174
-
175175

176176
Sparse

pandas/core/reshape/merge.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3058,17 +3058,19 @@ def renamer(x, suffix: str | None):
30583058
llabels = left._transform_index(lrenamer)
30593059
rlabels = right._transform_index(rrenamer)
30603060

3061-
dups = []
3061+
dups = set()
30623062
if not llabels.is_unique:
30633063
# Only warn when duplicates are caused because of suffixes, already duplicated
30643064
# columns in origin should not warn
3065-
dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()
3065+
dups.update(llabels[(llabels.duplicated()) & (~left.duplicated())])
30663066
if not rlabels.is_unique:
3067-
dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist())
3067+
dups.update(rlabels[(rlabels.duplicated()) & (~right.duplicated())])
3068+
# Suffix addition creates duplicate to pre-existing column name
3069+
dups.update(llabels.intersection(right.difference(to_rename)))
3070+
dups.update(rlabels.intersection(left.difference(to_rename)))
30683071
if dups:
30693072
raise MergeError(
3070-
f"Passing 'suffixes' which cause duplicate columns {set(dups)} is "
3071-
f"not allowed.",
3073+
f"Passing 'suffixes' which cause duplicate columns {dups} is not allowed.",
30723074
)
30733075

30743076
return llabels, rlabels

pandas/tests/reshape/merge/test_merge.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3060,3 +3060,18 @@ def test_merge_on_all_nan_column():
30603060
{"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]}
30613061
)
30623062
tm.assert_frame_equal(result, expected)
3063+
3064+
3065+
def test_merge_for_suffix_collisions():
3066+
# GH#61402
3067+
# Case 1: suffixes=("_dup", "") test collision
3068+
df1 = DataFrame({"col1": [1], "col2": [2]})
3069+
df2 = DataFrame({"col1": [1], "col2": [2], "col2_dup": [3]})
3070+
with pytest.raises(MergeError, match="duplicate columns"):
3071+
merge(df1, df2, on="col1", suffixes=("_dup", ""))
3072+
3073+
# Case 2: suffixes=("", "_dup") test collision
3074+
df1 = DataFrame({"col1": [1], "col2": [2]})
3075+
df2 = DataFrame({"col1": [1], "col2": [2], "col2_dup": [3]})
3076+
with pytest.raises(MergeError, match="duplicate columns"):
3077+
merge(df1, df2, on="col1", suffixes=("", "_dup"))

0 commit comments

Comments
 (0)