From bc62afe69b630ae42b206ccc9943ff012e8c6bd0 Mon Sep 17 00:00:00 2001 From: Farsidetfs Date: Mon, 12 May 2025 22:35:10 +0000 Subject: [PATCH] BUG: Raise MergeError when suffixes result in duplicate column names (GH#61402) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/reshape/merge.py | 12 +++++++----- pandas/tests/reshape/merge/test_merge.py | 9 +++++++++ 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 099e5bc48353a..6823ef65c2e37 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -847,6 +847,7 @@ Reshaping - Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) - Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`) +- Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 68d61da0cf7dd..b8ad932abc2b0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -3058,17 +3058,19 @@ def renamer(x, suffix: str | None): llabels = left._transform_index(lrenamer) rlabels = right._transform_index(rrenamer) - dups = [] + dups = set() if not llabels.is_unique: # Only warn when duplicates are caused because of suffixes, already duplicated # columns in origin should not warn - dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist() + dups.update(llabels[(llabels.duplicated()) & (~left.duplicated())]) if not rlabels.is_unique: - dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist()) + dups.update(rlabels[(rlabels.duplicated()) & (~right.duplicated())]) + # Suffix addition creates duplicate to pre-existing column name + dups.update(llabels.intersection(right.difference(to_rename))) + dups.update(rlabels.intersection(left.difference(to_rename))) if dups: raise MergeError( - f"Passing 'suffixes' which cause duplicate columns {set(dups)} is " - f"not allowed.", + f"Passing 'suffixes' which cause duplicate columns {dups} is not allowed.", ) return llabels, rlabels diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f0f67aebd85ec..f3418ad047afe 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -3060,3 +3060,12 @@ def test_merge_on_all_nan_column(): {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]} ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("suffixes", [("_dup", ""), ("", "_dup")]) +def test_merge_for_suffix_collisions(suffixes): + # GH#61402 + df1 = DataFrame({"col1": [1], "col2": [2]}) + df2 = DataFrame({"col1": [1], "col2": [2], "col2_dup": [3]}) + with pytest.raises(MergeError, match="duplicate columns"): + merge(df1, df2, on="col1", suffixes=suffixes)