diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8053c17437c5e..af79f73ef5ed8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11102,6 +11102,7 @@ def merge( right_index: bool = False, sort: bool = False, suffixes: Suffixes = ("_x", "_y"), + force_suffixes: bool = False, copy: bool | lib.NoDefault = lib.no_default, indicator: str | bool = False, validate: MergeValidate | None = None, @@ -11121,6 +11122,7 @@ def merge( right_index=right_index, sort=sort, suffixes=suffixes, + force_suffixes=force_suffixes, indicator=indicator, validate=validate, ) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f762695eedb3d..cbec3cae538b6 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -154,6 +154,7 @@ def merge( right_index: bool = False, sort: bool = False, suffixes: Suffixes = ("_x", "_y"), + force_suffixes: bool = False, copy: bool | lib.NoDefault = lib.no_default, indicator: str | bool = False, validate: str | None = None, @@ -396,6 +397,7 @@ def merge( right_index=right_index, sort=sort, suffixes=suffixes, + force_suffixes=force_suffixes, indicator=indicator, validate=validate, ) @@ -412,6 +414,7 @@ def _cross_merge( right_index: bool = False, sort: bool = False, suffixes: Suffixes = ("_x", "_y"), + force_suffixes: bool = False, indicator: str | bool = False, validate: str | None = None, ) -> DataFrame: @@ -448,6 +451,7 @@ def _cross_merge( right_index=right_index, sort=sort, suffixes=suffixes, + force_suffixes=force_suffixes, indicator=indicator, validate=validate, ) @@ -967,6 +971,7 @@ def __init__( right_index: bool = False, sort: bool = True, suffixes: Suffixes = ("_x", "_y"), + force_suffixes: bool = False, indicator: str | bool = False, validate: str | None = None, ) -> None: @@ -979,6 +984,8 @@ def __init__( self.on = com.maybe_make_list(on) self.suffixes = suffixes + self.force_suffixes = force_suffixes + self.sort = sort or how == "outer" self.left_index = left_index @@ -1088,8 +1095,12 @@ def _reindex_and_concat( left = self.left[:] right = self.right[:] + keep_left = [x for x in self.left_on if isinstance(x, str)] + keep_right = [x for x in self.right_on if isinstance(x, str)] + llabels, rlabels = _items_overlap_with_suffix( - self.left._info_axis, self.right._info_axis, self.suffixes + self.left._info_axis, self.right._info_axis, self.suffixes, + self.force_suffixes, keep_left, keep_right ) if left_indexer is not None and not is_range_indexer(left_indexer, len(left)): @@ -3013,7 +3024,8 @@ def _validate_operand(obj: DataFrame | Series) -> DataFrame: def _items_overlap_with_suffix( - left: Index, right: Index, suffixes: Suffixes + left: Index, right: Index, suffixes: Suffixes, force_suffixes: bool, + keep_left: list, keep_right: list ) -> tuple[Index, Index]: """ Suffixes type validation. @@ -3028,8 +3040,13 @@ def _items_overlap_with_suffix( f"Passing 'suffixes' as a {type(suffixes)}, is not supported. " "Provide 'suffixes' as a tuple instead." ) + + if force_suffixes: + to_rename = left.union(right) + else: + to_rename = left.intersection(right) + keep_left, keep_right = [], [] - to_rename = left.intersection(right) if len(to_rename) == 0: return left, right @@ -3038,7 +3055,7 @@ def _items_overlap_with_suffix( if not lsuffix and not rsuffix: raise ValueError(f"columns overlap but no suffix specified: {to_rename}") - def renamer(x, suffix: str | None): + def renamer(x, suffix: str | None, keep: list): """ Rename the left and right indices. @@ -3054,12 +3071,12 @@ def renamer(x, suffix: str | None): ------- x : renamed column name """ - if x in to_rename and suffix is not None: + if x in to_rename and suffix is not None and (x not in keep): return f"{x}{suffix}" return x - lrenamer = partial(renamer, suffix=lsuffix) - rrenamer = partial(renamer, suffix=rsuffix) + lrenamer = partial(renamer, suffix=lsuffix, keep=keep_left) + rrenamer = partial(renamer, suffix=rsuffix, keep=keep_right) llabels = left._transform_index(lrenamer) rlabels = right._transform_index(rrenamer) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f3418ad047afe..b24a5b09c60bc 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2369,6 +2369,77 @@ def test_merge_suffix(col1, col2, kwargs, expected_cols): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("force_suffixes", [False, True]) +def test_merge_suffix_with_force_simple(force_suffixes): + df1 = pd.DataFrame({ + 'ID': [1, 2, 3], + 'Value': ['A', 'B', 'C'] + }) + + df2 = pd.DataFrame({ + 'ID': [2, 3, 4], + 'Value': ['D', 'E', 'F'] + }) + + if force_suffixes: + expected = DataFrame([[2, "B", "D"], [3, "C", "E"]], + columns=["ID", "Value_left", "Value_right"]) + else: + expected = DataFrame([[2, "B", "D"], [3, "C", "E"]], + columns=["ID", "Value_left", "Value_right"]) + + result = merge(df1, df2, on="ID", suffixes=("_left", "_right"), + force_suffixes=force_suffixes) + tm.assert_frame_equal(result, expected) + +@pytest.mark.parametrize("force_suffixes", [False, True]) +def test_merge_suffix_with_force_multi_column(force_suffixes): + a = DataFrame({"A": [1, 2, 3, 98], "B": [4, 5, 6, 99], "ALPHABET": ["A", "B", "C", "Z"]}) + b = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "alphabet": ["a", "b", "c"]}) + + if force_suffixes: + expected = DataFrame([[1, 4, "A", 1, 4, "a"], [2, 5, "B", 2, 5, "b"], [3, 6, "C", 3, 6, "c"]], + columns=["A", "B", "ALPHABET_x", "a", "b", "alphabet_y"]) + else: + expected = DataFrame([[1, 4, "A", 1, 4, "a"], [2, 5, "B", 2, 5, "b"], [3, 6, "C", 3, 6, "c"]], + columns=["A", "B", "ALPHABET", "a", "b", "alphabet"]) + + result = merge(a, b, left_on=["A", "B"], right_on=["a", "b"], + force_suffixes=force_suffixes) + tm.assert_frame_equal(result, expected) + +@pytest.mark.parametrize( + "col1, col2, kwargs, expected_cols", + [ + (0, 0, {"suffixes": ("", "_dup")}, ["0", "0_dup"]), + (0, 0, {"suffixes": (None, "_dup")}, [0, "0_dup"]), + (0, 0, {"suffixes": ("_x", "_y")}, ["0_x", "0_y"]), + (0, 0, {"suffixes": ["_x", "_y"]}, ["0_x", "0_y"]), + ("a", 0, {"suffixes": (None, "_y")}, ["a", "0_y"]), + (0.0, 0.0, {"suffixes": ("_x", None)}, ["0.0_x", 0.0]), + ("b", "b", {"suffixes": (None, "_y")}, ["b", "b_y"]), + ("a", "a", {"suffixes": ("_x", None)}, ["a_x", "a"]), + ("a", "b", {"suffixes": ("_x", None)}, ["a_x", "b"]), + ("a", "a", {"suffixes": (None, "_x")}, ["a", "a_x"]), + (0, 0, {"suffixes": ("_a", None)}, ["0_a", 0]), + ("a", "a", {}, ["a_x", "a_y"]), + (0, 0, {}, ["0_x", "0_y"]), + ], +) +def test_merge_suffix_with_force(col1, col2, kwargs, expected_cols): + # issue: 24782 + a = DataFrame({col1: [1, 2, 3]}) + b = DataFrame({col2: [4, 5, 6]}) + + expected = DataFrame([[1, 4], [2, 5], [3, 6]], columns=expected_cols) + + result = a.merge(b, left_index=True, right_index=True, force_suffixes=True, **kwargs) + tm.assert_frame_equal(result, expected) + + result = merge(a, b, left_index=True, right_index=True, force_suffixes=True, **kwargs) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "how,expected", [ @@ -2577,6 +2648,7 @@ def test_categorical_non_unique_monotonic(n_categories): tm.assert_frame_equal(expected, result) + def test_merge_join_categorical_multiindex(): # From issue 16627 a = {