From f193bd7825477edbf2920ab9a559be9b441a9934 Mon Sep 17 00:00:00 2001 From: "Igoshev, Iaroslav" Date: Wed, 26 Jun 2024 19:30:35 +0000 Subject: [PATCH] Address comment Signed-off-by: Igoshev, Iaroslav --- modin/core/dataframe/algebra/binary.py | 4 +++ .../dataframe/pandas/dataframe/dataframe.py | 23 ++++++++++++++-- .../storage_formats/pandas/query_compiler.py | 2 ++ modin/tests/pandas/dataframe/test_binary.py | 27 +++++++++++++++++++ 4 files changed, 54 insertions(+), 2 deletions(-) diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index b5e701d2d4b..b107089eda5 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -298,6 +298,7 @@ def register( cls, func: Callable[..., pandas.DataFrame], join_type: str = "outer", + sort: bool = None, labels: str = "replace", infer_dtypes: Optional[str] = None, ) -> Callable[..., PandasQueryCompiler]: @@ -310,6 +311,8 @@ def register( Binary function to execute. Have to be able to accept at least two arguments. join_type : {'left', 'right', 'outer', 'inner', None}, default: 'outer' Type of join that will be used if indices of operands are not aligned. + sort : bool, default: None + Whether to sort index and columns or not. labels : {"keep", "replace", "drop"}, default: "replace" Whether keep labels from left Modin DataFrame, replace them with labels from joined DataFrame or drop altogether to make them be computed lazily later. @@ -419,6 +422,7 @@ def caller( lambda x, y: func(x, y, *args, **kwargs), [other._modin_frame], join_type=join_type, + sort=sort, labels=labels, dtypes=dtypes, ), diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 991d175932f..21e2356f9b3 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -3823,6 +3823,7 @@ def n_ary_op( op, right_frames: list[PandasDataframe], join_type="outer", + sort=None, copartition_along_columns=True, labels="replace", dtypes: Optional[pandas.Series] = None, @@ -3838,6 +3839,8 @@ def n_ary_op( Modin DataFrames to join with. join_type : str, default: "outer" Type of join to apply. + sort : bool, default: None + Whether to sort index and columns or not. copartition_along_columns : bool, default: True Whether to perform copartitioning along columns or not. For some ops this isn't needed (e.g., `fillna`). @@ -3854,7 +3857,16 @@ def n_ary_op( New Modin DataFrame. """ left_parts, list_of_right_parts, joined_index, row_lengths = self._copartition( - 0, right_frames, join_type, sort=True + 0, + right_frames, + join_type, + sort=( + not all( + self.get_axis(0).equals(right.get_axis(0)) for right in right_frames + ) + if sort is None + else sort + ), ) if copartition_along_columns: new_left_frame = self.__constructor__( @@ -3886,7 +3898,14 @@ def n_ary_op( 1, new_right_frames, join_type, - sort=False, + sort=( + not all( + self.get_axis(1).equals(right.get_axis(1)) + for right in new_right_frames + ) + if sort is None + else sort + ), ) else: joined_columns = self.copy_columns_cache(copy_lengths=True) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 7c4f7e79f55..9d4467c2085 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -460,6 +460,7 @@ def to_numpy(self, **kwargs): df_update = Binary.register( copy_df_for_func(pandas.DataFrame.update, display_name="update"), join_type="left", + sort=False, ) series_update = Binary.register( copy_df_for_func( @@ -467,6 +468,7 @@ def to_numpy(self, **kwargs): display_name="update", ), join_type="left", + sort=False, ) # Needed for numpy API diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index e153f9f892f..108e2620aac 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -527,3 +527,30 @@ def test_arithmetic_with_tricky_dtypes(val1, val2, op, request): lambda dfs: getattr(dfs[0], op)(dfs[1]), expected_exception=expected_exception, ) + + +@pytest.mark.parametrize( + "data, other_data", + [ + ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "C": [7, 8, 9]}), + ({"C": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "A": [7, 8, 9]}), + ], +) +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("match_index", [True, False]) +def test_bin_op_mismatched_columns(data, other_data, axis, match_index): + modin_df, pandas_df = create_test_dfs(data) + other_modin_df, other_pandas_df = create_test_dfs(other_data) + if axis == 0: + if not match_index: + modin_df.index = pandas_df.index = ["1", "2", "3"] + other_modin_df.index = other_pandas_df.index = ["2", "1", "3"] + eval_general( + modin_df, + pandas_df, + lambda df: ( + df.add(other_modin_df, axis=axis) + if isinstance(df, pd.DataFrame) + else df.add(other_pandas_df, axis=axis) + ), + )