From 66929bd35dff84d94c99545aad3b79c05b2d2784 Mon Sep 17 00:00:00 2001 From: "Igoshev, Iaroslav" Date: Tue, 23 Apr 2024 13:43:41 +0000 Subject: [PATCH] Fix HDK Signed-off-by: Igoshev, Iaroslav --- .../hdk_on_native/dataframe/dataframe.py | 40 ++++++++++++------- .../dataframe_protocol/hdk/test_protocol.py | 8 ++++ 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py index 09ffe8011a6..55c042f63db 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py @@ -1511,16 +1511,19 @@ def _join_arrow_columns(self, other_modin_frames): raise NotImplementedError("Duplicate column names") max_len = max(len(t) for t in tables) columns = [c for t in tables for c in t.columns] + new_dtypes = [dt for frame in frames for dt in frame.dtypes] # Make all columns of the same length, if required. for i, col in enumerate(columns): if len(col) < max_len: columns[i] = pyarrow.chunked_array( col.chunks + [pyarrow.nulls(max_len - len(col), col.type)] ) + new_dtypes[i] = arrow_type_to_pandas(columns[i].type) return self.from_arrow( at=pyarrow.table(columns, column_names), columns=[c for f in frames for c in f.columns], encode_col_names=False, + new_dtypes=new_dtypes, ) return None @@ -3009,7 +3012,13 @@ def encoder(n): @classmethod def from_arrow( - cls, at, index_cols=None, index=None, columns=None, encode_col_names=True + cls, + at, + index_cols=None, + index=None, + columns=None, + encode_col_names=True, + new_dtypes=None, ): """ Build a frame from an Arrow table. @@ -3028,6 +3037,8 @@ def from_arrow( Column labels to use for resulting frame. encode_col_names : bool, default: True Encode column names. + dtypes : pandas.Index or list, optional + Column data types. Returns ------- @@ -3057,20 +3068,21 @@ def from_arrow( dtype_index = [] if index_cols is None else list(index_cols) dtype_index.extend(new_columns) - new_dtypes = [] - - for col in at.columns: - if pyarrow.types.is_dictionary(col.type): - new_dtypes.append( - LazyProxyCategoricalDtype._build_proxy( - parent=at, - column_name=col._name, - materializer=build_categorical_from_at, - dtype=arrow_type_to_pandas(col.type.value_type), + + if new_dtypes is None: + new_dtypes = [] + for col in at.columns: + if pyarrow.types.is_dictionary(col.type): + new_dtypes.append( + LazyProxyCategoricalDtype._build_proxy( + parent=at, + column_name=col._name, + materializer=build_categorical_from_at, + dtype=arrow_type_to_pandas(col.type.value_type), + ) ) - ) - else: - new_dtypes.append(cls._arrow_type_to_dtype(col.type)) + else: + new_dtypes.append(cls._arrow_type_to_dtype(col.type)) if len(unsupported_cols) > 0: ErrorMessage.single_warning( diff --git a/modin/tests/interchange/dataframe_protocol/hdk/test_protocol.py b/modin/tests/interchange/dataframe_protocol/hdk/test_protocol.py index 0b3135b84fd..a21e86251b4 100644 --- a/modin/tests/interchange/dataframe_protocol/hdk/test_protocol.py +++ b/modin/tests/interchange/dataframe_protocol/hdk/test_protocol.py @@ -47,6 +47,14 @@ def test_simple_export(data_has_nulls, from_hdk, n_chunks): ) md_df = pd.DataFrame(data) exported_df = export_frame(md_df, from_hdk, n_chunks=n_chunks) + # export_frame() splits the frame into multiple chunks. When it's + # split with HDK, each categorical column will have a different + # set of categories. When concatenating the chunks, the categorical + # column will be of type object. + cat_cols = md_df.select_dtypes(include=["category"]).columns + with warns_that_defaulting_to_pandas(): + md_df[cat_cols] = md_df[cat_cols].astype(str) + exported_df[cat_cols] = exported_df[cat_cols].astype(str) df_equals(md_df, exported_df)