Fix HDK

Signed-off-by: Igoshev, Iaroslav <[email protected]>
YarShev · Apr 23, 2024 · 66929bd · 66929bd
1 parent e0f2fb3
commit 66929bd
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 14 deletions.
diff --git a/...n/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py b/...n/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py
@@ -1511,16 +1511,19 @@ def _join_arrow_columns(self, other_modin_frames):
                 raise NotImplementedError("Duplicate column names")
             max_len = max(len(t) for t in tables)
             columns = [c for t in tables for c in t.columns]
+            new_dtypes = [dt for frame in frames for dt in frame.dtypes]
             # Make all columns of the same length, if required.
             for i, col in enumerate(columns):
                 if len(col) < max_len:
                     columns[i] = pyarrow.chunked_array(
                         col.chunks + [pyarrow.nulls(max_len - len(col), col.type)]
                     )
+                    new_dtypes[i] = arrow_type_to_pandas(columns[i].type)
             return self.from_arrow(
                 at=pyarrow.table(columns, column_names),
                 columns=[c for f in frames for c in f.columns],
                 encode_col_names=False,
+                new_dtypes=new_dtypes,
             )
         return None
 
@@ -3009,7 +3012,13 @@ def encoder(n):
 
     @classmethod
     def from_arrow(
-        cls, at, index_cols=None, index=None, columns=None, encode_col_names=True
+        cls,
+        at,
+        index_cols=None,
+        index=None,
+        columns=None,
+        encode_col_names=True,
+        new_dtypes=None,
     ):
         """
         Build a frame from an Arrow table.
@@ -3028,6 +3037,8 @@ def from_arrow(
             Column labels to use for resulting frame.
         encode_col_names : bool, default: True
             Encode column names.
+        dtypes : pandas.Index or list, optional
+            Column data types.
 
         Returns
         -------
@@ -3057,20 +3068,21 @@ def from_arrow(
 
         dtype_index = [] if index_cols is None else list(index_cols)
         dtype_index.extend(new_columns)
-        new_dtypes = []
-
-        for col in at.columns:
-            if pyarrow.types.is_dictionary(col.type):
-                new_dtypes.append(
-                    LazyProxyCategoricalDtype._build_proxy(
-                        parent=at,
-                        column_name=col._name,
-                        materializer=build_categorical_from_at,
-                        dtype=arrow_type_to_pandas(col.type.value_type),
+
+        if new_dtypes is None:
+            new_dtypes = []
+            for col in at.columns:
+                if pyarrow.types.is_dictionary(col.type):
+                    new_dtypes.append(
+                        LazyProxyCategoricalDtype._build_proxy(
+                            parent=at,
+                            column_name=col._name,
+                            materializer=build_categorical_from_at,
+                            dtype=arrow_type_to_pandas(col.type.value_type),
+                        )
                     )
-                )
-            else:
-                new_dtypes.append(cls._arrow_type_to_dtype(col.type))
+                else:
+                    new_dtypes.append(cls._arrow_type_to_dtype(col.type))
 
         if len(unsupported_cols) > 0:
             ErrorMessage.single_warning(

diff --git a/modin/tests/interchange/dataframe_protocol/hdk/test_protocol.py b/modin/tests/interchange/dataframe_protocol/hdk/test_protocol.py
@@ -47,6 +47,14 @@ def test_simple_export(data_has_nulls, from_hdk, n_chunks):
     )
     md_df = pd.DataFrame(data)
     exported_df = export_frame(md_df, from_hdk, n_chunks=n_chunks)
+    # export_frame() splits the frame into multiple chunks. When it's
+    # split with HDK, each categorical column will have a different
+    # set of categories. When concatenating the chunks, the categorical
+    # column will be of type object.
+    cat_cols = md_df.select_dtypes(include=["category"]).columns
+    with warns_that_defaulting_to_pandas():
+        md_df[cat_cols] = md_df[cat_cols].astype(str)
+        exported_df[cat_cols] = exported_df[cat_cols].astype(str)
     df_equals(md_df, exported_df)