Skip to content

Commit

Permalink
Fix HDK
Browse files Browse the repository at this point in the history
Signed-off-by: Igoshev, Iaroslav <[email protected]>
  • Loading branch information
YarShev committed Apr 23, 2024
1 parent e0f2fb3 commit 66929bd
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1511,16 +1511,19 @@ def _join_arrow_columns(self, other_modin_frames):
raise NotImplementedError("Duplicate column names")
max_len = max(len(t) for t in tables)
columns = [c for t in tables for c in t.columns]
new_dtypes = [dt for frame in frames for dt in frame.dtypes]
# Make all columns of the same length, if required.
for i, col in enumerate(columns):
if len(col) < max_len:
columns[i] = pyarrow.chunked_array(
col.chunks + [pyarrow.nulls(max_len - len(col), col.type)]
)
new_dtypes[i] = arrow_type_to_pandas(columns[i].type)
return self.from_arrow(
at=pyarrow.table(columns, column_names),
columns=[c for f in frames for c in f.columns],
encode_col_names=False,
new_dtypes=new_dtypes,
)
return None

Expand Down Expand Up @@ -3009,7 +3012,13 @@ def encoder(n):

@classmethod
def from_arrow(
cls, at, index_cols=None, index=None, columns=None, encode_col_names=True
cls,
at,
index_cols=None,
index=None,
columns=None,
encode_col_names=True,
new_dtypes=None,
):
"""
Build a frame from an Arrow table.
Expand All @@ -3028,6 +3037,8 @@ def from_arrow(
Column labels to use for resulting frame.
encode_col_names : bool, default: True
Encode column names.
dtypes : pandas.Index or list, optional
Column data types.
Returns
-------
Expand Down Expand Up @@ -3057,20 +3068,21 @@ def from_arrow(

dtype_index = [] if index_cols is None else list(index_cols)
dtype_index.extend(new_columns)
new_dtypes = []

for col in at.columns:
if pyarrow.types.is_dictionary(col.type):
new_dtypes.append(
LazyProxyCategoricalDtype._build_proxy(
parent=at,
column_name=col._name,
materializer=build_categorical_from_at,
dtype=arrow_type_to_pandas(col.type.value_type),

if new_dtypes is None:
new_dtypes = []
for col in at.columns:
if pyarrow.types.is_dictionary(col.type):
new_dtypes.append(
LazyProxyCategoricalDtype._build_proxy(
parent=at,
column_name=col._name,
materializer=build_categorical_from_at,
dtype=arrow_type_to_pandas(col.type.value_type),
)
)
)
else:
new_dtypes.append(cls._arrow_type_to_dtype(col.type))
else:
new_dtypes.append(cls._arrow_type_to_dtype(col.type))

if len(unsupported_cols) > 0:
ErrorMessage.single_warning(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ def test_simple_export(data_has_nulls, from_hdk, n_chunks):
)
md_df = pd.DataFrame(data)
exported_df = export_frame(md_df, from_hdk, n_chunks=n_chunks)
# export_frame() splits the frame into multiple chunks. When it's
# split with HDK, each categorical column will have a different
# set of categories. When concatenating the chunks, the categorical
# column will be of type object.
cat_cols = md_df.select_dtypes(include=["category"]).columns
with warns_that_defaulting_to_pandas():
md_df[cat_cols] = md_df[cat_cols].astype(str)
exported_df[cat_cols] = exported_df[cat_cols].astype(str)
df_equals(md_df, exported_df)


Expand Down

0 comments on commit 66929bd

Please sign in to comment.