diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py index 1d41aa8d7ee..d653054e910 100644 --- a/python/pyarrow/interchange/from_dataframe.py +++ b/python/pyarrow/interchange/from_dataframe.py @@ -19,6 +19,7 @@ from typing import ( Any, + Tuple, ) from pyarrow.interchange.column import ( @@ -204,7 +205,9 @@ def column_to_array( pa.Array """ buffers = col.get_buffers() - data = buffers_to_array(buffers, col.size(), + data_type = col.dtype + data = buffers_to_array(buffers, data_type, + col.size(), col.describe_null, col.offset, allow_copy) @@ -236,7 +239,9 @@ def bool_column_to_array( ) buffers = col.get_buffers() - data = buffers_to_array(buffers, col.size(), + data_type = col.dtype + data = buffers_to_array(buffers, data_type, + col.size(), col.describe_null, col.offset) data = pc.cast(data, pa.bool_()) @@ -274,11 +279,15 @@ def categorical_column_to_dictionary( raise NotImplementedError( "Non-dictionary categoricals not supported yet") + # We need to first convert the dictionary column cat_column = categorical["categories"] dictionary = column_to_array(cat_column) - + # Then we need to convert the indices + # Here we need to use the buffer data type! buffers = col.get_buffers() - indices = buffers_to_array(buffers, col.size(), + _, data_type = buffers["data"] + indices = buffers_to_array(buffers, data_type, + col.size(), col.describe_null, col.offset) @@ -326,6 +335,7 @@ def map_date_type(data_type): def buffers_to_array( buffers: ColumnBuffers, + data_type: Tuple[DtypeKind, int, str, str], length: int, describe_null: ColumnNullType, offset: int = 0, @@ -339,6 +349,9 @@ def buffers_to_array( buffer : ColumnBuffers Dictionary containing tuples of underlying buffers and their associated dtype. + data_type : Tuple[DtypeKind, int, str, str], + Dtype description of the column as a tuple ``(kind, bit-width, format string, + endianness)``. length : int The number of values in the array. describe_null: ColumnNullType @@ -360,7 +373,7 @@ def buffers_to_array( is responsible for keeping the memory owner object alive as long as the returned PyArrow array is being used. """ - data_buff, data_type = buffers["data"] + data_buff, _ = buffers["data"] try: validity_buff, validity_dtype = buffers["validity"] except TypeError: