diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index 540beb37..989680c1 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -2498,7 +2498,7 @@ def map_rows( # Otherwise, return the results as a new NestedFrame return results_nf - def to_pandas(self, list_struct=False) -> pd.DataFrame: + def to_pandas(self, list_struct=False, large_list=False) -> pd.DataFrame: """Convert to an ordinal pandas DataFrame, with no NestedDtype series. NestedDtype is cast to pd.ArrowDtype @@ -2509,6 +2509,11 @@ def to_pandas(self, list_struct=False) -> pd.DataFrame: If True, cast nested columns to pandas struct-list arrow extension array columns. If False (default), cast nested columns to list-struct array columns. + large_list : bool + If False (default), use regular ``list_`` (int32 offsets). Set to + True to use ``large_list`` (int64 offsets), which is required when + the total number of nested elements across all rows exceeds + ``2**31 - 1``. Returns ------- @@ -2529,10 +2534,10 @@ def to_pandas(self, list_struct=False) -> pd.DataFrame: """ df = pd.DataFrame(self) for col in self.nested_columns: - df[col] = df[col].array.to_arrow_ext_array(list_struct=list_struct) + df[col] = df[col].array.to_arrow_ext_array(list_struct=list_struct, large_list=large_list) return df - def to_parquet(self, path, **kwargs) -> None: + def to_parquet(self, path, large_list=False, **kwargs) -> None: """Creates parquet file(s) with the data of a NestedFrame, either as a single parquet file where each nested dataset is packed into its own column or as an individual parquet file for each layer. @@ -2544,6 +2549,11 @@ def to_parquet(self, path, **kwargs) -> None: ---------- path : str The path to the parquet file + large_list : bool + If False (default), use regular ``list_`` (int32 offsets). Set to + True to use ``large_list`` (int64 offsets), which is required when + the total number of nested elements across all rows exceeds + ``2**31 - 1``. kwargs : keyword arguments, optional Keyword arguments to pass to `pyarrow.parquet.write_table @@ -2559,7 +2569,7 @@ def to_parquet(self, path, **kwargs) -> None: >>> nf = generate_data(5,5, seed=1) >>> nf.to_parquet("nestedframe.parquet") # doctest: +SKIP """ - df = self.to_pandas(list_struct=False) + df = self.to_pandas(list_struct=False, large_list=large_list) # Write through pyarrow # This is potentially not zero-copy diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py index 79310e72..a943851f 100644 --- a/src/nested_pandas/nestedframe/io.py +++ b/src/nested_pandas/nestedframe/io.py @@ -522,6 +522,6 @@ def _cast_struct_cols_to_nested(df, reject_nesting): def _cast_list_cols_to_nested(df): """cast list columns to nested dtype""" for col, dtype in df.dtypes.items(): - if pa.types.is_list(dtype.pyarrow_dtype): + if is_pa_type_a_list(dtype.pyarrow_dtype): df[col] = pack_lists(df[[col]]) return df diff --git a/src/nested_pandas/series/_storage/list_struct_storage.py b/src/nested_pandas/series/_storage/list_struct_storage.py index 30428953..092f9d7c 100644 --- a/src/nested_pandas/series/_storage/list_struct_storage.py +++ b/src/nested_pandas/series/_storage/list_struct_storage.py @@ -5,6 +5,7 @@ import pyarrow as pa from nested_pandas.series.utils import ( + is_pa_type_a_list, normalize_list_array, transpose_struct_list_chunked, validate_list_struct_type, @@ -29,7 +30,7 @@ class ListStructStorage: def __init__( self, array: pa.ListArray | pa.FixedSizeListArray | pa.LargeListArray | pa.ChunkedArray ) -> None: - if isinstance(array, pa.ListArray): + if isinstance(array, pa.Array) and is_pa_type_a_list(array.type): array = pa.chunked_array([array]) if not isinstance(array, pa.ChunkedArray): raise ValueError("array must be of type pa.ChunkedArray") @@ -81,7 +82,7 @@ def nbytes(self) -> int: return self._data.nbytes @property - def type(self) -> pa.ListType: + def type(self) -> pa.LargeListType: """Pyarrow type of the underlying array.""" return self._data.type diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py index 0e2dd06d..ab2ca348 100644 --- a/src/nested_pandas/series/accessor.py +++ b/src/nested_pandas/series/accessor.py @@ -14,7 +14,7 @@ from nested_pandas.series.dtype import NestedDtype from nested_pandas.series.nestedseries import NestedSeries from nested_pandas.series.packer import pack_flat, pack_sorted_df_into_struct -from nested_pandas.series.utils import nested_types_mapper +from nested_pandas.series.utils import downcast_large_list_array, nested_types_mapper __all__ = ["NestSeriesAccessor"] @@ -41,13 +41,17 @@ def _check_series(series): if not isinstance(dtype, NestedDtype): raise AttributeError(f"Can only use .nest accessor with a Series of NestedDtype, got {dtype}") - def to_lists(self, columns: list[str] | str | None = None) -> pd.DataFrame: + def to_lists(self, columns: list[str] | str | None = None, large_list: bool = False) -> pd.DataFrame: """Convert nested series into dataframe of list-array columns Parameters ---------- columns : list[str] or str or None, optional Names of the column(s) to include. Default is None, which means all columns. + large_list : bool, optional + If False (default), use regular ``list_`` (int32 offsets). Set to True to + use ``large_list`` (int64 offsets), which is required when the total number + of nested elements across all rows exceeds ~2.1 billion (int32 max). Returns ------- @@ -76,7 +80,12 @@ def to_lists(self, columns: list[str] | str | None = None) -> pd.DataFrame: if len(columns) == 0: raise ValueError("Cannot convert a struct with no fields to lists") - list_df = self._series.array.pa_table.select(columns).to_pandas(types_mapper=nested_types_mapper) + list_table = self._series.array.pa_table.select(columns) + if not large_list: + list_table = pa.table( + {col: downcast_large_list_array(list_table.column(col)) for col in list_table.column_names} + ) + list_df = list_table.to_pandas(types_mapper=nested_types_mapper) list_df.index = self._series.index return list_df @@ -128,7 +137,7 @@ def to_flat(self, columns: list[str] | str | None = None) -> pd.DataFrame: for chunk in self._series.array.struct_array.iterchunks(): struct_array = cast(pa.StructArray, chunk) for column in columns: - list_array = cast(pa.ListArray, struct_array.field(column)) + list_array = cast(pa.LargeListArray, struct_array.field(column)) flat_array = list_array.flatten() flat_chunks[column].append(flat_array) @@ -676,7 +685,7 @@ def get_flat_series(self, field: str) -> pd.Series: flat_chunks = [] for nested_chunk in self._series.array.struct_array.iterchunks(): struct_array = cast(pa.StructArray, nested_chunk) - list_array = cast(pa.ListArray, struct_array.field(field)) + list_array = cast(pa.LargeListArray, struct_array.field(field)) flat_array = list_array.flatten() flat_chunks.append(flat_array) @@ -723,7 +732,7 @@ def get_list_series(self, field: str) -> pd.Series: 2 [31.34241782 3.90547832] 3 [69.23226157 16.98304196] 4 [87.63891523 87.81425034] - Name: flux, dtype: list[pyarrow] + Name: flux, dtype: large_list[pyarrow] """ list_chunked_array = self._series.array.pa_table[field] return pd.Series( @@ -757,7 +766,7 @@ def __getitem__(self, key: str | list[str]) -> NestedSeries: flat_chunks = [] for nested_chunk in self._series.array.struct_array.iterchunks(): struct_array = cast(pa.StructArray, nested_chunk) - list_array = cast(pa.ListArray, struct_array.field(key)) + list_array = cast(pa.LargeListArray, struct_array.field(key)) flat_array = list_array.flatten() flat_chunks.append(flat_array) diff --git a/src/nested_pandas/series/dtype.py b/src/nested_pandas/series/dtype.py index 0802c300..f5f70ca7 100644 --- a/src/nested_pandas/series/dtype.py +++ b/src/nested_pandas/series/dtype.py @@ -14,7 +14,9 @@ from pandas.core.dtypes.base import ExtensionDtype from nested_pandas.series.utils import ( + is_pa_type_a_list, is_pa_type_is_list_struct, + normalize_struct_list_type, transpose_list_struct_type, transpose_struct_list_type, ) @@ -193,7 +195,7 @@ def __init__(self, pyarrow_dtype: pa.DataType | Mapping) -> None: # Allow from_columns-style mapping inputs if isinstance(pyarrow_dtype, Mapping): - pyarrow_dtype = pa.struct({col: pa.list_(pa_type) for col, pa_type in pyarrow_dtype.items()}) + pyarrow_dtype = pa.struct({col: pa.large_list(pa_type) for col, pa_type in pyarrow_dtype.items()}) pyarrow_dtype = cast(pa.StructType, pyarrow_dtype) self.pyarrow_dtype, self.list_struct_pa_dtype = self._validate_dtype(pyarrow_dtype) @@ -239,7 +241,7 @@ def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore nested >>> assert ( ... dtype.pyarrow_dtype - ... == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())}) + ... == pa.struct({"a": pa.large_list(pa.float64()), "b": pa.large_list(pa.int64())}) ... ) """ return cls.from_columns(fields) @@ -266,15 +268,15 @@ def from_columns(cls, columns: Mapping[str, pa.DataType]) -> Self: # type: igno nested >>> assert ( ... dtype.pyarrow_dtype - ... == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())}) + ... == pa.struct({"a": pa.large_list(pa.float64()), "b": pa.large_list(pa.int64())}) ... ) """ - pyarrow_dtype = pa.struct({column: pa.list_(pa_type) for column, pa_type in columns.items()}) + pyarrow_dtype = pa.struct({column: pa.large_list(pa_type) for column, pa_type in columns.items()}) pyarrow_dtype = cast(pa.StructType, pyarrow_dtype) return cls(pyarrow_dtype=pyarrow_dtype) @staticmethod - def _validate_dtype(pyarrow_dtype: pa.DataType) -> tuple[pa.StructType, pa.ListType]: + def _validate_dtype(pyarrow_dtype: pa.DataType) -> tuple[pa.StructType, pa.LargeListType]: """Check that the given pyarrow type is castable to the nested type. Parameters @@ -286,20 +288,27 @@ def _validate_dtype(pyarrow_dtype: pa.DataType) -> tuple[pa.StructType, pa.ListT ------- pa.StructType Struct-list pyarrow type representing the nested type. - pa.ListType + pa.LargeListType List-struct pyarrow type representing the nested type. """ if not isinstance(pyarrow_dtype, pa.DataType): raise TypeError(f"Expected a 'pyarrow.DataType' object, got {type(pyarrow_dtype)}") if pa.types.is_struct(pyarrow_dtype): struct_type = cast(pa.StructType, pyarrow_dtype) + # Normalize list fields to large_list for backward compatibility + # (callers may pass pa.list_ fields) + struct_type = normalize_struct_list_type(struct_type) return struct_type, transpose_struct_list_type(struct_type) - # Currently, LongList and others are not supported - if pa.types.is_list(pyarrow_dtype): - list_type = cast(pa.ListType, pyarrow_dtype) + # Support pa.large_list (and pa.list_ for backward compatibility) + if is_pa_type_a_list(pyarrow_dtype): + if not pa.types.is_large_list(pyarrow_dtype): + # Normalize regular list or fixed-size list to large_list + pyarrow_dtype = pa.large_list(pyarrow_dtype.value_type) + list_type = cast(pa.LargeListType, pyarrow_dtype) return transpose_list_struct_type(list_type), list_type raise ValueError( - f"NestedDtype can only be constructed with pa.StructType or pa.ListType only, got {pyarrow_dtype}" + "NestedDtype can only be constructed with pa.StructType, pa.LargeListType, " + f"or pa.ListType, got {pyarrow_dtype}" ) @property diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py index b816b7f4..52d948c8 100644 --- a/src/nested_pandas/series/ext_array.py +++ b/src/nested_pandas/series/ext_array.py @@ -64,6 +64,7 @@ from nested_pandas.series.nestedseries import NestedSeries # noqa from nested_pandas.series.utils import ( chunk_lengths, + downcast_large_list_array, is_pa_type_a_list, normalize_list_array, normalize_struct_list_type, @@ -185,7 +186,7 @@ def replace_with_mask(array: pa.ChunkedArray, mask: pa.BooleanArray, value: pa.A # If mask is [False, True, False, True], mask_cumsum will be [0, 1, 1, 2] # So we put value items to the right positions in broadcast_value, while duplicate some other items for # the positions where mask is False. - mask_cumsum = pa.compute.cumulative_sum(mask.cast(pa.int32())) + mask_cumsum = pa.compute.cumulative_sum(mask.cast(pa.int64())) value_index = pa.compute.subtract(mask_cumsum, 1) value_index = pa.compute.if_else(pa.compute.less(value_index, 0), 0, value_index) @@ -208,7 +209,7 @@ def convert_df_to_pa_scalar(df: pd.DataFrame, *, pa_type: pa.StructType | None) ty = scalar.type else: array = pa.array(series) - ty = pa.list_(array.type) + ty = pa.large_list(array.type) scalar = pa.scalar(array, type=ty) d[column] = scalar types[column] = ty @@ -916,7 +917,7 @@ def _pyarrow_dtype(self) -> pa.StructType: return self._dtype.pyarrow_dtype @property - def _pyarrow_list_struct_dtype(self) -> pa.ListType: + def _pyarrow_list_struct_dtype(self) -> pa.LargeListType: """PyArrow data type of the list-struct view over the ext. array""" return transpose_struct_list_type(self._pyarrow_dtype) @@ -925,7 +926,7 @@ def from_arrow_ext_array(cls, array: ArrowExtensionArray) -> Self: # type: igno """Create a NestedExtensionArray from pandas' ArrowExtensionArray""" return cls(array._pa_array) - def to_arrow_ext_array(self, list_struct: bool = False) -> ArrowExtensionArray: + def to_arrow_ext_array(self, list_struct: bool = False, large_list: bool = False) -> ArrowExtensionArray: """Convert the extension array to pandas' ArrowExtensionArray Parameters @@ -933,12 +934,19 @@ def to_arrow_ext_array(self, list_struct: bool = False) -> ArrowExtensionArray: list_struct : bool, optional If False (default), return struct-list array, otherwise return list-struct array. + large_list : bool, optional + If False (default), use regular ``list_`` (int32 offsets). Set to True to + use ``large_list`` (int64 offsets), which is required when the total number + of nested elements across all rows exceeds ~2.1 billion (int32 max). """ - if list_struct: - return ArrowExtensionArray(self.list_array) - return ArrowExtensionArray(self.struct_array) - - def to_pyarrow_scalar(self, list_struct: bool = False) -> pa.ListScalar: + arr = self.list_array if list_struct else self.struct_array + if not large_list: + arr = downcast_large_list_array(arr) + return ArrowExtensionArray(arr) + + def to_pyarrow_scalar( + self, list_struct: bool = False, large_list: bool = False + ) -> pa.LargeListScalar | pa.ListScalar: """Convert to a pyarrow scalar of a list type Parameters @@ -946,14 +954,19 @@ def to_pyarrow_scalar(self, list_struct: bool = False) -> pa.ListScalar: list_struct : bool, optional If False (default), return list-struct-list scalar, otherwise list-list-struct scalar. + large_list : bool, optional + If False (default), use regular ``list_`` (int32 offsets). Set to True to + use ``large_list`` (int64 offsets), which is required when the total number + of nested elements across all rows exceeds ~2.1 billion (int32 max). Returns ------- - pyarrow.ListScalar + pyarrow.LargeListScalar or pyarrow.ListScalar """ pa_array = self.list_array if list_struct else self.struct_array - pa_type = pa.list_(pa_array.type) - return cast(pa.ListScalar, pa.scalar(pa_array, type=pa_type)) + list_type = pa.large_list if large_list else pa.list_ + pa_type = list_type(pa_array.type) + return cast(pa.LargeListScalar | pa.ListScalar, pa.scalar(pa_array, type=pa_type)) @property def list_offsets(self) -> pa.Array: @@ -973,8 +986,8 @@ def list_offsets(self) -> pa.Array: zero_and_lengths = pa.chunked_array( [ - pa.array([0], type=pa.int32()), - pa.array(self.list_lengths, type=pa.int32()), + pa.array([0], type=pa.int64()), + pa.array(self.list_lengths, type=pa.int64()), ] ) offsets = pa.compute.cumulative_sum(zero_and_lengths) @@ -1028,7 +1041,7 @@ def iter_field_lists(self, field: str) -> Generator[np.ndarray, None, None]: """ for chunk in self.struct_array.iterchunks(): struct_array: pa.StructArray = cast(pa.StructArray, chunk) - list_array: pa.ListArray = cast(pa.ListArray, struct_array.field(field)) + list_array: pa.LargeListArray = cast(pa.LargeListArray, struct_array.field(field)) for list_scalar in list_array: yield np.asarray(list_scalar.values) @@ -1104,7 +1117,7 @@ def set_flat_field(self, field: str, value: ArrayLike, *, keep_dtype: bool = Fal if isinstance(pa_array, pa.ChunkedArray): pa_array = pa_array.combine_chunks() - field_list_array = pa.ListArray.from_arrays(values=pa_array, offsets=self.list_offsets) + field_list_array = pa.LargeListArray.from_arrays(values=pa_array, offsets=self.list_offsets) return self.set_list_field(field, field_list_array, keep_dtype=keep_dtype) diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py index 74b55acd..dc5a0e12 100644 --- a/src/nested_pandas/series/packer.py +++ b/src/nested_pandas/series/packer.py @@ -320,7 +320,7 @@ def view_sorted_series_as_list_array( flat_array = pa.array(series, from_pandas=True) if isinstance(flat_array, pa.ChunkedArray): flat_array = flat_array.combine_chunks() - list_array = pa.ListArray.from_arrays( + list_array = pa.LargeListArray.from_arrays( offset, flat_array, ) @@ -356,7 +356,7 @@ def calculate_sorted_index_offsets(index: pd.Index) -> np.ndarray: offset_but_last = np.nonzero(~index.duplicated(keep="first"))[0] offset = np.append(offset_but_last, len(index)) - # Arrow uses int32 for offsets - offset = offset.astype(np.int32) + # LargeListArray uses int64 for offsets + offset = offset.astype(np.int64) return offset diff --git a/src/nested_pandas/series/utils.py b/src/nested_pandas/series/utils.py index a31b60df..7e603611 100644 --- a/src/nested_pandas/series/utils.py +++ b/src/nested_pandas/series/utils.py @@ -63,7 +63,73 @@ def is_pa_type_is_list_struct(pa_type: pa.DataType) -> bool: True if the given type is a list-type with struct values, False otherwise. """ - return pa.types.is_list(pa_type) and pa.types.is_struct(pa_type.value_type) + return (pa.types.is_large_list(pa_type) or pa.types.is_list(pa_type)) and pa.types.is_struct( + pa_type.value_type + ) + + +def zero_align_large_list_offsets(array: pa.LargeListArray) -> pa.LargeListArray: + """Realign a LargeListArray so its offsets start at zero. + + If the first offset is already zero the original array is returned unchanged. + + Parameters + ---------- + array : pa.LargeListArray + Input list array whose offsets may start at a non-zero value. + + Returns + ------- + pa.LargeListArray + List array with offsets starting at zero and values sliced accordingly. + """ + offsets = array.offsets + first = offsets[0].as_py() + if first == 0: + return array + new_offsets = pa.compute.subtract(offsets, offsets[0]) + return pa.LargeListArray.from_arrays( + values=array.values[first : offsets[-1].as_py()], + offsets=new_offsets, + ) + + +def zero_align_struct_list_offsets(array: pa.StructArray) -> pa.StructArray: + """Realign all LargeList fields in a StructArray so their offsets start at zero. + + Parameters + ---------- + array : pa.StructArray + Input struct array whose fields are LargeListArrays. + + Returns + ------- + pa.StructArray + Struct array with every field's offsets starting at zero. + + Raises + ------ + ValueError + If list lengths do not match across struct fields. + """ + value_lengths = None + list_arrays = [] + for field in array.type: + inner_array = array.field(field.name) + list_array = cast(pa.LargeListArray, inner_array) + + if value_lengths is None: + value_lengths = list_array.value_lengths() + elif not value_lengths.equals(list_array.value_lengths()): + raise ValueError( + f"List lengths do not match for struct fields {array.type.field(0).name} and {field.name}", + ) + + list_arrays.append(zero_align_large_list_offsets(list_array)) + return pa.StructArray.from_arrays( + arrays=list_arrays, + fields=struct_fields(array.type), + ) def align_struct_list_offsets(array: pa.StructArray) -> pa.StructArray: @@ -88,12 +154,12 @@ def align_struct_list_offsets(array: pa.StructArray) -> pa.StructArray: if not pa.types.is_struct(array.type): raise ValueError(f"Expected a StructArray, got {array.type}") - first_offsets: pa.ListArray | None = None + first_offsets: pa.LargeListArray | None = None for field in array.type: inner_array = array.field(field.name) - if not pa.types.is_list(inner_array.type): - raise ValueError(f"Expected a ListArray, got {inner_array.type}") - list_array = cast(pa.ListArray, inner_array) + if not pa.types.is_large_list(inner_array.type): + raise ValueError(f"Expected a LargeListArray, got {inner_array.type}") + list_array = cast(pa.LargeListArray, inner_array) if first_offsets is None: first_offsets = list_array.offsets @@ -105,31 +171,7 @@ def align_struct_list_offsets(array: pa.StructArray) -> pa.StructArray: # Return the original array if all offsets match return array - new_offsets = pa.compute.subtract(first_offsets, first_offsets[0]) - value_lengths = None - list_arrays = [] - for field in array.type: - inner_array = array.field(field.name) - list_array = cast(pa.ListArray, inner_array) - - if value_lengths is None: - value_lengths = list_array.value_lengths() - elif not value_lengths.equals(list_array.value_lengths()): - raise ValueError( - f"List lengths do not match for struct fields {array.type.field(0).name} and {field.name}", - ) - - list_arrays.append( - pa.ListArray.from_arrays( - values=list_array.values[list_array.offsets[0].as_py() : list_array.offsets[-1].as_py()], - offsets=new_offsets, - ) - ) - new_array = pa.StructArray.from_arrays( - arrays=list_arrays, - fields=struct_fields(array.type), - ) - return new_array + return zero_align_struct_list_offsets(array) def align_chunked_struct_list_offsets(array: pa.Array | pa.ChunkedArray) -> pa.ChunkedArray: @@ -159,7 +201,7 @@ def align_chunked_struct_list_offsets(array: pa.Array | pa.ChunkedArray) -> pa.C return pa.chunked_array(chunks, type=array.type) -def transpose_struct_list_type(t: pa.StructType) -> pa.ListType: +def transpose_struct_list_type(t: pa.StructType) -> pa.LargeListType: """Converts a type of struct-list array into a type of list-struct array. Parameters @@ -169,7 +211,7 @@ def transpose_struct_list_type(t: pa.StructType) -> pa.ListType: Returns ------- - pa.DataType + pa.LargeListType Type of list-struct array. Raises @@ -182,16 +224,16 @@ def transpose_struct_list_type(t: pa.StructType) -> pa.ListType: fields = [] for field in t: - if not pa.types.is_list(field.type): - raise ValueError(f"Expected a ListType, got {field.type}") - list_type = cast(pa.ListType, field.type) + if not pa.types.is_large_list(field.type): + raise ValueError(f"Expected a LargeListType, got {field.type}") + list_type = cast(pa.LargeListType, field.type) fields.append(pa.field(field.name, list_type.value_type)) - list_type = cast(pa.ListType, pa.list_(pa.struct(fields))) + list_type = cast(pa.LargeListType, pa.large_list(pa.struct(fields))) return list_type -def transpose_struct_list_array(array: pa.StructArray, validate: bool = True) -> pa.ListArray: +def transpose_struct_list_array(array: pa.StructArray, validate: bool = True) -> pa.LargeListArray: """Converts a struct-array of lists into a list-array of structs. Parameters @@ -204,7 +246,7 @@ def transpose_struct_list_array(array: pa.StructArray, validate: bool = True) -> Returns ------- - pa.ListArray + pa.LargeListArray List array of structs. """ if validate: @@ -229,7 +271,7 @@ def transpose_struct_list_array(array: pa.StructArray, validate: bool = True) -> [field.values[field.offsets[0].as_py() : field.offsets[-1].as_py()] for field in array.flatten()], names=struct_field_names(array.type), ) - return pa.ListArray.from_arrays( + return pa.LargeListArray.from_arrays( offsets=offsets, values=struct_flat_array, mask=mask, @@ -259,12 +301,12 @@ def transpose_struct_list_chunked(chunked_array: pa.ChunkedArray, validate: bool ) -def transpose_list_struct_scalar(scalar: pa.ListScalar) -> pa.StructScalar: +def transpose_list_struct_scalar(scalar: pa.LargeListScalar | pa.ListScalar) -> pa.StructScalar: """Converts a list-scalar of structs into a struct-scalar of lists. Parameters ---------- - scalar : pa.ListScalar + scalar : pa.LargeListScalar or pa.ListScalar Input list-struct scalar. Returns @@ -280,10 +322,10 @@ def transpose_list_struct_scalar(scalar: pa.ListScalar) -> pa.StructScalar: return cast(pa.StructScalar, struct_scalar) -def validate_list_struct_type(t: pa.ListType) -> None: - """Raise a ValueError if not a list-struct type.""" - if not pa.types.is_list(t): - raise ValueError(f"Expected a ListType, got {t}") +def validate_list_struct_type(t: pa.LargeListType) -> None: + """Raise a ValueError if not a large-list-struct type.""" + if not pa.types.is_large_list(t): + raise ValueError(f"Expected a LargeListType, got {t}") if not pa.types.is_struct(t.value_type): raise ValueError(f"Expected a StructType as a list value type, got {t.value_type}") @@ -295,8 +337,8 @@ def validate_struct_list_type(t: pa.StructType) -> None: raise ValueError(f"Expected a StructType, got {t}") for field in struct_fields(t): - if not pa.types.is_list(field.type): - raise ValueError(f"Expected a ListType for field {field.name}, got {field.type}") + if not pa.types.is_large_list(field.type): + raise ValueError(f"Expected a LargeListType for field {field.name}, got {field.type}") def transpose_list_struct_type(t: pa.ListType) -> pa.StructType: @@ -322,18 +364,18 @@ def transpose_list_struct_type(t: pa.ListType) -> pa.StructType: struct_type = cast(pa.StructType, t.value_type) fields = [] for field in struct_type: - fields.append(pa.field(field.name, pa.list_(field.type))) + fields.append(pa.field(field.name, pa.large_list(field.type))) struct_type = cast(pa.StructType, pa.struct(fields)) return struct_type -def transpose_list_struct_array(array: pa.ListArray) -> pa.StructArray: +def transpose_list_struct_array(array: pa.LargeListArray) -> pa.StructArray: """Converts a list-array of structs into a struct-array of lists. Parameters ---------- - array : pa.ListArray + array : pa.LargeListArray Input list array of structs. Returns @@ -348,7 +390,7 @@ def transpose_list_struct_array(array: pa.ListArray) -> pa.StructArray: fields = [] for field_values in values.flatten(): - list_array = pa.ListArray.from_arrays(offsets, field_values) + list_array = pa.LargeListArray.from_arrays(offsets, field_values) fields.append(list_array) return pa.StructArray.from_arrays( @@ -380,7 +422,7 @@ def nested_types_mapper(type: pa.DataType) -> pd.ArrowDtype | NestedDtype: """Type mapper for pyarrow .to_pandas(types_mapper) methods.""" from nested_pandas.series.dtype import NestedDtype - if pa.types.is_list(type): + if pa.types.is_large_list(type): try: return NestedDtype(type) except (ValueError, TypeError): @@ -446,8 +488,8 @@ def rechunk(array: pa.Array | pa.ChunkedArray, chunk_lens: ArrayLike) -> pa.Chun def normalize_list_array( array: pa.ListArray | pa.FixedSizeListArray | pa.LargeListArray | pa.ChunkedArray, -) -> pa.ListArray | pa.ChunkedArray: - """Convert fixed-size and large list arrays to standard ``pa.ListArray``-based arrays. +) -> pa.LargeListArray | pa.ChunkedArray: + """Convert fixed-size and regular list arrays to standard ``pa.LargeListArray``-based arrays. Parameters ---------- @@ -457,17 +499,17 @@ def normalize_list_array( Returns ------- - pa.ListArray or pa.ChunkedArray + pa.LargeListArray or pa.ChunkedArray A list array (or chunked list array) where the list type is normalized to - ``pa.ListType`` while preserving the original value type. + ``pa.LargeListType`` while preserving the original value type. Raises ------ ValueError If the input is not a list-type array (i.e. does not have a ``.type.value_type``). """ - # Pass list-array as is - if pa.types.is_list(array.type): + # Pass large-list-array as is + if pa.types.is_large_list(array.type): return array try: @@ -477,11 +519,11 @@ def normalize_list_array( "Input array must be a list-type array: pa.ListArray, pa.LargeListArray or pa.FixedSizeListArray" ) from None - return array.cast(pa.list_(value_type)) + return array.cast(pa.large_list(value_type)) def normalize_struct_list_type(struct_type: pa.StructType) -> pa.StructType: - """Convert all struct-list fields to "normal" list types. + """Convert all struct-list fields to ``pa.large_list`` types. Parameters ---------- @@ -492,7 +534,7 @@ def normalize_struct_list_type(struct_type: pa.StructType) -> pa.StructType: Returns ------- pa.StructType - Output struct type with all fields as pa.ListType. + Output struct type with all fields as pa.LargeListType. Raises ------ @@ -507,10 +549,77 @@ def normalize_struct_list_type(struct_type: pa.StructType) -> pa.StructType: value_type = field.type.value_type except AttributeError: raise ValueError("Input struct_type must be a struct-list type") from None - fields.append(pa.field(field.name, pa.list_(value_type))) + fields.append(pa.field(field.name, pa.large_list(value_type))) return pa.struct(fields) +def downcast_large_list_type(t: pa.LargeListType | pa.StructType) -> pa.ListType | pa.StructType: + """Convert a ``large_list`` type or struct-of-``large_list`` type to regular ``list_`` type(s). + + Parameters + ---------- + t : pa.LargeListType or pa.StructType + Input type. If a ``large_list`` type, it is converted to ``list_``. + If a struct type, all ``large_list`` fields are converted to ``list_``. + + Returns + ------- + pa.ListType or pa.StructType + Downcast type using int32 offsets. + """ + if pa.types.is_large_list(t): + return pa.list_(cast(pa.LargeListType, t).value_type) + if pa.types.is_struct(t): + struct_t = cast(pa.StructType, t) + fields = [] + for field in struct_fields(struct_t): + if pa.types.is_large_list(field.type): + fields.append(pa.field(field.name, pa.list_(field.type.value_type))) + else: + fields.append(field) + return pa.struct(fields) + return cast(pa.ListType | pa.StructType, t) + + +def downcast_large_list_array( + array: pa.Array | pa.ChunkedArray, +) -> pa.Array | pa.ChunkedArray: + """Cast ``large_list`` fields to regular ``list_`` in an array or chunked array. + + Handles both ``LargeListArray`` (direct downcast) and ``StructArray`` whose + fields are ``large_list`` (each field is downcast to ``list_``). + + This is a compatibility helper for consumers that do not support ``large_list`` + (e.g. Parquet files written without Arrow schema metadata, older PyArrow + consumers, etc.). + + Parameters + ---------- + array : pa.LargeListArray, pa.StructArray, or pa.ChunkedArray thereof + Input array with ``large_list`` type or struct-of-``large_list`` type. + + Returns + ------- + pa.Array or pa.ChunkedArray + The downcast array using int32 offsets. + + Raises + ------ + ValueError + If the array contains more than ``2**31 - 1`` total values and cannot + be represented with int32 offsets. Pass ``large_list=True`` to the + calling function to keep int64 offsets. + """ + try: + return array.cast(downcast_large_list_type(array.type)) + except pa.ArrowInvalid as e: + raise ValueError( + "Cannot downcast large_list to list_: the array contains more than " + f"{2**31 - 1} values, which exceeds the int32 offset range. " + "Pass large_list=True to keep int64 offsets." + ) from e + + def normalize_struct_list_array(array: pa.StructArray | pa.ChunkedArray) -> pa.StructArray | pa.ChunkedArray: """Convert all struct-list fields to "normal" list arrays. diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index a12c3e90..0f65d41f 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -2181,9 +2181,10 @@ def test_explode_non_unique_index(): lambda x: {"unaligned_nested.unaligned_t": x[:2]}, columns="nested.t", row_container="args" ).reset_index(drop=True) # Add a list column which has the same lengths - nf["aligned_list_t"] = nf["nested"].nest.to_lists("t")["t"] + # large_list=False: pandas < 3 does not support DataFrame.explode on large_list columns + nf["aligned_list_t"] = nf["nested"].nest.to_lists("t", large_list=False)["t"] # Add a list column which has different lengths - nf["unaligned_list_t"] = nf["nested"].nest.to_lists("t")["t"].list[:2] + nf["unaligned_list_t"] = nf["nested"].nest.to_lists("t", large_list=False)["t"].list[:2] # Make index non-unique nf.index = np.tile(np.arange(10), 10) nf.index.name = "my_index" diff --git a/tests/nested_pandas/series/test_accessor.py b/tests/nested_pandas/series/test_accessor.py index 8c0a9365..db42c271 100644 --- a/tests/nested_pandas/series/test_accessor.py +++ b/tests/nested_pandas/series/test_accessor.py @@ -134,6 +134,31 @@ def test_to_lists_with_columns(): assert_frame_equal(lists, desired) +def test_to_lists_large_list_false(): + """Test that to_lists(large_list=False) returns list (int32) dtype columns.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), -np.array([1.0, 2.0, 1.0])]), + pa.array([np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + lists = series.nest.to_lists(large_list=False) + + assert lists["a"].dtype == pd.ArrowDtype(pa.list_(pa.float64())) + assert lists["b"].dtype == pd.ArrowDtype(pa.list_(pa.float64())) + # Values should be unchanged + np.testing.assert_array_equal(lists["a"].iloc[0], [1.0, 2.0, 3.0]) + np.testing.assert_array_equal(lists["b"].iloc[1], [-3.0, -4.0, -5.0]) + + # large_list=True returns large_list (int64 offsets) + lists_large = series.nest.to_lists(large_list=True) + assert lists_large["a"].dtype == pd.ArrowDtype(pa.large_list(pa.float64())) + assert lists_large["b"].dtype == pd.ArrowDtype(pa.large_list(pa.float64())) + + def test_to_lists_fails_for_empty_input(): """Test that the .nest.to_lists([]) fails when no columns are provided.""" struct_array = pa.StructArray.from_arrays( diff --git a/tests/nested_pandas/series/test_dtype.py b/tests/nested_pandas/series/test_dtype.py index bd969c67..5e62c0d3 100644 --- a/tests/nested_pandas/series/test_dtype.py +++ b/tests/nested_pandas/series/test_dtype.py @@ -24,7 +24,15 @@ def test_from_pyarrow_dtype_struct_list(pyarrow_dtype): """Test that we can construct NestedDtype from pyarrow struct type.""" dtype = NestedDtype(pyarrow_dtype) - assert dtype.pyarrow_dtype == pyarrow_dtype + # Inputs with pa.list_ are normalized to pa.large_list internally + expected_fields = [] + for i in range(pyarrow_dtype.num_fields): + field = pyarrow_dtype.field(i) + if pa.types.is_list(field.type): + expected_fields.append(pa.field(field.name, pa.large_list(field.type.value_type))) + else: + expected_fields.append(field) + assert dtype.pyarrow_dtype == pa.struct(expected_fields) @pytest.mark.parametrize( @@ -45,7 +53,8 @@ def test_from_pyarrow_dtype_struct_list(pyarrow_dtype): def test_from_pyarrow_dtype_list_struct(pyarrow_dtype): """Test that we can construct NestedDtype from pyarrow list type.""" dtype = NestedDtype(pyarrow_dtype) - assert dtype.list_struct_pa_dtype == pyarrow_dtype + # pa.list_ inputs are normalized to pa.large_list + assert dtype.list_struct_pa_dtype == pa.large_list(pyarrow_dtype.value_type) @pytest.mark.parametrize( @@ -68,7 +77,7 @@ def test_to_pandas_arrow_dtype(): """Test that NestedDtype.to_pandas_arrow_dtype() returns the correct pyarrow struct type.""" dtype = NestedDtype.from_columns({"a": pa.int64(), "b": pa.float64()}) assert dtype.to_pandas_arrow_dtype() == pd.ArrowDtype( - pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) + pa.struct([pa.field("a", pa.large_list(pa.int64())), pa.field("b", pa.large_list(pa.float64()))]) ) @@ -77,26 +86,26 @@ def test_from_pandas_arrow_dtype(): dtype_from_struct = NestedDtype.from_pandas_arrow_dtype( pd.ArrowDtype(pa.struct([pa.field("a", pa.list_(pa.int64()))])) ) - assert dtype_from_struct.pyarrow_dtype == pa.struct([pa.field("a", pa.list_(pa.int64()))]) + assert dtype_from_struct.pyarrow_dtype == pa.struct([pa.field("a", pa.large_list(pa.int64()))]) dtype_from_list = NestedDtype.from_pandas_arrow_dtype( pd.ArrowDtype(pa.list_(pa.struct([pa.field("a", pa.int64())]))) ) - assert dtype_from_list.pyarrow_dtype == pa.struct([pa.field("a", pa.list_(pa.int64()))]) + assert dtype_from_list.pyarrow_dtype == pa.struct([pa.field("a", pa.large_list(pa.int64()))]) def test_init_from_pandas_arrow_dtype(): """Test that we can construct NestedDtype from pandas.ArrowDtype in __init__.""" dtype_from_struct = NestedDtype(pd.ArrowDtype(pa.struct([pa.field("a", pa.list_(pa.int64()))]))) - assert dtype_from_struct.pyarrow_dtype == pa.struct([pa.field("a", pa.list_(pa.int64()))]) + assert dtype_from_struct.pyarrow_dtype == pa.struct([pa.field("a", pa.large_list(pa.int64()))]) dtype_from_list = NestedDtype(pd.ArrowDtype(pa.list_(pa.struct([pa.field("a", pa.int64())])))) - assert dtype_from_list.pyarrow_dtype == pa.struct([pa.field("a", pa.list_(pa.int64()))]) + assert dtype_from_list.pyarrow_dtype == pa.struct([pa.field("a", pa.large_list(pa.int64()))]) def test_to_pandas_list_struct_arrow_dtype(): """Test that NestedDtype.to_pandas_arrow_dtype(list_struct=True) returns the correct pyarrow type.""" dtype = NestedDtype.from_columns({"a": pa.list_(pa.int64()), "b": pa.float64()}) assert dtype.to_pandas_arrow_dtype(list_struct=True) == pd.ArrowDtype( - pa.list_(pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.float64())])) + pa.large_list(pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.float64())])) ) @@ -105,7 +114,7 @@ def test_from_columns(): columns = {"a": pa.int64(), "b": pa.float64()} dtype = NestedDtype.from_columns(columns) assert dtype.pyarrow_dtype == pa.struct( - [pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))] + [pa.field("a", pa.large_list(pa.int64())), pa.field("b", pa.large_list(pa.float64()))] ) @@ -114,7 +123,7 @@ def test_init_from_columns(): columns = {"a": pa.int64(), "b": pa.float64()} dtype = NestedDtype(columns) assert dtype.pyarrow_dtype == pa.struct( - [pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))] + [pa.field("a", pa.large_list(pa.int64())), pa.field("b", pa.large_list(pa.float64()))] ) diff --git a/tests/nested_pandas/series/test_ext_array.py b/tests/nested_pandas/series/test_ext_array.py index 7e5fcf62..6b0b34c5 100644 --- a/tests/nested_pandas/series/test_ext_array.py +++ b/tests/nested_pandas/series/test_ext_array.py @@ -200,7 +200,9 @@ def test_from_sequence_with_arrow_array_and_dtype(): b = [-4.0, None, -6.0] pa_type = pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) - new_pa_type = pa.struct([pa.field("a", pa.list_(pa.float64())), pa.field("b", pa.list_(pa.float64()))]) + new_pa_type = pa.struct( + [pa.field("a", pa.large_list(pa.float64())), pa.field("b", pa.large_list(pa.float64()))] + ) pa_array = pa.array( [{"a": a, "b": b}, {"a": a, "b": b}, None], @@ -285,7 +287,9 @@ def test_convert_df_to_pa_scalar(): assert pa_scalar == pa.scalar( {"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, - type=pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]), + type=pa.struct( + [pa.field("a", pa.large_list(pa.int64())), pa.field("b", pa.large_list(pa.float64()))] + ), ) @@ -296,7 +300,9 @@ def test_convert_df_to_pa_from_scalar(): assert pa_scalar == pa.scalar( {"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, - type=pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]), + type=pa.struct( + [pa.field("a", pa.large_list(pa.int64())), pa.field("b", pa.large_list(pa.float64()))] + ), ) @@ -307,7 +313,9 @@ def test_convert_df_to_pa_scalar_from_pyarrow_dtyped_df(): assert pa_scalar == pa.scalar( {"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, - type=pa.struct([pa.field("a", pa.list_(pa.int32())), pa.field("b", pa.list_(pa.float64()))]), + type=pa.struct( + [pa.field("a", pa.large_list(pa.int32())), pa.field("b", pa.large_list(pa.float64()))] + ), ) @@ -321,7 +329,9 @@ def test__box_pa_array_from_series_of_df(): ) list_of_dicts = list(NestedExtensionArray._box_pa_array(series, pa_type=None)) - desired_type = pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) + desired_type = pa.struct( + [pa.field("a", pa.large_list(pa.int64())), pa.field("b", pa.large_list(pa.float64()))] + ) assert list_of_dicts == [ pa.scalar({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, type=desired_type), @@ -337,7 +347,9 @@ def test__box_pa_array_from_list_of_df(): ] list_of_dicts = list(NestedExtensionArray._box_pa_array(list_of_dfs, pa_type=None)) - desired_type = pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) + desired_type = pa.struct( + [pa.field("a", pa.large_list(pa.int64())), pa.field("b", pa.large_list(pa.float64()))] + ) assert list_of_dicts == [ pa.scalar({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, type=desired_type), @@ -811,7 +823,8 @@ def test_chunked_array(): ext_array = NestedExtensionArray(struct_array) # pyarrow returns a single bool for == - assert ext_array.struct_array == pa.chunked_array(struct_array) + # ext_array normalizes list fields to large_list internally + assert ext_array.struct_array == pa.chunked_array(struct_array).cast(ext_array.struct_array.type) def test_chunked_list_struct_array(): @@ -829,7 +842,8 @@ def test_chunked_list_struct_array(): [ [{"a": 1, "b": -4.0}, {"a": 2, "b": -5.0}, {"a": 3, "b": -6.0}], [{"a": 1, "b": -3.0}, {"a": 2, "b": -4.0}, {"a": 1, "b": -5.0}], - ] + ], + type=pa.large_list(pa.struct([pa.field("a", pa.int64()), pa.field("b", pa.float64())])), ) desired = pa.chunked_array([list_array]) # pyarrow returns a single bool for == @@ -854,7 +868,7 @@ def test_to_pyarrow_scalar(): {"a": [1, 2, 1], "b": [-3.0, -4.0, -5.0]}, ], type=pa.list_( - pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) + pa.struct([pa.field("a", pa.large_list(pa.int64())), pa.field("b", pa.large_list(pa.float64()))]) ), ) desired_list_struct = pa.scalar( @@ -862,25 +876,46 @@ def test_to_pyarrow_scalar(): [{"a": 1, "b": -4.0}, {"a": 2, "b": -5.0}, {"a": 3, "b": -6.0}], [{"a": 1, "b": -3.0}, {"a": 2, "b": -4.0}, {"a": 1, "b": -5.0}], ], - type=pa.list_(pa.list_(pa.struct([pa.field("a", pa.int64()), pa.field("b", pa.float64())]))), + type=pa.list_(pa.large_list(pa.struct([pa.field("a", pa.int64()), pa.field("b", pa.float64())]))), ) # pyarrow returns a single bool for == assert ext_array.to_pyarrow_scalar(list_struct=False) == desired_struct_list assert ext_array.to_pyarrow_scalar(list_struct=True) == desired_list_struct +def test_to_pyarrow_scalar_large_list_false(): + """Test that to_pyarrow_scalar(large_list=False) returns a list (int32) scalar.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + scalar = ext_array.to_pyarrow_scalar(large_list=False) + assert pa.types.is_list(scalar.type) + + # large_list=True keeps large_list + scalar_large = ext_array.to_pyarrow_scalar(large_list=True) + assert pa.types.is_large_list(scalar_large.type) + + def test_list_offsets_single_chunk(): """Test that the .list_offset property is correct for a single chunk.""" struct_array = pa.StructArray.from_arrays( arrays=[ - pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])], type=pa.list_(pa.uint8())), - pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])], type=pa.large_list(pa.uint8())), + pa.array( + [-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])], type=pa.large_list(pa.float64()) + ), ], names=["a", "b"], ) ext_array = NestedExtensionArray(struct_array) - desired = pa.array([0, 3, 6], type=pa.int32()) + desired = pa.array([0, 3, 6], type=pa.int64()) # pyarrow returns a single bool for == assert ext_array.list_offsets == desired @@ -889,8 +924,10 @@ def test_list_offsets_multiple_chunks(): """Test that the .list_offset property is correct for multiple chunk_lens.""" struct_array = pa.StructArray.from_arrays( arrays=[ - pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])], type=pa.list_(pa.uint8())), - pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])], type=pa.large_list(pa.uint8())), + pa.array( + [-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])], type=pa.large_list(pa.float64()) + ), ], names=["a", "b"], ) @@ -1272,7 +1309,8 @@ def test_dropna(): def test___arrow_array__(): """Test that the extension array can be converted to a pyarrow array.""" list_array = pa.array( - [[{"a": 1, "b": 2}, {"a": 3, "b": 4}], [{"a": 5, "b": 6}], [{"a": -1, "b": -2}], []] + [[{"a": 1, "b": 2}, {"a": 3, "b": 4}], [{"a": 5, "b": 6}], [{"a": -1, "b": -2}], []], + type=pa.large_list(pa.struct([pa.field("a", pa.int64()), pa.field("b", pa.int64())])), ) ext_array = NestedExtensionArray(list_array) @@ -1355,7 +1393,9 @@ def test___array__(): None, pa.scalar( {"a": [-4.0, 5.0, None, 7.0], "b": ["hello", "world", "!", ""]}, - type=pa.struct([pa.field("a", pa.list_(pa.float64())), pa.field("b", pa.list_(pa.string()))]), + type=pa.struct( + [pa.field("a", pa.large_list(pa.float64())), pa.field("b", pa.large_list(pa.string()))] + ), ), ), ( @@ -1407,7 +1447,9 @@ def test__box_pa_scalar(value, pa_type, desired): None, pa.array( [None, {"a": [-4.0, 5.0, None, 7.0], "b": ["hello", "world", "!", None]}], - type=pa.struct([pa.field("a", pa.list_(pa.float64())), pa.field("b", pa.list_(pa.string()))]), + type=pa.struct( + [pa.field("a", pa.large_list(pa.float64())), pa.field("b", pa.large_list(pa.string()))] + ), ), ), ( @@ -2018,7 +2060,7 @@ def test_pop_fields_zero_chunks(): ) assert ext_array.num_chunks == 0, "Test setup is invalid" ext_array.pop_fields(["a"]) - assert ext_array._pyarrow_dtype == pa.struct({"b": pa.list_(pa.int64())}) + assert ext_array._pyarrow_dtype == pa.struct({"b": pa.large_list(pa.int64())}) def test_delete_last_field_raises(): @@ -2089,6 +2131,26 @@ def test_to_arrow_ext_array_with_list_struct_true(): ) +def test_to_arrow_ext_array_large_list_false(): + """Tests that to_arrow_ext_array(large_list=False) returns list (int32) dtype.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1, 2])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + # Default large_list=False: outer type is struct, inner fields should be list_ (int32) + arrow_ext_array = ext_array.to_arrow_ext_array(large_list=False) + assert pa.types.is_list(arrow_ext_array.dtype.pyarrow_dtype.field("a").type) + + # large_list=True keeps large_list in inner fields + arrow_ext_array_large = ext_array.to_arrow_ext_array(large_list=True) + assert pa.types.is_large_list(arrow_ext_array_large.dtype.pyarrow_dtype.field("a").type) + + def test_series_interpolate(): """We do not support interpolate() on NestedExtensionArray.""" with pytest.raises(NotImplementedError): @@ -2105,15 +2167,21 @@ def test___init___with_list_struct_array(): ext_array = NestedExtensionArray(list_array) assert ext_array.field_names == ["a", "b"] assert ext_array.flat_length == 4 - assert pa.array(ext_array) == list_array + # list_array uses pa.list_, ext_array normalizes to pa.large_list internally + assert pa.array(ext_array).cast(list_array.type) == list_array def test__struct_array(): """Test ._struct_array property""" struct_array = pa.StructArray.from_arrays( arrays=[ - pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0, 2.0])]), - pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + pa.array( + [np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0, 2.0])], type=pa.large_list(pa.float64()) + ), + pa.array( + [-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])], + type=pa.large_list(pa.float64()), + ), ], names=["a", "b"], ) @@ -2126,8 +2194,13 @@ def test__pa_table(): """Test ._pa_table property""" struct_array = pa.StructArray.from_arrays( arrays=[ - pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0, 2.0])]), - pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + pa.array( + [np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0, 2.0])], type=pa.large_list(pa.float64()) + ), + pa.array( + [-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])], + type=pa.large_list(pa.float64()), + ), ], names=["a", "b"], ) diff --git a/tests/nested_pandas/series/test_packer.py b/tests/nested_pandas/series/test_packer.py index 37cb9472..59add7f7 100644 --- a/tests/nested_pandas/series/test_packer.py +++ b/tests/nested_pandas/series/test_packer.py @@ -280,13 +280,13 @@ def test_pack_lists(): ], }, index=[1, 2, 3, 4], - dtype=pd.ArrowDtype(pa.list_(pa.int64())), + dtype=pd.ArrowDtype(pa.large_list(pa.int64())), ) series = packer.pack_lists(packed_df) offsets_reused(series) for field_name in packed_df.columns: - assert_series_equal(series.nest.to_lists()[field_name], packed_df[field_name]) + assert_series_equal(series.nest.to_lists(large_list=True)[field_name], packed_df[field_name]) def test_pack_lists_with_chunked_arrays(): @@ -303,8 +303,8 @@ def test_pack_lists_with_chunked_arrays(): ) list_df = pd.DataFrame({"a": chunked_a, "b": chunked_b}, index=[0, 1, 2, 3, 4, 5]) series = packer.pack_lists(list_df) - assert_series_equal(series.nest.to_lists()["a"], chunked_a) - assert_series_equal(series.nest.to_lists()["b"], chunked_b) + assert_series_equal(series.nest.to_lists()["a"], chunked_a, check_dtype=False) + assert_series_equal(series.nest.to_lists()["b"], chunked_b, check_dtype=False) def test_pack_lists_with_uneven_chunked_arrays(): @@ -321,8 +321,8 @@ def test_pack_lists_with_uneven_chunked_arrays(): ) list_df = pd.DataFrame({"a": chunked_a, "b": chunked_b}, index=[0, 1, 2, 3, 4, 5]) series = packer.pack_lists(list_df) - assert_series_equal(series.nest.to_lists()["a"], chunked_a) - assert_series_equal(series.nest.to_lists()["b"], chunked_b) + assert_series_equal(series.nest.to_lists()["a"], chunked_a, check_dtype=False) + assert_series_equal(series.nest.to_lists()["b"], chunked_b, check_dtype=False) def test_pack_seq_with_dfs_and_index(): @@ -484,7 +484,7 @@ def test_view_sorted_df_as_list_arrays(): ], }, index=[1, 2, 3, 4], - dtype=pd.ArrowDtype(pa.list_(pa.int64())), + dtype=pd.ArrowDtype(pa.large_list(pa.int64())), ) assert_frame_equal(nested_df, desired_nested) @@ -522,7 +522,7 @@ def test_view_sorted_series_as_list_array(): np.array([7, 8, 9]), ], index=[1, 2, 3, 4], - dtype=pd.ArrowDtype(pa.list_(pa.int64())), + dtype=pd.ArrowDtype(pa.large_list(pa.int64())), name="my_series", ) assert_series_equal(nested, desired_nested) @@ -554,7 +554,7 @@ def test_view_sorted_series_as_list_array_chunked_input(): desired = NestedSeries( pa.array([[0, 1], [2, None], [4]]), index=unique_index, - dtype=pd.ArrowDtype(pa.list_(pa.int64())), + dtype=pd.ArrowDtype(pa.large_list(pa.int64())), name="a", ) @@ -589,7 +589,7 @@ def test_view_sorted_series_as_list_array_chunked_input(): def test_calculate_sorted_index_offsets(index, offsets): """Test calculate_sorted_index_offsets().""" actual = packer.calculate_sorted_index_offsets(index) - assert actual.dtype == np.int32 + assert actual.dtype == np.int64 assert_array_equal(actual, offsets) diff --git a/tests/nested_pandas/series/test_series_utils.py b/tests/nested_pandas/series/test_series_utils.py index 32da7e51..9f5366b0 100644 --- a/tests/nested_pandas/series/test_series_utils.py +++ b/tests/nested_pandas/series/test_series_utils.py @@ -6,6 +6,7 @@ from nested_pandas.series.utils import ( align_chunked_struct_list_offsets, align_struct_list_offsets, + downcast_large_list_array, nested_types_mapper, normalize_list_array, normalize_struct_list_array, @@ -25,9 +26,9 @@ def test_align_struct_list_offsets(): with pytest.raises(ValueError): align_struct_list_offsets(pa.array([], type=pa.int64())) with pytest.raises(ValueError): - align_struct_list_offsets(pa.array([], type=pa.list_(pa.int64()))) + align_struct_list_offsets(pa.array([], type=pa.large_list(pa.int64()))) - # Raises if one of the fields is not a ListArray + # Raises if one of the fields is not a LargeListArray with pytest.raises(ValueError): align_struct_list_offsets( pa.StructArray.from_arrays([pa.array([[1, 2], [3, 4, 5]]), pa.array([1, 2])], ["a", "b"]) @@ -37,22 +38,26 @@ def test_align_struct_list_offsets(): with pytest.raises(ValueError): align_struct_list_offsets( pa.StructArray.from_arrays( - [pa.array([[1, 2], [3, 4, 5]]), pa.array([[1, 2, 3], [4, 5]])], ["a", "b"] + [ + pa.array([[1, 2], [3, 4, 5]], type=pa.large_list(pa.int64())), + pa.array([[1, 2, 3], [4, 5]], type=pa.large_list(pa.int64())), + ], + ["a", "b"], ) ) input_array = pa.StructArray.from_arrays( arrays=[ - pa.array([[1, 2], [3, 4], [], [5, 6, 7]]), - pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]]), + pa.array([[1, 2], [3, 4], [], [5, 6, 7]], type=pa.large_list(pa.int64())), + pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]], type=pa.large_list(pa.string())), ], names=["a", "b"], ) assert align_struct_list_offsets(input_array) is input_array - a = pa.array([[0, 0, 0], [1, 2], [3, 4], [], [5, 6, 7]])[1:] + a = pa.array([[0, 0, 0], [1, 2], [3, 4], [], [5, 6, 7]], type=pa.large_list(pa.int64()))[1:] assert a.offsets[0].as_py() == 3 - b = pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]]) + b = pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]], type=pa.large_list(pa.string())) assert b.offsets[0].as_py() == 0 input_array = pa.StructArray.from_arrays( arrays=[a, b], @@ -66,8 +71,8 @@ def test_align_struct_list_offsets(): def test_align_chunked_struct_list_offsets(): """Test align_chunked_struct_list_offsets function.""" # Input is an array, output is chunked array - a = pa.array([[1, 2], [3, 4], [], [5, 6, 7]]) - b = pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]]) + a = pa.array([[1, 2], [3, 4], [], [5, 6, 7]], type=pa.large_list(pa.int64())) + b = pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]], type=pa.large_list(pa.string())) input_array = pa.StructArray.from_arrays( arrays=[a, b], names=["a", "b"], @@ -90,8 +95,10 @@ def test_align_chunked_struct_list_offsets(): assert output_array.equals(input_array) # Input is an "aligned" chunked array, but offsets do not start with zero - a = pa.array([[0, 0, 0], [1, 2], [3, 4], [], [5, 6, 7]])[1:] - b = pa.array([["a", "a", "a", "a"], ["x", "y"], ["y", "x"], [], ["d", "e", "f"]])[1:] + a = pa.array([[0, 0, 0], [1, 2], [3, 4], [], [5, 6, 7]], type=pa.large_list(pa.int64()))[1:] + b = pa.array( + [["a", "a", "a", "a"], ["x", "y"], ["y", "x"], [], ["d", "e", "f"]], type=pa.large_list(pa.string()) + )[1:] input_array = pa.chunked_array( [ pa.StructArray.from_arrays( @@ -105,8 +112,8 @@ def test_align_chunked_struct_list_offsets(): assert output_array.equals(input_array) # Input is a "non-aligned" chunked array - a = pa.array([[0, 0, 0], [1, 2], [3, 4], [], [5, 6, 7]])[1:] - b = pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]]) + a = pa.array([[0, 0, 0], [1, 2], [3, 4], [], [5, 6, 7]], type=pa.large_list(pa.int64()))[1:] + b = pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]], type=pa.large_list(pa.string())) input_array = pa.chunked_array( [ pa.StructArray.from_arrays( @@ -126,16 +133,18 @@ def test_validate_struct_list_type(): validate_struct_list_type(pa.float64()) with pytest.raises(ValueError): - validate_struct_list_type(pa.list_(pa.struct({"a": pa.int64()}))) + validate_struct_list_type(pa.large_list(pa.struct({"a": pa.int64()}))) with pytest.raises(ValueError): validate_struct_list_type(pa.struct({"a": pa.float64()})) with pytest.raises(ValueError): - validate_struct_list_type(pa.struct({"a": pa.list_(pa.float64()), "b": pa.float64()})) + validate_struct_list_type(pa.struct({"a": pa.large_list(pa.float64()), "b": pa.float64()})) assert ( - validate_struct_list_type(pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.float64())})) + validate_struct_list_type( + pa.struct({"a": pa.large_list(pa.float64()), "b": pa.large_list(pa.float64())}) + ) is None ) @@ -146,14 +155,14 @@ def test_transpose_struct_list_type(): with pytest.raises(ValueError): transpose_struct_list_type(pa.int64()) with pytest.raises(ValueError): - transpose_struct_list_type(pa.list_(pa.int64())) + transpose_struct_list_type(pa.large_list(pa.int64())) - # Raises if one of the fields is not a ListType + # Raises if one of the fields is not a LargeListType with pytest.raises(ValueError): transpose_struct_list_type(pa.struct([("a", pa.int64()), ("b", pa.int64())])) - input_type = pa.struct([("a", pa.list_(pa.int64())), ("b", pa.list_(pa.string()))]) - expected_output = pa.list_(pa.struct([("a", pa.int64()), ("b", pa.string())])) + input_type = pa.struct([("a", pa.large_list(pa.int64())), ("b", pa.large_list(pa.string()))]) + expected_output = pa.large_list(pa.struct([("a", pa.int64()), ("b", pa.string())])) assert transpose_struct_list_type(input_type) == expected_output @@ -165,8 +174,8 @@ def test_transpose_list_struct_type(): with pytest.raises(ValueError): transpose_list_struct_type(pa.struct([("a", pa.int64()), ("b", pa.int64())])) - input_type = pa.list_(pa.struct([("a", pa.int64()), ("b", pa.string())])) - expected_output = pa.struct([("a", pa.list_(pa.int64())), ("b", pa.list_(pa.string()))]) + input_type = pa.large_list(pa.struct([("a", pa.int64()), ("b", pa.string())])) + expected_output = pa.struct([("a", pa.large_list(pa.int64())), ("b", pa.large_list(pa.string()))]) assert transpose_list_struct_type(input_type) == expected_output @@ -174,8 +183,8 @@ def test_transpose_struct_list_array(): """Test transpose_struct_list_array function.""" input_array = pa.StructArray.from_arrays( arrays=[ - pa.array([[1, 2], [3, 4], [], [5, 6, 7]]), - pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]]), + pa.array([[1, 2], [3, 4], [], [5, 6, 7]], type=pa.large_list(pa.int64())), + pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]], type=pa.large_list(pa.string())), ], names=["a", "b"], ) @@ -185,7 +194,8 @@ def test_transpose_struct_list_array(): [{"a": 3, "b": "y"}, {"a": 4, "b": "x"}], [], [{"a": 5, "b": "d"}, {"a": 6, "b": "e"}, {"a": 7, "b": "f"}], - ] + ], + type=pa.large_list(pa.struct([("a", pa.int64()), ("b", pa.string())])), ) actual = transpose_struct_list_array(input_array) assert actual == desired @@ -199,12 +209,13 @@ def test_transpose_list_struct_array(): [{"a": 3, "b": "y"}, {"a": 4, "b": "x"}], [], [{"a": 5, "b": "d"}, {"a": 6, "b": "e"}, {"a": 7, "b": "f"}], - ] + ], + type=pa.large_list(pa.struct([("a", pa.int64()), ("b", pa.string())])), ) desired = pa.StructArray.from_arrays( arrays=[ - pa.array([[1, 2], [3, 4], [], [5, 6, 7]]), - pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]]), + pa.array([[1, 2], [3, 4], [], [5, 6, 7]], type=pa.large_list(pa.int64())), + pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]], type=pa.large_list(pa.string())), ], names=["a", "b"], ) @@ -214,8 +225,14 @@ def test_transpose_list_struct_array(): def test_transpose_list_struct_scalar(): """Test transpose_list_struct_scalar function.""" - input_scalar = pa.scalar([{"a": 1, "b": "x"}, {"a": 2, "b": "y"}]) - desired = pa.scalar({"a": [1, 2], "b": ["x", "y"]}) + input_scalar = pa.scalar( + [{"a": 1, "b": "x"}, {"a": 2, "b": "y"}], + type=pa.large_list(pa.struct([("a", pa.int64()), ("b", pa.string())])), + ) + desired = pa.scalar( + {"a": [1, 2], "b": ["x", "y"]}, + type=pa.struct([("a", pa.large_list(pa.int64())), ("b", pa.large_list(pa.string()))]), + ) actual = transpose_list_struct_scalar(input_scalar) assert actual == desired @@ -240,7 +257,7 @@ def test_struct_field_names(): [ (pa.float64(), False), (pa.list_(pa.float64()), False), - (pa.list_(pa.struct([("a", pa.float64()), ("b", pa.float64())])), True), + (pa.large_list(pa.struct([("a", pa.float64()), ("b", pa.float64())])), True), ], ) def test_nested_types_mapper(pa_type, is_nested): @@ -255,43 +272,80 @@ def test_nested_types_mapper(pa_type, is_nested): def test_normalize_list_array(): - """Test normalize_list_array converts to plain list arrays.""" - list_array = pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int64())) - assert normalize_list_array(list_array) is list_array + """Test normalize_list_array converts to large_list arrays.""" + large_list_array = pa.array([[1, 2], [3, 4]], type=pa.large_list(pa.int64())) + assert normalize_list_array(large_list_array) is large_list_array fixed = pa.FixedSizeListArray.from_arrays(pa.array([1, 2, 3, 4]), list_size=2) normalized_fixed = normalize_list_array(fixed) - expected_list = pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int64())) - assert pa.types.is_list(normalized_fixed.type) - assert normalized_fixed.equals(expected_list) + expected_large = pa.array([[1, 2], [3, 4]], type=pa.large_list(pa.int64())) + assert pa.types.is_large_list(normalized_fixed.type) + assert normalized_fixed.equals(expected_large) - large = pa.array([[5, 6], [7, 8]], type=pa.large_list(pa.int64())) - normalized_large = normalize_list_array(large) - expected_large = pa.array([[5, 6], [7, 8]], type=pa.list_(pa.int64())) - assert pa.types.is_list(normalized_large.type) - assert normalized_large.equals(expected_large) + list_array = pa.array([[5, 6], [7, 8]], type=pa.list_(pa.int64())) + normalized_list = normalize_list_array(list_array) + expected_normalized = pa.array([[5, 6], [7, 8]], type=pa.large_list(pa.int64())) + assert pa.types.is_large_list(normalized_list.type) + assert normalized_list.equals(expected_normalized) with pytest.raises(ValueError): normalize_list_array(pa.array([1, 2, 3])) def test_normalize_struct_list_array(): - """Test normalize_struct_list_array converts struct fields to plain list arrays.""" - list_struct = pa.StructArray.from_arrays([pa.array([[1], [2], [3]])], names=["a"]) - assert normalize_struct_list_array(list_struct) is list_struct + """Test normalize_struct_list_array converts struct fields to large_list arrays.""" + large_list_struct = pa.StructArray.from_arrays( + [pa.array([[1], [2], [3]], type=pa.large_list(pa.int64()))], names=["a"] + ) + assert normalize_struct_list_array(large_list_struct) is large_list_struct fixed = pa.FixedSizeListArray.from_arrays(pa.array([1, 2, 3, 4]), list_size=2) struct_array = pa.StructArray.from_arrays([fixed], names=["a"]) normalized = normalize_struct_list_array(struct_array) expected_array = pa.StructArray.from_arrays( - [pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int64()))], + [pa.array([[1, 2], [3, 4]], type=pa.large_list(pa.int64()))], names=["a"], ) - expected_type = pa.struct([("a", pa.list_(pa.int64()))]) + expected_type = pa.struct([("a", pa.large_list(pa.int64()))]) assert normalized.type == expected_type assert normalized.equals(expected_array) with pytest.raises(ValueError): normalize_struct_list_array(pa.array([[1, 2], [3, 4]])) + + +def test_downcast_large_list_array_already_list(): + """downcast_large_list_array is a no-op when the input already uses list_ (int32 offsets). + + This covers the ``return cast(pa.ListType | pa.StructType, t)`` fallthrough in + ``downcast_large_list_type``. It is useful when code that normally produces + ``large_list`` arrays receives data that was already stored with regular ``list_`` + offsets (e.g. loaded from an older Parquet file written without Arrow metadata). + """ + list_array = pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int64())) + result = downcast_large_list_array(list_array) + assert pa.types.is_list(result.type) + assert result.equals(list_array) + + +def test_downcast_large_list_array_mixed_struct(): + """downcast_large_list_array handles structs where only some fields are large_list. + + This covers the ``fields.append(field)`` pass-through branch in + ``downcast_large_list_type``. A struct may arrive with mixed offset types + when it originates from a source that already wrote certain columns as regular + ``list_`` — the downcast should convert ``large_list`` fields and leave the + already-regular ``list_`` fields unchanged. + """ + large_list_col = pa.array([[10, 20], [30]], type=pa.large_list(pa.int64())) + regular_list_col = pa.array([["a"], ["b", "c"]], type=pa.list_(pa.string())) + struct_array = pa.StructArray.from_arrays([large_list_col, regular_list_col], names=["x", "y"]) + + result = downcast_large_list_array(struct_array) + + assert pa.types.is_list(result.type.field("x").type), "large_list field should be downcast to list_" + assert pa.types.is_list(result.type.field("y").type), "already-list_ field should remain list_" + assert result.field("x").equals(large_list_col.cast(pa.list_(pa.int64()))) + assert result.field("y").equals(regular_list_col)