Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2498,7 +2498,7 @@ def map_rows(
# Otherwise, return the results as a new NestedFrame
return results_nf

def to_pandas(self, list_struct=False) -> pd.DataFrame:
def to_pandas(self, list_struct=False, large_list=False) -> pd.DataFrame:
"""Convert to an ordinal pandas DataFrame, with no NestedDtype series.

NestedDtype is cast to pd.ArrowDtype
Expand All @@ -2509,6 +2509,11 @@ def to_pandas(self, list_struct=False) -> pd.DataFrame:
If True, cast nested columns to pandas struct-list arrow extension
array columns. If False (default), cast nested columns to
list-struct array columns.
large_list : bool
If False (default), use regular ``list_`` (int32 offsets). Set to
True to use ``large_list`` (int64 offsets), which is required when
the total number of nested elements across all rows exceeds
``2**31 - 1``.

Returns
-------
Expand All @@ -2529,10 +2534,10 @@ def to_pandas(self, list_struct=False) -> pd.DataFrame:
"""
df = pd.DataFrame(self)
for col in self.nested_columns:
df[col] = df[col].array.to_arrow_ext_array(list_struct=list_struct)
df[col] = df[col].array.to_arrow_ext_array(list_struct=list_struct, large_list=large_list)
return df

def to_parquet(self, path, **kwargs) -> None:
def to_parquet(self, path, large_list=False, **kwargs) -> None:
"""Creates parquet file(s) with the data of a NestedFrame, either
as a single parquet file where each nested dataset is packed into its
own column or as an individual parquet file for each layer.
Expand All @@ -2544,6 +2549,11 @@ def to_parquet(self, path, **kwargs) -> None:
----------
path : str
The path to the parquet file
large_list : bool
If False (default), use regular ``list_`` (int32 offsets). Set to
True to use ``large_list`` (int64 offsets), which is required when
the total number of nested elements across all rows exceeds
``2**31 - 1``.
kwargs : keyword arguments, optional
Keyword arguments to pass to
`pyarrow.parquet.write_table
Expand All @@ -2559,7 +2569,7 @@ def to_parquet(self, path, **kwargs) -> None:
>>> nf = generate_data(5,5, seed=1)
>>> nf.to_parquet("nestedframe.parquet") # doctest: +SKIP
"""
df = self.to_pandas(list_struct=False)
df = self.to_pandas(list_struct=False, large_list=large_list)

# Write through pyarrow
# This is potentially not zero-copy
Expand Down
2 changes: 1 addition & 1 deletion src/nested_pandas/nestedframe/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,6 @@ def _cast_struct_cols_to_nested(df, reject_nesting):
def _cast_list_cols_to_nested(df):
"""cast list columns to nested dtype"""
for col, dtype in df.dtypes.items():
if pa.types.is_list(dtype.pyarrow_dtype):
if is_pa_type_a_list(dtype.pyarrow_dtype):
df[col] = pack_lists(df[[col]])
return df
5 changes: 3 additions & 2 deletions src/nested_pandas/series/_storage/list_struct_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pyarrow as pa

from nested_pandas.series.utils import (
is_pa_type_a_list,
normalize_list_array,
transpose_struct_list_chunked,
validate_list_struct_type,
Expand All @@ -29,7 +30,7 @@ class ListStructStorage:
def __init__(
self, array: pa.ListArray | pa.FixedSizeListArray | pa.LargeListArray | pa.ChunkedArray
) -> None:
if isinstance(array, pa.ListArray):
if isinstance(array, pa.Array) and is_pa_type_a_list(array.type):
array = pa.chunked_array([array])
if not isinstance(array, pa.ChunkedArray):
raise ValueError("array must be of type pa.ChunkedArray")
Expand Down Expand Up @@ -81,7 +82,7 @@ def nbytes(self) -> int:
return self._data.nbytes

@property
def type(self) -> pa.ListType:
def type(self) -> pa.LargeListType:
"""Pyarrow type of the underlying array."""
return self._data.type

Expand Down
23 changes: 16 additions & 7 deletions src/nested_pandas/series/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from nested_pandas.series.dtype import NestedDtype
from nested_pandas.series.nestedseries import NestedSeries
from nested_pandas.series.packer import pack_flat, pack_sorted_df_into_struct
from nested_pandas.series.utils import nested_types_mapper
from nested_pandas.series.utils import downcast_large_list_array, nested_types_mapper

__all__ = ["NestSeriesAccessor"]

Expand All @@ -41,13 +41,17 @@ def _check_series(series):
if not isinstance(dtype, NestedDtype):
raise AttributeError(f"Can only use .nest accessor with a Series of NestedDtype, got {dtype}")

def to_lists(self, columns: list[str] | str | None = None) -> pd.DataFrame:
def to_lists(self, columns: list[str] | str | None = None, large_list: bool = False) -> pd.DataFrame:
"""Convert nested series into dataframe of list-array columns

Parameters
----------
columns : list[str] or str or None, optional
Names of the column(s) to include. Default is None, which means all columns.
large_list : bool, optional
If False (default), use regular ``list_`` (int32 offsets). Set to True to
use ``large_list`` (int64 offsets), which is required when the total number
of nested elements across all rows exceeds ~2.1 billion (int32 max).

Returns
-------
Expand Down Expand Up @@ -76,7 +80,12 @@ def to_lists(self, columns: list[str] | str | None = None) -> pd.DataFrame:
if len(columns) == 0:
raise ValueError("Cannot convert a struct with no fields to lists")

list_df = self._series.array.pa_table.select(columns).to_pandas(types_mapper=nested_types_mapper)
list_table = self._series.array.pa_table.select(columns)
if not large_list:
list_table = pa.table(
{col: downcast_large_list_array(list_table.column(col)) for col in list_table.column_names}
)
list_df = list_table.to_pandas(types_mapper=nested_types_mapper)
list_df.index = self._series.index

return list_df
Expand Down Expand Up @@ -128,7 +137,7 @@ def to_flat(self, columns: list[str] | str | None = None) -> pd.DataFrame:
for chunk in self._series.array.struct_array.iterchunks():
struct_array = cast(pa.StructArray, chunk)
for column in columns:
list_array = cast(pa.ListArray, struct_array.field(column))
list_array = cast(pa.LargeListArray, struct_array.field(column))
flat_array = list_array.flatten()
flat_chunks[column].append(flat_array)

Expand Down Expand Up @@ -676,7 +685,7 @@ def get_flat_series(self, field: str) -> pd.Series:
flat_chunks = []
for nested_chunk in self._series.array.struct_array.iterchunks():
struct_array = cast(pa.StructArray, nested_chunk)
list_array = cast(pa.ListArray, struct_array.field(field))
list_array = cast(pa.LargeListArray, struct_array.field(field))
flat_array = list_array.flatten()
flat_chunks.append(flat_array)

Expand Down Expand Up @@ -723,7 +732,7 @@ def get_list_series(self, field: str) -> pd.Series:
2 [31.34241782 3.90547832]
3 [69.23226157 16.98304196]
4 [87.63891523 87.81425034]
Name: flux, dtype: list<item: double>[pyarrow]
Name: flux, dtype: large_list<item: double>[pyarrow]
"""
list_chunked_array = self._series.array.pa_table[field]
return pd.Series(
Expand Down Expand Up @@ -757,7 +766,7 @@ def __getitem__(self, key: str | list[str]) -> NestedSeries:
flat_chunks = []
for nested_chunk in self._series.array.struct_array.iterchunks():
struct_array = cast(pa.StructArray, nested_chunk)
list_array = cast(pa.ListArray, struct_array.field(key))
list_array = cast(pa.LargeListArray, struct_array.field(key))
flat_array = list_array.flatten()
flat_chunks.append(flat_array)

Expand Down
29 changes: 19 additions & 10 deletions src/nested_pandas/series/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
from pandas.core.dtypes.base import ExtensionDtype

from nested_pandas.series.utils import (
is_pa_type_a_list,
is_pa_type_is_list_struct,
normalize_struct_list_type,
transpose_list_struct_type,
transpose_struct_list_type,
)
Expand Down Expand Up @@ -193,7 +195,7 @@ def __init__(self, pyarrow_dtype: pa.DataType | Mapping) -> None:

# Allow from_columns-style mapping inputs
if isinstance(pyarrow_dtype, Mapping):
pyarrow_dtype = pa.struct({col: pa.list_(pa_type) for col, pa_type in pyarrow_dtype.items()})
pyarrow_dtype = pa.struct({col: pa.large_list(pa_type) for col, pa_type in pyarrow_dtype.items()})
pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)

self.pyarrow_dtype, self.list_struct_pa_dtype = self._validate_dtype(pyarrow_dtype)
Expand Down Expand Up @@ -239,7 +241,7 @@ def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore
nested<a: [double], b: [int64]>
>>> assert (
... dtype.pyarrow_dtype
... == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())})
... == pa.struct({"a": pa.large_list(pa.float64()), "b": pa.large_list(pa.int64())})
... )
"""
return cls.from_columns(fields)
Expand All @@ -266,15 +268,15 @@ def from_columns(cls, columns: Mapping[str, pa.DataType]) -> Self: # type: igno
nested<a: [double], b: [int64]>
>>> assert (
... dtype.pyarrow_dtype
... == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())})
... == pa.struct({"a": pa.large_list(pa.float64()), "b": pa.large_list(pa.int64())})
... )
"""
pyarrow_dtype = pa.struct({column: pa.list_(pa_type) for column, pa_type in columns.items()})
pyarrow_dtype = pa.struct({column: pa.large_list(pa_type) for column, pa_type in columns.items()})
pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)
return cls(pyarrow_dtype=pyarrow_dtype)

@staticmethod
def _validate_dtype(pyarrow_dtype: pa.DataType) -> tuple[pa.StructType, pa.ListType]:
def _validate_dtype(pyarrow_dtype: pa.DataType) -> tuple[pa.StructType, pa.LargeListType]:
"""Check that the given pyarrow type is castable to the nested type.

Parameters
Expand All @@ -286,20 +288,27 @@ def _validate_dtype(pyarrow_dtype: pa.DataType) -> tuple[pa.StructType, pa.ListT
-------
pa.StructType
Struct-list pyarrow type representing the nested type.
pa.ListType
pa.LargeListType
List-struct pyarrow type representing the nested type.
"""
if not isinstance(pyarrow_dtype, pa.DataType):
raise TypeError(f"Expected a 'pyarrow.DataType' object, got {type(pyarrow_dtype)}")
if pa.types.is_struct(pyarrow_dtype):
struct_type = cast(pa.StructType, pyarrow_dtype)
# Normalize list fields to large_list for backward compatibility
# (callers may pass pa.list_ fields)
struct_type = normalize_struct_list_type(struct_type)
return struct_type, transpose_struct_list_type(struct_type)
# Currently, LongList and others are not supported
if pa.types.is_list(pyarrow_dtype):
list_type = cast(pa.ListType, pyarrow_dtype)
# Support pa.large_list (and pa.list_ for backward compatibility)
if is_pa_type_a_list(pyarrow_dtype):
if not pa.types.is_large_list(pyarrow_dtype):
# Normalize regular list or fixed-size list to large_list
pyarrow_dtype = pa.large_list(pyarrow_dtype.value_type)
list_type = cast(pa.LargeListType, pyarrow_dtype)
return transpose_list_struct_type(list_type), list_type
raise ValueError(
f"NestedDtype can only be constructed with pa.StructType or pa.ListType only, got {pyarrow_dtype}"
"NestedDtype can only be constructed with pa.StructType, pa.LargeListType, "
f"or pa.ListType, got {pyarrow_dtype}"
)

@property
Expand Down
45 changes: 29 additions & 16 deletions src/nested_pandas/series/ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
from nested_pandas.series.nestedseries import NestedSeries # noqa
from nested_pandas.series.utils import (
chunk_lengths,
downcast_large_list_array,
is_pa_type_a_list,
normalize_list_array,
normalize_struct_list_type,
Expand Down Expand Up @@ -185,7 +186,7 @@ def replace_with_mask(array: pa.ChunkedArray, mask: pa.BooleanArray, value: pa.A
# If mask is [False, True, False, True], mask_cumsum will be [0, 1, 1, 2]
# So we put value items to the right positions in broadcast_value, while duplicate some other items for
# the positions where mask is False.
mask_cumsum = pa.compute.cumulative_sum(mask.cast(pa.int32()))
mask_cumsum = pa.compute.cumulative_sum(mask.cast(pa.int64()))
value_index = pa.compute.subtract(mask_cumsum, 1)
value_index = pa.compute.if_else(pa.compute.less(value_index, 0), 0, value_index)

Expand All @@ -208,7 +209,7 @@ def convert_df_to_pa_scalar(df: pd.DataFrame, *, pa_type: pa.StructType | None)
ty = scalar.type
else:
array = pa.array(series)
ty = pa.list_(array.type)
ty = pa.large_list(array.type)
scalar = pa.scalar(array, type=ty)
d[column] = scalar
types[column] = ty
Expand Down Expand Up @@ -916,7 +917,7 @@ def _pyarrow_dtype(self) -> pa.StructType:
return self._dtype.pyarrow_dtype

@property
def _pyarrow_list_struct_dtype(self) -> pa.ListType:
def _pyarrow_list_struct_dtype(self) -> pa.LargeListType:
"""PyArrow data type of the list-struct view over the ext. array"""
return transpose_struct_list_type(self._pyarrow_dtype)

Expand All @@ -925,35 +926,47 @@ def from_arrow_ext_array(cls, array: ArrowExtensionArray) -> Self: # type: igno
"""Create a NestedExtensionArray from pandas' ArrowExtensionArray"""
return cls(array._pa_array)

def to_arrow_ext_array(self, list_struct: bool = False) -> ArrowExtensionArray:
def to_arrow_ext_array(self, list_struct: bool = False, large_list: bool = False) -> ArrowExtensionArray:
"""Convert the extension array to pandas' ArrowExtensionArray

Parameters
----------
list_struct : bool, optional
If False (default), return struct-list array, otherwise return
list-struct array.
large_list : bool, optional
If False (default), use regular ``list_`` (int32 offsets). Set to True to
use ``large_list`` (int64 offsets), which is required when the total number
of nested elements across all rows exceeds ~2.1 billion (int32 max).
"""
if list_struct:
return ArrowExtensionArray(self.list_array)
return ArrowExtensionArray(self.struct_array)

def to_pyarrow_scalar(self, list_struct: bool = False) -> pa.ListScalar:
arr = self.list_array if list_struct else self.struct_array
if not large_list:
arr = downcast_large_list_array(arr)
return ArrowExtensionArray(arr)

def to_pyarrow_scalar(
self, list_struct: bool = False, large_list: bool = False
) -> pa.LargeListScalar | pa.ListScalar:
"""Convert to a pyarrow scalar of a list type

Parameters
----------
list_struct : bool, optional
If False (default), return list-struct-list scalar,
otherwise list-list-struct scalar.
large_list : bool, optional
If False (default), use regular ``list_`` (int32 offsets). Set to True to
use ``large_list`` (int64 offsets), which is required when the total number
of nested elements across all rows exceeds ~2.1 billion (int32 max).

Returns
-------
pyarrow.ListScalar
pyarrow.LargeListScalar or pyarrow.ListScalar
"""
pa_array = self.list_array if list_struct else self.struct_array
pa_type = pa.list_(pa_array.type)
return cast(pa.ListScalar, pa.scalar(pa_array, type=pa_type))
list_type = pa.large_list if large_list else pa.list_
pa_type = list_type(pa_array.type)
return cast(pa.LargeListScalar | pa.ListScalar, pa.scalar(pa_array, type=pa_type))

@property
def list_offsets(self) -> pa.Array:
Expand All @@ -973,8 +986,8 @@ def list_offsets(self) -> pa.Array:

zero_and_lengths = pa.chunked_array(
[
pa.array([0], type=pa.int32()),
pa.array(self.list_lengths, type=pa.int32()),
pa.array([0], type=pa.int64()),
pa.array(self.list_lengths, type=pa.int64()),
]
)
offsets = pa.compute.cumulative_sum(zero_and_lengths)
Expand Down Expand Up @@ -1028,7 +1041,7 @@ def iter_field_lists(self, field: str) -> Generator[np.ndarray, None, None]:
"""
for chunk in self.struct_array.iterchunks():
struct_array: pa.StructArray = cast(pa.StructArray, chunk)
list_array: pa.ListArray = cast(pa.ListArray, struct_array.field(field))
list_array: pa.LargeListArray = cast(pa.LargeListArray, struct_array.field(field))
for list_scalar in list_array:
yield np.asarray(list_scalar.values)

Expand Down Expand Up @@ -1104,7 +1117,7 @@ def set_flat_field(self, field: str, value: ArrayLike, *, keep_dtype: bool = Fal

if isinstance(pa_array, pa.ChunkedArray):
pa_array = pa_array.combine_chunks()
field_list_array = pa.ListArray.from_arrays(values=pa_array, offsets=self.list_offsets)
field_list_array = pa.LargeListArray.from_arrays(values=pa_array, offsets=self.list_offsets)

return self.set_list_field(field, field_list_array, keep_dtype=keep_dtype)

Expand Down
6 changes: 3 additions & 3 deletions src/nested_pandas/series/packer.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def view_sorted_series_as_list_array(
flat_array = pa.array(series, from_pandas=True)
if isinstance(flat_array, pa.ChunkedArray):
flat_array = flat_array.combine_chunks()
list_array = pa.ListArray.from_arrays(
list_array = pa.LargeListArray.from_arrays(
offset,
flat_array,
)
Expand Down Expand Up @@ -356,7 +356,7 @@ def calculate_sorted_index_offsets(index: pd.Index) -> np.ndarray:
offset_but_last = np.nonzero(~index.duplicated(keep="first"))[0]
offset = np.append(offset_but_last, len(index))

# Arrow uses int32 for offsets
offset = offset.astype(np.int32)
# LargeListArray uses int64 for offsets
offset = offset.astype(np.int64)

return offset
Loading
Loading