lincc-frameworks · hombit · Apr 15, 2026 · Mar 2, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -2498,7 +2498,7 @@ def map_rows(
         # Otherwise, return the results as a new NestedFrame
         return results_nf
 
-    def to_pandas(self, list_struct=False) -> pd.DataFrame:
+    def to_pandas(self, list_struct=False, large_list=False) -> pd.DataFrame:
         """Convert to an ordinal pandas DataFrame, with no NestedDtype series.
 
         NestedDtype is cast to pd.ArrowDtype
@@ -2509,6 +2509,11 @@ def to_pandas(self, list_struct=False) -> pd.DataFrame:
             If True, cast nested columns to pandas struct-list arrow extension
             array columns. If False (default), cast nested columns to
             list-struct array columns.
+        large_list : bool
+            If False (default), use regular ``list_`` (int32 offsets). Set to
+            True to use ``large_list`` (int64 offsets), which is required when
+            the total number of nested elements across all rows exceeds
+            ``2**31 - 1``.
 
         Returns
         -------
@@ -2529,10 +2534,10 @@ def to_pandas(self, list_struct=False) -> pd.DataFrame:
         """
         df = pd.DataFrame(self)
         for col in self.nested_columns:
-            df[col] = df[col].array.to_arrow_ext_array(list_struct=list_struct)
+            df[col] = df[col].array.to_arrow_ext_array(list_struct=list_struct, large_list=large_list)
         return df
 
-    def to_parquet(self, path, **kwargs) -> None:
+    def to_parquet(self, path, large_list=False, **kwargs) -> None:
         """Creates parquet file(s) with the data of a NestedFrame, either
         as a single parquet file where each nested dataset is packed into its
         own column or as an individual parquet file for each layer.
@@ -2544,6 +2549,11 @@ def to_parquet(self, path, **kwargs) -> None:
         ----------
         path : str
             The path to the parquet file
+        large_list : bool
+            If False (default), use regular ``list_`` (int32 offsets). Set to
+            True to use ``large_list`` (int64 offsets), which is required when
+            the total number of nested elements across all rows exceeds
+            ``2**31 - 1``.
         kwargs : keyword arguments, optional
             Keyword arguments to pass to
             `pyarrow.parquet.write_table
@@ -2559,7 +2569,7 @@ def to_parquet(self, path, **kwargs) -> None:
         >>> nf = generate_data(5,5, seed=1)
         >>> nf.to_parquet("nestedframe.parquet")  # doctest: +SKIP
         """
-        df = self.to_pandas(list_struct=False)
+        df = self.to_pandas(list_struct=False, large_list=large_list)
 
         # Write through pyarrow
         # This is potentially not zero-copy

diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py
@@ -522,6 +522,6 @@ def _cast_struct_cols_to_nested(df, reject_nesting):
 def _cast_list_cols_to_nested(df):
     """cast list columns to nested dtype"""
     for col, dtype in df.dtypes.items():
-        if pa.types.is_list(dtype.pyarrow_dtype):
+        if is_pa_type_a_list(dtype.pyarrow_dtype):
             df[col] = pack_lists(df[[col]])
     return df
diff --git a/src/nested_pandas/series/_storage/list_struct_storage.py b/src/nested_pandas/series/_storage/list_struct_storage.py
@@ -5,6 +5,7 @@
 import pyarrow as pa
 
 from nested_pandas.series.utils import (
+    is_pa_type_a_list,
     normalize_list_array,
     transpose_struct_list_chunked,
     validate_list_struct_type,
@@ -29,7 +30,7 @@ class ListStructStorage:
     def __init__(
         self, array: pa.ListArray | pa.FixedSizeListArray | pa.LargeListArray | pa.ChunkedArray
     ) -> None:
-        if isinstance(array, pa.ListArray):
+        if isinstance(array, pa.Array) and is_pa_type_a_list(array.type):
             array = pa.chunked_array([array])
         if not isinstance(array, pa.ChunkedArray):
             raise ValueError("array must be of type pa.ChunkedArray")
@@ -81,7 +82,7 @@ def nbytes(self) -> int:
         return self._data.nbytes
 
     @property
-    def type(self) -> pa.ListType:
+    def type(self) -> pa.LargeListType:
         """Pyarrow type of the underlying array."""
         return self._data.type
 

diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py
@@ -14,7 +14,7 @@
 from nested_pandas.series.dtype import NestedDtype
 from nested_pandas.series.nestedseries import NestedSeries
 from nested_pandas.series.packer import pack_flat, pack_sorted_df_into_struct
-from nested_pandas.series.utils import nested_types_mapper
+from nested_pandas.series.utils import downcast_large_list_array, nested_types_mapper
 
 __all__ = ["NestSeriesAccessor"]
 
@@ -41,13 +41,17 @@ def _check_series(series):
         if not isinstance(dtype, NestedDtype):
             raise AttributeError(f"Can only use .nest accessor with a Series of NestedDtype, got {dtype}")
 
-    def to_lists(self, columns: list[str] | str | None = None) -> pd.DataFrame:
+    def to_lists(self, columns: list[str] | str | None = None, large_list: bool = False) -> pd.DataFrame:
         """Convert nested series into dataframe of list-array columns
 
         Parameters
         ----------
         columns : list[str] or str or None, optional
             Names of the column(s) to include. Default is None, which means all columns.
+        large_list : bool, optional
+            If False (default), use regular ``list_`` (int32 offsets). Set to True to
+            use ``large_list`` (int64 offsets), which is required when the total number
+            of nested elements across all rows exceeds ~2.1 billion (int32 max).
 
         Returns
         -------
@@ -76,7 +80,12 @@ def to_lists(self, columns: list[str] | str | None = None) -> pd.DataFrame:
         if len(columns) == 0:
             raise ValueError("Cannot convert a struct with no fields to lists")
 
-        list_df = self._series.array.pa_table.select(columns).to_pandas(types_mapper=nested_types_mapper)
+        list_table = self._series.array.pa_table.select(columns)
+        if not large_list:
+            list_table = pa.table(
+                {col: downcast_large_list_array(list_table.column(col)) for col in list_table.column_names}
+            )
+        list_df = list_table.to_pandas(types_mapper=nested_types_mapper)
         list_df.index = self._series.index
 
         return list_df
@@ -128,7 +137,7 @@ def to_flat(self, columns: list[str] | str | None = None) -> pd.DataFrame:
         for chunk in self._series.array.struct_array.iterchunks():
             struct_array = cast(pa.StructArray, chunk)
             for column in columns:
-                list_array = cast(pa.ListArray, struct_array.field(column))
+                list_array = cast(pa.LargeListArray, struct_array.field(column))
                 flat_array = list_array.flatten()
                 flat_chunks[column].append(flat_array)
 
@@ -676,7 +685,7 @@ def get_flat_series(self, field: str) -> pd.Series:
         flat_chunks = []
         for nested_chunk in self._series.array.struct_array.iterchunks():
             struct_array = cast(pa.StructArray, nested_chunk)
-            list_array = cast(pa.ListArray, struct_array.field(field))
+            list_array = cast(pa.LargeListArray, struct_array.field(field))
             flat_array = list_array.flatten()
             flat_chunks.append(flat_array)
 
@@ -723,7 +732,7 @@ def get_list_series(self, field: str) -> pd.Series:
         2    [31.34241782  3.90547832]
         3    [69.23226157 16.98304196]
         4    [87.63891523 87.81425034]
-        Name: flux, dtype: list<item: double>[pyarrow]
+        Name: flux, dtype: large_list<item: double>[pyarrow]
         """
         list_chunked_array = self._series.array.pa_table[field]
         return pd.Series(
@@ -757,7 +766,7 @@ def __getitem__(self, key: str | list[str]) -> NestedSeries:
         flat_chunks = []
         for nested_chunk in self._series.array.struct_array.iterchunks():
             struct_array = cast(pa.StructArray, nested_chunk)
-            list_array = cast(pa.ListArray, struct_array.field(key))
+            list_array = cast(pa.LargeListArray, struct_array.field(key))
             flat_array = list_array.flatten()
             flat_chunks.append(flat_array)
 

diff --git a/src/nested_pandas/series/dtype.py b/src/nested_pandas/series/dtype.py
@@ -14,7 +14,9 @@
 from pandas.core.dtypes.base import ExtensionDtype
 
 from nested_pandas.series.utils import (
+    is_pa_type_a_list,
     is_pa_type_is_list_struct,
+    normalize_struct_list_type,
     transpose_list_struct_type,
     transpose_struct_list_type,
 )
@@ -193,7 +195,7 @@ def __init__(self, pyarrow_dtype: pa.DataType | Mapping) -> None:
 
         # Allow from_columns-style mapping inputs
         if isinstance(pyarrow_dtype, Mapping):
-            pyarrow_dtype = pa.struct({col: pa.list_(pa_type) for col, pa_type in pyarrow_dtype.items()})
+            pyarrow_dtype = pa.struct({col: pa.large_list(pa_type) for col, pa_type in pyarrow_dtype.items()})
             pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)
 
         self.pyarrow_dtype, self.list_struct_pa_dtype = self._validate_dtype(pyarrow_dtype)
@@ -239,7 +241,7 @@ def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self:  # type: ignore
         nested<a: [double], b: [int64]>
         >>> assert (
         ...     dtype.pyarrow_dtype
-        ...     == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())})
+        ...     == pa.struct({"a": pa.large_list(pa.float64()), "b": pa.large_list(pa.int64())})
         ... )
         """
         return cls.from_columns(fields)
@@ -266,15 +268,15 @@ def from_columns(cls, columns: Mapping[str, pa.DataType]) -> Self:  # type: igno
         nested<a: [double], b: [int64]>
         >>> assert (
         ...     dtype.pyarrow_dtype
-        ...     == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())})
+        ...     == pa.struct({"a": pa.large_list(pa.float64()), "b": pa.large_list(pa.int64())})
         ... )
         """
-        pyarrow_dtype = pa.struct({column: pa.list_(pa_type) for column, pa_type in columns.items()})
+        pyarrow_dtype = pa.struct({column: pa.large_list(pa_type) for column, pa_type in columns.items()})
         pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)
         return cls(pyarrow_dtype=pyarrow_dtype)
 
     @staticmethod
-    def _validate_dtype(pyarrow_dtype: pa.DataType) -> tuple[pa.StructType, pa.ListType]:
+    def _validate_dtype(pyarrow_dtype: pa.DataType) -> tuple[pa.StructType, pa.LargeListType]:
         """Check that the given pyarrow type is castable to the nested type.
 
         Parameters
@@ -286,20 +288,27 @@ def _validate_dtype(pyarrow_dtype: pa.DataType) -> tuple[pa.StructType, pa.ListT
         -------
         pa.StructType
             Struct-list pyarrow type representing the nested type.
-        pa.ListType
+        pa.LargeListType
             List-struct pyarrow type representing the nested type.
         """
         if not isinstance(pyarrow_dtype, pa.DataType):
             raise TypeError(f"Expected a 'pyarrow.DataType' object, got {type(pyarrow_dtype)}")
         if pa.types.is_struct(pyarrow_dtype):
             struct_type = cast(pa.StructType, pyarrow_dtype)
+            # Normalize list fields to large_list for backward compatibility
+            # (callers may pass pa.list_ fields)
+            struct_type = normalize_struct_list_type(struct_type)
             return struct_type, transpose_struct_list_type(struct_type)
-        # Currently, LongList and others are not supported
-        if pa.types.is_list(pyarrow_dtype):
-            list_type = cast(pa.ListType, pyarrow_dtype)
+        # Support pa.large_list (and pa.list_ for backward compatibility)
+        if is_pa_type_a_list(pyarrow_dtype):
+            if not pa.types.is_large_list(pyarrow_dtype):
+                # Normalize regular list or fixed-size list to large_list
+                pyarrow_dtype = pa.large_list(pyarrow_dtype.value_type)
+            list_type = cast(pa.LargeListType, pyarrow_dtype)
             return transpose_list_struct_type(list_type), list_type
         raise ValueError(
-            f"NestedDtype can only be constructed with pa.StructType or pa.ListType only, got {pyarrow_dtype}"
+            "NestedDtype can only be constructed with pa.StructType, pa.LargeListType, "
+            f"or pa.ListType, got {pyarrow_dtype}"
         )
 
     @property

diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py
@@ -64,6 +64,7 @@
 from nested_pandas.series.nestedseries import NestedSeries  # noqa
 from nested_pandas.series.utils import (
     chunk_lengths,
+    downcast_large_list_array,
     is_pa_type_a_list,
     normalize_list_array,
     normalize_struct_list_type,
@@ -185,7 +186,7 @@ def replace_with_mask(array: pa.ChunkedArray, mask: pa.BooleanArray, value: pa.A
     # If mask is [False, True, False, True], mask_cumsum will be [0, 1, 1, 2]
     # So we put value items to the right positions in broadcast_value, while duplicate some other items for
     # the positions where mask is False.
-    mask_cumsum = pa.compute.cumulative_sum(mask.cast(pa.int32()))
+    mask_cumsum = pa.compute.cumulative_sum(mask.cast(pa.int64()))
     value_index = pa.compute.subtract(mask_cumsum, 1)
     value_index = pa.compute.if_else(pa.compute.less(value_index, 0), 0, value_index)
 
@@ -208,7 +209,7 @@ def convert_df_to_pa_scalar(df: pd.DataFrame, *, pa_type: pa.StructType | None)
             ty = scalar.type
         else:
             array = pa.array(series)
-            ty = pa.list_(array.type)
+            ty = pa.large_list(array.type)
             scalar = pa.scalar(array, type=ty)
         d[column] = scalar
         types[column] = ty
@@ -916,7 +917,7 @@ def _pyarrow_dtype(self) -> pa.StructType:
         return self._dtype.pyarrow_dtype
 
     @property
-    def _pyarrow_list_struct_dtype(self) -> pa.ListType:
+    def _pyarrow_list_struct_dtype(self) -> pa.LargeListType:
         """PyArrow data type of the list-struct view over the ext. array"""
         return transpose_struct_list_type(self._pyarrow_dtype)
 
@@ -925,35 +926,47 @@ def from_arrow_ext_array(cls, array: ArrowExtensionArray) -> Self:  # type: igno
         """Create a NestedExtensionArray from pandas' ArrowExtensionArray"""
         return cls(array._pa_array)
 
-    def to_arrow_ext_array(self, list_struct: bool = False) -> ArrowExtensionArray:
+    def to_arrow_ext_array(self, list_struct: bool = False, large_list: bool = False) -> ArrowExtensionArray:
         """Convert the extension array to pandas' ArrowExtensionArray
 
         Parameters
         ----------
         list_struct : bool, optional
             If False (default), return struct-list array, otherwise return
             list-struct array.
+        large_list : bool, optional
+            If False (default), use regular ``list_`` (int32 offsets). Set to True to
+            use ``large_list`` (int64 offsets), which is required when the total number
+            of nested elements across all rows exceeds ~2.1 billion (int32 max).
         """
-        if list_struct:
-            return ArrowExtensionArray(self.list_array)
-        return ArrowExtensionArray(self.struct_array)
-
-    def to_pyarrow_scalar(self, list_struct: bool = False) -> pa.ListScalar:
+        arr = self.list_array if list_struct else self.struct_array
+        if not large_list:
+            arr = downcast_large_list_array(arr)
+        return ArrowExtensionArray(arr)
+
+    def to_pyarrow_scalar(
+        self, list_struct: bool = False, large_list: bool = False
+    ) -> pa.LargeListScalar | pa.ListScalar:
         """Convert to a pyarrow scalar of a list type
 
         Parameters
         ----------
         list_struct : bool, optional
             If False (default), return list-struct-list scalar,
             otherwise list-list-struct scalar.
+        large_list : bool, optional
+            If False (default), use regular ``list_`` (int32 offsets). Set to True to
+            use ``large_list`` (int64 offsets), which is required when the total number
+            of nested elements across all rows exceeds ~2.1 billion (int32 max).
 
         Returns
         -------
-        pyarrow.ListScalar
+        pyarrow.LargeListScalar or pyarrow.ListScalar
         """
         pa_array = self.list_array if list_struct else self.struct_array
-        pa_type = pa.list_(pa_array.type)
-        return cast(pa.ListScalar, pa.scalar(pa_array, type=pa_type))
+        list_type = pa.large_list if large_list else pa.list_
+        pa_type = list_type(pa_array.type)
+        return cast(pa.LargeListScalar | pa.ListScalar, pa.scalar(pa_array, type=pa_type))
 
     @property
     def list_offsets(self) -> pa.Array:
@@ -973,8 +986,8 @@ def list_offsets(self) -> pa.Array:
 
         zero_and_lengths = pa.chunked_array(
             [
-                pa.array([0], type=pa.int32()),
-                pa.array(self.list_lengths, type=pa.int32()),
+                pa.array([0], type=pa.int64()),
+                pa.array(self.list_lengths, type=pa.int64()),
             ]
         )
         offsets = pa.compute.cumulative_sum(zero_and_lengths)
@@ -1028,7 +1041,7 @@ def iter_field_lists(self, field: str) -> Generator[np.ndarray, None, None]:
         """
         for chunk in self.struct_array.iterchunks():
             struct_array: pa.StructArray = cast(pa.StructArray, chunk)
-            list_array: pa.ListArray = cast(pa.ListArray, struct_array.field(field))
+            list_array: pa.LargeListArray = cast(pa.LargeListArray, struct_array.field(field))
             for list_scalar in list_array:
                 yield np.asarray(list_scalar.values)
 
@@ -1104,7 +1117,7 @@ def set_flat_field(self, field: str, value: ArrayLike, *, keep_dtype: bool = Fal
 
         if isinstance(pa_array, pa.ChunkedArray):
             pa_array = pa_array.combine_chunks()
-        field_list_array = pa.ListArray.from_arrays(values=pa_array, offsets=self.list_offsets)
+        field_list_array = pa.LargeListArray.from_arrays(values=pa_array, offsets=self.list_offsets)
 
         return self.set_list_field(field, field_list_array, keep_dtype=keep_dtype)
 

diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py
@@ -320,7 +320,7 @@ def view_sorted_series_as_list_array(
     flat_array = pa.array(series, from_pandas=True)
     if isinstance(flat_array, pa.ChunkedArray):
         flat_array = flat_array.combine_chunks()
-    list_array = pa.ListArray.from_arrays(
+    list_array = pa.LargeListArray.from_arrays(
         offset,
         flat_array,
     )
@@ -356,7 +356,7 @@ def calculate_sorted_index_offsets(index: pd.Index) -> np.ndarray:
     offset_but_last = np.nonzero(~index.duplicated(keep="first"))[0]
     offset = np.append(offset_but_last, len(index))
 
-    # Arrow uses int32 for offsets
-    offset = offset.astype(np.int32)
+    # LargeListArray uses int64 for offsets
+    offset = offset.astype(np.int64)
 
     return offset