diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 7e27bbdb749..d0f5f32b090 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -262,9 +262,9 @@ Result> VariableShapeTensorType::MakeTensor( internal::Permute(permutation, &dim_names); } + internal::Permute(permutation, &shape); ARROW_ASSIGN_OR_RAISE( auto strides, internal::ComputeStrides(ext_type.value_type(), shape, permutation)); - internal::Permute(permutation, &shape); ARROW_ASSIGN_OR_RAISE(const auto buffer, internal::SliceTensorBuffer(*data_array, value_type, shape)); diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index 290ce09befb..4eb0583a68e 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -101,6 +101,7 @@ may expose data type-specific methods or properties. JsonArray UuidArray Bool8Array + VariableShapeTensorArray .. _api.scalar: diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 18a40d877c3..ebce638897b 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -165,6 +165,7 @@ def print_entry(label, value): dictionary, run_end_encoded, bool8, fixed_shape_tensor, json_, opaque, uuid, + variable_shape_tensor, field, type_for_alias, DataType, DictionaryType, StructType, @@ -178,6 +179,7 @@ def print_entry(label, value): RunEndEncodedType, Bool8Type, FixedShapeTensorType, JsonType, OpaqueType, UuidType, UnknownExtensionType, + VariableShapeTensorType, register_extension_type, unregister_extension_type, DictionaryMemo, KeyValueMetadata, @@ -214,6 +216,7 @@ def print_entry(label, value): StructArray, ExtensionArray, RunEndEncodedArray, Bool8Array, FixedShapeTensorArray, JsonArray, OpaqueArray, UuidArray, + VariableShapeTensorArray, scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, @@ -231,7 +234,8 @@ def print_entry(label, value): FixedSizeBinaryScalar, DictionaryScalar, MapScalar, StructScalar, UnionScalar, RunEndEncodedScalar, Bool8Scalar, ExtensionScalar, - FixedShapeTensorScalar, JsonScalar, OpaqueScalar, UuidScalar) + FixedShapeTensorScalar, JsonScalar, OpaqueScalar, UuidScalar, + VariableShapeTensorScalar) # Buffers, allocation from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ec58ac727e5..307eba9e4f6 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4621,7 +4621,7 @@ cdef class FixedShapeTensorArray(ExtensionArray): and the rest of the dimensions will match the permuted shape of the fixed shape tensor. - The conversion is zero-copy. + The conversion is zero-copy if data is primitive numeric and without nulls. Returns ------- @@ -4705,17 +4705,7 @@ cdef class FixedShapeTensorArray(ExtensionArray): "Cannot convert 1D array or scalar to fixed shape tensor array") if np.prod(obj.shape) == 0: raise ValueError("Expected a non-empty ndarray") - if dim_names is not None: - if not isinstance(dim_names, Sequence): - raise TypeError("dim_names must be a tuple or list") - if len(dim_names) != len(obj.shape[1:]): - raise ValueError( - (f"The length of dim_names ({len(dim_names)}) does not match" - f"the number of tensor dimensions ({len(obj.shape[1:])})." - ) - ) - if not all(isinstance(name, str) for name in dim_names): - raise TypeError("Each element of dim_names must be a string") + _validate_dim_names(dim_names, len(obj.shape[1:])) permutation = (-np.array(obj.strides)).argsort(kind='stable') if permutation[0] != 0: @@ -4874,6 +4864,509 @@ cdef class Bool8Array(ExtensionArray): return Bool8Array.from_storage(storage_arr) +def _check_sequence_param(value, ndim, name): + if value is None: + return False + if not isinstance(value, Sequence): + raise TypeError(f"{name} must be a tuple or list") + if len(value) != ndim: + raise ValueError( + (f"The length of {name} ({len(value)}) does not match" + f" the number of tensor dimensions ({ndim}).")) + return True + + +def _validate_dim_names(dim_names, ndim): + if not _check_sequence_param(dim_names, ndim, "dim_names"): + return + if not all(isinstance(name, str) for name in dim_names): + raise TypeError("Each element of dim_names must be a string") + + +def _validate_permutation(permutation, ndim): + if not _check_sequence_param(permutation, ndim, "permutation"): + return None + normalized = [int(x) for x in permutation] + if sorted(normalized) != list(range(ndim)): + raise ValueError( + "permutation must contain each dimension index exactly once") + return normalized + + +def _validate_uniform_shape(uniform_shape, ndim): + if not _check_sequence_param(uniform_shape, ndim, "uniform_shape"): + return + for value in uniform_shape: + if value is not None and value < 0: + raise ValueError( + "uniform_shape must contain non-negative values") + + +def _infer_uniform_shape(shape_rows, ndim): + if len(shape_rows) == 0: + return None + inferred = [] + for i in range(ndim): + axis_size = shape_rows[0][i] + if all(shape[i] == axis_size for shape in shape_rows): + inferred.append(axis_size) + else: + inferred.append(None) + if all(x is None for x in inferred): + return None + return inferred + + +def _permutation_from_strides(arr): + """Infer the dimension permutation from array strides. + + Note: for arrays with size-1 dimensions, the inferred permutation + may be unreliable since size-1 strides are unconstrained. Callers + should skip permutation validation for such arrays. + """ + return [int(x) for x in + (-np.array(arr.strides, dtype=np.int64)).argsort(kind="stable")] + + +cdef class VariableShapeTensorArray(ExtensionArray): + """ + Concrete class for variable shape tensor extension arrays. + + Examples + -------- + Define the extension type for tensor array + + >>> import pyarrow as pa + >>> tensor_type = pa.variable_shape_tensor(pa.float64(), 2) + + Create an extension array + + >>> shapes = pa.array([[2, 3], [1, 2]], pa.list_(pa.int32(), 2)) + >>> values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(pa.float64())) + >>> arr = pa.StructArray.from_arrays([values, shapes], names=["data", "shape"]) + >>> pa.ExtensionArray.from_storage(tensor_type, arr) + + -- is_valid: all not null + -- child 0 type: list + [ + [ + 1, + 2, + 3, + 4, + 5, + 6 + ], + [ + 7, + 8 + ] + ] + -- child 1 type: fixed_size_list[2] + [ + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + """ + + @staticmethod + def from_numpy_ndarray(obj, dim_names=None, permutation=None, uniform_shape=None, + value_type=None, ndim=None): + """ + Convert a sequence of numpy.ndarrays to a variable shape tensor extension array. + The length of the input sequence becomes the length of the output array. + + Parameters + ---------- + obj : Sequence[numpy.ndarray] + Sequence of ndarrays with matching dtype, ndim, and memory permutation. + dim_names : tuple or list of strings, default None + Explicit names to tensor dimensions. + permutation : tuple or list of integers, default None + Physical permutation for all input arrays. If None, inferred from strides. + uniform_shape : tuple or list of integers or None, default None + Optional known uniform dimensions in physical order. If None, inferred. + value_type : pyarrow.DataType or numpy dtype, default None + Optional explicit tensor value type. Required with empty input. + ndim : int, default None + Optional explicit tensor rank. Required with empty input. + """ + cdef: + list arrays + list shape_rows + int array_ndim + int i + object base_dtype + DataType arrow_type + object normalized_permutation + object permutation_metadata + object shape_type + object values + object shapes + object struct_arr + object ext_type + + if isinstance(obj, np.ndarray): + raise TypeError("obj must be a sequence of numpy arrays") + if not isinstance(obj, Sequence) or isinstance(obj, (str, bytes)): + raise TypeError("obj must be a sequence of numpy arrays") + arrays = list(obj) + + if value_type is not None and not isinstance(value_type, DataType): + try: + value_type = from_numpy_dtype(np.dtype(value_type)) + except (TypeError, ValueError) as exc: + raise TypeError( + "value_type must be a pyarrow.DataType or numpy dtype" + ) from exc + + if len(arrays) == 0: + if value_type is None or ndim is None: + raise ValueError( + "For empty input, both value_type and ndim must be provided") + if ndim < 0: + raise ValueError("ndim must be non-negative") + + _validate_dim_names(dim_names, ndim) + permutation = _validate_permutation(permutation, ndim) + _validate_uniform_shape(uniform_shape, ndim) + + shape_type = list_(int32(), list_size=ndim) + values = array([], list_(value_type)) + shapes = array([], shape_type) + struct_arr = StructArray.from_arrays( + [values, shapes], names=["data", "shape"]) + ext_type = variable_shape_tensor( + value_type, + ndim, + dim_names=dim_names, + permutation=permutation, + uniform_shape=uniform_shape + ) + return ExtensionArray.from_storage(ext_type, struct_arr) + + for i, arr in enumerate(arrays): + if not isinstance(arr, np.ndarray): + raise TypeError(f"obj[{i}] must be a numpy.ndarray") + if arr.ndim == 0: + raise ValueError("Cannot convert scalar to variable shape tensor array") + + base_dtype = arrays[0].dtype + array_ndim = arrays[0].ndim + arrow_type = from_numpy_dtype(base_dtype) + + if value_type is not None and value_type != arrow_type: + raise TypeError( + f"numpy array dtype {base_dtype} does not match value_type {value_type}") + + if ndim is not None and ndim != array_ndim: + raise ValueError( + f"ndim must match numpy arrays ndim ({array_ndim}). Got {ndim}.") + ndim = array_ndim + + for i, arr in enumerate(arrays[1:], start=1): + if arr.dtype != base_dtype: + raise TypeError( + f"obj[{i}] has dtype {arr.dtype}; expected {base_dtype}") + if arr.ndim != ndim: + raise ValueError(f"obj[{i}] has ndim {arr.ndim}; expected {ndim}") + + _validate_dim_names(dim_names, ndim) + normalized_permutation = _validate_permutation(permutation, ndim) + + # Infer permutation if not provided by the user. Prefer arrays + # without size-1 dimensions since their strides are unambiguous. + if normalized_permutation is None: + for arr in arrays: + if all(s > 1 for s in arr.shape): + normalized_permutation = _permutation_from_strides(arr) + break + else: + # All arrays have size-1 dims; use first array's strides + normalized_permutation = _permutation_from_strides(arrays[0]) + + # Validate permutation consistency for arrays without size-1 + # dims (size-1 strides are unconstrained, so skip those). + for i, arr in enumerate(arrays): + if any(s <= 1 for s in arr.shape): + continue + ndarray_permutation_list = _permutation_from_strides(arr) + if ndarray_permutation_list != normalized_permutation: + raise ValueError( + (f"obj[{i}] has permutation {ndarray_permutation_list}; " + f"expected {list(normalized_permutation)}")) + + shape_rows = [ + [int(x) for x in np.take(arr.shape, normalized_permutation)] + for arr in arrays + ] + + if uniform_shape is not None: + _validate_uniform_shape(uniform_shape, ndim) + for i, value in enumerate(uniform_shape): + if value is not None: + if any(shape[i] != value for shape in shape_rows): + raise ValueError( + (f"uniform_shape[{i}]={value} does not match input shape " + f"dimension values")) + else: + uniform_shape = _infer_uniform_shape(shape_rows, ndim) + + # Verify that ravel(order="K") + inferred permutation are consistent + # by round-tripping the first non-empty array. + for arr in arrays: + if arr.size > 0: + raveled = np.ravel(arr, order="K") + physical_shape = tuple( + np.take(arr.shape, normalized_permutation)) + reconstructed = raveled.reshape(physical_shape) + inv_perm = list(np.argsort(normalized_permutation)) + reconstructed_logical = np.transpose(reconstructed, inv_perm) + if not np.array_equal(reconstructed_logical, arr): + raise ValueError( + "Array memory layout is incompatible with variable " + "shape tensor representation. Consider making the " + "array contiguous first with np.ascontiguousarray().") + break + + values = array([np.ravel(arr, order="K") for arr in arrays], list_(arrow_type)) + shapes = array(shape_rows, list_(int32(), list_size=ndim)) + struct_arr = StructArray.from_arrays([values, shapes], names=["data", "shape"]) + + if np.array_equal(normalized_permutation, np.arange(ndim)): + permutation_metadata = None + else: + permutation_metadata = normalized_permutation + + ext_type = variable_shape_tensor( + arrow_type, + ndim, + dim_names=dim_names, + permutation=permutation_metadata, + uniform_shape=uniform_shape, + ) + return ExtensionArray.from_storage(ext_type, struct_arr) + + def to_numpy_ndarray_list(self): + """ + Convert variable shape tensor extension array to a list of numpy.ndarrays. + + Returns + ------- + list + List containing one ndarray per valid element and None for null elements. + """ + return [x.to_numpy() if x.is_valid else None for x in self] + + def to_row_splits(self): + """ + Return row_splits/offsets for the flattened values representation. + + Returns + ------- + Int32Array + One more element than the array length. First value is always 0. + """ + offsets = self.storage.field("data").offsets + if len(offsets) == 0: + return offsets + base = offsets[0].as_py() + if base == 0: + return offsets + return _pc().subtract(offsets, base) + + def to_offsets(self): + """ + Alias for :meth:`to_row_splits`. + """ + return self.to_row_splits() + + @staticmethod + def from_row_splits(values, row_splits, shapes, dim_names=None, permutation=None, + uniform_shape=None, value_type=None, ndim=None): + """ + Construct a VariableShapeTensorArray from flat values and row_splits. + + Parameters + ---------- + values : array-like + Flat tensor values. + row_splits : array-like + Monotonically increasing boundaries into ``values`` with + ``row_splits[0] == 0``. + shapes : array-like + Physical tensor shapes for each row. + dim_names : tuple or list of strings, default None + Explicit names for tensor dimensions. + permutation : tuple or list of integers, default None + Physical permutation of tensor dimensions. + uniform_shape : tuple or list of integers or None, default None + Optional known uniform dimensions in physical order. If None, + inferred from shapes. + value_type : pyarrow.DataType or numpy dtype, default None + Optional explicit tensor value type. Required when values are + empty. + ndim : int, default None + Optional explicit tensor rank. If None, inferred from shapes. + """ + cdef: + object values_arr + object row_splits_arr + object shape_arr + list splits + int i + object shape_type + object data + object struct_arr + object ext_type + list shape_rows + + if value_type is not None and not isinstance(value_type, DataType): + try: + value_type = from_numpy_dtype(np.dtype(value_type)) + except (TypeError, ValueError) as exc: + raise TypeError( + "value_type must be a pyarrow.DataType or numpy dtype" + ) from exc + + values_arr = asarray(values, type=value_type) + if isinstance(values_arr, ChunkedArray): + if values_arr.num_chunks != 1: + raise TypeError("values must be an Array or single-chunk ChunkedArray") + values_arr = values_arr.chunk(0) + + if len(values_arr) == 0 and value_type is None: + raise TypeError("value_type must be provided when values are empty") + + row_splits_arr = asarray(row_splits, type=int32()) + if isinstance(row_splits_arr, ChunkedArray): + if row_splits_arr.num_chunks != 1: + raise TypeError( + "row_splits must be an Array or single-chunk ChunkedArray") + row_splits_arr = row_splits_arr.chunk(0) + + splits = row_splits_arr.to_pylist() + if len(splits) == 0: + raise ValueError("row_splits must contain at least one value") + if any(x is None for x in splits): + raise ValueError("row_splits must not contain nulls") + if splits[0] != 0: + raise ValueError("row_splits must start with 0") + for i in range(1, len(splits)): + if splits[i] < splits[i - 1]: + raise ValueError("row_splits must be monotonically non-decreasing") + if splits[-1] != len(values_arr): + raise ValueError( + f"row_splits[-1] ({splits[-1]}) must equal len(values) ({len(values_arr)})") + + if ndim is None: + if isinstance(shapes, Array): + if shapes.type.id != _Type_FIXED_SIZE_LIST: + raise TypeError("shapes must be fixed-size list of int32 values") + ndim = shapes.type.list_size + else: + shape_rows = list(shapes) + if len(shape_rows) == 0: + raise ValueError("ndim must be provided when shapes is empty") + ndim = len(shape_rows[0]) + elif ndim < 0: + raise ValueError("ndim must be non-negative") + + _validate_dim_names(dim_names, ndim) + permutation = _validate_permutation(permutation, ndim) + + shape_type = list_(int32(), list_size=ndim) + shape_arr = asarray(shapes, type=shape_type) + if isinstance(shape_arr, ChunkedArray): + if shape_arr.num_chunks != 1: + raise TypeError("shapes must be an Array or single-chunk ChunkedArray") + shape_arr = shape_arr.chunk(0) + if len(shape_arr) != len(splits) - 1: + raise ValueError( + (f"shapes length ({len(shape_arr)}) must equal number of rows " + f"({len(splits) - 1})")) + + shape_rows = shape_arr.to_pylist() + + # Validate that each row's shape product matches its segment length + for i in range(len(splits) - 1): + expected_size = 1 + for dim in shape_rows[i]: + expected_size *= dim + actual_size = splits[i + 1] - splits[i] + if expected_size != actual_size: + raise ValueError( + (f"shapes[{i}] product ({expected_size}) does not match " + f"row_splits interval ({actual_size})")) + + if uniform_shape is not None: + _validate_uniform_shape(uniform_shape, ndim) + for i, value in enumerate(uniform_shape): + if value is not None: + if any(shape[i] != value for shape in shape_rows): + raise ValueError( + (f"uniform_shape[{i}]={value} does not match " + f"shape dimension values")) + else: + uniform_shape = _infer_uniform_shape(shape_rows, ndim) + + data = ListArray.from_arrays(row_splits_arr, values_arr) + struct_arr = StructArray.from_arrays([data, shape_arr], names=["data", "shape"]) + ext_type = variable_shape_tensor( + values_arr.type, + ndim, + dim_names=dim_names, + permutation=permutation, + uniform_shape=uniform_shape + ) + return ExtensionArray.from_storage(ext_type, struct_arr) + + @staticmethod + def from_offsets(values, offsets, shapes, dim_names=None, permutation=None, + uniform_shape=None, value_type=None, ndim=None): + """ + Alias for :meth:`from_row_splits`. + + Parameters + ---------- + values : array-like + Flat tensor values. + offsets : array-like + Monotonically increasing boundaries into ``values`` with + ``offsets[0] == 0``. + shapes : array-like + Physical tensor shapes for each row. + dim_names : tuple or list of strings, default None + Explicit names for tensor dimensions. + permutation : tuple or list of integers, default None + Physical permutation of tensor dimensions. + uniform_shape : tuple or list of integers or None, default None + Optional known uniform dimensions in physical order. If None, + inferred from shapes. + value_type : pyarrow.DataType or numpy dtype, default None + Optional explicit tensor value type. Required when values are + empty. + ndim : int, default None + Optional explicit tensor rank. If None, inferred from shapes. + """ + return VariableShapeTensorArray.from_row_splits( + values, + offsets, + shapes, + dim_names=dim_names, + permutation=permutation, + uniform_shape=uniform_shape, + value_type=value_type, + ndim=ndim, + ) + + cdef dict _array_classes = { _Type_NA: NullArray, _Type_BOOL: BooleanArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index e96a7d84696..281693325c3 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -908,6 +908,14 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: const shared_ptr[CBuffer] null_bitmap, ) + @staticmethod + CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"( + shared_ptr[CDataType], + const shared_ptr[CArray]& offsets, + const shared_ptr[CArray]& keys, + const shared_ptr[CArray]& items, + CMemoryPool* pool) + shared_ptr[CArray] keys() shared_ptr[CArray] items() CMapType* map_type() @@ -1184,6 +1192,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: void set_chunksize(int64_t chunksize) cdef cppclass CTensor" arrow::Tensor": + CTensor(const shared_ptr[CDataType]& type, + const shared_ptr[CBuffer]& data, + const vector[int64_t]& shape, + const vector[int64_t]& strides, + const vector[c_string]& dim_names) shared_ptr[CDataType] type() shared_ptr[CBuffer] data() @@ -3014,6 +3027,24 @@ cdef extern from "arrow/extension_type.h" namespace "arrow": shared_ptr[CArray] storage() +cdef extern from "arrow/extension/variable_shape_tensor.h" namespace "arrow::extension" nogil: + cdef cppclass CVariableShapeTensorType \ + " arrow::extension::VariableShapeTensorType"(CExtensionType): + + CResult[shared_ptr[CTensor]] MakeTensor(const shared_ptr[CExtensionScalar]& scalar) const + + @staticmethod + CResult[shared_ptr[CDataType]] Make(const shared_ptr[CDataType]& value_type, + const int32_t ndim, + const vector[int64_t] permutation, + const vector[c_string] dim_names, + const vector[optional[int64_t]] uniform_shape) + + const shared_ptr[CDataType] value_type() + const int32_t ndim() + const vector[int64_t] permutation() + const vector[c_string] dim_names() + const vector[optional[int64_t]] uniform_shape() cdef extern from "arrow/extension/json.h" namespace "arrow::extension" nogil: cdef cppclass CJsonType" arrow::extension::JsonExtensionType"(CExtensionType): @@ -3034,7 +3065,7 @@ cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil: cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension" nogil: cdef cppclass CFixedShapeTensorType \ - " arrow::extension::FixedShapeTensorType"(CExtensionType): + " arrow::extension::FixedShapeTensorType"(CExtensionType) nogil: CResult[shared_ptr[CTensor]] MakeTensor(const shared_ptr[CExtensionScalar]& scalar) const diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 683faa7855c..0f16a3e3d82 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -195,6 +195,11 @@ cdef class ExtensionType(BaseExtensionType): const CPyExtensionType* cpy_ext_type +cdef class VariableShapeTensorType(BaseExtensionType): + cdef: + const CVariableShapeTensorType* tensor_ext_type + + cdef class FixedShapeTensorType(BaseExtensionType): cdef: const CFixedShapeTensorType* tensor_ext_type diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index d1fa1192deb..b0035ce79cb 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -131,6 +131,8 @@ cdef api object pyarrow_wrap_data_type( out = Bool8Type.__new__(Bool8Type) elif extension_name == b"arrow.fixed_shape_tensor": out = FixedShapeTensorType.__new__(FixedShapeTensorType) + elif extension_name == b"arrow.variable_shape_tensor": + out = VariableShapeTensorType.__new__(VariableShapeTensorType) elif extension_name == b"arrow.opaque": out = OpaqueType.__new__(OpaqueType) elif extension_name == b"arrow.uuid": diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 83cabcf447d..c0851e39e68 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1484,7 +1484,7 @@ cdef class FixedShapeTensorScalar(ExtensionScalar): The resulting ndarray's shape matches the permuted shape of the fixed shape tensor scalar. - The conversion is zero-copy. + The conversion is zero-copy if data is primitive numeric and without nulls. Returns ------- @@ -1539,6 +1539,47 @@ cdef class Bool8Scalar(ExtensionScalar): py_val = super().as_py() return None if py_val is None else py_val != 0 + +cdef class VariableShapeTensorScalar(ExtensionScalar): + """ + Concrete class for variable shape tensor extension scalar. + """ + + def to_numpy(self): + """ + Convert variable shape tensor extension scalar to a numpy.ndarray. + + The conversion is zero-copy if data is primitive numeric and without nulls. + + Returns + ------- + numpy.ndarray + """ + if not self.is_valid: + raise ValueError("Cannot convert null scalar to numpy array") + return self.to_tensor().to_numpy() + + def to_tensor(self): + """ + Convert variable shape tensor extension scalar to a pyarrow.Tensor. + + Returns + ------- + tensor : pyarrow.Tensor + """ + if not self.is_valid: + raise ValueError("Cannot convert null scalar to Tensor") + cdef: + CVariableShapeTensorType* c_type = static_pointer_cast[CVariableShapeTensorType, CDataType]( + self.wrapped.get().type).get() + shared_ptr[CExtensionScalar] scalar = static_pointer_cast[CExtensionScalar, CScalar](self.wrapped) + shared_ptr[CTensor] ctensor + + with nogil: + ctensor = GetResultValue(c_type.MakeTensor(scalar)) + return pyarrow_wrap_tensor(ctensor) + + cdef dict _scalar_classes = { _Type_BOOL: BooleanScalar, _Type_UINT8: UInt8Scalar, diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index c947b06e0e9..f45c81a1b0d 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -28,7 +28,6 @@ import numpy as np except ImportError: np = None - import pyarrow as pa from pyarrow.vendored.version import Version @@ -1399,7 +1398,7 @@ def test_uuid_extension(): assert isinstance(array[0], pa.UuidScalar) -def test_tensor_type(): +def test_fixed_shape_tensor_type(): tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3]) assert tensor_type.extension_name == "arrow.fixed_shape_tensor" assert tensor_type.storage_type == pa.list_(pa.int8(), 6) @@ -1424,6 +1423,56 @@ def test_tensor_type(): assert tensor_type.permutation is None +def test_variable_shape_tensor_type(): + tensor_type = pa.variable_shape_tensor(pa.int8(), 2) + expected_storage_type = pa.struct([ + pa.field("data", pa.list_(pa.int8())), + pa.field("shape", pa.list_(pa.int32(), 2)) + ]) + assert tensor_type.extension_name == "arrow.variable_shape_tensor" + assert tensor_type.storage_type == expected_storage_type + assert tensor_type.ndim == 2 + assert tensor_type.dim_names is None + assert tensor_type.permutation is None + assert tensor_type.uniform_shape is None + + tensor_type = pa.variable_shape_tensor(pa.int64(), 3, dim_names=['C', 'H', 'W']) + expected_storage_type = pa.struct([ + pa.field("data", pa.list_(pa.int64())), + pa.field("shape", pa.list_(pa.int32(), 3)) + ]) + assert tensor_type.extension_name == "arrow.variable_shape_tensor" + assert tensor_type.storage_type == expected_storage_type + assert tensor_type.ndim == 3 + assert tensor_type.dim_names == ['C', 'H', 'W'] + assert tensor_type.permutation is None + assert tensor_type.uniform_shape is None + + tensor_type = pa.variable_shape_tensor(pa.bool_(), 2, permutation=[1, 0]) + expected_storage_type = pa.struct([ + pa.field("data", pa.list_(pa.bool_())), + pa.field("shape", pa.list_(pa.int32(), 2)) + ]) + assert tensor_type.extension_name == "arrow.variable_shape_tensor" + assert tensor_type.storage_type == expected_storage_type + assert tensor_type.ndim == 2 + assert tensor_type.dim_names is None + assert tensor_type.permutation == [1, 0] + assert tensor_type.uniform_shape is None + + tensor_type = pa.variable_shape_tensor(pa.float64(), 2, uniform_shape=[1, None]) + expected_storage_type = pa.struct([ + pa.field("data", pa.list_(pa.float64())), + pa.field("shape", pa.list_(pa.int32(), 2)) + ]) + assert tensor_type.extension_name == "arrow.variable_shape_tensor" + assert tensor_type.storage_type == expected_storage_type + assert tensor_type.ndim == 2 + assert tensor_type.dim_names is None + assert tensor_type.permutation is None + assert tensor_type.uniform_shape == [1, None] + + @pytest.mark.numpy @pytest.mark.parametrize("np_type_str", ("int8", "int64", "float32")) def test_tensor_class_methods(np_type_str): @@ -1589,6 +1638,413 @@ def test_tensor_array_from_numpy(np_type_str): @pytest.mark.numpy +@pytest.mark.parametrize("value_type", ( + "int8", "int32", "int64", "float64")) +def test_variable_shape_tensor_class_methods(value_type): + value_type = getattr(np, value_type) + ndim = 2 + shape_type = pa.list_(pa.int32(), ndim) + arrow_type = pa.from_numpy_dtype(value_type) + tensor_type = pa.variable_shape_tensor( + arrow_type, + ndim, + dim_names=["H", "W"], + permutation=[0, 1], + uniform_shape=[None, None], + ) + fields = [pa.field("data", pa.list_(arrow_type)), pa.field("shape", shape_type)] + + shapes = pa.array([[2, 3], [2, 1]], shape_type) + values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(arrow_type)) + struct_arr = pa.StructArray.from_arrays([values, shapes], fields=fields) + arr = pa.ExtensionArray.from_storage(tensor_type, struct_arr) + basic_arr = pa.ExtensionArray.from_storage( + pa.variable_shape_tensor(arrow_type, ndim, uniform_shape=[2, None]), struct_arr + ) + + storage = pa.array( + [([1, 2, 3, 4, 5, 6], [2, 3]), ([7, 8], [2, 1])], type=pa.struct(fields) + ) + assert pa.ExtensionArray.from_storage(tensor_type, storage).equals(arr) + + assert arr.type == tensor_type + + ndarray_list = [ + np.array([[1, 2, 3], [4, 5, 6]], dtype=value_type), + np.array([[7], [8]], dtype=value_type), + ] + list(np.testing.assert_array_equal(x.to_numpy(), y) for x, y in + zip(arr, ndarray_list)) + + from_numpy = pa.VariableShapeTensorArray.from_numpy_ndarray(ndarray_list) + assert from_numpy.equals(basic_arr) + assert from_numpy.type.uniform_shape == [2, None] + + assert arr.to_pylist() == [ + {"data": [1, 2, 3, 4, 5, 6], "shape": [2, 3]}, + {"data": [7, 8], "shape": [2, 1]}, + ] + + expected_0 = np.array([[1, 2, 3], [4, 5, 6]], dtype=value_type) + expected_1 = np.array([[7], [8]], dtype=value_type) + + np.testing.assert_array_equal(arr[0].to_tensor().to_numpy(), expected_0) + np.testing.assert_array_equal(arr[1].to_tensor().to_numpy(), expected_1) + + np.testing.assert_array_equal(arr[0].to_numpy(), expected_0) + np.testing.assert_array_equal(arr[1].to_numpy(), expected_1) + + assert arr[0].to_tensor().equals( + pa.Tensor.from_numpy(expected_0, dim_names=["H", "W"])) + + assert arr[1].to_tensor().equals( + pa.Tensor.from_numpy(expected_1, dim_names=["H", "W"])) + + shapes = pa.array([[2, 3], [0, 0]], shape_type) + values = pa.array([[1, 2, 3, 4, 5, 6], []], pa.list_(arrow_type)) + struct_arr = pa.StructArray.from_arrays([values, shapes], fields=fields) + arr = pa.ExtensionArray.from_storage(tensor_type, struct_arr) + np.testing.assert_array_equal(arr[1].to_tensor().to_numpy(), np.array( + [], dtype=value_type).reshape(shapes[1].as_py())) + + +@pytest.mark.numpy +@pytest.mark.parametrize("value_type", ("int8", "int64", "float32")) +def test_variable_shape_tensor_array_from_numpy(value_type): + value_type = np.dtype(value_type).type() + arrow_type = pa.from_numpy_dtype(value_type) + + arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], + dtype=value_type, order="C") + tensor_array_from_numpy = pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + assert isinstance(tensor_array_from_numpy.type, pa.VariableShapeTensorType) + assert tensor_array_from_numpy.type.value_type == arrow_type + assert tensor_array_from_numpy.type.ndim == 3 + assert tensor_array_from_numpy.type.permutation is None + assert tensor_array_from_numpy.type.uniform_shape == [2, 2, 3] + + f_arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], + dtype=value_type, order="F") + with pytest.raises(ValueError, match=r"obj\[1\] has permutation"): + pa.VariableShapeTensorArray.from_numpy_ndarray([f_arr, arr]) + with pytest.raises(ValueError, match=r"obj\[1\] has ndim"): + pa.VariableShapeTensorArray.from_numpy_ndarray([arr.reshape((12, 1)), arr]) + with pytest.raises(TypeError, match=r"obj\[1\] has dtype"): + pa.VariableShapeTensorArray.from_numpy_ndarray([arr.astype(np.int32()), arr]) + + from numpy.lib.stride_tricks import as_strided + + flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type) + bw = value_type.itemsize + + arr = flat_arr.reshape(1, 3, 4) + tensor_array_from_numpy = pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + assert tensor_array_from_numpy.type.ndim == 3 + assert tensor_array_from_numpy.type.permutation is None + assert tensor_array_from_numpy.type.uniform_shape == [1, 3, 4] + + arr = as_strided(flat_arr, shape=(1, 2, 3, 2), + strides=(bw * 12, bw * 6, bw, bw * 3)) + tensor_array_from_numpy = pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + assert tensor_array_from_numpy.type.ndim == 4 + assert tensor_array_from_numpy.type.permutation == [0, 1, 3, 2] + assert tensor_array_from_numpy[0].to_tensor() == pa.Tensor.from_numpy(arr) + assert tensor_array_from_numpy.type.uniform_shape == [1, 2, 2, 3] + + arr = flat_arr.reshape(1, 2, 3, 2) + result = pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + expected = np.array( + [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type) + np.testing.assert_array_equal(result[0].to_numpy(), expected) + + arr = np.arange(24, dtype=value_type).reshape((2, 3, 4)) + arr = np.transpose(arr, (2, 0, 1)) + result = pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + assert result.type.permutation == [1, 2, 0] + expected_tensor_view = np.transpose(arr, np.argsort(result.type.permutation)) + tensor = result[0].to_tensor() + assert list(tensor.shape) == list(expected_tensor_view.shape) + result_ndarray = result[0].to_numpy() + assert list(result_ndarray.shape) == list(expected_tensor_view.shape) + + arr = np.array([1, 2, 3, 4], dtype=value_type) + result = pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + assert result.type.ndim == 1 + assert result.type.uniform_shape == [4] + np.testing.assert_array_equal(result[0].to_numpy(), arr) + + arr = np.array(1, dtype=value_type) + with pytest.raises(ValueError, match="Cannot convert scalar"): + pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + + arr = np.array([], dtype=value_type).reshape((0)) + result = pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + assert result.type.ndim == 1 + np.testing.assert_array_equal(result[0].to_numpy(), arr) + + arr = np.array([], dtype=value_type).reshape((0, 3, 2)) + result = pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + assert result.type.ndim == 3 + np.testing.assert_array_equal(result[0].to_numpy(), arr) + + arr = np.array([], dtype=value_type).reshape((3, 0, 2)) + result = pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + assert result.type.ndim == 3 + np.testing.assert_array_equal(result[0].to_numpy(), arr) + + +@pytest.mark.numpy +@pytest.mark.parametrize("dtype", ( + "int8", "int16", "int32", "int64", "float32", "float64")) +def test_variable_shape_tensor_roundtrip_2d(dtype): + """Roundtrip C-contiguous 2D arrays through variable shape tensor.""" + dtype = np.dtype(dtype) + arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], dtype=dtype) + + result = pa.VariableShapeTensorArray.from_numpy_ndarray([arr]) + assert result.type.permutation is None + assert result.type.ndim == 2 + + tensor = result[0].to_tensor() + assert tensor == pa.Tensor.from_numpy(arr) + assert list(tensor.shape) == list(arr.shape) + assert list(tensor.strides) == list(arr.strides) + np.testing.assert_array_equal(result[0].to_numpy(), arr) + + stored_shape = result.storage.field("shape")[0].as_py() + assert stored_shape == list(arr.shape) + + # from_storage constructor path + arrow_type = pa.from_numpy_dtype(dtype) + tensor_type = pa.variable_shape_tensor(arrow_type, 2) + fields = [ + pa.field("data", pa.list_(arrow_type)), + pa.field("shape", pa.list_(pa.int32(), 2)), + ] + values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], + pa.list_(arrow_type)) + shapes = pa.array([[2, 3], [2, 1]], pa.list_(pa.int32(), 2)) + storage = pa.StructArray.from_arrays([values, shapes], fields=fields) + ext_arr = pa.ExtensionArray.from_storage(tensor_type, storage) + + assert isinstance(ext_arr, pa.VariableShapeTensorArray) + assert ext_arr.type == tensor_type + expected_0 = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype) + expected_1 = np.array([[7], [8]], dtype=dtype) + np.testing.assert_array_equal(ext_arr[0].to_numpy(), expected_0) + np.testing.assert_array_equal(ext_arr[1].to_numpy(), expected_1) + + +@pytest.mark.numpy +def test_variable_shape_tensor_from_numpy_empty_input_schema(): + arr = pa.VariableShapeTensorArray.from_numpy_ndarray( + [], + value_type=pa.int32(), + ndim=2, + dim_names=["H", "W"], + permutation=[1, 0], + uniform_shape=[None, None], + ) + assert len(arr) == 0 + assert arr.type.value_type == pa.int32() + assert arr.type.ndim == 2 + assert arr.type.dim_names == ["H", "W"] + assert arr.type.permutation == [1, 0] + assert arr.type.uniform_shape == [None, None] + + with pytest.raises(ValueError, match="both value_type and ndim must be provided"): + pa.VariableShapeTensorArray.from_numpy_ndarray([]) + + +@pytest.mark.numpy +def test_variable_shape_tensor_from_numpy_validation(): + arr = np.array([[1, 2], [3, 4]], dtype=np.int32) + result = pa.VariableShapeTensorArray.from_numpy_ndarray((arr,)) + assert result.type.ndim == 2 + + with pytest.raises(TypeError, match="obj must be a sequence of numpy arrays"): + pa.VariableShapeTensorArray.from_numpy_ndarray(arr) + + with pytest.raises(TypeError, match=r"obj\[1\] must be a numpy.ndarray"): + pa.VariableShapeTensorArray.from_numpy_ndarray([arr, [1, 2]]) + + with pytest.raises(ValueError, match="permutation must contain each dimension"): + pa.VariableShapeTensorArray.from_numpy_ndarray([arr], permutation=[0, 0]) + + with pytest.raises(ValueError, match=r"obj\[0\] has permutation"): + pa.VariableShapeTensorArray.from_numpy_ndarray([arr], permutation=[1, 0]) + + with pytest.raises(ValueError, match=r"uniform_shape\[0\]=3"): + pa.VariableShapeTensorArray.from_numpy_ndarray([arr], uniform_shape=[3, None]) + + +@pytest.mark.numpy +def test_variable_shape_tensor_row_split_adapters(): + values = pa.array([1, 2, 3, 4, 5, 6, 7, 8], pa.int32()) + row_splits = pa.array([0, 6, 8], pa.int32()) + shapes = pa.array([[2, 3], [2, 1]], pa.list_(pa.int32(), 2)) + + arr = pa.VariableShapeTensorArray.from_row_splits( + values, row_splits, shapes, dim_names=["H", "W"]) + arr2 = pa.VariableShapeTensorArray.from_offsets( + values, row_splits, shapes, dim_names=["H", "W"]) + + assert arr.equals(arr2) + assert arr.type.dim_names == ["H", "W"] + assert arr.type.uniform_shape == [2, None] + assert arr.to_row_splits().to_pylist() == [0, 6, 8] + assert arr.to_offsets().to_pylist() == [0, 6, 8] + assert arr[1:].to_row_splits().to_pylist() == [0, 2] + + ndarray_list = arr.to_numpy_ndarray_list() + np.testing.assert_array_equal( + ndarray_list[0], np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)) + np.testing.assert_array_equal( + ndarray_list[1], np.array([[7], [8]], dtype=np.int32)) + + +@pytest.mark.numpy +def test_variable_shape_tensor_from_row_splits_validation(): + shapes = pa.array([[2, 3], [2, 1]], pa.list_(pa.int32(), 2)) + + # Empty row_splits + with pytest.raises(ValueError, match="at least one value"): + pa.VariableShapeTensorArray.from_row_splits( + pa.array([], pa.int32()), pa.array([], pa.int32()), shapes, + value_type=pa.int32()) + + # Nulls in row_splits + with pytest.raises(ValueError, match="must not contain nulls"): + pa.VariableShapeTensorArray.from_row_splits( + pa.array([1, 2, 3], pa.int32()), + pa.array([0, None, 3], pa.int32()), shapes) + + # Non-zero start + with pytest.raises(ValueError, match="must start with 0"): + pa.VariableShapeTensorArray.from_row_splits( + pa.array([1, 2, 3], pa.int32()), + pa.array([1, 3], pa.int32()), + pa.array([[1, 3]], pa.list_(pa.int32(), 2))) + + # Non-monotonic row_splits + with pytest.raises(ValueError, match="monotonically non-decreasing"): + pa.VariableShapeTensorArray.from_row_splits( + pa.array([1, 2, 3, 4, 5, 6, 7, 8], pa.int32()), + pa.array([0, 8, 6], pa.int32()), shapes) + + # row_splits[-1] != len(values) + with pytest.raises(ValueError, match="must equal len"): + pa.VariableShapeTensorArray.from_row_splits( + pa.array([1, 2, 3], pa.int32()), + pa.array([0, 6, 8], pa.int32()), shapes) + + # Shapes length mismatch + with pytest.raises(ValueError, match="shapes length"): + pa.VariableShapeTensorArray.from_row_splits( + pa.array([1, 2, 3, 4, 5, 6, 7, 8], pa.int32()), + pa.array([0, 6, 8], pa.int32()), + pa.array([[2, 3]], pa.list_(pa.int32(), 2))) + + # Shape product doesn't match segment length + with pytest.raises(ValueError, match="shapes.*product.*does not match"): + pa.VariableShapeTensorArray.from_row_splits( + pa.array([1, 2, 3, 4, 5, 6, 7, 8], pa.int32()), + pa.array([0, 6, 8], pa.int32()), + # shape [2,2] claims 4 elements but segment has 6 + pa.array([[2, 2], [2, 1]], pa.list_(pa.int32(), 2))) + + # Invalid permutation + with pytest.raises(ValueError, match="permutation must contain each dimension"): + pa.VariableShapeTensorArray.from_row_splits( + pa.array([1, 2, 3, 4, 5, 6, 7, 8], pa.int32()), + pa.array([0, 6, 8], pa.int32()), shapes, + permutation=[0, 0]) + + # Invalid dim_names + with pytest.raises(ValueError, match="length of dim_names"): + pa.VariableShapeTensorArray.from_row_splits( + pa.array([1, 2, 3, 4, 5, 6, 7, 8], pa.int32()), + pa.array([0, 6, 8], pa.int32()), shapes, + dim_names=["only_one"]) + + # Invalid uniform_shape + with pytest.raises(ValueError, match="uniform_shape.*does not match"): + pa.VariableShapeTensorArray.from_row_splits( + pa.array([1, 2, 3, 4, 5, 6, 7, 8], pa.int32()), + pa.array([0, 6, 8], pa.int32()), shapes, + uniform_shape=[99, None]) + + # Negative ndim + with pytest.raises(ValueError, match="ndim must be non-negative"): + pa.VariableShapeTensorArray.from_row_splits( + pa.array([1, 2, 3, 4, 5, 6, 7, 8], pa.int32()), + pa.array([0, 6, 8], pa.int32()), shapes, + ndim=-1) + + # Single-element array (happy path edge case) + arr = pa.VariableShapeTensorArray.from_row_splits( + pa.array([1, 2, 3, 4, 5, 6], pa.int32()), + pa.array([0, 6], pa.int32()), + pa.array([[2, 3]], pa.list_(pa.int32(), 2))) + assert len(arr) == 1 + np.testing.assert_array_equal( + arr[0].to_numpy(), + np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)) + + # value_type as numpy dtype string + arr = pa.VariableShapeTensorArray.from_row_splits( + pa.array([1, 2, 3, 4, 5, 6], pa.int32()), + pa.array([0, 6], pa.int32()), + pa.array([[2, 3]], pa.list_(pa.int32(), 2)), + value_type="int32") + assert arr.type.value_type == pa.int32() + + +@pytest.mark.numpy +def test_variable_shape_tensor_to_numpy_ndarray_list_with_nulls(): + tensor_type = pa.variable_shape_tensor(pa.int32(), 2) + + data = pa.array([[1, 2, 3, 4, 5, 6], [10, 20], [7, 8]], pa.list_(pa.int32())) + shapes = pa.array([[2, 3], [1, 2], [2, 1]], + pa.list_(pa.int32(), 2)) + mask = pa.array([False, True, False]) + + storage = pa.StructArray.from_arrays( + [data, shapes], names=["data", "shape"], mask=mask) + arr = pa.ExtensionArray.from_storage(tensor_type, storage) + + result = arr.to_numpy_ndarray_list() + assert len(result) == 3 + np.testing.assert_array_equal( + result[0], np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)) + assert result[1] is None + np.testing.assert_array_equal( + result[2], np.array([[7], [8]], dtype=np.int32)) + + +@pytest.mark.numpy +def test_variable_shape_tensor_metadata_roundtrip_from_numpy(): + ndarray_list = [ + np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32), + np.array([[7], [8]], dtype=np.int32), + ] + arr = pa.VariableShapeTensorArray.from_numpy_ndarray( + ndarray_list, dim_names=["H", "W"]) + batch = pa.RecordBatch.from_arrays([arr], ["ext"]) + + buf = ipc_write_batch(batch) + batch = ipc_read_batch(buf) + result = batch.column(0) + + assert isinstance(result, pa.VariableShapeTensorArray) + assert result.type.dim_names == ["H", "W"] + assert result.type.permutation is None + assert result.type.uniform_shape == [2, None] + out_list = result.to_numpy_ndarray_list() + np.testing.assert_array_equal(out_list[0], ndarray_list[0]) + np.testing.assert_array_equal(out_list[1], ndarray_list[1]) + + @pytest.mark.parametrize("tensor_type", ( pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]), pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]), @@ -1619,6 +2075,48 @@ def test_tensor_type_ipc(tensor_type): assert result.type.shape == [2, 2, 3] +@pytest.mark.parametrize("tensor_type", ( + pa.variable_shape_tensor(pa.int8(), 2), + pa.variable_shape_tensor(pa.int8(), 2, permutation=[1, 0]), + pa.variable_shape_tensor(pa.int8(), 2, dim_names=['H', 'W']), + pa.variable_shape_tensor(pa.int8(), 2, uniform_shape=[None, None]), +)) +def test_variable_shape_tensor_type_ipc(tensor_type): + values_type = tensor_type.storage_type.field(0).type + shape_type = tensor_type.storage_type.field(1).type + values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], values_type) + shapes = pa.array([[2, 3], [1, 2]], shape_type) + + struct_arr = pa.StructArray.from_arrays([values, shapes], names=["data", "shape"]) + arr = pa.ExtensionArray.from_storage(tensor_type, struct_arr) + batch = pa.RecordBatch.from_arrays([arr], ["ext"]) + + # check the built array has exactly the expected clss + tensor_class = tensor_type.__arrow_ext_class__() + assert isinstance(arr, tensor_class) + + buf = ipc_write_batch(batch) + del batch + batch = ipc_read_batch(buf) + + result = batch.column(0) + # check the deserialized array class is the expected one + assert isinstance(result, tensor_class) + assert result.type.extension_name == "arrow.variable_shape_tensor" + assert arr.storage.to_pylist() == [ + {"data": [1, 2, 3, 4, 5, 6], "shape": [2, 3]}, + {"data": [7, 8], "shape": [1, 2]}, + ] + + # we get back an actual TensorType + assert isinstance(result.type, pa.VariableShapeTensorType) + assert result.type.value_type == pa.int8() + assert result.type.ndim == 2 + assert result.type.permutation == tensor_type.permutation + assert result.type.dim_names == tensor_type.dim_names + assert result.type.uniform_shape == tensor_type.uniform_shape + + def test_tensor_type_equality(): tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]) assert tensor_type.extension_name == "arrow.fixed_shape_tensor" @@ -1628,6 +2126,14 @@ def test_tensor_type_equality(): assert tensor_type == tensor_type2 assert not tensor_type == tensor_type3 + tensor_type = pa.variable_shape_tensor(pa.int8(), 2) + assert tensor_type.extension_name == "arrow.variable_shape_tensor" + + tensor_type2 = pa.variable_shape_tensor(pa.int8(), 2) + tensor_type3 = pa.variable_shape_tensor(pa.uint8(), 2) + assert tensor_type == tensor_type2 + assert not tensor_type == tensor_type3 + def test_tensor_type_cast(): tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3]) @@ -1693,7 +2199,7 @@ def test_extension_to_pandas_storage_type(registered_period_type): assert isinstance(result["ext"].dtype, pd.ArrowDtype) -def test_tensor_type_is_picklable(pickle_module): +def test_fixed_shape_tensor_type_is_picklable(pickle_module): # GH-35599 expected_type = pa.fixed_shape_tensor(pa.int32(), (2, 2)) @@ -1709,6 +2215,34 @@ def test_tensor_type_is_picklable(pickle_module): assert result == expected_arr +def test_variable_shape_tensor_type_is_picklable(pickle_module): + expected_type = pa.variable_shape_tensor(pa.int32(), 2) + result = pickle_module.loads(pickle_module.dumps(expected_type)) + + assert result == expected_type + + shapes = pa.array([[2, 3], [1, 2]], pa.list_(pa.int32(), 2)) + values = pa.array([[1, 2, 3, 4, 5, 6], [7, 8]], pa.list_(pa.int32())) + arr = pa.StructArray.from_arrays([values, shapes], names=["data", "shape"]) + expected_arr = pa.ExtensionArray.from_storage(expected_type, arr) + + result = pickle_module.loads(pickle_module.dumps(expected_arr)) + + assert result == expected_arr + + # Pickle with all optional params populated + expected_type = pa.variable_shape_tensor( + pa.float32(), 3, + dim_names=['C', 'H', 'W'], + permutation=[0, 2, 1], + uniform_shape=[3, None, None]) + result = pickle_module.loads(pickle_module.dumps(expected_type)) + assert result == expected_type + assert result.dim_names == ['C', 'H', 'W'] + assert result.permutation == [0, 2, 1] + assert result.uniform_shape == [3, None, None] + + @pytest.mark.parametrize(("tensor_type", "text"), [ ( pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]), diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index e84f1b073f6..58aeedd3253 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1988,6 +1988,105 @@ cdef class UuidType(BaseExtensionType): return UuidScalar +cdef class VariableShapeTensorType(BaseExtensionType): + """ + Concrete class for variable shape tensor extension type. + + Examples + -------- + Create an instance of variable shape tensor extension type: + + >>> import pyarrow as pa + >>> pa.variable_shape_tensor(pa.int32(), 2) + VariableShapeTensorType(extension) + + Create an instance of variable shape tensor extension type with + permutation: + + >>> tensor_type = pa.variable_shape_tensor(pa.int8(), 3, + ... permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.tensor_ext_type = type.get() + + @property + def value_type(self): + """ + Data type of an individual tensor. + """ + return pyarrow_wrap_data_type(self.tensor_ext_type.value_type()) + + @property + def ndim(self): + """ + Number of dimensions of the tensors. + """ + return self.tensor_ext_type.ndim() + + @property + def dim_names(self): + """ + Explicit names of the dimensions. + """ + list_of_bytes = self.tensor_ext_type.dim_names() + if len(list_of_bytes) != 0: + return [frombytes(x) for x in list_of_bytes] + else: + return None + + @property + def permutation(self): + """ + Indices of the dimensions ordering. + """ + indices = self.tensor_ext_type.permutation() + if len(indices) != 0: + return indices + else: + return None + + @property + def uniform_shape(self): + """ + Shape over dimensions that are guaranteed to be constant. + + Returns None if no uniform shape metadata is present on the type. + When present, returns a list of length ``ndim`` where integer values + indicate uniform (fixed) dimensions and None values indicate + variable dimensions. Note that ``[None, None]`` (metadata present, + all dimensions variable) is distinct from ``None`` (no metadata). + """ + cdef: + vector[optional[int64_t]] c_uniform_shape = self.tensor_ext_type.uniform_shape() + length = c_uniform_shape.size() + + if length == 0: + return None + + uniform_shape = [] + for i in range(length): + if c_uniform_shape[i].has_value(): + uniform_shape.append(c_uniform_shape[i].value()) + else: + uniform_shape.append(None) + + return uniform_shape + + def __arrow_ext_class__(self): + return VariableShapeTensorArray + + def __reduce__(self): + return variable_shape_tensor, (self.value_type, self.ndim, + self.dim_names, self.permutation, self.uniform_shape) + + def __arrow_ext_scalar_class__(self): + return VariableShapeTensorScalar + + cdef class FixedShapeTensorType(BaseExtensionType): """ Concrete class for fixed shape tensor extension type. @@ -5758,6 +5857,123 @@ def opaque(DataType storage_type, str type_name not None, str vendor_name not No return out +def variable_shape_tensor(DataType value_type, ndim, dim_names=None, permutation=None, + uniform_shape=None): + """ + Create instance of variable shape tensor extension type with number of + dimensions and optional names of tensor dimensions and indices of the + desired logical ordering of dimensions. + + Parameters + ---------- + value_type : DataType + Data type of individual tensor elements. + ndim : integer + The number of dimensions of the contained tensors. + dim_names : tuple or list of strings, default None + Explicit names to tensor dimensions. + permutation : tuple or list integers, default None + Indices of the desired ordering of the original dimensions. + The indices contain a permutation of the values ``[0, 1, .., N-1]`` where + N is the number of dimensions. The permutation indicates which dimension + of the logical layout corresponds to which dimension of the physical tensor. + For more information on this parameter see + :ref:`variable_shape_tensor_extension`. + uniform_shape : tuple or list of integers, default None + Shape of dimensions that are guaranteed to stay constant over all tensors + in the array if all their non-uniform sizes were replaced by None. + + Examples + -------- + Create an instance of variable shape tensor extension type: + + >>> import pyarrow as pa + >>> tensor_type = pa.variable_shape_tensor(pa.int32(), 2) + >>> tensor_type + VariableShapeTensorType(extension) + + Inspect the data type: + + >>> tensor_type.value_type + DataType(int32) + >>> tensor_type.ndim + 2 + + Create a table with variable shape tensor extension array: + + >>> fields = [pa.field("data", pa.list_(pa.int32())), pa.field("shape", pa.list_(pa.int32(), 2))] + >>> storage = pa.array([([1, 2, 3, 4, 5, 6], [2, 3]), ([7, 8], [1, 2])], type=pa.struct(fields)) + >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage) + >>> pa.table([tensor], names=["tensor_array"]) + pyarrow.Table + tensor_array: extension + ---- + tensor_array: [ -- is_valid: all not null + -- child 0 type: list + [[1,2,3,4,5,6],[7,8]] + -- child 1 type: fixed_size_list[2] + [[2,3],[1,2]]] + + Create an instance of variable shape tensor extension type with names + of tensor dimensions: + + >>> tensor_type = pa.variable_shape_tensor(pa.int8(), 3, + ... dim_names=['C', 'H', 'W']) + >>> tensor_type.dim_names + ['C', 'H', 'W'] + + Create an instance of variable shape tensor extension type with + permutation: + + >>> tensor_type = pa.variable_shape_tensor(pa.int8(), 3, + ... permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + + Returns + ------- + type : VariableShapeTensorType + """ + + cdef: + int32_t c_ndim + vector[int64_t] c_permutation + vector[c_string] c_dim_names + vector[optional[int64_t]] c_uniform_shape + shared_ptr[CDataType] c_tensor_ext_type + + if value_type is None: + raise TypeError('value_type must not be None') + if ndim is None: + raise TypeError('ndim must not be None') + + c_ndim = ndim + + if permutation is not None: + for i in permutation: + c_permutation.push_back(i) + + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + + if uniform_shape is not None: + for x in uniform_shape: + if x is None: + c_uniform_shape.push_back(nullopt) + else: + c_uniform_shape.push_back((x)) + + cdef VariableShapeTensorType out = VariableShapeTensorType.__new__(VariableShapeTensorType) + + with nogil: + c_tensor_ext_type = GetResultValue(CVariableShapeTensorType.Make( + value_type.sp_type, c_ndim, c_permutation, c_dim_names, c_uniform_shape)) + + out.init(c_tensor_ext_type) + return out + + cdef dict _type_aliases = { 'null': null, 'bool': bool_,