From d86d427ecc332e205872ce15702f260d39c75fdb Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 9 Feb 2026 20:54:52 +0100 Subject: [PATCH] Add type stubs for core data structures Add type annotation stubs for array, table, tensor, builder, memory, device, config, and types modules. Includes type-ignore annotations in related tests. --- python/pyarrow-stubs/pyarrow/array.pyi | 894 +++++++++++++++++++ python/pyarrow-stubs/pyarrow/builder.pyi | 51 ++ python/pyarrow-stubs/pyarrow/config.pyi | 72 ++ python/pyarrow-stubs/pyarrow/device.pyi | 66 ++ python/pyarrow-stubs/pyarrow/memory.pyi | 94 ++ python/pyarrow-stubs/pyarrow/table.pyi | 686 ++++++++++++++ python/pyarrow-stubs/pyarrow/tensor.pyi | 268 ++++++ python/pyarrow-stubs/pyarrow/types.pyi | 227 +++++ python/pyarrow/array.pxi | 2 +- python/pyarrow/scalar.pxi | 2 +- python/pyarrow/tests/test_array.py | 71 +- python/pyarrow/tests/test_convert_builtin.py | 42 +- python/pyarrow/tests/test_device.py | 12 +- python/pyarrow/tests/test_schema.py | 11 +- python/pyarrow/tests/test_sparse_tensor.py | 20 +- python/pyarrow/tests/test_table.py | 120 ++- python/pyarrow/tests/test_tensor.py | 2 +- python/pyarrow/tests/test_types.py | 55 +- 18 files changed, 2561 insertions(+), 134 deletions(-) create mode 100644 python/pyarrow-stubs/pyarrow/array.pyi create mode 100644 python/pyarrow-stubs/pyarrow/builder.pyi create mode 100644 python/pyarrow-stubs/pyarrow/config.pyi create mode 100644 python/pyarrow-stubs/pyarrow/device.pyi create mode 100644 python/pyarrow-stubs/pyarrow/memory.pyi create mode 100644 python/pyarrow-stubs/pyarrow/table.pyi create mode 100644 python/pyarrow-stubs/pyarrow/tensor.pyi create mode 100644 python/pyarrow-stubs/pyarrow/types.pyi diff --git a/python/pyarrow-stubs/pyarrow/array.pyi b/python/pyarrow-stubs/pyarrow/array.pyi new file mode 100644 index 00000000000..547e9c949d5 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/array.pyi @@ -0,0 +1,894 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +from collections.abc import Iterable, Iterator, Sequence + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +from typing import ( + Any, + Generic, + Literal, + TypeVar, +) + +import numpy as np +import pandas as pd + +from pyarrow._compute import CastOptions +from pyarrow._stubs_typing import ( + ArrayLike, + Indices, + Mask, + Order, + SupportArrowArray, + SupportArrowDeviceArray, + SupportPyArrowArray, +) +from pyarrow.lib import ( + Buffer, + Device, + MemoryManager, + MemoryPool, + Tensor, + _Weakrefable, +) +from typing_extensions import deprecated +import builtins + +from .scalar import ( # noqa: F401 + BinaryScalar, + BinaryViewScalar, + BooleanScalar, + Date32Scalar, + Date64Scalar, + DictionaryScalar, + DoubleScalar, + DurationScalar, + ExtensionScalar, + FixedSizeBinaryScalar, + FixedSizeListScalar, + FloatScalar, + HalfFloatScalar, + Int16Scalar, + Int32Scalar, + Int64Scalar, + Int8Scalar, + LargeBinaryScalar, + LargeListScalar, + LargeStringScalar, + ListScalar, + ListViewScalar, + MapScalar, + MonthDayNanoIntervalScalar, + NullScalar, + RunEndEncodedScalar, + Scalar, + StringScalar, + StringViewScalar, + StructScalar, + Time32Scalar, + Time64Scalar, + TimestampScalar, + UInt16Scalar, + UInt32Scalar, + UInt64Scalar, + UInt8Scalar, + UnionScalar, +) +from .device import DeviceAllocationType +from ._types import ( # noqa: F401 + BaseExtensionType, + BinaryType, + DataType, + Field, + Float64Type, + Int64Type, + MapType, + StringType, + StructType, + _AsPyType, + _BasicDataType, + _BasicValueT, + _DataTypeT, + _IndexT, + _RunEndType, + _Size, + _Time32Unit, + _Time64Unit, + _Tz, + _Unit, +) +from ._stubs_typing import NullableCollection + + +def array( + values: NullableCollection[Any] | Iterable[Any] | SupportArrowArray + | SupportArrowDeviceArray | SupportPyArrowArray, + type: Any | None = None, + mask: Mask | pd.Series[bool] | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> ArrayLike: ... + + +def asarray( + values: NullableCollection[Any] | Iterable[Any] | SupportArrowArray + | SupportArrowDeviceArray, + type: _DataTypeT | Any | None = None, +) -> Array[Scalar[_DataTypeT]] | ArrayLike: ... + + +def nulls( + size: int, + type: Any | None = None, + memory_pool: MemoryPool | None = None, +) -> ArrayLike: ... + + +def repeat( + value: Any, + size: int, + memory_pool: MemoryPool | None = None, +) -> ArrayLike: ... + + +def infer_type(values: Iterable[Any], mask: Mask | None = None, + from_pandas: bool = False) -> DataType: ... + + +class ArrayStatistics(_Weakrefable): + @property + def null_count(self) -> int | None: ... + + @property + def distinct_count(self) -> int | None: ... + + @property + def is_null_count_exact(self) -> bool | None: ... + + @property + def is_distinct_count_exact(self) -> bool | None: ... + + @property + def min(self) -> Any | None: ... + + @property + def is_min_exact(self) -> bool | None: ... + + @property + def max(self) -> Any | None: ... + + @property + def is_max_exact(self) -> bool | None: ... + + +_ConvertAs = TypeVar("_ConvertAs", pd.DataFrame, pd.Series) + + +class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): + def to_pandas( + self, + memory_pool: MemoryPool | None = None, + categories: list | tuple | None = None, + strings_to_categorical: bool = False, + zero_copy_only: bool = False, + integer_object_nulls: bool = False, + date_as_object: bool = True, + timestamp_as_object: bool = False, + use_threads: bool = True, + deduplicate_objects: bool = True, + ignore_metadata: bool = False, + safe: bool = True, + split_blocks: bool = False, + self_destruct: bool = False, + maps_as_pydicts: Literal["None", "lossy", "strict"] | None = None, + types_mapper: Any = None, # Callable[[DataType], ExtensionDtype | None] | None + coerce_temporal_nanoseconds: bool = False, + ) -> _ConvertAs: ... + + +_CastAs = TypeVar("_CastAs", bound=DataType) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) +_ScalarT = TypeVar("_ScalarT", bound=Scalar) + + +class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): + def as_py(self) -> list[Any]: ... + + def diff(self, other: Self) -> str: ... + + # Private attribute used internally (e.g., for column names in batches) + _name: str | None + + def cast( + self, + target_type: _CastAs | str, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar[_CastAs]]: ... + + def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: ... + + def sum(self, **kwargs) -> _Scalar_co: ... + + @property + def type(self: Array[Scalar[_DataTypeT]]) -> _DataTypeT: ... + def unique(self) -> Self: ... + + def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: ... + + def value_counts(self) -> StructArray: ... + + @staticmethod + def from_pandas( + obj: pd.Series | np.ndarray | ArrayLike, + *, + mask: Mask | None = None, + type: _DataTypeT | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar[_DataTypeT]] | Array[Scalar]: ... + + @staticmethod + def from_buffers( + type: _DataTypeT, + length: int, + buffers: Sequence[Buffer | None], + null_count: int = -1, + offset=0, + children: NullableCollection[Array[Scalar[_DataTypeT]]] | None = None, + ) -> Array[Scalar[_DataTypeT]]: ... + + @property + def null_count(self) -> int: ... + @property + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + + def __sizeof__(self) -> int: ... + def __iter__(self) -> Iterator[_Scalar_co]: ... + + def to_string( + self, + *, + indent: int = 2, + top_level_indent: int = 0, + window: int = 10, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: ... + + format = to_string + def equals(self, other: Array | Any) -> bool: ... + + def __len__(self) -> int: ... + + def is_null(self, *, nan_is_null: bool = False) -> BooleanArray: ... + + def is_nan(self) -> BooleanArray: ... + + def is_valid(self) -> BooleanArray: ... + + def fill_null( + self: Array[Scalar[_BasicDataType[_AsPyType]]], fill_value: _AsPyType + ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: ... + + def __getitem__(self, key: int | builtins.slice) -> _Scalar_co | Self: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def take(self, indices: Indices) -> Self: ... + + def drop_null(self) -> Self: ... + + def filter( + self, + mask: Mask, + *, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + ) -> Self: ... + + def index( + self: Array[_ScalarT] | Array[Scalar[_BasicDataType[_AsPyType]]], + value: _ScalarT | _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: ... + + def sort(self, order: Order = "ascending", **kwargs) -> Self: ... + + def __array__(self, dtype: np.dtype | None = None, + copy: bool | None = None) -> np.ndarray: ... + + def to_numpy(self, zero_copy_only: bool = True, + writable: bool = False) -> np.ndarray: ... + + def to_pylist( + self, + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[Any]: ... + + tolist = to_pylist + def validate(self, *, full: bool = False) -> None: ... + + @property + def offset(self) -> int: ... + + def buffers(self) -> list[Buffer | None]: ... + + def copy_to(self, destination: MemoryManager | Device) -> Self: ... + + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int, type: int | DataType) -> Self: ... + + def __arrow_c_array__(self, requested_schema=None) -> Any: ... + + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + + @classmethod + def _import_from_c_device(cls, in_ptr: int, type: DataType | int) -> Self: ... + + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ... + + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def __dlpack__(self, stream: int | None = None) -> Any: ... + + def __dlpack_device__(self) -> tuple[int, int]: ... + + @property + def device_type(self) -> DeviceAllocationType: ... + + @property + def is_cpu(self) -> bool: ... + + @property + def statistics(self) -> ArrayStatistics | None: ... + + +class NullArray(Array[NullScalar]): + ... + + +class BooleanArray(Array[BooleanScalar]): + @property + def false_count(self) -> int: ... + @property + def true_count(self) -> int: ... + + +class NumericArray(Array[_ScalarT]): + ... + + +class IntegerArray(NumericArray[_ScalarT]): + ... + + +class FloatingPointArray(NumericArray[_ScalarT]): + ... + + +class Int8Array(IntegerArray[Int8Scalar]): + ... + + +class UInt8Array(IntegerArray[UInt8Scalar]): + ... + + +class Int16Array(IntegerArray[Int16Scalar]): + ... + + +class UInt16Array(IntegerArray[UInt16Scalar]): + ... + + +class Int32Array(IntegerArray[Int32Scalar]): + ... + + +class UInt32Array(IntegerArray[UInt32Scalar]): + ... + + +class Int64Array(IntegerArray[Int64Scalar]): + ... + + +class UInt64Array(IntegerArray[UInt64Scalar]): + ... + + +class Date32Array(NumericArray[Date32Scalar]): + ... + + +class Date64Array(NumericArray[Date64Scalar]): + ... + + +class TimestampArray(NumericArray[TimestampScalar[_Unit, _Tz]]): + ... + + +class Time32Array(NumericArray[Time32Scalar[_Time32Unit]]): + ... + + +class Time64Array(NumericArray[Time64Scalar[_Time64Unit]]): + ... + + +class DurationArray(NumericArray[DurationScalar[_Unit]]): + ... + + +class MonthDayNanoIntervalArray(Array[MonthDayNanoIntervalScalar]): + ... + + +class HalfFloatArray(FloatingPointArray[HalfFloatScalar]): + ... + + +class FloatArray(FloatingPointArray[FloatScalar]): + ... + + +class DoubleArray(FloatingPointArray[DoubleScalar]): + ... + + +class FixedSizeBinaryArray(Array[FixedSizeBinaryScalar]): + ... + + +class Decimal32Array(FixedSizeBinaryArray): + ... + + +class Decimal64Array(FixedSizeBinaryArray): + ... + + +class Decimal128Array(FixedSizeBinaryArray): + ... + + +class Decimal256Array(FixedSizeBinaryArray): + ... + + +class BaseListArray(Array[_ScalarT]): + def flatten(self, recursive: bool = False) -> Array: ... + + def value_parent_indices(self) -> Int64Array: ... + + def value_lengths(self) -> Int32Array: ... + + +class ListArray(BaseListArray[_ScalarT]): + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int] | list[int | None], + values: Array[Scalar[_DataTypeT]] | list[int] | list[float] | list[str] + | list[bytes] | list, + *, + type: _DataTypeT | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> (ListArray[ListScalar[ + _DataTypeT | Int64Type | Float64Type | StringType | BinaryType + ]] | ListArray): ... + + @property + def values(self) -> Array: ... + + @property + def offsets(self) -> Int32Array: ... + + +class LargeListArray(BaseListArray[LargeListScalar[_DataTypeT]]): + @classmethod + def from_arrays( + cls, + offsets: Int64Array | list[int] | list[int | None], + values: Array[Scalar[_DataTypeT]] | Array, + *, + type: _DataTypeT | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListArray[_DataTypeT]: ... + + @property + def values(self) -> Array: ... + + @property + def offsets(self) -> Int64Array: ... + + +class ListViewArray(BaseListArray[ListViewScalar[_DataTypeT]]): + @classmethod + def from_arrays( + cls, + offsets: Int32Array, + values: Array[Scalar[_DataTypeT]] | Array, + *, + type: _DataTypeT | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListViewArray[_DataTypeT]: ... + + @property + def values(self) -> Array: ... + + @property + def offsets(self) -> Int32Array: ... + + @property + def sizes(self) -> Int32Array: ... + + +class LargeListViewArray(BaseListArray[LargeListScalar[_DataTypeT]]): + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array[Scalar[_DataTypeT]] | Array, + *, + type: _DataTypeT | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListViewArray[_DataTypeT]: ... + + @property + def values(self) -> Array: ... + + @property + def offsets(self) -> Int64Array: ... + + @property + def sizes(self) -> Int64Array: ... + + +class FixedSizeListArray(BaseListArray[FixedSizeListScalar[_DataTypeT, _Size]]): + @classmethod + def from_arrays( + cls, + values: Array[Scalar[_DataTypeT]], + list_size: _Size | None = None, + *, + type: DataType | None = None, + mask: Mask | None = None, + ) -> FixedSizeListArray[_DataTypeT, _Size | None]: ... + + @property + def values(self) -> BaseListArray[ListScalar[_DataTypeT]]: ... + + +_MapKeyT = TypeVar("_MapKeyT", bound=_BasicDataType) +_MapItemT = TypeVar("_MapItemT", bound=_BasicDataType) + + +class MapArray(BaseListArray[MapScalar[_MapKeyT, _MapItemT]]): + @classmethod + def from_arrays( + cls, + offsets: Int64Array | list[int] | None, + keys: Array[Scalar[_MapKeyT]] | np.ndarray | list | None = None, + items: Array[Scalar[_MapItemT]] | np.ndarray | list | None = None, + values: Array | DataType | None = None, + *, + type: DataType | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> MapArray[_MapKeyT, _MapItemT]: ... + + @property + def keys(self) -> Array: ... + + @property + def items(self) -> Array: ... + + +class UnionArray(Array[UnionScalar]): + @deprecated("Use fields() instead") + def child(self, pos: int) -> Field: ... + + def field(self, pos: int) -> Array: ... + + @property + def type_codes(self) -> Int8Array: ... + + @property + def offsets(self) -> Int32Array: ... + + @staticmethod + def from_dense( + types: Int8Array, + value_offsets: Int32Array, + children: NullableCollection[Array], + field_names: list[str] | None = None, + type_codes: Int8Array | list[int] | None = None, + ) -> UnionArray: ... + + @staticmethod + def from_sparse( + types: Int8Array, + children: NullableCollection[Array], + field_names: list[str] | None = None, + type_codes: Int8Array | list[int] | None = None, + ) -> UnionArray: ... + + +class StringArray(Array[StringScalar]): + @staticmethod + def from_buffers( # type: ignore[override] + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = None, + null_count: int | None = -1, + offset: int | None = 0, + ) -> StringArray: ... + + +class LargeStringArray(Array[LargeStringScalar]): + @staticmethod + def from_buffers( # type: ignore[override] + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = None, + null_count: int | None = -1, + offset: int | None = 0, + ) -> StringArray: ... + + +class StringViewArray(Array[StringViewScalar]): + ... + + +class BinaryArray(Array[BinaryScalar]): + @property + def total_values_length(self) -> int: ... + + +class LargeBinaryArray(Array[LargeBinaryScalar]): + @property + def total_values_length(self) -> int: ... + + +class BinaryViewArray(Array[BinaryViewScalar]): + ... + + +class DictionaryArray(Array[DictionaryScalar[_IndexT, _BasicValueT]]): + def dictionary_encode(self) -> Self: ... # type: ignore[override] + def dictionary_decode(self) -> Array[Scalar[_BasicValueT]]: ... + + @property + def indices(self) -> Array[Scalar[_IndexT]]: ... + @property + def dictionary(self) -> Array[Scalar[_BasicValueT]]: ... + + @staticmethod + def from_buffers( # type: ignore[override] + type: _BasicValueT, + length: int, + buffers: list[Buffer], + dictionary: Array | np.ndarray | pd.Series, + null_count: int = -1, + offset: int = 0, + ) -> DictionaryArray[Any, _BasicValueT]: ... + + @staticmethod + def from_arrays( + indices: Indices | Sequence[int | None], + dictionary: Array | np.ndarray | pd.Series | list[Any], + mask: np.ndarray | pd.Series | BooleanArray | None = None, + ordered: bool = False, + from_pandas: bool = False, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> DictionaryArray: ... + + +class StructArray(Array[StructScalar]): + def field(self, index: int | str) -> Array: ... + + def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: ... + + @staticmethod + def from_arrays( + arrays: Iterable[Array | np.ndarray | list], + names: Sequence[str] | list[Field] | None = None, + fields: list[Field] | None = None, + mask=None, + memory_pool: MemoryPool | None = None, + type: StructType | None = None, + ) -> StructArray: ... + + def sort(self, order: Order = "ascending", by: str | + None = None, **kwargs) -> StructArray: ... + + +class RunEndEncodedArray(Array[RunEndEncodedScalar[_RunEndType, _BasicValueT]]): + @staticmethod + def from_arrays( + run_ends: Int16Array | Int32Array | Int64Array | list[int], + values: Array | list[Any], type: DataType | None = None, + ) -> RunEndEncodedArray[Any, _BasicValueT]: ... + + @staticmethod + def from_buffers( # type: ignore[override] + type: DataType, + length: int, + buffers: list[Buffer] | list[None], + null_count: int = -1, + offset=0, + children: tuple[Array, Array] | list[list[int]] | None = None, + ) -> RunEndEncodedArray[Any, _BasicValueT]: ... + + @property + def run_ends(self) -> Array[Scalar[_RunEndType]]: ... + + @property + def values(self) -> Array[Scalar[_BasicValueT]]: ... + + def find_physical_offset(self) -> int: ... + + def find_physical_length(self) -> int: ... + + +_ArrayT = TypeVar("_ArrayT", bound=Array) + + +class ExtensionArray(Array[ExtensionScalar], Generic[_ArrayT]): + @property + def storage(self) -> Any: ... + + @staticmethod + def from_storage(typ: BaseExtensionType, + storage: _ArrayT) -> ExtensionArray[_ArrayT]: ... + + +class JsonArray(ExtensionArray[_ArrayT]): + ... + + +class UuidArray(ExtensionArray[_ArrayT]): + ... + + +class FixedShapeTensorArray(ExtensionArray[_ArrayT]): + def to_numpy_ndarray(self) -> np.ndarray: ... + + def to_tensor(self) -> Tensor: ... + + @classmethod + def from_numpy_ndarray( + cls, obj: np.ndarray, + dim_names: list[str] | tuple[str, ...] | None = None + ) -> Self: ... + + +class OpaqueArray(ExtensionArray[_ArrayT]): + ... + + +class Bool8Array(ExtensionArray): + def to_numpy(self, zero_copy_only: bool = ..., + writable: bool = ...) -> np.ndarray: ... + + @classmethod + def from_storage(cls, storage: Int8Array) -> Self: ... # type: ignore[override] + + @classmethod + def from_numpy(cls, obj: np.ndarray) -> Self: ... + + +def concat_arrays(arrays: Iterable[_ArrayT], + memory_pool: MemoryPool | None = None) -> _ArrayT: ... + + +def _empty_array(type: _DataTypeT) -> Array[Scalar[_DataTypeT]]: ... + + +__all__ = [ + "array", + "asarray", + "nulls", + "repeat", + "infer_type", + "_PandasConvertible", + "Array", + "NullArray", + "BooleanArray", + "NumericArray", + "IntegerArray", + "FloatingPointArray", + "Int8Array", + "UInt8Array", + "Int16Array", + "UInt16Array", + "Int32Array", + "UInt32Array", + "Int64Array", + "UInt64Array", + "Date32Array", + "Date64Array", + "TimestampArray", + "Time32Array", + "Time64Array", + "DurationArray", + "MonthDayNanoIntervalArray", + "HalfFloatArray", + "FloatArray", + "DoubleArray", + "FixedSizeBinaryArray", + "Decimal32Array", + "Decimal64Array", + "Decimal128Array", + "Decimal256Array", + "BaseListArray", + "ListArray", + "LargeListArray", + "ListViewArray", + "LargeListViewArray", + "FixedSizeListArray", + "MapArray", + "UnionArray", + "StringArray", + "LargeStringArray", + "StringViewArray", + "BinaryArray", + "LargeBinaryArray", + "BinaryViewArray", + "DictionaryArray", + "StructArray", + "RunEndEncodedArray", + "ExtensionArray", + "Bool8Array", + "UuidArray", + "JsonArray", + "OpaqueArray", + "FixedShapeTensorArray", + "concat_arrays", + "_empty_array", + "_CastAs", +] diff --git a/python/pyarrow-stubs/pyarrow/builder.pyi b/python/pyarrow-stubs/pyarrow/builder.pyi new file mode 100644 index 00000000000..9001d9835b6 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/builder.pyi @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Iterable + +from pyarrow.lib import MemoryPool, _Weakrefable + +from .array import StringArray, StringViewArray + + +class StringBuilder(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | float | None): ... + + def append_values(self, values: Iterable[str | bytes | float | None]): ... + + def finish(self) -> StringArray: ... + + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + + +class StringViewBuilder(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | float | None): ... + + def append_values(self, values: Iterable[str | bytes | float | None]): ... + + def finish(self) -> StringViewArray: ... + + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + + +__all__ = ["StringBuilder", "StringViewBuilder"] diff --git a/python/pyarrow-stubs/pyarrow/config.pyi b/python/pyarrow-stubs/pyarrow/config.pyi new file mode 100644 index 00000000000..069b70e553a --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/config.pyi @@ -0,0 +1,72 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import NamedTuple + + +class VersionInfo(NamedTuple): + major: int + minor: int + patch: int + + +class CppBuildInfo(NamedTuple): + version: str + version_info: VersionInfo + so_version: str + full_so_version: str + compiler_id: str + compiler_version: str + compiler_flags: str + git_id: str + git_description: str + package_kind: str + build_type: str + + +class BuildInfo(NamedTuple): + build_type: str + cpp_build_info: CppBuildInfo + + +class RuntimeInfo(NamedTuple): + simd_level: str + detected_simd_level: str + + +build_info: BuildInfo +cpp_build_info: CppBuildInfo +cpp_version: str +cpp_version_info: VersionInfo + + +def runtime_info() -> RuntimeInfo: ... +def set_timezone_db_path(path: str) -> None: ... + + +__all__ = [ + "VersionInfo", + "BuildInfo", + "CppBuildInfo", + "RuntimeInfo", + "build_info", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "set_timezone_db_path", +] diff --git a/python/pyarrow-stubs/pyarrow/device.pyi b/python/pyarrow-stubs/pyarrow/device.pyi new file mode 100644 index 00000000000..7787ac44deb --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/device.pyi @@ -0,0 +1,66 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum + +from pyarrow.lib import _Weakrefable + + +class DeviceAllocationType(enum.Enum): + CPU = enum.auto() + CUDA = enum.auto() + CUDA_HOST = enum.auto() + OPENCL = enum.auto() + VULKAN = enum.auto() + METAL = enum.auto() + VPI = enum.auto() + ROCM = enum.auto() + ROCM_HOST = enum.auto() + EXT_DEV = enum.auto() + CUDA_MANAGED = enum.auto() + ONEAPI = enum.auto() + WEBGPU = enum.auto() + HEXAGON = enum.auto() + + +class Device(_Weakrefable): + @property + def type_name(self) -> str: ... + + @property + def device_id(self) -> int: ... + + @property + def is_cpu(self) -> bool: ... + + @property + def device_type(self) -> DeviceAllocationType: ... + + +class MemoryManager(_Weakrefable): + @property + def device(self) -> Device: ... + + @property + def is_cpu(self) -> bool: ... + + +def default_cpu_memory_manager() -> MemoryManager: ... + + +__all__ = ["DeviceAllocationType", "Device", + "MemoryManager", "default_cpu_memory_manager"] diff --git a/python/pyarrow-stubs/pyarrow/memory.pyi b/python/pyarrow-stubs/pyarrow/memory.pyi new file mode 100644 index 00000000000..f80e01ab21c --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/memory.pyi @@ -0,0 +1,94 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow.lib import _Weakrefable + + +class MemoryPool(_Weakrefable): + def release_unused(self) -> None: ... + + def bytes_allocated(self) -> int: ... + + def total_bytes_allocated(self) -> int: ... + + def max_memory(self) -> int | None: ... + + def num_allocations(self) -> int: ... + + def print_stats(self) -> None: ... + + @property + def backend_name(self) -> str: ... + + +class LoggingMemoryPool(MemoryPool): + ... + + +class ProxyMemoryPool(MemoryPool): + ... + + +def default_memory_pool() -> MemoryPool: ... + + +def proxy_memory_pool(parent: MemoryPool) -> ProxyMemoryPool: ... + + +def logging_memory_pool(parent: MemoryPool) -> LoggingMemoryPool: ... + + +def system_memory_pool() -> MemoryPool: ... + + +def jemalloc_memory_pool() -> MemoryPool: ... + + +def mimalloc_memory_pool() -> MemoryPool: ... + + +def set_memory_pool(pool: MemoryPool) -> None: ... + + +def log_memory_allocations(enable: bool = True) -> None: ... + + +def total_allocated_bytes() -> int: ... + + +def jemalloc_set_decay_ms(decay_ms: int) -> None: ... + + +def supported_memory_backends() -> list[str]: ... + + +__all__ = [ + "MemoryPool", + "LoggingMemoryPool", + "ProxyMemoryPool", + "default_memory_pool", + "proxy_memory_pool", + "logging_memory_pool", + "system_memory_pool", + "jemalloc_memory_pool", + "mimalloc_memory_pool", + "set_memory_pool", + "log_memory_allocations", + "total_allocated_bytes", + "jemalloc_set_decay_ms", + "supported_memory_backends", +] diff --git a/python/pyarrow-stubs/pyarrow/table.pyi b/python/pyarrow-stubs/pyarrow/table.pyi new file mode 100644 index 00000000000..6dd61674d40 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/table.pyi @@ -0,0 +1,686 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from collections.abc import ( + Collection, Generator, Iterable, Iterator, Sequence, Mapping) +from typing import Any, Generic, Literal, TypeVar +import builtins + +import numpy as np +import pandas as pd + +from numpy.typing import NDArray +from pyarrow._compute import ( + CastOptions, + CountOptions, + FunctionOptions, + ScalarAggregateOptions, + TDigestOptions, + VarianceOptions, +) +from pyarrow._stubs_typing import ( + Indices, + Mask, + NullEncoding, + NullSelectionBehavior, + Order, + SupportArrowArray, + SupportArrowDeviceArray, + SupportArrowStream, +) +from pyarrow.compute import Expression +from pyarrow.interchange.dataframe import _PyArrowDataFrame +from pyarrow.lib import Device, MemoryManager, MemoryPool, Schema +from pyarrow.lib import Field as _Field + +from .array import Array, StructArray, _CastAs, _PandasConvertible +from .device import DeviceAllocationType +from .io import Buffer +from ._ipc import RecordBatchReader +from .scalar import BooleanScalar, Int64Scalar, Scalar, StructScalar +from .tensor import Tensor +from ._stubs_typing import NullableCollection +from ._types import DataType, _AsPyType, _BasicDataType, _DataTypeT + +Field: TypeAlias = _Field[DataType] +_ScalarT = TypeVar("_ScalarT", bound=Scalar) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) +ArrayOrChunkedArray: TypeAlias = Array[_Scalar_co] | ChunkedArray[_Scalar_co] + +_Aggregation: TypeAlias = Literal[ + "all", + "any", + "approximate_median", + "count", + "count_all", + "count_distinct", + "distinct", + "first", + "first_last", + "last", + "list", + "max", + "mean", + "min", + "min_max", + "one", + "product", + "stddev", + "sum", + "tdigest", + "variance", +] +_AggregationPrefixed: TypeAlias = Literal[ + "hash_all", + "hash_any", + "hash_approximate_median", + "hash_count", + "hash_count_all", + "hash_count_distinct", + "hash_distinct", + "hash_first", + "hash_first_last", + "hash_last", + "hash_list", + "hash_max", + "hash_mean", + "hash_min", + "hash_min_max", + "hash_one", + "hash_product", + "hash_stddev", + "hash_sum", + "hash_tdigest", + "hash_variance", +] +Aggregation: TypeAlias = _Aggregation | _AggregationPrefixed | str +AggregateOptions: TypeAlias = (ScalarAggregateOptions | CountOptions + | TDigestOptions | VarianceOptions | FunctionOptions) + +UnarySelector: TypeAlias = str +NullarySelector: TypeAlias = tuple[()] +NarySelector: TypeAlias = list[str] | tuple[str, ...] +ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector + + +class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): + + def as_py(self) -> list[Any]: ... + + @property + def data(self) -> Self: ... + @property + def type(self: ChunkedArray[Scalar[_DataTypeT]]) -> _DataTypeT: ... + + # Private attribute used internally for column names + _name: str | None + + def length(self) -> int: ... + + __len__ = length + + def to_string( + self, + *, + indent: int = 0, + window: int = 5, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: ... + + format = to_string + def validate(self, *, full: bool = False) -> None: ... + + @property + def null_count(self) -> int: ... + + @property + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + + def __sizeof__(self) -> int: ... + + def __getitem__( + self, key: int | np.integer | builtins.slice) -> _Scalar_co | Self: ... + + def getitem(self, i: int) -> Scalar: ... + def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[BooleanScalar]: ... + + def is_nan(self) -> ChunkedArray[BooleanScalar]: ... + + def is_valid(self) -> ChunkedArray[BooleanScalar]: ... + + def cast( + self, target_type: _CastAs | str | None, safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None + ) -> Self | ChunkedArray[Scalar[_CastAs]]: ... + + def fill_null(self, fill_value: Scalar[_DataTypeT] | Any) -> Self: ... + + def equals(self, other: Self | Any) -> bool: ... + + def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: ... + + def __array__(self, dtype: np.dtype | None = None, + copy: bool | None = None) -> np.ndarray: ... + + def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: ... + + def flatten(self, memory_pool: MemoryPool | + None = None) -> list[ChunkedArray[Any]]: ... + + def combine_chunks(self, memory_pool: MemoryPool | + None = None) -> Array[_Scalar_co]: ... + + def unique(self) -> ChunkedArray[_Scalar_co]: ... + + def value_counts(self) -> StructArray: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def filter(self, mask: Mask, + null_selection_behavior: NullSelectionBehavior = "drop") -> Self: ... + + def index( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + value: Scalar[_DataTypeT] | _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: ... + + def take(self, indices: Indices) -> Self: ... + + def drop_null(self) -> Self: ... + + def sort(self, order: Order = "ascending", **kwargs) -> Self: ... + + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ... + + @property + def num_chunks(self) -> int: ... + + def chunk(self, i: int) -> Array[_Scalar_co]: ... + + @property + def chunks(self) -> list[Array[_Scalar_co]]: ... + + def iterchunks( + self: ArrayOrChunkedArray[_ScalarT], + ) -> Generator[Array, None, None]: ... + + def __iter__(self) -> Iterator[_Scalar_co]: ... + + def to_pylist( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[_AsPyType | None]: ... + + def __arrow_c_stream__(self, requested_schema=None) -> Any: ... + + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: ... + + @property + def is_cpu(self) -> bool: ... + + +def chunked_array( + arrays: Iterable[NullableCollection[Any]] + | Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray] + | Iterable[Array[_ScalarT]] | Array[_ScalarT] + | SupportArrowArray | SupportArrowStream, + type: DataType | str | None = None, +) -> ChunkedArray[Scalar[Any]] | ChunkedArray[_ScalarT]: ... + + +_ColumnT = TypeVar("_ColumnT", bound=ArrayOrChunkedArray[Any]) + + +class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): + def __array__(self, dtype: np.dtype | None = None, + copy: bool | None = None) -> np.ndarray: ... + + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: ... + + def __getitem__(self, key: int | str | slice) -> _ColumnT | Self: ... + + def __len__(self) -> int: ... + def column(self, i: int | str) -> _ColumnT: ... + + @property + def column_names(self) -> list[str]: ... + + @property + def columns(self) -> list[_ColumnT]: ... + + def drop_null(self) -> Self: ... + + def field(self, i: int | str) -> Field: ... + + @classmethod + def from_pydict( + cls, + mapping: + Mapping[Any, ArrayOrChunkedArray[Any] | list[Any] | np.ndarray | range], + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: ... + + @classmethod + def from_pylist( + cls, + mapping: Sequence[Mapping[str, Any]], + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: ... + + def itercolumns(self) -> Generator[_ColumnT, None, None]: ... + + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def shape(self) -> tuple[int, int]: ... + + @property + def schema(self) -> Schema: ... + @property + def nbytes(self) -> int: ... + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> Self: ... + + def take(self, indices: Indices) -> Self: ... + + def filter( + self, + mask: Mask | Expression, + null_selection_behavior: NullSelectionBehavior = "drop") -> Self: ... + + def to_pydict( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> dict[str, list[Any]]: ... + + def to_pylist( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> list[dict[str, Any]]: ... + + def to_string(self, *, show_metadata: bool = False, + preview_cols: int = 0) -> str: ... + + def remove_column(self, i: int) -> Self: ... + def drop_columns(self, columns: str | list[str]) -> Self: ... + + def add_column(self, i: int, field_: str | Field, + column: ArrayOrChunkedArray[Any] | list[list[Any]]) -> Self: ... + + def append_column( + self, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] + ) -> Self: ... + + +class RecordBatch(_Tabular[Array]): + def validate(self, *, full: bool = False) -> None: ... + + def replace_schema_metadata( + self, + metadata: dict[str, str] + | dict[bytes, bytes] + | dict[bytes, str] + | dict[str, bytes] + | None = None + ) -> Self: ... + + @property + def num_columns(self) -> int: ... + + @property + def num_rows(self) -> int: ... + + @property + def schema(self) -> Schema: ... + + @property + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + + def __sizeof__(self) -> int: ... + + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: ... + + def remove_column(self, i: int) -> Self: ... + + def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: ... + + def rename_columns(self, names: list[str] | dict[str, str]) -> Self: ... + + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def equals(self, other: Self | Any, check_metadata: bool = False) -> bool: ... + + def select(self, columns: Iterable[str] | + Iterable[int] | NDArray[np.str_]) -> Self: ... + + def cast(self, target_schema: Schema, safe: bool | None = None, + options: CastOptions | None = None) -> Self: ... + + @classmethod + def from_arrays( + cls, + arrays: Iterable[Any], + names: list[str] | tuple[str, ...] | None = None, + schema: Schema | None = None, + metadata: Mapping[bytes, bytes] + | Mapping[str, str] + | Mapping[bytes, str] + | Mapping[str, bytes] + | None = None, + ) -> Self: ... + + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: Sequence[str | int] | None = None, + ) -> Self: ... + + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[StructScalar] + ) -> Self: ... + + def to_struct_array(self) -> StructArray: ... + + def to_tensor( + self, + null_to_nan: bool = False, + row_major: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Tensor: ... + + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0): ... + + @classmethod + def _import_from_c(cls, in_ptr: int, schema: Schema) -> Self: ... + + def __arrow_c_array__(self, requested_schema=None): ... + + def __arrow_c_stream__(self, requested_schema=None): ... + + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... + + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + + @classmethod + def _import_from_c_device(cls, in_ptr: int, schema: Schema) -> Self: ... + + def __arrow_c_device_array__(self, requested_schema=None, **kwargs): ... + + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... + + @property + def device_type(self) -> DeviceAllocationType: ... + + @property + def is_cpu(self) -> bool: ... + + def copy_to(self, destination: MemoryManager | Device) -> Self: ... + + +def table_to_blocks(options, table: Table, categories, extension_columns): ... + + +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] + + +class Table(_Tabular[ChunkedArray[Any]]): + def validate(self, *, full: bool = False) -> None: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def select(self, columns: Iterable[str] | + Iterable[int] | NDArray[np.str_]) -> Self: ... + + def replace_schema_metadata( + self, metadata: dict[str, str] + | dict[bytes, bytes] + | dict[bytes, str] + | dict[str, bytes] + | None = None + ) -> Self: ... + + def flatten(self, memory_pool: MemoryPool | None = None) -> Self: ... + + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: ... + + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ... + + def equals(self, other: Self | Any, check_metadata: bool = False) -> bool: ... + + def cast(self, target_schema: Schema, safe: bool | None = None, + options: CastOptions | None = None) -> Self: ... + + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: Sequence[str | int] | None = None, + safe: bool = True, + ) -> Self: ... + + @classmethod + def from_arrays( + cls, + arrays: + Collection[ArrayOrChunkedArray[Any] | Collection[NDArray[Any]] | list[Any]], + names: list[str] | tuple[str, ...] | None = None, + schema: Schema | None = None, + metadata: Mapping[bytes, bytes] + | Mapping[str, str] + | Mapping[bytes, str] + | Mapping[str, bytes] | None = None, + ) -> Self: ... + + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[StructScalar] + ) -> Self: ... + + def to_struct_array( + self, max_chunksize: int | None = None + ) -> ChunkedArray[StructScalar]: ... + + @classmethod + def from_batches(cls, batches: Iterable[RecordBatch], + schema: Schema | None = None) -> Self: ... + + def to_batches(self, max_chunksize: int | None = None) -> list[RecordBatch]: ... + + def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: ... + + @property + def schema(self) -> Schema: ... + + @property + def num_columns(self) -> int: ... + + @property + def num_rows(self) -> int: ... + + @property + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + + def __sizeof__(self) -> int: ... + + def add_column(self, i: int, field_: str | Field, + column: ArrayOrChunkedArray[Any] | list[list[Any]]) -> Self: ... + + def remove_column(self, i: int) -> Self: ... + + def set_column(self, i: int, field_: str | Field, + column: ArrayOrChunkedArray[Any] | list[list[Any]]) -> Self: ... + + def rename_columns(self, names: list[str] | dict[str, str]) -> Self: ... + + def drop(self, columns: str | list[str]) -> Self: ... + + def group_by(self, keys: str | list[str], + use_threads: bool = True) -> TableGroupBy: ... + + def join( + self, + right_table: Self, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> Self: ... + + def join_asof( + self, + right_table: Self, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> Self: ... + + def __arrow_c_stream__(self, requested_schema=None): ... + + @property + def is_cpu(self) -> bool: ... + + +def record_batch( + data: Mapping[str, list[Any] | Array[Any]] + | Collection[Array[Any] | ChunkedArray[Any] | list[Any]] + | pd.DataFrame + | SupportArrowArray + | SupportArrowDeviceArray, + names: list[str] | Schema | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, +) -> RecordBatch: ... + + +def table( + data: Collection[ArrayOrChunkedArray[Any] | list[Any] | range | str] + | pd.DataFrame + | SupportArrowArray + | SupportArrowStream + | SupportArrowDeviceArray + | Mapping[str, list[Any] | Array[Any] | ChunkedArray[Any] | range] + | Mapping[str, Any], + names: list[str] | Schema | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + nthreads: int | None = None, +) -> Table: ... + + +def concat_tables( + tables: Iterable[Table], + memory_pool: MemoryPool | None = None, + promote_options: Literal["none", "default", "permissive"] = "none", + **kwargs: Any, +) -> Table: ... + + +class TableGroupBy: + + keys: str | list[str] + + def __init__(self, table: Table, keys: str | + list[str], use_threads: bool = True): ... + + def aggregate( + self, + aggregations: Iterable[ + tuple[ColumnSelector, Aggregation] + | tuple[ColumnSelector, Aggregation, AggregateOptions | None] + ], + ) -> Table: ... + + def _table(self) -> Table: ... + @property + def _use_threads(self) -> bool: ... + + +def concat_batches( + recordbatches: Iterable[RecordBatch], memory_pool: MemoryPool | None = None +) -> RecordBatch: ... + + +__all__ = [ + "ChunkedArray", + "chunked_array", + "_Tabular", + "RecordBatch", + "table_to_blocks", + "Table", + "record_batch", + "table", + "concat_tables", + "TableGroupBy", + "concat_batches", + "Aggregation", + "AggregateOptions", +] diff --git a/python/pyarrow-stubs/pyarrow/tensor.pyi b/python/pyarrow-stubs/pyarrow/tensor.pyi new file mode 100644 index 00000000000..ba40c7b299d --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/tensor.pyi @@ -0,0 +1,268 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +from collections.abc import Sequence +import numpy as np + +from pyarrow.lib import _Weakrefable +from pyarrow._types import DataType +from scipy.sparse import coo_matrix, csr_matrix +from sparse import COO # type: ignore[import-untyped, import-not-found] + + +class Tensor(_Weakrefable): + @classmethod + def from_numpy(cls, obj: np.ndarray, + dim_names: Sequence[str] | None = None) -> Self: ... + + def to_numpy(self) -> np.ndarray: ... + + def equals(self, other: Tensor) -> bool: ... + + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + + @property + def is_mutable(self) -> bool: ... + + @property + def is_contiguous(self) -> bool: ... + + @property + def ndim(self) -> int: ... + + @property + def size(self) -> str: ... + + @property + def shape(self) -> tuple[int, ...]: ... + + @property + def strides(self) -> tuple[int, ...]: ... + + @property + def type(self) -> DataType: ... + + +class SparseCOOTensor(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, + dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + coords: np.ndarray, + shape: Sequence[int], + dim_names: Sequence[str] | None = None, + ) -> Self: ... + + @classmethod + def from_scipy(cls, obj: csr_matrix, + dim_names: Sequence[str] | None = None) -> Self: ... + + @classmethod + def from_pydata_sparse( + cls, obj: COO, dim_names: Sequence[str] | None = None) -> Self: ... + + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray]: ... + + def to_scipy(self) -> coo_matrix: ... + + def to_pydata_sparse(self) -> COO: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + @property + def has_canonical_format(self) -> bool: ... + @property + def type(self) -> DataType: ... + + +class SparseCSRMatrix(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, + dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: Sequence[int], + dim_names: Sequence[str] | None = None, + ) -> Self: ... + + @classmethod + def from_scipy(cls, obj: csr_matrix, + dim_names: Sequence[str] | None = None) -> Self: ... + + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + + def to_scipy(self) -> csr_matrix: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + @property + def type(self) -> DataType: ... + + +class SparseCSCMatrix(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, + dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: ... + + @classmethod + def from_scipy(cls, obj: csr_matrix, + dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + + def to_scipy(self) -> csr_matrix: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + + +class SparseCSFTensor(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, + dim_names: Sequence[str] | None = None) -> Self: ... + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: Sequence[np.ndarray], + indices: Sequence[np.ndarray], + shape: tuple[int, ...], + axis_order: Sequence[int] | None = None, + dim_names: Sequence[str] | None = None, + ) -> Self: ... + + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + @property + def type(self) -> DataType: ... + + +__all__ = [ + "Tensor", + "SparseCOOTensor", + "SparseCSRMatrix", + "SparseCSCMatrix", + "SparseCSFTensor", +] diff --git a/python/pyarrow-stubs/pyarrow/types.pyi b/python/pyarrow-stubs/pyarrow/types.pyi new file mode 100644 index 00000000000..9e5a0568db0 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/types.pyi @@ -0,0 +1,227 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys +from enum import IntEnum + +from typing import Any + +if sys.version_info >= (3, 13): + from typing import TypeIs +else: + from typing_extensions import TypeIs +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +import pyarrow.lib as lib + +from pyarrow.lib import ( + BinaryType, + BinaryViewType, + BoolType, + DataType, + Date32Type, + Date64Type, + Decimal32Type, + Decimal64Type, + Decimal128Type, + Decimal256Type, + DenseUnionType, + DictionaryType, + DurationType, + FixedSizeBinaryType, + FixedSizeListType, + Float16Type, + Float32Type, + Float64Type, + Int8Type, + Int16Type, + Int32Type, + Int64Type, + LargeBinaryType, + LargeListType, + LargeListViewType, + LargeStringType, + ListType, + ListViewType, + MapType, + MonthDayNanoIntervalType, + NullType, + RunEndEncodedType, + SparseUnionType, + StringType, + StringViewType, + StructType, + Time32Type, + Time64Type, + TimestampType, + UInt8Type, + UInt16Type, + UInt32Type, + UInt64Type, +) + +_SignedInteger: TypeAlias = Int8Type | Int16Type | Int32Type | Int64Type +_UnsignedInteger: TypeAlias = UInt8Type | UInt16Type | UInt32Type | UInt64Type +_Integer: TypeAlias = _SignedInteger | _UnsignedInteger +_Floating: TypeAlias = Float16Type | Float32Type | Float64Type +_Decimal: TypeAlias = ( + Decimal32Type[Any, Any] + | Decimal64Type[Any, Any] + | Decimal128Type[Any, Any] + | Decimal256Type[Any, Any] +) +_Date: TypeAlias = Date32Type | Date64Type +_Time: TypeAlias = Time32Type[Any] | Time64Type[Any] +_Interval: TypeAlias = MonthDayNanoIntervalType +_Temporal: TypeAlias = (TimestampType[Any, Any] + | DurationType[Any] | _Time | _Date | _Interval) +_Union: TypeAlias = SparseUnionType | DenseUnionType +_Nested: TypeAlias = ( + ListType[Any] + | FixedSizeListType[Any, Any] + | LargeListType[Any] + | ListViewType[Any] + | LargeListViewType[Any] + | StructType + | MapType[Any, Any, Any] + | _Union +) + + +def is_null(t: DataType) -> TypeIs[NullType]: ... +def is_boolean(t: DataType) -> TypeIs[BoolType]: ... +def is_integer(t: DataType) -> TypeIs[_Integer]: ... +def is_signed_integer(t: DataType) -> TypeIs[_SignedInteger]: ... +def is_unsigned_integer(t: DataType) -> TypeIs[_UnsignedInteger]: ... +def is_int8(t: DataType) -> TypeIs[Int8Type]: ... +def is_int16(t: DataType) -> TypeIs[Int16Type]: ... +def is_int32(t: DataType) -> TypeIs[Int32Type]: ... +def is_int64(t: DataType) -> TypeIs[Int64Type]: ... +def is_uint8(t: DataType) -> TypeIs[UInt8Type]: ... +def is_uint16(t: DataType) -> TypeIs[UInt16Type]: ... +def is_uint32(t: DataType) -> TypeIs[UInt32Type]: ... +def is_uint64(t: DataType) -> TypeIs[UInt64Type]: ... +def is_floating(t: DataType) -> TypeIs[_Floating]: ... +def is_float16(t: DataType) -> TypeIs[Float16Type]: ... +def is_float32(t: DataType) -> TypeIs[Float32Type]: ... +def is_float64(t: DataType) -> TypeIs[Float64Type]: ... +def is_list(t: DataType) -> TypeIs[ListType[Any]]: ... +def is_large_list(t: DataType) -> TypeIs[LargeListType[Any]]: ... +def is_fixed_size_list(t: DataType) -> TypeIs[FixedSizeListType[Any, Any]]: ... +def is_list_view(t: DataType) -> TypeIs[ListViewType[Any]]: ... +def is_large_list_view(t: DataType) -> TypeIs[LargeListViewType[Any]]: ... +def is_struct(t: DataType) -> TypeIs[StructType]: ... +def is_union(t: DataType) -> TypeIs[_Union]: ... +def is_nested(t: DataType) -> TypeIs[_Nested]: ... +def is_run_end_encoded(t: DataType) -> TypeIs[RunEndEncodedType[Any, Any]]: ... +def is_temporal(t: DataType) -> TypeIs[_Temporal]: ... +def is_timestamp(t: DataType) -> TypeIs[TimestampType[Any, Any]]: ... +def is_duration(t: DataType) -> TypeIs[DurationType[Any]]: ... +def is_time(t: DataType) -> TypeIs[_Time]: ... +def is_time32(t: DataType) -> TypeIs[Time32Type[Any]]: ... +def is_time64(t: DataType) -> TypeIs[Time64Type[Any]]: ... +def is_binary(t: DataType) -> TypeIs[BinaryType]: ... +def is_large_binary(t: DataType) -> TypeIs[LargeBinaryType]: ... +def is_unicode(t: DataType) -> TypeIs[StringType]: ... +def is_string(t: DataType) -> TypeIs[StringType]: ... +def is_large_unicode(t: DataType) -> TypeIs[LargeStringType]: ... +def is_large_string(t: DataType) -> TypeIs[LargeStringType]: ... +def is_fixed_size_binary(t: DataType) -> TypeIs[FixedSizeBinaryType]: ... +def is_binary_view(t: DataType) -> TypeIs[BinaryViewType]: ... +def is_string_view(t: DataType) -> TypeIs[StringViewType]: ... +def is_date(t: DataType) -> TypeIs[_Date]: ... +def is_date32(t: DataType) -> TypeIs[Date32Type]: ... +def is_date64(t: DataType) -> TypeIs[Date64Type]: ... +def is_map(t: DataType) -> TypeIs[MapType[Any, Any, Any]]: ... +def is_decimal(t: DataType) -> TypeIs[_Decimal]: ... +def is_decimal32(t: DataType) -> TypeIs[Decimal32Type[Any, Any]]: ... +def is_decimal64(t: DataType) -> TypeIs[Decimal64Type[Any, Any]]: ... +def is_decimal128(t: DataType) -> TypeIs[Decimal128Type[Any, Any]]: ... +def is_decimal256(t: DataType) -> TypeIs[Decimal256Type[Any, Any]]: ... +def is_dictionary(t: DataType) -> TypeIs[DictionaryType[Any, Any, Any]]: ... +def is_interval(t: DataType) -> TypeIs[_Interval]: ... +def is_primitive(t: DataType) -> bool: ... +def is_boolean_value(obj: Any) -> bool: ... +def is_integer_value(obj: Any) -> bool: ... +def is_float_value(obj: Any) -> bool: ... + + +__all__ = [ + "lib", + "is_binary", + "is_binary_view", + "is_boolean", + "is_date", + "is_date32", + "is_date64", + "is_decimal", + "is_decimal128", + "is_decimal256", + "is_decimal32", + "is_decimal64", + "is_dictionary", + "is_duration", + "is_fixed_size_binary", + "is_fixed_size_list", + "is_float16", + "is_float32", + "is_float64", + "is_floating", + "is_int16", + "is_int32", + "is_int64", + "is_int8", + "is_integer", + "is_interval", + "is_large_binary", + "is_large_list", + "is_large_list_view", + "is_large_string", + "is_large_unicode", + "is_list", + "is_list_view", + "is_map", + "is_nested", + "is_null", + "is_primitive", + "is_run_end_encoded", + "is_signed_integer", + "is_string", + "is_string_view", + "is_struct", + "is_temporal", + "is_time", + "is_time32", + "is_time64", + "is_timestamp", + "is_uint16", + "is_uint32", + "is_uint64", + "is_uint8", + "is_unicode", + "is_union", + "is_unsigned_integer", +] + + +class TypesEnum(IntEnum): + INTERVAL_MONTHS = 0 + INTERVAL_DAY_TIME = 1 + INTERVAL_MONTH_DAY_NANO = 2 diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ec58ac727e5..47ce1ab9b81 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3632,7 +3632,7 @@ cdef class FixedSizeListArray(BaseListArray): Or create from a values array, list size and matching type: >>> typ = pa.list_(pa.field("values", pa.int64()), 2) - >>> arr = pa.FixedSizeListArray.from_arrays(values,type=typ) + >>> arr = pa.FixedSizeListArray.from_arrays(values, type=typ) >>> arr [ diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 83cabcf447d..16fed344e4d 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1036,7 +1036,7 @@ cdef class StructScalar(Scalar, Mapping): Parameters ---------- - index : Union[int, str] + key : Union[int, str] Index / position or name of the field. Returns diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index d09d9f45c7d..8a257ca48d6 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -18,19 +18,23 @@ from collections.abc import Iterable import datetime import decimal -import hypothesis as h -import hypothesis.strategies as st +import hypothesis as h # type: ignore[import-not-found] +import hypothesis.strategies as st # type: ignore[import-not-found] import itertools -import pytest +import pytest # type: ignore[import-not-found] import struct import subprocess import sys import weakref +from typing import TYPE_CHECKING -try: +if TYPE_CHECKING: import numpy as np -except ImportError: - np = None +else: + try: + import numpy as np + except ImportError: + np = None import pyarrow as pa import pyarrow.tests.strategies as past @@ -71,7 +75,7 @@ def test_constructor_raises(): # This could happen by wrong capitalization. # ARROW-2638: prevent calling extension class constructors directly with pytest.raises(TypeError): - pa.Array([1, 2]) + pa.Array([1, 2]) # type: ignore[reportCallIssue] def test_list_format(): @@ -321,11 +325,11 @@ def test_asarray(): arr = pa.array(range(4)) - # The iterator interface gives back an array of Int64Value's + # The iterator interface gives back an array of Int64Type's np_arr = np.asarray([_ for _ in arr]) assert np_arr.tolist() == [0, 1, 2, 3] assert np_arr.dtype == np.dtype('O') - assert isinstance(np_arr[0], pa.lib.Int64Value) + assert isinstance(np_arr[0], pa.lib.Int64Type) # Calling with the arrow array gives back an array with 'int64' dtype np_arr = np.asarray(arr) @@ -649,8 +653,8 @@ def test_array_eq(): @pytest.mark.numpy def test_array_from_buffers(): - values_buf = pa.py_buffer(np.int16([4, 5, 6, 7])) - nulls_buf = pa.py_buffer(np.uint8([0b00001101])) + values_buf = pa.py_buffer(np.array([4, 5, 6, 7], dtype=np.int16())) + nulls_buf = pa.py_buffer(np.array([0b00001101], dtype=np.uint8())) arr = pa.Array.from_buffers(pa.int16(), 4, [nulls_buf, values_buf]) assert arr.type == pa.int16() assert arr.to_pylist() == [4, None, 6, 7] @@ -665,7 +669,9 @@ def test_array_from_buffers(): assert arr.to_pylist() == [None, 6, 7] with pytest.raises(TypeError): - pa.Array.from_buffers(pa.int16(), 3, ['', ''], offset=1) + pa.Array.from_buffers( + pa.int16(), 3, ['', ''], offset=1 # type: ignore[reportArgumentType] + ) def test_string_binary_from_buffers(): @@ -859,7 +865,8 @@ def test_struct_array_from_chunked(): chunked_arr = pa.chunked_array([[1, 2, 3], [4, 5, 6]]) with pytest.raises(TypeError, match="Expected Array"): - pa.StructArray.from_arrays([chunked_arr], ["foo"]) + pa.StructArray.from_arrays( + [chunked_arr], ["foo"]) # type: ignore[reportArgumentType] @pytest.mark.parametrize("offset", (0, 1)) @@ -1179,24 +1186,24 @@ def test_map_from_arrays(): keys = pa.array(pykeys, type='binary') items = pa.array(pyitems, type='i4') - result = pa.MapArray.from_arrays(offsets, keys, items) + result = pa.MapArray.from_arrays(offsets, keys, items) # type: ignore[arg-type] expected = pa.array(pyentries, type=pa.map_(pa.binary(), pa.int32())) assert result.equals(expected) # pass in the type explicitly - result = pa.MapArray.from_arrays(offsets, keys, items, pa.map_( - keys.type, - items.type - )) + result = pa.MapArray.from_arrays(offsets, keys, items, # type: ignore[arg-type] + pa.map_(keys.type, items.type)) assert result.equals(expected) # pass in invalid types with pytest.raises(pa.ArrowTypeError, match='Expected map type, got string'): - pa.MapArray.from_arrays(offsets, keys, items, pa.string()) + pa.MapArray.from_arrays( + offsets, keys, items, pa.string() # type: ignore[arg-type] + ) with pytest.raises(pa.ArrowTypeError, match='Mismatching map items type'): - pa.MapArray.from_arrays(offsets, keys, items, pa.map_( + pa.MapArray.from_arrays(offsets, keys, items, pa.map_( # type: ignore[arg-type] keys.type, # Larger than the original i4 pa.int64() @@ -1234,7 +1241,7 @@ def test_map_from_arrays(): # error if null bitmap and offsets with nulls passed msg1 = 'Ambiguous to specify both validity map and offsets with nulls' with pytest.raises(pa.ArrowInvalid, match=msg1): - pa.MapArray.from_arrays(offsets, keys, items, pa.map_( + pa.MapArray.from_arrays(offsets, keys, items, pa.map_( # type: ignore[arg-type] keys.type, items.type), mask=pa.array([False, True, False], type=pa.bool_()) @@ -2718,7 +2725,7 @@ def test_interval_array_from_relativedelta(): assert arr.type == pa.month_day_nano_interval() expected_list = [ None, - pa.MonthDayNano([13, 8, + pa.MonthDayNano([13, 8, # type: ignore[arg-type] (datetime.timedelta(seconds=1, microseconds=1, minutes=1, hours=1) // datetime.timedelta(microseconds=1)) * 1000])] @@ -2751,7 +2758,7 @@ def test_interval_array_from_tuple(): assert arr.type == pa.month_day_nano_interval() expected_list = [ None, - pa.MonthDayNano([1, 2, -3])] + pa.MonthDayNano([1, 2, -3])] # type: ignore[arg-type] expected = pa.array(expected_list) assert arr.equals(expected) assert arr.to_pylist() == expected_list @@ -2772,8 +2779,8 @@ def test_interval_array_from_dateoffset(): assert arr.type == pa.month_day_nano_interval() expected_list = [ None, - pa.MonthDayNano([13, 8, 3661000001001]), - pa.MonthDayNano([0, 0, 0])] + pa.MonthDayNano([13, 8, 3661000001001]), # type: ignore[arg-type] + pa.MonthDayNano([0, 0, 0])] # type: ignore[arg-type] expected = pa.array(expected_list) assert arr.equals(expected) expected_from_pandas = [ @@ -2937,7 +2944,7 @@ def test_buffers_primitive(): # Slicing does not affect the buffers but the offset a_sliced = a[1:] buffers = a_sliced.buffers() - a_sliced.offset == 1 + assert a_sliced.offset == 1 assert len(buffers) == 2 null_bitmap = buffers[0].to_pybytes() assert 1 <= len(null_bitmap) <= 64 # XXX this is varying @@ -2945,7 +2952,7 @@ def test_buffers_primitive(): assert struct.unpack('hhxxh', buffers[1].to_pybytes()) == (1, 2, 4) - a = pa.array(np.int8([4, 5, 6])) + a = pa.array(np.array([4, 5, 6], dtype=np.int8)) buffers = a.buffers() assert len(buffers) == 2 # No null bitmap from Numpy int array @@ -3031,7 +3038,7 @@ def test_nbytes_size(): def test_invalid_tensor_constructor_repr(): # ARROW-2638: prevent calling extension class constructors directly with pytest.raises(TypeError): - repr(pa.Tensor([1])) + repr(pa.Tensor([1])) # type: ignore[reportCallIssue] def test_invalid_tensor_construction(): @@ -3549,7 +3556,7 @@ def test_array_supported_masks(): with pytest.raises(pa.ArrowTypeError): arr = pa.array([4, None, 4, 3], - mask=[1.0, 2.0, 3.0, 4.0]) + mask=[1.0, 2.0, 3.0, 4.0]) # type: ignore[reportArgumentType] with pytest.raises(pa.ArrowTypeError): arr = pa.array([4, None, 4, 3], @@ -3836,11 +3843,11 @@ def test_concat_array_invalid_type(): # ARROW-9920 - do not segfault on non-array input with pytest.raises(TypeError, match="should contain Array objects"): - pa.concat_arrays([None]) + pa.concat_arrays([None]) # type: ignore[reportArgumentType] arr = pa.chunked_array([[0, 1], [3, 4]]) with pytest.raises(TypeError, match="should contain Array objects"): - pa.concat_arrays(arr) + pa.concat_arrays(arr) # type: ignore[reportArgumentType] @pytest.mark.pandas @@ -4369,7 +4376,7 @@ def test_non_cpu_array(): with pytest.raises(NotImplementedError): [i for i in iter(arr)] with pytest.raises(NotImplementedError): - arr == arr2 + _ = arr == arr2 with pytest.raises(NotImplementedError): arr.is_null() with pytest.raises(NotImplementedError): diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index c10ae0f62b4..6e48a4ff076 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -21,13 +21,18 @@ import itertools import math import re +from typing import TYPE_CHECKING, cast import hypothesis as h import pytest -try: + +if TYPE_CHECKING: import numpy as np -except ImportError: - np = None +else: + try: + import numpy as np + except ImportError: + np = None from pyarrow.pandas_compat import _pandas_api # noqa import pyarrow as pa @@ -66,7 +71,7 @@ def __int__(self): class MyBrokenInt: def __int__(self): - 1/0 # MARKER + _ = 1/0 # MARKER def test_iterable_types(): @@ -137,7 +142,7 @@ def test_object_with_getitem(): # https://github.com/apache/arrow/issues/34944 # considered as sequence because of __getitem__, but has no length with pytest.raises(TypeError, match="has no len()"): - pa.array(ObjectWithOnlyGetitem()) + pa.array(ObjectWithOnlyGetitem()) # type: ignore[arg-type] def _as_list(xs): @@ -845,7 +850,7 @@ def test_large_binary_value(ty): assert isinstance(arr, pa.Array) assert arr.type == ty assert len(arr) == 4 - buf = arr[1].as_buffer() + buf = cast(pa.FixedSizeBinaryScalar, arr[1]).as_buffer() assert len(buf) == len(s) * nrepeats @@ -1091,11 +1096,11 @@ def expected_datetime_value(dt): ), ] utcdata = [ - pytz.utc.localize(data[0]), + pytz.utc.localize(cast(datetime.datetime, data[0])), data[1], None, - data[3].astimezone(pytz.utc), - data[4].astimezone(pytz.utc), + cast(datetime.datetime, data[3]).astimezone(pytz.utc), + cast(datetime.datetime, data[4]).astimezone(pytz.utc), ] ty = pa.timestamp(unit, tz=timezone) @@ -1223,9 +1228,9 @@ def test_sequence_timestamp_from_mixed_builtin_and_pandas_datetimes(): None, ] utcdata = [ - data[0].astimezone(pytz.utc), - pytz.utc.localize(data[1]), - data[2].astimezone(pytz.utc), + cast(datetime.datetime, data[0]).astimezone(pytz.utc), + pytz.utc.localize(cast(datetime.datetime, data[1])), + cast(datetime.datetime, data[2]).astimezone(pytz.utc), None, ] @@ -2103,8 +2108,8 @@ def test_map_from_dicts(): assert arr.to_pylist() == expected # With omitted values - data[1] = None - expected[1] = None + data[1] = None # type: ignore[call-overload] + expected[1] = None # type: ignore[call-overload] arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32())) @@ -2429,6 +2434,7 @@ def test_nested_auto_chunking(ty, char): } +@pytest.mark.numpy @pytest.mark.large_memory def test_array_from_pylist_data_overflow(): # Regression test for ARROW-12983 @@ -2451,6 +2457,7 @@ def test_array_from_pylist_data_overflow(): assert len(arr.chunks) > 1 +@pytest.mark.numpy @pytest.mark.slow @pytest.mark.large_memory def test_array_from_pylist_offset_overflow(): @@ -2475,6 +2482,7 @@ def test_array_from_pylist_offset_overflow(): assert len(arr.chunks) > 1 +@pytest.mark.numpy @parametrize_with_collections_types @pytest.mark.parametrize(('data', 'scalar_data', 'value_type'), [ ([True, False, None], [pa.scalar(True), pa.scalar(False), None], pa.bool_()), @@ -2512,8 +2520,10 @@ def test_array_from_pylist_offset_overflow(): pa.timestamp('us') ), ( - [pa.MonthDayNano([1, -1, -10100])], - [pa.scalar(pa.MonthDayNano([1, -1, -10100]))], + [pa.MonthDayNano([1, -1, -10100])], # type: ignore[call-arg, arg-type] + [pa.scalar( + pa.MonthDayNano([1, -1, -10100]) # type: ignore[call-arg, arg-type] + )], pa.month_day_nano_interval() ), (["a", "b"], [pa.scalar("a"), pa.scalar("b")], pa.string()), diff --git a/python/pyarrow/tests/test_device.py b/python/pyarrow/tests/test_device.py index dc1a51e6d00..00f8bbf720d 100644 --- a/python/pyarrow/tests/test_device.py +++ b/python/pyarrow/tests/test_device.py @@ -59,11 +59,15 @@ def test_copy_to(): batch_copied = batch.copy_to(dest) assert batch_copied.equals(batch) - assert batch_copied["col"].buffers()[1].device == mm.device - assert batch_copied["col"].buffers()[1].address != arr.buffers()[1].address + buffer = batch_copied.column("col").buffers()[1] + assert buffer is not None + assert buffer.device == mm.device + buffer_orig = arr.buffers()[1] + assert buffer_orig is not None + assert buffer.address != buffer_orig.address with pytest.raises(TypeError, match="Argument 'destination' has incorrect type"): - arr.copy_to(mm.device.device_type) + arr.copy_to(mm.device.device_type) # type: ignore[arg-type] with pytest.raises(TypeError, match="Argument 'destination' has incorrect type"): - batch.copy_to(mm.device.device_type) + batch.copy_to(mm.device.device_type) # type: ignore[arg-type] diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 029e14ca162..5a7b9989358 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.tests.util as test_util @@ -259,7 +259,7 @@ def test_schema(): child 0, item: int8""" with pytest.raises(TypeError): - pa.schema([None]) + pa.schema([None]) # type: ignore[list-item] def test_schema_weakref(): @@ -548,7 +548,7 @@ def test_schema_equals_invalid_type(): for val in [None, 'string', pa.array([1, 2])]: with pytest.raises(TypeError): - schema.equals(val) + schema.equals(val) # type: ignore[invalid-argument-type] def test_schema_equality_operators(): @@ -594,7 +594,7 @@ def test_schema_get_fields(): with pytest.raises(KeyError): schema.field('other') with pytest.raises(TypeError): - schema.field(0.0) + schema.field(0.0) # type: ignore[arg-type] with pytest.raises(IndexError): schema.field(4) @@ -706,6 +706,7 @@ def test_empty_table(): assert table.schema == schema +@pytest.mark.numpy @pytest.mark.pandas def test_schema_from_pandas(): import pandas as pd @@ -782,7 +783,7 @@ def test_schema_merge(): # raise proper error when passing a non-Schema value with pytest.raises(TypeError): - pa.unify_schemas([a, 1]) + pa.unify_schemas([a, 1]) # type: ignore[list-item] def test_undecodable_metadata(): diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index eca8090d77a..2ce48b651b1 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -26,15 +26,16 @@ import pyarrow as pa try: - from scipy.sparse import csr_array, coo_array, csr_matrix, coo_matrix + from scipy.sparse import ( # type: ignore[reportMissingModuleSource] + csr_array, coo_array, csr_matrix, coo_matrix) except ImportError: - coo_matrix = None - csr_matrix = None - csr_array = None - coo_array = None + coo_matrix = None # type: ignore[assignment, misc] + csr_matrix = None # type: ignore[assignment, misc] + csr_array = None # type: ignore[assignment, misc] + coo_array = None # type: ignore[assignment, misc] try: - import sparse + import sparse # type: ignore[import-untyped, import-not-found] except ImportError: sparse = None @@ -401,7 +402,7 @@ def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type): assert np.array_equal(array, result_array) -@pytest.mark.skipif(not coo_matrix, reason="requires scipy") +@pytest.mark.skipif(coo_matrix is None, reason="requires scipy") @pytest.mark.parametrize('sparse_object', (coo_array, coo_matrix)) @pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs) def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type, @@ -443,7 +444,7 @@ def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type, assert out_scipy_matrix.has_canonical_format -@pytest.mark.skipif(not csr_matrix, reason="requires scipy") +@pytest.mark.skipif(csr_matrix is None, reason="requires scipy") @pytest.mark.parametrize('sparse_object', (csr_array, csr_matrix)) @pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs) def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type, @@ -483,7 +484,8 @@ def test_pydata_sparse_sparse_coo_tensor_roundtrip(dtype_str, arrow_type): shape = (4, 6) dim_names = ("x", "y") - sparse_array = sparse.COO(data=data, coords=coords, shape=shape) + sparse_array = sparse.COO( # type: ignore[reportOptionalMemberAccess] + data=data, coords=coords, shape=shape) sparse_tensor = pa.SparseCOOTensor.from_pydata_sparse(sparse_array, dim_names=dim_names) out_sparse_array = sparse_tensor.to_pydata_sparse() diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index b65fb7d952c..6263afd03a5 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -18,12 +18,13 @@ from collections import OrderedDict from collections.abc import Iterable import sys +from typing import cast import weakref try: import numpy as np except ImportError: - np = None + pass import pytest import pyarrow as pa import pyarrow.compute as pc @@ -418,7 +419,8 @@ def test_to_pandas_empty_table(): table = pa.table(df) result = table.schema.empty_table().to_pandas() assert result.shape == (0, 2) - tm.assert_frame_equal(result, df.iloc[:0]) + expected = df.iloc[:0] + tm.assert_frame_equal(result, expected) @pytest.mark.pandas @@ -486,12 +488,25 @@ def test_chunked_array_unify_dictionaries(): pa.array(["foo", "bar", None, "foo"]).dictionary_encode(), pa.array(["quux", None, "foo"]).dictionary_encode(), ]) - assert arr.chunk(0).dictionary.equals(pa.array(["foo", "bar"])) - assert arr.chunk(1).dictionary.equals(pa.array(["quux", "foo"])) + chunk_0 = arr.chunk(0) + assert isinstance(chunk_0, pa.DictionaryArray) + assert chunk_0.dictionary.equals(pa.array(["foo", "bar"])) + + chunk_1 = arr.chunk(1) + assert isinstance(chunk_1, pa.DictionaryArray) + assert chunk_1.dictionary.equals(pa.array(["quux", "foo"])) + arr = arr.unify_dictionaries() expected_dict = pa.array(["foo", "bar", "quux"]) - assert arr.chunk(0).dictionary.equals(expected_dict) - assert arr.chunk(1).dictionary.equals(expected_dict) + + chunk_0 = arr.chunk(0) + assert isinstance(chunk_0, pa.DictionaryArray) + assert chunk_0.dictionary.equals(expected_dict) + + chunk_1 = arr.chunk(1) + assert isinstance(chunk_1, pa.DictionaryArray) + assert chunk_1.dictionary.equals(expected_dict) + assert arr.to_pylist() == ["foo", "bar", None, "foo", "quux", None, "foo"] @@ -716,7 +731,7 @@ def test_recordbatch_take(): def test_recordbatch_column_sets_private_name(): # ARROW-6429 rb = pa.record_batch([pa.array([1, 2, 3, 4])], names=['a0']) - assert rb[0]._name == 'a0' + assert rb.column(0)._name == 'a0' def test_recordbatch_from_arrays_validate_schema(): @@ -798,7 +813,7 @@ def test_recordbatch_get_field(): batch.field('d') with pytest.raises(TypeError): - batch.field(None) + batch.field(None) # type: ignore[arg-type] with pytest.raises(IndexError): batch.field(4) @@ -819,7 +834,7 @@ def test_recordbatch_select_column(): batch.column('d') with pytest.raises(TypeError): - batch.column(None) + batch.column(None) # type: ignore[arg-type] with pytest.raises(IndexError): batch.column(4) @@ -933,7 +948,10 @@ def test_table_from_struct_array_chunked_array(): [[{"ints": 1}, {"floats": 1.0}]], type=pa.struct([("ints", pa.int32()), ("floats", pa.float32())]), ) - result = pa.Table.from_struct_array(chunked_struct_array) + assert isinstance(chunked_struct_array.type, pa.StructType) + # Cast to the proper type for type checker + struct_chunked_array = cast(pa.ChunkedArray, chunked_struct_array) + result = pa.Table.from_struct_array(struct_chunked_array) assert result.equals(pa.Table.from_arrays( [ pa.array([1, None], type=pa.int32()), @@ -1189,7 +1207,7 @@ def test_recordbatch_to_tensor_null(): batch.to_tensor() result = batch.to_tensor(null_to_nan=True, row_major=False) - x = np.column_stack([arr1, arr2]).astype(np.float64, order="F") + x = np.column_stack([arr1, arr2]).astype(np.float64, order="F") # type: ignore[no-matching-overload] expected = pa.Tensor.from_numpy(x) np.testing.assert_equal(result.to_numpy(), x) @@ -1223,7 +1241,7 @@ def test_recordbatch_to_tensor_null(): ) result = batch.to_tensor(null_to_nan=True, row_major=False) - x = np.column_stack([arr1, arr2]).astype(np.float32, order="F") + x = np.column_stack([arr1, arr2]).astype(np.float32, order="F") # type: ignore[no-matching-overload] expected = pa.Tensor.from_numpy(x) np.testing.assert_equal(result.to_numpy(), x) @@ -1339,7 +1357,7 @@ def test_recordbatchlist_schema_equals(): def test_table_column_sets_private_name(): # ARROW-6429 t = pa.table([pa.array([1, 2, 3, 4])], names=['a0']) - assert t[0]._name == 'a0' + assert t.column(0)._name == 'a0' def test_table_equals(): @@ -1500,7 +1518,8 @@ def test_table_from_arrays_preserves_column_metadata(): field1 = pa.field('field2', pa.int64(), nullable=False) table = pa.Table.from_arrays([arr0, arr1], schema=pa.schema([field0, field1])) - assert b"a" in table.field(0).metadata + field0_metadata = table.field(0).metadata + assert field0_metadata is not None and b"a" in field0_metadata assert table.field(1).nullable is False @@ -1565,7 +1584,7 @@ def test_table_get_field(): table.field('d') with pytest.raises(TypeError): - table.field(None) + table.field(None) # type: ignore[arg-type] with pytest.raises(IndexError): table.field(4) @@ -1586,7 +1605,7 @@ def test_table_select_column(): table.column('d') with pytest.raises(TypeError): - table.column(None) + table.column(None) # type: ignore[arg-type] with pytest.raises(IndexError): table.column(4) @@ -1879,22 +1898,41 @@ def test_table_unify_dictionaries(): table = pa.Table.from_batches([batch1, batch2]) table = table.replace_schema_metadata({b"key1": b"value1"}) - assert table.column(0).chunk(0).dictionary.equals( - pa.array(["foo", "bar"])) - assert table.column(0).chunk(1).dictionary.equals( - pa.array(["quux", "foo"])) - assert table.column(1).chunk(0).dictionary.equals( - pa.array([123, 456, 789])) - assert table.column(1).chunk(1).dictionary.equals( - pa.array([456, 789])) + chunk_0_0 = table.column(0).chunk(0) + assert isinstance(chunk_0_0, pa.DictionaryArray) + assert chunk_0_0.dictionary.equals(pa.array(["foo", "bar"])) + + chunk_0_1 = table.column(0).chunk(1) + assert isinstance(chunk_0_1, pa.DictionaryArray) + assert chunk_0_1.dictionary.equals(pa.array(["quux", "foo"])) + + chunk_1_0 = table.column(1).chunk(0) + assert isinstance(chunk_1_0, pa.DictionaryArray) + assert chunk_1_0.dictionary.equals(pa.array([123, 456, 789])) + + chunk_1_1 = table.column(1).chunk(1) + assert isinstance(chunk_1_1, pa.DictionaryArray) + assert chunk_1_1.dictionary.equals(pa.array([456, 789])) table = table.unify_dictionaries(pa.default_memory_pool()) expected_dict_0 = pa.array(["foo", "bar", "quux"]) expected_dict_1 = pa.array([123, 456, 789]) - assert table.column(0).chunk(0).dictionary.equals(expected_dict_0) - assert table.column(0).chunk(1).dictionary.equals(expected_dict_0) - assert table.column(1).chunk(0).dictionary.equals(expected_dict_1) - assert table.column(1).chunk(1).dictionary.equals(expected_dict_1) + + chunk_0_0 = table.column(0).chunk(0) + assert isinstance(chunk_0_0, pa.DictionaryArray) + assert chunk_0_0.dictionary.equals(expected_dict_0) + + chunk_0_1 = table.column(0).chunk(1) + assert isinstance(chunk_0_1, pa.DictionaryArray) + assert chunk_0_1.dictionary.equals(expected_dict_0) + + chunk_1_0 = table.column(1).chunk(0) + assert isinstance(chunk_1_0, pa.DictionaryArray) + assert chunk_1_0.dictionary.equals(expected_dict_1) + + chunk_1_1 = table.column(1).chunk(1) + assert isinstance(chunk_1_1, pa.DictionaryArray) + assert chunk_1_1.dictionary.equals(expected_dict_1) assert table.to_pydict() == { 'a': ["foo", "bar", None, "foo", "quux", "foo", None, "quux"], @@ -1964,13 +2002,13 @@ def test_concat_tables_invalid_option(): t = pa.Table.from_arrays([list(range(10))], names=('a',)) with pytest.raises(ValueError, match="Invalid promote_options: invalid"): - pa.concat_tables([t, t], promote_options="invalid") + pa.concat_tables([t, t], promote_options="invalid") # type: ignore[arg-type] def test_concat_tables_none_table(): # ARROW-11997 with pytest.raises(AttributeError): - pa.concat_tables([None]) + pa.concat_tables([None]) # type: ignore[arg-type] @pytest.mark.pandas @@ -2113,7 +2151,7 @@ def test_concat_batches_different_schema(): def test_concat_batches_none_batches(): # ARROW-11997 with pytest.raises(AttributeError): - pa.concat_batches([None]) + pa.concat_batches([None]) # type: ignore[arg-type] @pytest.mark.parametrize( @@ -2264,7 +2302,7 @@ def test_from_arrays_schema(data, klass): # with different and incompatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))]) with pytest.raises((NotImplementedError, TypeError)): - pa.Table.from_pydict(data, schema=schema) + pa.Table.from_pydict(data, schema=schema) # type: ignore[arg-type] # Cannot pass both schema and metadata / names with pytest.raises(ValueError): @@ -2369,7 +2407,7 @@ def test_table_from_pydict_arrow_arrays(data, klass): # with different and incompatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))]) with pytest.raises((NotImplementedError, TypeError)): - pa.Table.from_pydict(data, schema=schema) + pa.Table.from_pydict(data, schema=schema) # type: ignore[arg-type] @pytest.mark.parametrize('data, klass', [ @@ -2386,7 +2424,7 @@ def test_table_from_pydict_schema(data, klass): schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64()), ('ints', pa.int64())]) with pytest.raises(KeyError, match='ints'): - pa.Table.from_pydict(data, schema=schema) + pa.Table.from_pydict(data, schema=schema) # type: ignore[arg-type] # data has columns not present in schema -> ignored schema = pa.schema([('strs', pa.utf8())]) @@ -2590,10 +2628,10 @@ def test_table_factory_function_args_pandas(): def test_factory_functions_invalid_input(): with pytest.raises(TypeError, match="Expected pandas DataFrame, python"): - pa.table("invalid input") + pa.table("invalid input") # type: ignore[arg-type] with pytest.raises(TypeError, match="Expected pandas DataFrame"): - pa.record_batch("invalid input") + pa.record_batch("invalid input") # type: ignore[arg-type] def test_table_repr_to_string(): @@ -2727,8 +2765,8 @@ def test_table_function_unicode_schema(): schema = pa.schema([(col_a, pa.int32()), (col_b, pa.string())]) result = pa.table(d, schema=schema) - assert result[0].chunk(0).equals(pa.array([1, 2, 3], type='int32')) - assert result[1].chunk(0).equals(pa.array(['a', 'b', 'c'], type='string')) + assert result.column(0).chunk(0).equals(pa.array([1, 2, 3], type='int32')) + assert result.column(1).chunk(0).equals(pa.array(['a', 'b', 'c'], type='string')) def test_table_take_vanilla_functionality(): @@ -3603,7 +3641,7 @@ def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_arr # equals() test with pytest.raises(NotImplementedError): - cuda_chunked_array == cuda_chunked_array + cuda_chunked_array == cuda_chunked_array # type: ignore[reportUnusedExpression] # to_pandas() test with pytest.raises(NotImplementedError): @@ -3860,7 +3898,7 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch, # __dataframe__() test with pytest.raises(NotImplementedError): - from_dataframe(cuda_recordbatch.__dataframe__()) + from_dataframe(cuda_recordbatch.__dataframe__()) # type: ignore[misc] def verify_cuda_table(table, expected_schema): @@ -4059,7 +4097,7 @@ def test_table_non_cpu(cuda_context, cpu_table, cuda_table, # __dataframe__() test with pytest.raises(NotImplementedError): - from_dataframe(cuda_table.__dataframe__()) + from_dataframe(cuda_table.__dataframe__()) # type: ignore[misc] # __reduce__() test with pytest.raises(NotImplementedError): diff --git a/python/pyarrow/tests/test_tensor.py b/python/pyarrow/tests/test_tensor.py index debb1066280..c3726fdbbf4 100644 --- a/python/pyarrow/tests/test_tensor.py +++ b/python/pyarrow/tests/test_tensor.py @@ -213,7 +213,7 @@ def test_tensor_memoryview(): dtype = data.dtype lst = data.tolist() tensor = pa.Tensor.from_numpy(data) - m = memoryview(tensor) + m = memoryview(tensor) # type: ignore[reportArgumentType] assert m.format == expected_format assert m.shape == data.shape assert m.strides == data.strides diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 539f0172454..c224392510d 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -24,16 +24,22 @@ import pytest import hypothesis as h import hypothesis.strategies as st -try: - import hypothesis.extra.pytz as tzst -except ImportError: - tzst = None +from typing import Any, TYPE_CHECKING import weakref -try: +if TYPE_CHECKING: import numpy as np -except ImportError: - np = None + import hypothesis.extra.pytz as tzst +else: + try: + import numpy as np + except ImportError: + np = None + try: + import hypothesis.extra.pytz as tzst + except ImportError: + tzst = None + import pyarrow as pa import pyarrow.types as types import pyarrow.tests.strategies as past @@ -411,7 +417,7 @@ def test_tzinfo_to_string_errors(): if tzst: timezones = tzst.timezones() else: - timezones = st.none() + timezones = st.none() # type: ignore[assignment] @h.given(timezones) @@ -465,7 +471,7 @@ class BuggyTimezone2(datetime.tzinfo): def tzname(self, dt): return None - def utcoffset(self, dt): + def utcoffset(self, dt): # type: ignore[override] return "one hour" class BuggyTimezone3(datetime.tzinfo): @@ -473,7 +479,7 @@ class BuggyTimezone3(datetime.tzinfo): Wrong timezone name type """ - def tzname(self, dt): + def tzname(self, dt): # type: ignore[override] return 240 def utcoffset(self, dt): @@ -732,13 +738,13 @@ def test_struct_type(): # Neither integer nor string with pytest.raises(TypeError): - ty[None] + ty[None] # type: ignore[reportArgumentType] with pytest.raises(TypeError): - ty.field(None) + ty.field(None) # type: ignore[reportArgumentType] for a, b in zip(ty, fields): - a == b + assert a == b # Construct from list of tuples ty = pa.struct([('a', pa.int64()), @@ -746,7 +752,7 @@ def test_struct_type(): ('b', pa.int32())]) assert list(ty) == fields for a, b in zip(ty, fields): - a == b + assert a == b # Construct from mapping fields = [pa.field('a', pa.int64()), @@ -755,7 +761,7 @@ def test_struct_type(): ('b', pa.int32())])) assert list(ty) == fields for a, b in zip(ty, fields): - a == b + assert a == b # Invalid args with pytest.raises(TypeError): @@ -862,7 +868,7 @@ def test_dictionary_type(): # invalid index type raises with pytest.raises(TypeError): - pa.dictionary(pa.string(), pa.int64()) + pa.dictionary(pa.string(), pa.int64()) # type: ignore[reportArgumentType] def test_dictionary_ordered_equals(): @@ -951,7 +957,7 @@ def test_run_end_encoded_type(): pa.run_end_encoded(None, pa.utf8()) with pytest.raises(ValueError): - pa.run_end_encoded(pa.int8(), pa.utf8()) + pa.run_end_encoded(pa.int8(), pa.utf8()) # type: ignore[reportArgumentType] @pytest.mark.parametrize('t,check_func', [ @@ -1084,12 +1090,12 @@ def test_timedelta_overflow(): pa.scalar(d, type=pa.duration('ns')) # microsecond resolution, not overflow - pa.scalar(d, type=pa.duration('us')).as_py() == d + assert pa.scalar(d, type=pa.duration('us')).as_py() == d # second/millisecond resolution, not overflow for d in [datetime.timedelta.min, datetime.timedelta.max]: - pa.scalar(d, type=pa.duration('ms')).as_py() == d - pa.scalar(d, type=pa.duration('s')).as_py() == d + _ = pa.scalar(d, type=pa.duration('ms')).as_py() == d + _ = pa.scalar(d, type=pa.duration('s')).as_py() == d def test_type_equality_operators(): @@ -1127,11 +1133,11 @@ def test_key_value_metadata(): assert m1 != {'a': 'A', 'b': 'C'} with pytest.raises(TypeError): - pa.KeyValueMetadata({'a': 1}) + pa.KeyValueMetadata({'a': 1}) # type: ignore[reportArgumentType] with pytest.raises(TypeError): - pa.KeyValueMetadata({1: 'a'}) + pa.KeyValueMetadata({1: 'a'}) # type: ignore[reportArgumentType] with pytest.raises(TypeError): - pa.KeyValueMetadata(a=1) + pa.KeyValueMetadata(a=1) # type: ignore[reportArgumentType] expected = [(b'a', b'A'), (b'b', b'B')] result = [(k, v) for k, v in m3.items()] @@ -1258,6 +1264,7 @@ def test_field_metadata(): assert f1.metadata is None assert f2.metadata == {} + assert f3.metadata is not None assert f3.metadata[b'bizz'] == b'bazz' @@ -1394,7 +1401,7 @@ def __arrow_c_schema__(self): return self.schema.__arrow_c_schema__() -class SchemaMapping(Mapping): +class SchemaMapping(Mapping[Any, Any]): def __init__(self, schema): self.schema = schema