diff --git a/Cargo.lock b/Cargo.lock index 72e567c..e2cdce2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -790,9 +790,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.11.1" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593" dependencies = [ "serde", ] @@ -4934,6 +4934,7 @@ dependencies = [ name = "zarrista" version = "0.1.0-beta.2" dependencies = [ + "bytes", "half", "icechunk", "ndarray", diff --git a/Cargo.toml b/Cargo.toml index 61072aa..0a1f8b4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ crate-type = ["cdylib"] abi3-py311 = ["pyo3/abi3-py311"] [dependencies] +bytes = "1.12.0" half = "2" icechunk = "2.0.0" ndarray = "0.17.2" diff --git a/dev-docs/specs/2026-06-18-arrow-export-design.md b/dev-docs/specs/2026-06-18-arrow-export-design.md new file mode 100644 index 0000000..0798082 --- /dev/null +++ b/dev-docs/specs/2026-06-18-arrow-export-design.md @@ -0,0 +1,217 @@ +# zarrista — Arrow export faces for the decoded result types + +**Date:** 2026-06-18 +**Status:** Approved (architecture realigned to the +[decoded result types](2026-06-18-data-result-types-design.md) spike) + +## Framing: Arrow is one face on the result types + +Reads no longer return a single `Data` object. Per +[decoded result types](2026-06-18-data-result-types-design.md), a read returns one +of **four concrete classes** built from the raw post-codec `ArrayBytes`, each +holding `bytes::Bytes` (refcounted, zero-copy): + +| class | `ArrayBytes` layout | payload held | +|---|---|---| +| `Tensor` | `Fixed` | values | +| `VariableArray` | `Variable` | values + offsets | +| `MaskedTensor` | `Optional(Fixed)` | values + mask | +| `MaskedVariableArray` | `Optional(Variable)` | values + offsets + mask | + +These are **format-neutral payloads**; consumers reach them through thin, +**co-equal export faces** — today the buffer protocol (`to_numpy`/`buffer()` on +`Tensor`), with this spec adding **Arrow** (the PyCapsule interface), and **DLPack** +a likely future face. No decision about the result types is made to serve one +format; each face adapts to the payload, not the reverse. + +This matters because **no single interchange format covers all Zarr dtypes** (see +the coverage matrix). Arrow uniquely expresses the variable-length and masked +layouts the buffer protocol can't; DLPack uniquely expresses exotic numerics +(complex, bfloat16) Arrow can't; the buffer protocol is the universal fixed-width +baseline. Several faces are a necessity, not a luxury — and none is privileged. + +Arrow specifically is **additive and exploratory**: not a strong pull in the +zarr-nd community, not the organizing principle of the library. Its concrete payoff +is the **zero-copy variable-length / masked** path, where the buffer protocol simply +cannot represent the data. + +## Goal + +Implement the Arrow PyCapsule interface (`__arrow_c_array__` / `__arrow_c_schema__`) +on the result classes so a decoded chunk/subset can be handed to pyarrow, polars, +arro3, datafusion, etc. **`Tensor` first** (the dtypes that already round-trip); +`VariableArray` and the masked classes follow as their dtypes land, but their Arrow +mapping is designed here because they're where Arrow earns its keep. + +## Coverage matrix (why several faces) + +The full zarrs dtype set is: bool; int 8/16/32/64 + int2/int4; uint 8/16/32/64 + +uint2/uint4; float 16/32/64 + subfloat (float8); complex64/128; raw_bits; +fixed_length_utf32; string; bytes; datetime64 / timedelta64. There is **no +struct/compound and no dictionary/categorical dtype** — Zarr v2's compound dtypes are +only a v3 *extension* point (zarrs models the closest, opaque case as `raw_bits` = +numpy `void`), and "categorical" in Zarr is a *codec/filter* (`numcodecs.Categorize`) +yielding an integer array, not a dtype. So Arrow's nested/`Struct`/`Dictionary` +strengths have nothing to map *from* in Zarr today. + +| Zarr dtype | buffer protocol | Arrow | DLPack (future) | +|---|---|---|---| +| int / uint / float32-64 | ✅ | ✅ | ✅ | +| float16 | ✅ | ✅ `Float16` | ✅ | +| bool | ✅ (1 byte) | ✅ `arrow.bool8` | ✅ | +| datetime64 / timedelta64 | ✅ (as int64) | ⚠️ `Timestamp`/`Duration` — zero-copy only for s/ms/us/ns without NaT (see note) | ✅ | +| variable-length string/bytes | ❌ | ✅ `Utf8`/`Binary` | ❌ | +| complex64/128 | ✅ (2 floats) | ❌ no native complex | ✅ | +| subfloat (float8) / bfloat16 | ⚠️ no PEP-3118 code | ❌ | ✅ | +| int2/4, uint2/4 (sub-byte) | ❌ | ❌ | ❌ (awkward everywhere) | + +For Zarr's actual dtypes, Arrow's unique value is **variable-length string/bytes** and +**masked** data, plus a **semantic** bonus on the temporal types; DLPack uniquely +covers the exotic numerics; the buffer protocol is the universal fixed-width baseline. + +## Per-class Arrow mapping + +The result class (set by the `ArrayBytes` layout) determines the Arrow array kind; +the dtype refines it. Values are always **zero-copy** (a `bytes::Bytes` clone); +only the small side buffers (offsets, validity) are ever copied. + +| class | Arrow array | values | offsets | validity | +|---|---|---|---|---| +| `Tensor` | primitive (+ `arrow.bool8` for bool) | zero-copy | — | — | +| `VariableArray` | `Utf8`/`LargeUtf8` (string) or `Binary`/`LargeBinary` (bytes) | zero-copy | `usize` → `i32`/`i64` (copy) | — | +| `MaskedTensor` | primitive **+ validity bitmap** | zero-copy | — | byte-mask → bitmap (copy) | +| `MaskedVariableArray` | `Utf8`/`Binary` **+ validity bitmap** | zero-copy | `usize` → `i32`/`i64` (copy) | byte-mask → bitmap (copy) | + +Two layout conversions are intrinsic to Arrow and unavoidable (but cheap — side +buffers, not bulk data): + +- **Offsets.** zarrs hands variable-length offsets as `usize` (currently materialized + to `Vec`; zarrs#406 tracks avoiding that). Arrow needs `i32` (`Utf8`/`Binary`) + or `i64` (`LargeUtf8`/`LargeBinary`). Pick the `Large` variant when the last offset + exceeds `i32::MAX`, else `i32`; either way it's a copy of the small offsets array. +- **Validity.** zarrs `Optional` carries a mask of **1 byte per element** (0 = invalid, + non-zero = valid). Arrow validity is **1 bit per element** (1 = valid). So masked + export **bit-packs** the byte-mask into a validity bitmap — `bit[i] = mask[i] != 0` — + a small copy, and derives `null_count`. (This is the inverse of the `bool8` case, + where Arrow happens to match our byte layout and *no* packing is needed.) + +### `bool` via `arrow.bool8` (zero-copy) + +`Tensor` of `bool` uses the +[`arrow.bool8` canonical extension](https://arrow.apache.org/docs/format/CanonicalExtensions.html#bit-boolean) +(int8 storage, 0=false / non-zero=true) — exactly our 1-byte-per-bool layout. So bool +values export **zero-copy**, no bit-packing. Consumers that don't understand the +extension degrade gracefully to the int8 storage. (Note the asymmetry: a `bool` +*value* column needs no packing via `bool8`, but a *validity mask* always bit-packs, +because Arrow validity is defined as a bitmap.) + +### `Tensor` dtype mapping (first round) + +| dtype | Arrow type | +|---|---| +| int8/16/32/64 | `Int8/16/32/64` | +| uint8/16/32/64 | `UInt8/16/32/64` | +| float32/64 | `Float32/64` | +| float16 | `Float16` | +| bool | `Int8` + `arrow.bool8` | + +## Zero-copy mechanism (simpler now, thanks to `bytes::Bytes`) + +Because the payload is `bytes::Bytes` (refcounted), the values buffer needs **no +`Py_INCREF`/release-callback dance**. Build the arrow-rs `Buffer` over the bytes via +`Buffer::from_custom_allocation(ptr, len, owner)`, where `owner` is a cloned +`bytes::Bytes` (an `Arc`-like handle) — the Arrow buffer keeps the allocation alive by +holding its own refcount, dropped when the consumer releases the Arrow array. +`pyo3-arrow` wraps the resulting array in the PyCapsule. One allocation backs the +buffer protocol, `to_numpy`, and Arrow simultaneously via cheap `Bytes` clones. + +## Contiguity & alignment + +The values payload is a single contiguous `bytes::Bytes` blob in C-order — so unlike +the old `ArrayD` design there is **no stride/non-contiguity case to handle**; +Arrow's contiguity requirement is always satisfied for the values buffer. + +The live concern is **alignment**, and the decision (from the result-types spec) is +**stay unaligned**: the bytes come straight from the decode allocator, contractually +1-byte aligned (de-facto 16 from malloc). Consequences for Arrow: + +- **Correctness:** unaffected. Arrow's C Data Interface treats 64-byte alignment as + *advisory* (≈8-byte floor). +- **Performance:** a SIMD kernel or strict validator (some pyarrow paths) may silently + **re-copy to realign**, turning a zero-copy export into one consumer-side copy. +- **Owning consumers** (`pa.array(...)` that materializes, compute kernels) pay any + realign copy as part of work they were already doing — we never pay a *dedicated* + alignment copy. + +If it ever bites, expose an explicit aligned-copy path rather than aligning every +read (the result-types spec's lazy-alignment stance). + +## Side-channel shape + +Arrow arrays are logically 1-D. Each result class exposes its data as a **flat, +length-`prod(shape)` Arrow array** plus its existing **`shape`** property; consumers +reshape if they care. We deliberately do **not** use `arrow.fixed_shape_tensor` (its +semantics are "batch of tensors", leading dim = batch — wrong for a single chunk). + +## API surface + +On each result class (`Tensor` first): + +```python +obj.__arrow_c_array__(requested_schema=None) -> (schema_capsule, array_capsule) +obj.__arrow_c_schema__() -> schema_capsule +obj.shape # already present; the Arrow array is flat length prod(shape) +``` + +`pa.array(obj)`, `pl.Series(obj)`, `arro3.core.Array.from_arrow(obj)` all work via the +capsule protocol; recover N-D structure by combining with `obj.shape`. + +**Optional introspection** (revisit when implementing): a per-class `arrow_copy: bool` +could advertise whether an export copies bulk data. With the `bytes::Bytes` design the +values are always zero-copy, so this would only ever flag alignment realign risk or +the (always-copied) side buffers — lower value than under the old design, so it's +deferred rather than specified. + +## Tooling + +Use [`pyo3-arrow`](https://crates.io/crates/pyo3-arrow) to build the Arrow arrays and +expose the capsules — no pyarrow dependency in Rust. (Separately, `dlpark`'s pyo3 +feature pins pyo3 0.25 vs our 0.29 — see the result-types spec — but that's a DLPack +concern, not Arrow.) + +## Testing + +- **Round-trip vs. zarr-python** (extends the existing harness): write numeric arrays, + read with zarrista, assert the Arrow export of `Tensor` matches — e.g. + `pa.array(tensor)` (or arro3) reshaped via `tensor.shape` equals the zarr-python + numpy array. Cover every numeric dtype and `bool` (verifying `arrow.bool8`), plus a + multi-dim chunk for flat-array + `shape`. +- **Zero-copy assertion:** the Arrow values buffer pointer equals `tensor.buffer()`'s + pointer (no copy), and the Arrow array outlives a dropped Python reference (the + `Bytes` refcount holds). +- **When variable/masked land:** assert `VariableArray` → `Utf8`/`Binary` with correct + offsets, and masked → correct validity bitmap + `null_count` from the byte-mask. +- **Tooling:** `maturin develop`; `uv run --no-project pytest`. + +## Out of scope (deferred) + +- `VariableArray` / masked Arrow export (designed above; lands with the + variable-length and nullable dtype work). +- DLPack export (co-equal future face; covers complex/bfloat16 Arrow can't). +- Arrow *import* / writing. +- complex / float8 / bfloat16 dtypes (no Arrow representation — DLPack territory). +- temporal dtypes (datetime64 / timedelta64 → `Timestamp`/`Duration`) — a natural + Arrow follow-up, but **not uniformly zero-copy**: both are int64/LE/epoch-based so + the values buffer matches, but (1) Arrow supports only s/ms/us/ns — calendar/other + units (D/h/m/W/M/Y/sub-ns) need a unit cast and W/M/Y have no faithful Arrow + representation; and (2) numpy encodes NaT as the `INT64_MIN` sentinel while Arrow + uses a validity bitmap, so any NaT (a valid Zarr fill value) forces an O(n) scan to + build a bitmap. Zero-copy only holds for unit ∈ {s, ms, us, ns} with no NaT. + +## Risks + +- **Buffer alignment.** As above: correctness fine, possible silent consumer-side + realign copy; `to_numpy` unaffected (buffer protocol has no alignment requirement). +- **`arrow.bool8` consumer support.** A relatively recent canonical extension; older + consumers see int8 storage rather than boolean. Acceptable (graceful degradation). +- **`pyo3-arrow` / arrow-rs version coupling.** Pin deliberately. diff --git a/dev-docs/specs/2026-06-18-data-result-types-design.md b/dev-docs/specs/2026-06-18-data-result-types-design.md new file mode 100644 index 0000000..4dd8726 --- /dev/null +++ b/dev-docs/specs/2026-06-18-data-result-types-design.md @@ -0,0 +1,151 @@ +# zarrista — decoded result types (ArrayBytes-backed `Data`) + +**Date:** 2026-06-18 +**Status:** Prototyping (spike on branch `kyle/dlpack-tensor-spike`) + +## Goal + +Reorient what a read returns. Today a read decodes into a typed +`ndarray::ArrayD` wrapped in a single `Data` class, dispatched by a 12-arm +per-dtype macro. Replace that with a **format-neutral, zero-copy** payload built +from the raw post-codec [`ArrayBytes`], surfaced as **four concrete Python result +classes**. This collapses the dispatch to a single generic retrieval and sets up +the multi-protocol faces (buffer protocol now; Arrow, DLPack later — see +[arrow-export-design](2026-06-18-arrow-export-design.md)). + +## Why (key findings that drove this) + +- **The typed path copies *every* buffer.** `retrieve::>` / + `::>` go through `convert_from_bytes_slice` → + `bytemuck::allocation::pod_collect_to_vec`, which **unconditionally** allocates + a fresh aligned `Vec` and `copy_from_slice`s into it — no alignment fast + path. So today's `Data` pays a full copy per read. Retrieving raw `ArrayBytes` + avoids it. +- **`FromArrayBytes` is post-codec.** All retrieval funnels through + `retrieve_array_subset_opt`, which runs the *entire* codec pipeline and then + calls `T::from_array_bytes(bytes, shape, data_type)`. Implementing our own + `FromArrayBytes` customizes only the final "bytes → our type" step; it does + **not** skip codecs. zarrs also passes us the region shape, so we never + re-derive it. +- **`ArrayBytes` has three layouts, and `Tensor`/`into_fixed` only handles one.** + `ArrayBytes` is `Fixed | Variable | Optional` (the last is data + a 1-byte/elem + validity mask). zarrs's own `Tensor` does `bytes.into_fixed()`, which errors for + both `Variable` and `Optional`. So `Tensor` is fixed-dense-only; we need our own + type to represent all three. +- **`ArrayBytes` does no alignment handling.** It's `Cow<'static, [u8]>` straight + from the decode allocation — contractually 1-byte aligned (de-facto 16 from + system malloc; a borrowed shard sub-slice could be less). The alignment we have + *today* is purely the side effect of the typed-path copy above. (See + [Alignment](#alignment).) + +## Architecture + +### One Rust type in, four Python types out + +We implement `FromArrayBytes` for an internal `Decoded` enum, retrieve +`::` (one call, no macro), and convert to the matching Python class: + +| `ArrayBytes` variant | Python class | exposes | +|---|---|---| +| `Fixed` | **`Tensor`** | buffer protocol + `to_numpy` (+ Arrow/DLPack later) | +| `Variable` | **`VariableArray`** | (skeleton; Arrow later) | +| `Optional(Fixed)` | **`MaskedTensor`** | (skeleton; Arrow-with-validity later) | +| `Optional(Variable)` | **`MaskedVariableArray`** | (skeleton) | + +**Separate classes, not one union class with conditionally-erroring methods.** +The type *is* the information the consumer needs (`isinstance`), each class +implements exactly the faces it can support, and it mirrors numpy (`MaskedArray` +is its own type) and Arrow (typed array classes). The "union" lives only as the +throwaway `Decoded` enum at the FFI seam, surfaced via `IntoPyObject` so both the +sync and async retrieve paths just return `Decoded`. + +### Zero-copy payload + +`Tensor` holds the decoded bytes as `bytes::Bytes` (refcounted, cheaply cloned), +obtained by moving the `Fixed` `Cow::Owned(Vec)` into `Bytes` (zero-copy). +Every face takes a cheap `Bytes` clone: + +- **buffer protocol:** `pyo3_bytes::PyBytes::new(bytes.clone())` — zero-copy, + already implements the buffer protocol. +- **`to_numpy`:** `np.frombuffer(PyBytes, dtype).reshape(shape)`, where `dtype` is + the zarr v3 dtype name (numpy accepts the same names for the fixed numerics) and + the bytes are native-endian (matching numpy's default). + +This is the "Rust hands over bytes, Python interprets the dtype" model: numpy owns +the reinterpretation, so we never do a Rust-side `Vec→Vec` cast (which would +require alignment — the exact pain hit in +[async-tiff#165](https://github.com/developmentseed/async-tiff/pull/165), which +fell back to `bytemuck::try_cast_vec` + copy). + +## Alignment + +**Decision: do not align up front.** Buffers are whatever the decode allocator +produced (unaligned in the worst case). + +- **Buffer protocol:** no alignment requirement; numpy handles unaligned. +- **`np.frombuffer` / view consumers:** numpy views unaligned bytes (sets + `aligned=False`); correct, and fine on x86-64/aarch64 where unaligned loads are + cheap. **No copy.** +- **Owning consumers** (`np.array(data)`, `.copy()`, most ops): numpy allocates + its own *aligned* buffer and copies into it — so alignment is **fused into the + copy they were already doing**. We never pay a *dedicated* alignment copy. +- **Arrow:** recommends 64-byte but the C Data Interface relaxes to advisory; + pyarrow mostly tolerates, occasionally realign-copies in strict kernels. + +Rejected alternatives: changing zarrs to allocate aligned output (not a one-liner; +allocations spread across codecs, and borrowed shard sub-slices can't align without +a copy — worth an upstream feature request, not a local fix); and +`retrieve_*_into` with our own aligned buffer (doesn't support variable-length and +needs `unsafe` — not adopting yet). If a consumer ever needs guaranteed alignment, +expose an explicit aligned-copy method rather than copying on every read. + +## Dispatch collapse (the payoff) + +Sync and async `retrieve_array_subset` / `retrieve_chunk` each become a single +`retrieve::(…)` call — deleting the 12-arm `for_each_dtype!` macro, the +`DataInner` enum, and the hand-rolled `unsafe` buffer-protocol code in `data.rs`. +`Decoded: IntoPyObject` builds the right class. + +## Faces: now vs. later + +- **Now:** buffer protocol + `to_numpy` on `Tensor`. The other three classes + carry their full decoded payload (values bytes + offsets and/or mask via + `ArrayBytes`/`Optional` `into_parts()`) plus `shape`/`dtype`, but don't yet + expose it to numpy — `to_numpy` raises `NotImplementedError`. (Offsets are + currently copied to `Vec`; zarrs#406 tracks avoiding that.) +- **Later (own specs):** Arrow `__arrow_c_array__` on all four (variable-length and + masked are where Arrow earns its keep — zero-copy `String`/`Binary` + validity + bitmaps); DLPack `__dlpack__` on `Tensor`/`MaskedTensor`. +- **Zero-copy introspection** (`contiguous`, `arrow_copy`, `buffer_protocol_copy`) + from the Arrow spec applies here too. + +## DLPack note (deferred, with a real blocker) + +zarrs has a `dlpack` feature wiring its `Tensor` to the `dlpark` crate +(`SafeManagedTensorVersioned`). But **`dlpark` 0.6's `pyo3` feature pins pyo3 +0.25, and we're on pyo3 0.29** — two pyo3 versions can't share one extension +module. So we cannot use `dlpark`'s capsule helper directly; DLPack would mean +hand-rolling the `PyCapsule` (name `dltensor`/`dltensor_versioned` + a deleter +calling the managed tensor's `deleter`) over `dlpark`'s core (which *is* pyo3-free +and usable), or waiting for a `dlpark` pyo3 bump. DLPack also only covers the plain +numerics + bf16 (no complex, no var-length), so it's one face among several, not a +replacement. Deferred. + +## Naming + +Provisional: `Tensor`, `VariableArray`, `MaskedTensor`, `MaskedVariableArray`. +("Tensor" deliberately names only the fixed-dense class, not the umbrella — there +is no single umbrella class by design.) + +## Testing + +- Round-trip vs. zarr-python: write fixed numeric arrays, read with zarrista, + assert `Tensor.to_numpy()` (and `np.frombuffer(tensor.buffer())`) equal the + zarr-python array across dtypes/shapes, including a multi-dim chunk. +- Assert a variable-length / masked array returns the correct class (not `Tensor`). +- `maturin develop`; `uv run --no-project pytest`. + +## Out of scope (this spike) + +Exposing data from `VariableArray`/`MaskedTensor`/`MaskedVariableArray`; Arrow and +DLPack faces; complex/temporal/sub-byte dtype numpy mappings; writing. diff --git a/docs/api/data.md b/docs/api/data.md deleted file mode 100644 index 02e0d60..0000000 --- a/docs/api/data.md +++ /dev/null @@ -1,5 +0,0 @@ -# Data - -::: zarrista.Data - options: - show_bases: false diff --git a/docs/api/decoded_array.md b/docs/api/decoded_array.md new file mode 100644 index 0000000..9184757 --- /dev/null +++ b/docs/api/decoded_array.md @@ -0,0 +1,41 @@ +# DecodedArray + +Reading from an [`Array`][zarrista.Array] (via `retrieve_array_subset`, +`retrieve_chunk`, or `[...]`) returns a `DecodedArray`: one of four concrete result +types, chosen by the decoded byte layout of the dtype. Use `isinstance` to narrow to +a concrete type before calling its layout-specific methods. + +- [`Tensor`](#zarrista.Tensor) — fixed-width, dense data. +- [`VariableArray`](#zarrista.VariableArray) — variable-length data (e.g. strings or + bytes). +- [`MaskedTensor`](#zarrista.MaskedTensor) — fixed-width data with a validity mask. +- [`MaskedVariableArray`](#zarrista.MaskedVariableArray) — variable-length data with a + validity mask. + +::: zarrista.DecodedArray + options: + show_bases: false + +## Tensor + +::: zarrista.Tensor + options: + show_bases: false + +## VariableArray + +::: zarrista.VariableArray + options: + show_bases: false + +## MaskedTensor + +::: zarrista.MaskedTensor + options: + show_bases: false + +## MaskedVariableArray + +::: zarrista.MaskedVariableArray + options: + show_bases: false diff --git a/mkdocs.yml b/mkdocs.yml index 5fe7a36..2e37dc6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -26,7 +26,7 @@ nav: - api/store.md - api/chunk-grid.md - api/codec.md - - api/data.md + - api/decoded_array.md - api/dtype.md - Blog: - blog/index.md diff --git a/python/zarrista/__init__.py b/python/zarrista/__init__.py index 984981c..4517188 100644 --- a/python/zarrista/__init__.py +++ b/python/zarrista/__init__.py @@ -1,3 +1,5 @@ +from typing import TypeAlias + from . import codec, exceptions from ._zarrista import ( Array, @@ -5,27 +7,43 @@ AsyncArray, AsyncGroup, ChunkGrid, - Data, DataType, FilesystemStore, FillValue, Group, + MaskedTensor, + MaskedVariableArray, MemoryStore, + Tensor, + VariableArray, __version__, ) +DecodedArray: TypeAlias = Tensor | VariableArray | MaskedTensor | MaskedVariableArray +"""The result of a read: one of the four decoded array layouts. + +Which one is returned depends on the dtype's byte layout (fixed vs. variable, and +whether it carries a validity mask). Use `isinstance` to narrow to a concrete +type before using layout-specific methods. +""" + + __all__ = [ "Array", "ArrayBytes", "AsyncArray", "AsyncGroup", + "DecodedArray", "ChunkGrid", - "Data", "DataType", "FillValue", "FilesystemStore", "Group", + "MaskedTensor", + "MaskedVariableArray", "MemoryStore", + "Tensor", + "VariableArray", "__version__", "codec", "exceptions", diff --git a/python/zarrista/_array.pyi b/python/zarrista/_array.pyi index 28a8f84..3d8d1c5 100644 --- a/python/zarrista/_array.pyi +++ b/python/zarrista/_array.pyi @@ -5,7 +5,7 @@ from zarrista.codec import CodecOptions from ._chunks import ChunkGrid from ._codec import CodecChain -from ._data import Data +from ._decoded_array import DecodedArray from ._dtype import DataType from ._store import AsyncStore, FilesystemStore, MemoryStore @@ -51,7 +51,7 @@ class Array: """The array's path in the store.""" def retrieve_array_subset( self, selection: Selection, **codec_options: Unpack[CodecOptions] - ) -> Data: + ) -> DecodedArray: """Read and decode an array region selected with numpy-style basic indexing. The result is ndim-preserving (consistent with a zarrs `ArraySubset`): an @@ -61,7 +61,7 @@ class Array: """ def retrieve_chunk( self, chunk_indices: list[int], **codec_options: Unpack[CodecOptions] - ) -> Data: + ) -> DecodedArray: """Read and decode the chunk at the given chunk grid indices. Keyword arguments are passed as [`CodecOptions`][zarrista.codec.CodecOptions]. @@ -69,7 +69,7 @@ class Array: @property def shape(self) -> list[int]: """The array shape.""" - def __getitem__(self, selection: Selection) -> Data: + def __getitem__(self, selection: Selection) -> DecodedArray: """Read a region with numpy-style basic indexing, e.g. `arr[0:10, :, 5]`. Sugar for `retrieve_array_subset`. @@ -111,7 +111,7 @@ class AsyncArray: """The array's path in the store.""" async def retrieve_array_subset( self, selection: Selection, **codec_options: Unpack[CodecOptions] - ) -> Data: + ) -> DecodedArray: """Read and decode an array region selected with numpy-style basic indexing. The result is ndim-preserving (consistent with a zarrs `ArraySubset`): an @@ -121,7 +121,7 @@ class AsyncArray: """ async def retrieve_chunk( self, chunk_indices: list[int], **codec_options: Unpack[CodecOptions] - ) -> Data: + ) -> DecodedArray: """Read and decode the chunk at the given chunk grid indices. Keyword arguments are passed as [`CodecOptions`][zarrista.codec.CodecOptions]. @@ -129,7 +129,7 @@ class AsyncArray: @property def shape(self) -> list[int]: """The array shape.""" - async def __getitem__(self, selection: Selection) -> Data: + async def __getitem__(self, selection: Selection) -> DecodedArray: """Read a region with numpy-style basic indexing: `await arr[0:10, :, 5]`. Sugar for `retrieve_array_subset`. diff --git a/python/zarrista/_data.pyi b/python/zarrista/_data.pyi deleted file mode 100644 index dc30c8c..0000000 --- a/python/zarrista/_data.pyi +++ /dev/null @@ -1,25 +0,0 @@ -import sys -from typing import Any - -from numpy.typing import NDArray - -if sys.version_info >= (3, 12): - from collections.abc import Buffer -else: - from typing_extensions import Buffer - -class Data(Buffer): - """A decoded chunk of array data. - - Implements the Python buffer protocol (PEP 3118), so dtypes with a native - buffer representation can be read zero-copy via `memoryview(data)` or - `np.asarray(data)`. Dtypes without one raise `BufferError`. - """ - - def __buffer__(self, flags: int) -> memoryview: ... - def to_numpy(self) -> NDArray[Any]: - """Convert the chunk into a NumPy array. - - Zero-copy when the dtype supports the buffer protocol; otherwise the - data is copied and converted. - """ diff --git a/python/zarrista/_decoded_array.pyi b/python/zarrista/_decoded_array.pyi new file mode 100644 index 0000000..83bcc1a --- /dev/null +++ b/python/zarrista/_decoded_array.pyi @@ -0,0 +1,79 @@ +import sys +from typing import Any, TypeAlias + +from numpy.typing import NDArray + +from ._dtype import DataType + +if sys.version_info >= (3, 12): + from collections.abc import Buffer +else: + from typing_extensions import Buffer + +class Tensor: + """Fixed-width, dense decoded array data. + + The decoded bytes are held zero-copy. Reinterpret them as a NumPy array with + `to_numpy()`, or get the raw bytes as a buffer-protocol object via `buffer()`. + """ + + @property + def shape(self) -> list[int]: + """The shape of the decoded region.""" + @property + def dtype(self) -> DataType: + """The Zarr data type.""" + def buffer(self) -> Buffer: + """The raw decoded bytes as a zero-copy buffer-protocol object.""" + def to_numpy(self) -> NDArray[Any]: + """Access a NumPy array view over Rust memory. + + This is a zero-copy view via `np.frombuffer`. + """ + +class VariableArray: + """Variable-length decoded data (e.g. strings or bytes). + + Not yet exposed to NumPy. + """ + + @property + def shape(self) -> list[int]: + """The shape of the decoded region.""" + @property + def dtype(self) -> DataType: + """The Zarr data type.""" + +class MaskedTensor: + """Fixed-width decoded data with a validity mask. + + Not yet exposed to NumPy. + """ + + @property + def shape(self) -> list[int]: + """The shape of the decoded region.""" + @property + def dtype(self) -> DataType: + """The Zarr data type.""" + +class MaskedVariableArray: + """Variable-length decoded data with a validity mask. + + Not yet exposed to NumPy. + """ + + @property + def shape(self) -> list[int]: + """The shape of the decoded region.""" + @property + def dtype(self) -> DataType: + """The Zarr data type.""" + +DecodedArray: TypeAlias = Tensor | VariableArray | MaskedTensor | MaskedVariableArray +"""The result of a read: one of the four decoded array layouts. + +Which one is returned depends on the dtype's byte layout (fixed vs. variable, and +whether it carries a validity mask). Use `isinstance` to narrow to a concrete +type before using layout-specific methods. +""" diff --git a/python/zarrista/_zarrista.pyi b/python/zarrista/_zarrista.pyi index c6783a8..0ff4154 100644 --- a/python/zarrista/_zarrista.pyi +++ b/python/zarrista/_zarrista.pyi @@ -1,7 +1,7 @@ from ._array import Array, AsyncArray from ._array_bytes import ArrayBytes from ._chunks import ChunkGrid -from ._data import Data +from ._decoded_array import MaskedTensor, MaskedVariableArray, Tensor, VariableArray from ._dtype import DataType from ._fill_value import FillValue from ._group import AsyncGroup, Group @@ -15,11 +15,14 @@ __all__ = [ "AsyncArray", "AsyncGroup", "ChunkGrid", - "Data", "DataType", "FillValue", "FilesystemStore", "Group", + "MaskedTensor", + "MaskedVariableArray", "MemoryStore", + "Tensor", + "VariableArray", "__version__", ] diff --git a/src/array/async.rs b/src/array/async.rs index ec8a1e0..b8e6a80 100644 --- a/src/array/async.rs +++ b/src/array/async.rs @@ -6,13 +6,11 @@ use crate::array::selection::PySelection; use crate::array::util::PyChunkIndices; use crate::chunks::PyChunkGrid; use crate::codec::{PyCodecChain, PyCodecOptions}; -use crate::data::{for_each_dtype, DataInner, PyData}; +use crate::decoded_array::DecodedArray; use crate::dtype::PyDataType; use crate::error::ZarristaError; use crate::node::PyNodePath; use crate::storage::PyAsyncStorage; -use ndarray::ArrayD; -use pyo3::exceptions::PyNotImplementedError; use pyo3::prelude::*; use pyo3_async_runtimes::tokio::future_into_py; use pyo3_bytes::PyBytes; @@ -123,30 +121,15 @@ impl PyAsyncArray { py: Python<'py>, selection: PySelection, ) -> PyResult> { - use zarrs::array::data_type::*; - let inner = self.inner.clone(); let array_subset = selection.to_array_subset(inner.shape())?; future_into_py(py, async move { - let dtype = inner.data_type(); - - macro_rules! arm { - ($dtype:ty, $variant:ident, $elem:ty) => { - if dtype.is::<$dtype>() { - let data = inner - .async_retrieve_array_subset::>(&array_subset) - .await - .map_err(ZarristaError::from)?; - return Ok(PyData::from(DataInner::$variant(data))); - } - }; - } - for_each_dtype!(arm); - - Err(PyNotImplementedError::new_err(format!( - "reading data type {dtype} is not supported yet" - ))) + let decoded = inner + .async_retrieve_array_subset::(&array_subset) + .await + .map_err(ZarristaError::from)?; + Ok(decoded) }) } @@ -157,35 +140,17 @@ impl PyAsyncArray { chunk_indices: PyChunkIndices, codec_options: Option, ) -> PyResult> { - use zarrs::array::data_type::*; - let inner = self.inner.clone(); let codec_options = codec_options .map(|opts| opts.into_inner()) .unwrap_or_default(); future_into_py(py, async move { - let dtype = inner.data_type(); - - macro_rules! arm { - ($dtype:ty, $variant:ident, $elem:ty) => { - if dtype.is::<$dtype>() { - let chunk = inner - .async_retrieve_chunk_opt::>( - chunk_indices.as_ref(), - &codec_options, - ) - .await - .map_err(ZarristaError::from)?; - return Ok(PyData::from(DataInner::$variant(chunk))); - } - }; - } - for_each_dtype!(arm); - - Err(PyNotImplementedError::new_err(format!( - "reading data type {dtype} is not supported yet" - ))) + let decoded = inner + .async_retrieve_chunk_opt::(chunk_indices.as_ref(), &codec_options) + .await + .map_err(ZarristaError::from)?; + Ok(decoded) }) } diff --git a/src/array/sync.rs b/src/array/sync.rs index 5d45c66..32387c2 100644 --- a/src/array/sync.rs +++ b/src/array/sync.rs @@ -4,13 +4,11 @@ use crate::array::selection::PySelection; use crate::array::util::PyChunkIndices; use crate::chunks::PyChunkGrid; use crate::codec::{PyCodecChain, PyCodecOptions}; -use crate::data::{for_each_dtype, DataInner, PyData}; +use crate::decoded_array::DecodedArray; use crate::dtype::PyDataType; use crate::error::ZarristaResult; use crate::node::PyNodePath; use crate::storage::PySyncStorage; -use ndarray::ArrayD; -use pyo3::exceptions::PyNotImplementedError; use pyo3::prelude::*; use pyo3_bytes::PyBytes; use pythonize::pythonize; @@ -33,7 +31,7 @@ impl PyArray { #[pymethods] impl PyArray { /// Read a region with numpy-style basic indexing, e.g. `arr[0:10, :, 5]`. - fn __getitem__(&self, selection: PySelection) -> ZarristaResult { + fn __getitem__(&self, selection: PySelection) -> ZarristaResult { self.retrieve_array_subset(selection) } @@ -101,29 +99,13 @@ impl PyArray { self.inner.path().as_str() } - /// Read a region of the array as `Data`, using numpy-style basic indexing. - fn retrieve_array_subset(&self, selection: PySelection) -> ZarristaResult { - use zarrs::array::data_type::*; - + /// Read a region of the array, using numpy-style basic indexing. + /// + /// Returns one of the decoded result classes (`Tensor`, `VariableArray`, + /// `MaskedTensor`, `MaskedVariableArray`) depending on the dtype layout. + fn retrieve_array_subset(&self, selection: PySelection) -> ZarristaResult { let array_subset = selection.to_array_subset(self.inner.shape())?; - let dtype = self.inner.data_type(); - - macro_rules! arm { - ($dtype:ty, $variant:ident, $elem:ty) => { - if dtype.is::<$dtype>() { - let data = self - .inner - .retrieve_array_subset::>(&array_subset)?; - return Ok(PyData::from(DataInner::$variant(data))); - } - }; - } - for_each_dtype!(arm); - - Err(PyNotImplementedError::new_err(format!( - "reading data type {dtype} is not supported yet" - )) - .into()) + Ok(self.inner.retrieve_array_subset(&array_subset)?) } #[pyo3(signature = (chunk_indices, **codec_options))] @@ -131,31 +113,13 @@ impl PyArray { &self, chunk_indices: PyChunkIndices, codec_options: Option, - ) -> ZarristaResult { - use zarrs::array::data_type::*; - - let dtype = self.inner.data_type(); + ) -> ZarristaResult { let codec_options = codec_options .map(|opts| opts.into_inner()) .unwrap_or_default(); - - macro_rules! arm { - ($dtype:ty, $variant:ident, $elem:ty) => { - if dtype.is::<$dtype>() { - let chunk = self.inner.retrieve_chunk_opt::>( - chunk_indices.as_ref(), - &codec_options, - )?; - return Ok(PyData::from(DataInner::$variant(chunk))); - } - }; - } - for_each_dtype!(arm); - - Err(PyNotImplementedError::new_err(format!( - "reading data type {dtype} is not supported yet" - )) - .into()) + Ok(self + .inner + .retrieve_chunk_opt(chunk_indices.as_ref(), &codec_options)?) } fn retrieve_encoded_chunk( @@ -171,38 +135,4 @@ impl PyArray { fn shape(&self) -> &[u64] { self.inner.shape() } - - // /// The chunk shape (size of a chunk along each dimension). - // #[getter] - // fn chunks(&self) -> PyResult> { - // // TODO: review - // let origin = vec![0u64; self.inner.shape().len()]; - // let chunk_shape = self.inner.chunk_shape(&origin).map_err(to_py_err)?; - // Ok(chunk_shape.iter().map(|n| n.get()).collect()) - // } - - // /// The fill value as a Python scalar (or `None` if not interpretable). - // #[getter] - // fn fill_value(&self, py: Python<'_>) -> PyResult { - // dtype::fill_value_to_py( - // py, - // self.inner.data_type(), - // self.inner.fill_value().as_ne_bytes(), - // ) - // } - - // /// Read a region with numpy-style basic indexing. - // fn __getitem__(&self, py: Python<'_>, key: &Bound<'_, PyAny>) -> PyResult { - // let shape = self.inner.shape().to_vec(); - // let (ranges, out_shape) = parse_index(py, key, &shape)?; - // let subset = ArraySubset::new_with_ranges(&ranges); - // dtype::read_region(py, &self.inner, &Region::Subset(&subset), &out_shape) - // } - - // /// Read a single chunk by its chunk coordinates. - // fn get_chunk(&self, py: Python<'_>, chunk_coords: Vec) -> PyResult { - // let chunk_shape = self.inner.chunk_shape(&chunk_coords).map_err(to_py_err)?; - // let out_shape: Vec = chunk_shape.iter().map(|n| n.get() as usize).collect(); - // dtype::read_region(py, &self.inner, &Region::Chunk(&chunk_coords), &out_shape) - // } } diff --git a/src/data.rs b/src/data.rs deleted file mode 100644 index f271a01..0000000 --- a/src/data.rs +++ /dev/null @@ -1,254 +0,0 @@ -//! A Python-exposed array type that implements the buffer protocol. -//! -//! This module provides `PyData`, an ND array type that can be used with numpy -//! via Python's buffer protocol. The buffer protocol allows Python objects to -//! expose raw memory buffers, enabling zero-copy interoperability with numpy. -//! -//! ## Buffer Protocol Overview -//! -//! The buffer protocol is defined in [PEP 3118] and allows objects to expose their -//! internal data as a contiguous or strided memory region. Key concepts: -//! -//! [PEP 3118](https://peps.python.org/pep-3118/) -//! -//! - **view**: A `Py_buffer` struct that describes how to interpret the memory -//! - **format**: A string describing the element type (e.g., "` on the PyData struct -//! itself. This means their lifetime is tied to the PyData object, which is kept -//! alive by the `Py_INCREF` call. This avoids the need to leak/free allocations -//! in `__getbuffer__`/`__releasebuffer__`. - -use std::ffi::{c_char, c_void, CStr}; -use std::os::raw::c_int; - -use ndarray::ArrayD; -use numpy::PyArray; -use pyo3::exceptions::PyBufferError; -use pyo3::ffi; -use pyo3::prelude::*; - -/// The decoded data of a chunk, with variants for each supported dtype. -/// -/// This is kept separate from `PyData` 1. because Python classes can't be defined as Rust enums and 2. so that we have a clean place to manage data types not supported by numpy. -pub enum DataInner { - Bool(ArrayD), - Float16(ArrayD), - Float32(ArrayD), - Float64(ArrayD), - Int16(ArrayD), - Int32(ArrayD), - Int64(ArrayD), - Int8(ArrayD), - Uint16(ArrayD), - Uint32(ArrayD), - Uint64(ArrayD), - Uint8(ArrayD), -} - -/// Invoke `$arm!(ZarrsDataType, DataInnerVariant, rust_elem_type)` once per -/// supported dtype. The caller supplies an `arm!` macro that turns a single -/// `(dtype, variant, elem)` triple into a retrieval (e.g. `retrieve_chunk` or -/// `retrieve_array_subset`), keeping the dtype list defined in exactly one place. -/// -/// The `ZarrsDataType` idents (e.g. `BoolDataType`) and `half::f16` are resolved -/// in the caller's scope, so callers must have `use zarrs::array::data_type::*`. -macro_rules! for_each_dtype { - ($arm:ident) => { - $arm!(BoolDataType, Bool, bool); - $arm!(Int8DataType, Int8, i8); - $arm!(Int16DataType, Int16, i16); - $arm!(Int32DataType, Int32, i32); - $arm!(Int64DataType, Int64, i64); - $arm!(UInt8DataType, Uint8, u8); - $arm!(UInt16DataType, Uint16, u16); - $arm!(UInt32DataType, Uint32, u32); - $arm!(UInt64DataType, Uint64, u64); - $arm!(Float16DataType, Float16, half::f16); - $arm!(Float32DataType, Float32, f32); - $arm!(Float64DataType, Float64, f64); - }; -} -pub(crate) use for_each_dtype; - -/// Run `$body` against the inner `ArrayD`, with `$a` bound to it regardless of -/// element type. Exhaustive, so a new `DataInner` variant is a compile error. -macro_rules! with_array { - ($data:expr, $a:ident => $body:expr) => {{ - use DataInner::*; - match $data { - Bool($a) => $body, - Float16($a) => $body, - Float32($a) => $body, - Float64($a) => $body, - Int16($a) => $body, - Int32($a) => $body, - Int64($a) => $body, - Int8($a) => $body, - Uint16($a) => $body, - Uint32($a) => $body, - Uint64($a) => $body, - Uint8($a) => $body, - } - }}; -} - -impl DataInner { - /// The PEP 3118 buffer-protocol format string for this dtype, or `None` if - /// the dtype has no buffer-protocol representation (e.g. dtypes with no - /// matching native layout). `None` is the signal that consumers must copy. - fn buffer_format(&self) -> Option<&'static CStr> { - use DataInner::*; - - let format = match self { - Bool(_) => c"?", - Int8(_) => c"b", - Int16(_) => c"h", - Int32(_) => c"i", - Int64(_) => c"q", - Uint8(_) => c"B", - Uint16(_) => c"H", - Uint32(_) => c"I", - Uint64(_) => c"Q", - Float16(_) => c"e", - Float32(_) => c"f", - Float64(_) => c"d", - }; - Some(format) - } - - /// The size of a single element in bytes. - fn itemsize(&self) -> usize { - use DataInner::*; - - match self { - Bool(_) | Int8(_) | Uint8(_) => 1, - Float16(_) | Int16(_) | Uint16(_) => 2, - Float32(_) | Int32(_) | Uint32(_) => 4, - Float64(_) | Int64(_) | Uint64(_) => 8, - } - } - - /// A raw pointer to the first element of the backing buffer. - fn data_ptr(&self) -> *mut c_void { - with_array!(self, a => a.as_ptr() as *mut c_void) - } - - /// Copy the decoded chunk into a fresh NumPy array. Used as the fallback for - /// dtypes that cannot be exposed via the buffer protocol. - fn to_numpy_with_copy<'py>(&self, py: Python<'py>) -> Bound<'py, PyAny> { - with_array!(&self, a => PyArray::from_array(py, a).into_any()) - } -} - -#[pyclass(module = "zarrista", frozen, name = "Data")] -pub struct PyData { - inner: DataInner, - /// Shape in elements. Cached here so the buffer protocol can hand out a - /// pointer with a lifetime tied to this (frozen) object. - shape: Box<[isize]>, - /// Strides in bytes, cached for the same reason. - strides: Box<[isize]>, -} - -impl From for PyData { - fn from(inner: DataInner) -> Self { - let itemsize = inner.itemsize() as isize; - let shape: Box<[isize]> = - with_array!(&inner, a => a.shape().iter().map(|&d| d as isize).collect()); - let strides: Box<[isize]> = with_array!(&inner, - a => a.strides().iter().map(|&s| s * itemsize).collect()); - Self { - inner, - shape, - strides, - } - } -} - -#[pymethods] -impl PyData { - /// Convert the decoded chunk into a NumPy array. - /// - /// When the dtype has a buffer-protocol representation this is a zero-copy - /// view (numpy reads our buffer directly). Otherwise the data is copied and - /// converted into a fresh array. - fn to_numpy<'py>(slf: Bound<'py, Self>) -> PyResult> { - let py = slf.py(); - if slf.borrow().inner.buffer_format().is_some() { - // Zero-copy: `np.asarray` views our buffer via `__getbuffer__`. - py.import("numpy")?.call_method1("asarray", (&slf,)) - } else { - // Fallback: copy/convert into a new numpy array. - Ok(slf.borrow().inner.to_numpy_with_copy(py)) - } - } - - /// Buffer-protocol export (PEP 3118): lets any consumer (`memoryview`, - /// numpy, pyarrow, …) read the data without a copy. - /// - /// # Safety - /// `view` is a valid `Py_buffer` provided by the interpreter. We pin `self` - /// for the view's lifetime with `Py_INCREF`, and `shape`/`strides` are owned - /// by `self`, so all pointers stay valid until `__releasebuffer__`. - unsafe fn __getbuffer__( - slf: PyRef<'_, Self>, - view: *mut ffi::Py_buffer, - flags: c_int, - ) -> PyResult<()> { - if view.is_null() { - return Err(PyBufferError::new_err("null buffer view")); - } - let Some(format) = slf.inner.buffer_format() else { - return Err(PyBufferError::new_err( - "this data type has no buffer-protocol representation", - )); - }; - // We only ever expose a read-only buffer. - if (flags & ffi::PyBUF_WRITABLE) == ffi::PyBUF_WRITABLE { - return Err(PyBufferError::new_err("buffer is read-only")); - } - - let itemsize = slf.inner.itemsize() as isize; - let len = slf.shape.iter().product::() * itemsize; - - (*view).buf = slf.inner.data_ptr(); - (*view).len = len; - (*view).itemsize = itemsize; - (*view).readonly = 1; - (*view).ndim = slf.shape.len() as c_int; - (*view).format = if (flags & ffi::PyBUF_FORMAT) == ffi::PyBUF_FORMAT { - format.as_ptr() as *mut c_char - } else { - std::ptr::null_mut() - }; - (*view).shape = slf.shape.as_ptr() as *mut isize; - (*view).strides = slf.strides.as_ptr() as *mut isize; - (*view).suboffsets = std::ptr::null_mut(); - (*view).internal = std::ptr::null_mut(); - - // Keep `self` (and therefore the buffer) alive while the view exists; - // Python calls `Py_DECREF` after `__releasebuffer__` returns. - (*view).obj = slf.as_ptr(); - ffi::Py_INCREF((*view).obj); - - Ok(()) - } - - /// Required counterpart to `__getbuffer__`. Nothing to free: all memory is - /// owned by `self`, and Python handles the `Py_DECREF` on `view.obj`. - unsafe fn __releasebuffer__(&self, _view: *mut ffi::Py_buffer) {} -} - -impl PyData {} diff --git a/src/decoded_array.rs b/src/decoded_array.rs new file mode 100644 index 0000000..81ca5cc --- /dev/null +++ b/src/decoded_array.rs @@ -0,0 +1,245 @@ +//! Decoded array data exposed to Python. +//! +//! SPIKE: instead of decoding into a typed `ndarray::ArrayD` (which copies +//! every buffer via `bytemuck::pod_collect_to_vec`), we retrieve the raw +//! post-codec [`ArrayBytes`] and wrap it zero-copy. Retrieval is a single +//! generic call (`retrieve::`) — no per-dtype macro — because we +//! implement [`FromArrayBytes`] on our own [`DecodedArray`] type. +//! +//! `ArrayBytes` has three layouts (`Fixed`, `Variable`, `Optional`), which we +//! surface as four concrete Python classes so each exposes exactly the faces it +//! can support: +//! +//! - [`PyTensor`] — fixed-width, dense. Buffer protocol + `to_numpy`. +//! - [`PyVariableArray`] — variable-length (string/bytes). (skeleton) +//! - [`PyMaskedTensor`] — fixed-width with a validity mask. (skeleton) +//! - [`PyMaskedVariableArray`] — variable-length with a validity mask. (skeleton) +//! +//! Buffers are **not** aligned: numpy's `frombuffer` tolerates unaligned data +//! (it sets `aligned=False`), and any consumer that materializes an owned array +//! pays the alignment copy as part of the copy it was already doing. + +use std::borrow::Cow; + +use bytes::Bytes; +use pyo3::exceptions::PyNotImplementedError; +use pyo3::prelude::*; +use pyo3::IntoPyObjectExt; +use pyo3_bytes::PyBytes; +use zarrs::array::{ArrayBytes, ArrayError, DataType, FromArrayBytes}; + +use crate::dtype::PyDataType; + +/// Fixed-width, dense decoded data. +/// +/// We don't use the upstream `Tensor` type because its bytes are not reference counted, and thus +/// don't play nicely with buffer protocol export +#[pyclass(module = "zarrista", frozen, name = "Tensor")] +pub struct PyTensor { + bytes: Bytes, + data_type: DataType, + shape: Vec, +} + +#[pymethods] +impl PyTensor { + #[getter] + fn shape(&self) -> &[u64] { + &self.shape + } + + #[getter] + fn dtype(&self) -> PyDataType { + self.data_type.clone().into() + } + + /// The raw decoded bytes, zero-copy, as a buffer-protocol object. + // TODO: it might be nice for the Tensor itself to implement the buffer protocol, instead of + // having to call the buffer method? :shrug: + fn buffer(&self) -> PyBytes { + PyBytes::new(self.bytes.clone()) + } + + /// Reinterpret the raw bytes as a numpy array of this dtype and shape. + /// + /// Zero-copy view (`np.frombuffer`) — numpy tolerates an unaligned buffer. + fn to_numpy<'py>(&self, py: Python<'py>) -> PyResult> { + // TODO: will the name_v3 always be understood by numpy? + let name = self.data_type.name_v3().ok_or_else(|| { + PyNotImplementedError::new_err(format!( + "data type {} has no zarr v3 name / numpy mapping", + self.data_type + )) + })?; + let np = py.import("numpy")?; + let flat = np.call_method1("frombuffer", (self.buffer(), name.into_owned()))?; + flat.call_method1("reshape", (&self.shape,)) + } +} + +/// Variable-length data (string/bytes). Skeleton: carries metadata only for now. +#[pyclass(module = "zarrista", frozen, name = "VariableArray")] +pub struct PyVariableArray { + #[expect(dead_code)] + bytes: Bytes, + #[expect(dead_code)] + offsets: Vec, + data_type: DataType, + shape: Vec, +} + +#[pymethods] +impl PyVariableArray { + #[getter] + fn shape(&self) -> &[u64] { + &self.shape + } + + #[getter] + fn dtype(&self) -> PyDataType { + self.data_type.clone().into() + } +} + +/// Fixed-width data with a validity mask. Skeleton. +#[pyclass(module = "zarrista", frozen, name = "MaskedTensor")] +pub struct PyMaskedTensor { + #[expect(dead_code)] + bytes: Bytes, + /// The mask is 1 byte per element where 0 = invalid/missing, non-zero = valid/present. + #[expect(dead_code)] + mask: Bytes, + data_type: DataType, + shape: Vec, +} + +#[pymethods] +impl PyMaskedTensor { + #[getter] + fn shape(&self) -> &[u64] { + &self.shape + } + + #[getter] + fn dtype(&self) -> PyDataType { + self.data_type.clone().into() + } +} + +/// Variable-length data with a validity mask. Skeleton. +#[pyclass(module = "zarrista", frozen, name = "MaskedVariableArray")] +pub struct PyMaskedVariableArray { + #[expect(dead_code)] + bytes: Bytes, + #[expect(dead_code)] + offsets: Vec, + /// The mask is 1 byte per element where 0 = invalid/missing, non-zero = valid/present. + #[expect(dead_code)] + mask: Bytes, + data_type: DataType, + shape: Vec, +} + +#[pymethods] +impl PyMaskedVariableArray { + #[getter] + fn shape(&self) -> &[u64] { + &self.shape + } + + #[getter] + fn dtype(&self) -> PyDataType { + self.data_type.clone().into() + } +} + +/// Internal decoded result, produced by our [`FromArrayBytes`] impl. Carries the +/// post-codec bytes (zero-copy), the data type, and the region shape (which +/// zarrs hands us, so we never have to re-derive it). +pub enum DecodedArray { + Tensor(PyTensor), + Variable(PyVariableArray), + MaskedTensor(PyMaskedTensor), + MaskedVariable(PyMaskedVariableArray), +} + +impl FromArrayBytes for DecodedArray { + fn from_array_bytes( + bytes: ArrayBytes<'static>, + shape: &[u64], + data_type: &DataType, + ) -> Result { + let shape = shape.to_vec(); + let data_type = data_type.clone(); + Ok(match bytes { + ArrayBytes::Fixed(bytes) => DecodedArray::Tensor(PyTensor { + bytes: cow_to_bytes(bytes), + data_type, + shape, + }), + ArrayBytes::Variable(v) => { + let (buf, offsets) = v.into_parts(); + DecodedArray::Variable(PyVariableArray { + bytes: cow_to_bytes(buf), + // Ideally in the future we'll avoid a copy: + // https://github.com/zarrs/zarrs/issues/406 + offsets: offsets.to_vec(), + data_type, + shape, + }) + } + ArrayBytes::Optional(optional) => { + let (data, mask) = optional.into_parts(); + match *data { + ArrayBytes::Fixed(fixed) => DecodedArray::MaskedTensor(PyMaskedTensor { + bytes: cow_to_bytes(fixed), + mask: cow_to_bytes(mask), + data_type, + shape, + }), + ArrayBytes::Variable(variable) => { + let (buf, offsets) = variable.into_parts(); + DecodedArray::MaskedVariable(PyMaskedVariableArray { + bytes: cow_to_bytes(buf), + // Ideally in the future we'll avoid a copy: + // https://github.com/zarrs/zarrs/issues/406 + offsets: offsets.to_vec(), + mask: cow_to_bytes(mask), + data_type, + shape, + }) + } + ArrayBytes::Optional(_) => { + unreachable!("nested optional is not a valid layout") + } + } + } + }) + } +} + +impl<'py> IntoPyObject<'py> for DecodedArray { + type Target = PyAny; + type Output = Bound<'py, PyAny>; + type Error = PyErr; + + fn into_pyobject(self, py: Python<'py>) -> Result { + match self { + DecodedArray::Tensor(py_tensor) => py_tensor.into_bound_py_any(py), + DecodedArray::Variable(py_variable_array) => py_variable_array.into_bound_py_any(py), + DecodedArray::MaskedTensor(py_masked_tensor) => py_masked_tensor.into_bound_py_any(py), + DecodedArray::MaskedVariable(py_masked_variable_array) => { + py_masked_variable_array.into_bound_py_any(py) + } + } + } +} + +/// Move a `'static` `Cow<[u8]>` into `bytes::Bytes`. Owned is a zero-copy move; +/// borrowed (rare for retrieval) copies. +fn cow_to_bytes(cow: Cow<'static, [u8]>) -> Bytes { + match cow { + Cow::Owned(v) => Bytes::from(v), + Cow::Borrowed(b) => Bytes::copy_from_slice(b), + } +} diff --git a/src/lib.rs b/src/lib.rs index c8f6d4f..42481c0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,7 +4,7 @@ mod array; mod array_bytes; mod chunks; mod codec; -mod data; +mod decoded_array; mod dtype; mod error; mod exceptions; @@ -20,7 +20,7 @@ use crate::array::{PyArray, PyAsyncArray}; use crate::array_bytes::PyArrayBytes; use crate::chunks::PyChunkGrid; use crate::codec::register_codec_module; -use crate::data::PyData; +use crate::decoded_array::{PyMaskedTensor, PyMaskedVariableArray, PyTensor, PyVariableArray}; use crate::dtype::PyDataType; use crate::exceptions::register_exceptions_module; use crate::fill_value::PyFillValue; @@ -37,7 +37,10 @@ fn _zarrista(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/tests/test_indexing.py b/tests/test_indexing.py index d33c18f..8d0010e 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -12,7 +12,7 @@ import zarr from numpy.typing import NDArray from obstore.store import LocalStore -from zarrista import Array, AsyncArray, FilesystemStore +from zarrista import Array, AsyncArray, FilesystemStore, Tensor @pytest.fixture @@ -38,6 +38,25 @@ def test_slice_read_matches_numpy(int32_array: tuple[Path, NDArray[np.int32]]): np.testing.assert_array_equal(result, data[0:2, :, 5:7]) +def test_fixed_dtype_returns_tensor(int32_array: tuple[Path, NDArray[np.int32]]): + """A fixed-width dtype decodes to a `Tensor` carrying `shape`/`dtype`; its raw + `buffer()` reinterprets to the same array as `to_numpy()`.""" + path, data = int32_array + arr = Array.open(FilesystemStore(path)) + + tensor = arr.retrieve_array_subset((slice(0, 2), slice(None), slice(5, 7))) + assert isinstance(tensor, Tensor) + assert tensor.shape == [2, 64, 2] + assert tensor.dtype == arr.dtype + + expected = data[0:2, :, 5:7] + np.testing.assert_array_equal(tensor.to_numpy(), expected) + + # buffer() exposes the raw decoded bytes; reinterpreting matches to_numpy(). + from_buffer = np.frombuffer(tensor.buffer(), dtype="int32").reshape(tensor.shape) + np.testing.assert_array_equal(from_buffer, expected) + + def test_getitem_matches_retrieve_array_subset( int32_array: tuple[Path, NDArray[np.int32]], ):