diff --git a/Cargo.lock b/Cargo.lock
index 72e567c..e2cdce2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -790,9 +790,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
 name = "bytes"
-version = "1.11.1"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
+checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593"
 dependencies = [
  "serde",
 ]
@@ -4934,6 +4934,7 @@ dependencies = [
 name = "zarrista"
 version = "0.1.0-beta.2"
 dependencies = [
+ "bytes",
  "half",
  "icechunk",
  "ndarray",
diff --git a/Cargo.toml b/Cargo.toml
index 61072aa..0a1f8b4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,6 +20,7 @@ crate-type = ["cdylib"]
 abi3-py311 = ["pyo3/abi3-py311"]
 
 [dependencies]
+bytes = "1.12.0"
 half = "2"
 icechunk = "2.0.0"
 ndarray = "0.17.2"
diff --git a/dev-docs/specs/2026-06-18-arrow-export-design.md b/dev-docs/specs/2026-06-18-arrow-export-design.md
new file mode 100644
index 0000000..0798082
--- /dev/null
+++ b/dev-docs/specs/2026-06-18-arrow-export-design.md
@@ -0,0 +1,217 @@
+# zarrista — Arrow export faces for the decoded result types
+
+**Date:** 2026-06-18
+**Status:** Approved (architecture realigned to the
+[decoded result types](2026-06-18-data-result-types-design.md) spike)
+
+## Framing: Arrow is one face on the result types
+
+Reads no longer return a single `Data` object. Per
+[decoded result types](2026-06-18-data-result-types-design.md), a read returns one
+of **four concrete classes** built from the raw post-codec `ArrayBytes`, each
+holding `bytes::Bytes` (refcounted, zero-copy):
+
+| class | `ArrayBytes` layout | payload held |
+|---|---|---|
+| `Tensor` | `Fixed` | values |
+| `VariableArray` | `Variable` | values + offsets |
+| `MaskedTensor` | `Optional(Fixed)` | values + mask |
+| `MaskedVariableArray` | `Optional(Variable)` | values + offsets + mask |
+
+These are **format-neutral payloads**; consumers reach them through thin,
+**co-equal export faces** — today the buffer protocol (`to_numpy`/`buffer()` on
+`Tensor`), with this spec adding **Arrow** (the PyCapsule interface), and **DLPack**
+a likely future face. No decision about the result types is made to serve one
+format; each face adapts to the payload, not the reverse.
+
+This matters because **no single interchange format covers all Zarr dtypes** (see
+the coverage matrix). Arrow uniquely expresses the variable-length and masked
+layouts the buffer protocol can't; DLPack uniquely expresses exotic numerics
+(complex, bfloat16) Arrow can't; the buffer protocol is the universal fixed-width
+baseline. Several faces are a necessity, not a luxury — and none is privileged.
+
+Arrow specifically is **additive and exploratory**: not a strong pull in the
+zarr-nd community, not the organizing principle of the library. Its concrete payoff
+is the **zero-copy variable-length / masked** path, where the buffer protocol simply
+cannot represent the data.
+
+## Goal
+
+Implement the Arrow PyCapsule interface (`__arrow_c_array__` / `__arrow_c_schema__`)
+on the result classes so a decoded chunk/subset can be handed to pyarrow, polars,
+arro3, datafusion, etc. **`Tensor` first** (the dtypes that already round-trip);
+`VariableArray` and the masked classes follow as their dtypes land, but their Arrow
+mapping is designed here because they're where Arrow earns its keep.
+
+## Coverage matrix (why several faces)
+
+The full zarrs dtype set is: bool; int 8/16/32/64 + int2/int4; uint 8/16/32/64 +
+uint2/uint4; float 16/32/64 + subfloat (float8); complex64/128; raw_bits;
+fixed_length_utf32; string; bytes; datetime64 / timedelta64. There is **no
+struct/compound and no dictionary/categorical dtype** — Zarr v2's compound dtypes are
+only a v3 *extension* point (zarrs models the closest, opaque case as `raw_bits` =
+numpy `void`), and "categorical" in Zarr is a *codec/filter* (`numcodecs.Categorize`)
+yielding an integer array, not a dtype. So Arrow's nested/`Struct`/`Dictionary`
+strengths have nothing to map *from* in Zarr today.
+
+| Zarr dtype | buffer protocol | Arrow | DLPack (future) |
+|---|---|---|---|
+| int / uint / float32-64 | ✅ | ✅ | ✅ |
+| float16 | ✅ | ✅ `Float16` | ✅ |
+| bool | ✅ (1 byte) | ✅ `arrow.bool8` | ✅ |
+| datetime64 / timedelta64 | ✅ (as int64) | ⚠️ `Timestamp`/`Duration` — zero-copy only for s/ms/us/ns without NaT (see note) | ✅ |
+| variable-length string/bytes | ❌ | ✅ `Utf8`/`Binary` | ❌ |
+| complex64/128 | ✅ (2 floats) | ❌ no native complex | ✅ |
+| subfloat (float8) / bfloat16 | ⚠️ no PEP-3118 code | ❌ | ✅ |
+| int2/4, uint2/4 (sub-byte) | ❌ | ❌ | ❌ (awkward everywhere) |
+
+For Zarr's actual dtypes, Arrow's unique value is **variable-length string/bytes** and
+**masked** data, plus a **semantic** bonus on the temporal types; DLPack uniquely
+covers the exotic numerics; the buffer protocol is the universal fixed-width baseline.
+
+## Per-class Arrow mapping
+
+The result class (set by the `ArrayBytes` layout) determines the Arrow array kind;
+the dtype refines it. Values are always **zero-copy** (a `bytes::Bytes` clone);
+only the small side buffers (offsets, validity) are ever copied.
+
+| class | Arrow array | values | offsets | validity |
+|---|---|---|---|---|
+| `Tensor` | primitive (+ `arrow.bool8` for bool) | zero-copy | — | — |
+| `VariableArray` | `Utf8`/`LargeUtf8` (string) or `Binary`/`LargeBinary` (bytes) | zero-copy | `usize` → `i32`/`i64` (copy) | — |
+| `MaskedTensor` | primitive **+ validity bitmap** | zero-copy | — | byte-mask → bitmap (copy) |
+| `MaskedVariableArray` | `Utf8`/`Binary` **+ validity bitmap** | zero-copy | `usize` → `i32`/`i64` (copy) | byte-mask → bitmap (copy) |
+
+Two layout conversions are intrinsic to Arrow and unavoidable (but cheap — side
+buffers, not bulk data):
+
+- **Offsets.** zarrs hands variable-length offsets as `usize` (currently materialized
+  to `Vec<usize>`; zarrs#406 tracks avoiding that). Arrow needs `i32` (`Utf8`/`Binary`)
+  or `i64` (`LargeUtf8`/`LargeBinary`). Pick the `Large` variant when the last offset
+  exceeds `i32::MAX`, else `i32`; either way it's a copy of the small offsets array.
+- **Validity.** zarrs `Optional` carries a mask of **1 byte per element** (0 = invalid,
+  non-zero = valid). Arrow validity is **1 bit per element** (1 = valid). So masked
+  export **bit-packs** the byte-mask into a validity bitmap — `bit[i] = mask[i] != 0` —
+  a small copy, and derives `null_count`. (This is the inverse of the `bool8` case,
+  where Arrow happens to match our byte layout and *no* packing is needed.)
+
+### `bool` via `arrow.bool8` (zero-copy)
+
+`Tensor` of `bool` uses the
+[`arrow.bool8` canonical extension](https://arrow.apache.org/docs/format/CanonicalExtensions.html#bit-boolean)
+(int8 storage, 0=false / non-zero=true) — exactly our 1-byte-per-bool layout. So bool
+values export **zero-copy**, no bit-packing. Consumers that don't understand the
+extension degrade gracefully to the int8 storage. (Note the asymmetry: a `bool`
+*value* column needs no packing via `bool8`, but a *validity mask* always bit-packs,
+because Arrow validity is defined as a bitmap.)
+
+### `Tensor` dtype mapping (first round)
+
+| dtype | Arrow type |
+|---|---|
+| int8/16/32/64 | `Int8/16/32/64` |
+| uint8/16/32/64 | `UInt8/16/32/64` |
+| float32/64 | `Float32/64` |
+| float16 | `Float16` |
+| bool | `Int8` + `arrow.bool8` |
+
+## Zero-copy mechanism (simpler now, thanks to `bytes::Bytes`)
+
+Because the payload is `bytes::Bytes` (refcounted), the values buffer needs **no
+`Py_INCREF`/release-callback dance**. Build the arrow-rs `Buffer` over the bytes via
+`Buffer::from_custom_allocation(ptr, len, owner)`, where `owner` is a cloned
+`bytes::Bytes` (an `Arc`-like handle) — the Arrow buffer keeps the allocation alive by
+holding its own refcount, dropped when the consumer releases the Arrow array.
+`pyo3-arrow` wraps the resulting array in the PyCapsule. One allocation backs the
+buffer protocol, `to_numpy`, and Arrow simultaneously via cheap `Bytes` clones.
+
+## Contiguity & alignment
+
+The values payload is a single contiguous `bytes::Bytes` blob in C-order — so unlike
+the old `ArrayD<T>` design there is **no stride/non-contiguity case to handle**;
+Arrow's contiguity requirement is always satisfied for the values buffer.
+
+The live concern is **alignment**, and the decision (from the result-types spec) is
+**stay unaligned**: the bytes come straight from the decode allocator, contractually
+1-byte aligned (de-facto 16 from malloc). Consequences for Arrow:
+
+- **Correctness:** unaffected. Arrow's C Data Interface treats 64-byte alignment as
+  *advisory* (≈8-byte floor).
+- **Performance:** a SIMD kernel or strict validator (some pyarrow paths) may silently
+  **re-copy to realign**, turning a zero-copy export into one consumer-side copy.
+- **Owning consumers** (`pa.array(...)` that materializes, compute kernels) pay any
+  realign copy as part of work they were already doing — we never pay a *dedicated*
+  alignment copy.
+
+If it ever bites, expose an explicit aligned-copy path rather than aligning every
+read (the result-types spec's lazy-alignment stance).
+
+## Side-channel shape
+
+Arrow arrays are logically 1-D. Each result class exposes its data as a **flat,
+length-`prod(shape)` Arrow array** plus its existing **`shape`** property; consumers
+reshape if they care. We deliberately do **not** use `arrow.fixed_shape_tensor` (its
+semantics are "batch of tensors", leading dim = batch — wrong for a single chunk).
+
+## API surface
+
+On each result class (`Tensor` first):
+
+```python
+obj.__arrow_c_array__(requested_schema=None) -> (schema_capsule, array_capsule)
+obj.__arrow_c_schema__() -> schema_capsule
+obj.shape  # already present; the Arrow array is flat length prod(shape)
+```
+
+`pa.array(obj)`, `pl.Series(obj)`, `arro3.core.Array.from_arrow(obj)` all work via the
+capsule protocol; recover N-D structure by combining with `obj.shape`.
+
+**Optional introspection** (revisit when implementing): a per-class `arrow_copy: bool`
+could advertise whether an export copies bulk data. With the `bytes::Bytes` design the
+values are always zero-copy, so this would only ever flag alignment realign risk or
+the (always-copied) side buffers — lower value than under the old design, so it's
+deferred rather than specified.
+
+## Tooling
+
+Use [`pyo3-arrow`](https://crates.io/crates/pyo3-arrow) to build the Arrow arrays and
+expose the capsules — no pyarrow dependency in Rust. (Separately, `dlpark`'s pyo3
+feature pins pyo3 0.25 vs our 0.29 — see the result-types spec — but that's a DLPack
+concern, not Arrow.)
+
+## Testing
+
+- **Round-trip vs. zarr-python** (extends the existing harness): write numeric arrays,
+  read with zarrista, assert the Arrow export of `Tensor` matches — e.g.
+  `pa.array(tensor)` (or arro3) reshaped via `tensor.shape` equals the zarr-python
+  numpy array. Cover every numeric dtype and `bool` (verifying `arrow.bool8`), plus a
+  multi-dim chunk for flat-array + `shape`.
+- **Zero-copy assertion:** the Arrow values buffer pointer equals `tensor.buffer()`'s
+  pointer (no copy), and the Arrow array outlives a dropped Python reference (the
+  `Bytes` refcount holds).
+- **When variable/masked land:** assert `VariableArray` → `Utf8`/`Binary` with correct
+  offsets, and masked → correct validity bitmap + `null_count` from the byte-mask.
+- **Tooling:** `maturin develop`; `uv run --no-project pytest`.
+
+## Out of scope (deferred)
+
+- `VariableArray` / masked Arrow export (designed above; lands with the
+  variable-length and nullable dtype work).
+- DLPack export (co-equal future face; covers complex/bfloat16 Arrow can't).
+- Arrow *import* / writing.
+- complex / float8 / bfloat16 dtypes (no Arrow representation — DLPack territory).
+- temporal dtypes (datetime64 / timedelta64 → `Timestamp`/`Duration`) — a natural
+  Arrow follow-up, but **not uniformly zero-copy**: both are int64/LE/epoch-based so
+  the values buffer matches, but (1) Arrow supports only s/ms/us/ns — calendar/other
+  units (D/h/m/W/M/Y/sub-ns) need a unit cast and W/M/Y have no faithful Arrow
+  representation; and (2) numpy encodes NaT as the `INT64_MIN` sentinel while Arrow
+  uses a validity bitmap, so any NaT (a valid Zarr fill value) forces an O(n) scan to
+  build a bitmap. Zero-copy only holds for unit ∈ {s, ms, us, ns} with no NaT.
+
+## Risks
+
+- **Buffer alignment.** As above: correctness fine, possible silent consumer-side
+  realign copy; `to_numpy` unaffected (buffer protocol has no alignment requirement).
+- **`arrow.bool8` consumer support.** A relatively recent canonical extension; older
+  consumers see int8 storage rather than boolean. Acceptable (graceful degradation).
+- **`pyo3-arrow` / arrow-rs version coupling.** Pin deliberately.
diff --git a/dev-docs/specs/2026-06-18-data-result-types-design.md b/dev-docs/specs/2026-06-18-data-result-types-design.md
new file mode 100644
index 0000000..4dd8726
--- /dev/null
+++ b/dev-docs/specs/2026-06-18-data-result-types-design.md
@@ -0,0 +1,151 @@
+# zarrista — decoded result types (ArrayBytes-backed `Data`)
+
+**Date:** 2026-06-18
+**Status:** Prototyping (spike on branch `kyle/dlpack-tensor-spike`)
+
+## Goal
+
+Reorient what a read returns. Today a read decodes into a typed
+`ndarray::ArrayD<T>` wrapped in a single `Data` class, dispatched by a 12-arm
+per-dtype macro. Replace that with a **format-neutral, zero-copy** payload built
+from the raw post-codec [`ArrayBytes`], surfaced as **four concrete Python result
+classes**. This collapses the dispatch to a single generic retrieval and sets up
+the multi-protocol faces (buffer protocol now; Arrow, DLPack later — see
+[arrow-export-design](2026-06-18-arrow-export-design.md)).
+
+## Why (key findings that drove this)
+
+- **The typed path copies *every* buffer.** `retrieve::<ArrayD<T>>` /
+  `::<Vec<T>>` go through `convert_from_bytes_slice` →
+  `bytemuck::allocation::pod_collect_to_vec`, which **unconditionally** allocates
+  a fresh aligned `Vec<T>` and `copy_from_slice`s into it — no alignment fast
+  path. So today's `Data` pays a full copy per read. Retrieving raw `ArrayBytes`
+  avoids it.
+- **`FromArrayBytes` is post-codec.** All retrieval funnels through
+  `retrieve_array_subset_opt`, which runs the *entire* codec pipeline and then
+  calls `T::from_array_bytes(bytes, shape, data_type)`. Implementing our own
+  `FromArrayBytes` customizes only the final "bytes → our type" step; it does
+  **not** skip codecs. zarrs also passes us the region shape, so we never
+  re-derive it.
+- **`ArrayBytes` has three layouts, and `Tensor`/`into_fixed` only handles one.**
+  `ArrayBytes` is `Fixed | Variable | Optional` (the last is data + a 1-byte/elem
+  validity mask). zarrs's own `Tensor` does `bytes.into_fixed()`, which errors for
+  both `Variable` and `Optional`. So `Tensor` is fixed-dense-only; we need our own
+  type to represent all three.
+- **`ArrayBytes` does no alignment handling.** It's `Cow<'static, [u8]>` straight
+  from the decode allocation — contractually 1-byte aligned (de-facto 16 from
+  system malloc; a borrowed shard sub-slice could be less). The alignment we have
+  *today* is purely the side effect of the typed-path copy above. (See
+  [Alignment](#alignment).)
+
+## Architecture
+
+### One Rust type in, four Python types out
+
+We implement `FromArrayBytes` for an internal `Decoded` enum, retrieve
+`::<Decoded>` (one call, no macro), and convert to the matching Python class:
+
+| `ArrayBytes` variant | Python class | exposes |
+|---|---|---|
+| `Fixed` | **`Tensor`** | buffer protocol + `to_numpy` (+ Arrow/DLPack later) |
+| `Variable` | **`VariableArray`** | (skeleton; Arrow later) |
+| `Optional(Fixed)` | **`MaskedTensor`** | (skeleton; Arrow-with-validity later) |
+| `Optional(Variable)` | **`MaskedVariableArray`** | (skeleton) |
+
+**Separate classes, not one union class with conditionally-erroring methods.**
+The type *is* the information the consumer needs (`isinstance`), each class
+implements exactly the faces it can support, and it mirrors numpy (`MaskedArray`
+is its own type) and Arrow (typed array classes). The "union" lives only as the
+throwaway `Decoded` enum at the FFI seam, surfaced via `IntoPyObject` so both the
+sync and async retrieve paths just return `Decoded`.
+
+### Zero-copy payload
+
+`Tensor` holds the decoded bytes as `bytes::Bytes` (refcounted, cheaply cloned),
+obtained by moving the `Fixed` `Cow::Owned(Vec<u8>)` into `Bytes` (zero-copy).
+Every face takes a cheap `Bytes` clone:
+
+- **buffer protocol:** `pyo3_bytes::PyBytes::new(bytes.clone())` — zero-copy,
+  already implements the buffer protocol.
+- **`to_numpy`:** `np.frombuffer(PyBytes, dtype).reshape(shape)`, where `dtype` is
+  the zarr v3 dtype name (numpy accepts the same names for the fixed numerics) and
+  the bytes are native-endian (matching numpy's default).
+
+This is the "Rust hands over bytes, Python interprets the dtype" model: numpy owns
+the reinterpretation, so we never do a Rust-side `Vec<u8>→Vec<T>` cast (which would
+require alignment — the exact pain hit in
+[async-tiff#165](https://github.com/developmentseed/async-tiff/pull/165), which
+fell back to `bytemuck::try_cast_vec` + copy).
+
+## Alignment
+
+**Decision: do not align up front.** Buffers are whatever the decode allocator
+produced (unaligned in the worst case).
+
+- **Buffer protocol:** no alignment requirement; numpy handles unaligned.
+- **`np.frombuffer` / view consumers:** numpy views unaligned bytes (sets
+  `aligned=False`); correct, and fine on x86-64/aarch64 where unaligned loads are
+  cheap. **No copy.**
+- **Owning consumers** (`np.array(data)`, `.copy()`, most ops): numpy allocates
+  its own *aligned* buffer and copies into it — so alignment is **fused into the
+  copy they were already doing**. We never pay a *dedicated* alignment copy.
+- **Arrow:** recommends 64-byte but the C Data Interface relaxes to advisory;
+  pyarrow mostly tolerates, occasionally realign-copies in strict kernels.
+
+Rejected alternatives: changing zarrs to allocate aligned output (not a one-liner;
+allocations spread across codecs, and borrowed shard sub-slices can't align without
+a copy — worth an upstream feature request, not a local fix); and
+`retrieve_*_into` with our own aligned buffer (doesn't support variable-length and
+needs `unsafe` — not adopting yet). If a consumer ever needs guaranteed alignment,
+expose an explicit aligned-copy method rather than copying on every read.
+
+## Dispatch collapse (the payoff)
+
+Sync and async `retrieve_array_subset` / `retrieve_chunk` each become a single
+`retrieve::<Decoded>(…)` call — deleting the 12-arm `for_each_dtype!` macro, the
+`DataInner` enum, and the hand-rolled `unsafe` buffer-protocol code in `data.rs`.
+`Decoded: IntoPyObject` builds the right class.
+
+## Faces: now vs. later
+
+- **Now:** buffer protocol + `to_numpy` on `Tensor`. The other three classes
+  carry their full decoded payload (values bytes + offsets and/or mask via
+  `ArrayBytes`/`Optional` `into_parts()`) plus `shape`/`dtype`, but don't yet
+  expose it to numpy — `to_numpy` raises `NotImplementedError`. (Offsets are
+  currently copied to `Vec<usize>`; zarrs#406 tracks avoiding that.)
+- **Later (own specs):** Arrow `__arrow_c_array__` on all four (variable-length and
+  masked are where Arrow earns its keep — zero-copy `String`/`Binary` + validity
+  bitmaps); DLPack `__dlpack__` on `Tensor`/`MaskedTensor`.
+- **Zero-copy introspection** (`contiguous`, `arrow_copy`, `buffer_protocol_copy`)
+  from the Arrow spec applies here too.
+
+## DLPack note (deferred, with a real blocker)
+
+zarrs has a `dlpack` feature wiring its `Tensor` to the `dlpark` crate
+(`SafeManagedTensorVersioned`). But **`dlpark` 0.6's `pyo3` feature pins pyo3
+0.25, and we're on pyo3 0.29** — two pyo3 versions can't share one extension
+module. So we cannot use `dlpark`'s capsule helper directly; DLPack would mean
+hand-rolling the `PyCapsule` (name `dltensor`/`dltensor_versioned` + a deleter
+calling the managed tensor's `deleter`) over `dlpark`'s core (which *is* pyo3-free
+and usable), or waiting for a `dlpark` pyo3 bump. DLPack also only covers the plain
+numerics + bf16 (no complex, no var-length), so it's one face among several, not a
+replacement. Deferred.
+
+## Naming
+
+Provisional: `Tensor`, `VariableArray`, `MaskedTensor`, `MaskedVariableArray`.
+("Tensor" deliberately names only the fixed-dense class, not the umbrella — there
+is no single umbrella class by design.)
+
+## Testing
+
+- Round-trip vs. zarr-python: write fixed numeric arrays, read with zarrista,
+  assert `Tensor.to_numpy()` (and `np.frombuffer(tensor.buffer())`) equal the
+  zarr-python array across dtypes/shapes, including a multi-dim chunk.
+- Assert a variable-length / masked array returns the correct class (not `Tensor`).
+- `maturin develop`; `uv run --no-project pytest`.
+
+## Out of scope (this spike)
+
+Exposing data from `VariableArray`/`MaskedTensor`/`MaskedVariableArray`; Arrow and
+DLPack faces; complex/temporal/sub-byte dtype numpy mappings; writing.
diff --git a/docs/api/data.md b/docs/api/data.md
deleted file mode 100644
index 02e0d60..0000000
--- a/docs/api/data.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Data
-
-::: zarrista.Data
-    options:
-      show_bases: false
diff --git a/docs/api/decoded_array.md b/docs/api/decoded_array.md
new file mode 100644
index 0000000..9184757
--- /dev/null
+++ b/docs/api/decoded_array.md
@@ -0,0 +1,41 @@
+# DecodedArray
+
+Reading from an [`Array`][zarrista.Array] (via `retrieve_array_subset`,
+`retrieve_chunk`, or `[...]`) returns a `DecodedArray`: one of four concrete result
+types, chosen by the decoded byte layout of the dtype. Use `isinstance` to narrow to
+a concrete type before calling its layout-specific methods.
+
+- [`Tensor`](#zarrista.Tensor) — fixed-width, dense data.
+- [`VariableArray`](#zarrista.VariableArray) — variable-length data (e.g. strings or
+  bytes).
+- [`MaskedTensor`](#zarrista.MaskedTensor) — fixed-width data with a validity mask.
+- [`MaskedVariableArray`](#zarrista.MaskedVariableArray) — variable-length data with a
+  validity mask.
+
+::: zarrista.DecodedArray
+    options:
+      show_bases: false
+
+## Tensor
+
+::: zarrista.Tensor
+    options:
+      show_bases: false
+
+## VariableArray
+
+::: zarrista.VariableArray
+    options:
+      show_bases: false
+
+## MaskedTensor
+
+::: zarrista.MaskedTensor
+    options:
+      show_bases: false
+
+## MaskedVariableArray
+
+::: zarrista.MaskedVariableArray
+    options:
+      show_bases: false
diff --git a/mkdocs.yml b/mkdocs.yml
index 5fe7a36..2e37dc6 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -26,7 +26,7 @@ nav:
       - api/store.md
       - api/chunk-grid.md
       - api/codec.md
-      - api/data.md
+      - api/decoded_array.md
       - api/dtype.md
   - Blog:
       - blog/index.md
diff --git a/python/zarrista/__init__.py b/python/zarrista/__init__.py
index 984981c..4517188 100644
--- a/python/zarrista/__init__.py
+++ b/python/zarrista/__init__.py
@@ -1,3 +1,5 @@
+from typing import TypeAlias
+
 from . import codec, exceptions
 from ._zarrista import (
     Array,
@@ -5,27 +7,43 @@
     AsyncArray,
     AsyncGroup,
     ChunkGrid,
-    Data,
     DataType,
     FilesystemStore,
     FillValue,
     Group,
+    MaskedTensor,
+    MaskedVariableArray,
     MemoryStore,
+    Tensor,
+    VariableArray,
     __version__,
 )
 
+DecodedArray: TypeAlias = Tensor | VariableArray | MaskedTensor | MaskedVariableArray
+"""The result of a read: one of the four decoded array layouts.
+
+Which one is returned depends on the dtype's byte layout (fixed vs. variable, and
+whether it carries a validity mask). Use `isinstance` to narrow to a concrete
+type before using layout-specific methods.
+"""
+
+
 __all__ = [
     "Array",
     "ArrayBytes",
     "AsyncArray",
     "AsyncGroup",
+    "DecodedArray",
     "ChunkGrid",
-    "Data",
     "DataType",
     "FillValue",
     "FilesystemStore",
     "Group",
+    "MaskedTensor",
+    "MaskedVariableArray",
     "MemoryStore",
+    "Tensor",
+    "VariableArray",
     "__version__",
     "codec",
     "exceptions",
diff --git a/python/zarrista/_array.pyi b/python/zarrista/_array.pyi
index 28a8f84..3d8d1c5 100644
--- a/python/zarrista/_array.pyi
+++ b/python/zarrista/_array.pyi
@@ -5,7 +5,7 @@ from zarrista.codec import CodecOptions
 
 from ._chunks import ChunkGrid
 from ._codec import CodecChain
-from ._data import Data
+from ._decoded_array import DecodedArray
 from ._dtype import DataType
 from ._store import AsyncStore, FilesystemStore, MemoryStore
 
@@ -51,7 +51,7 @@ class Array:
         """The array's path in the store."""
     def retrieve_array_subset(
         self, selection: Selection, **codec_options: Unpack[CodecOptions]
-    ) -> Data:
+    ) -> DecodedArray:
         """Read and decode an array region selected with numpy-style basic indexing.
 
         The result is ndim-preserving (consistent with a zarrs `ArraySubset`): an
@@ -61,7 +61,7 @@ class Array:
         """
     def retrieve_chunk(
         self, chunk_indices: list[int], **codec_options: Unpack[CodecOptions]
-    ) -> Data:
+    ) -> DecodedArray:
         """Read and decode the chunk at the given chunk grid indices.
 
         Keyword arguments are passed as [`CodecOptions`][zarrista.codec.CodecOptions].
@@ -69,7 +69,7 @@ class Array:
     @property
     def shape(self) -> list[int]:
         """The array shape."""
-    def __getitem__(self, selection: Selection) -> Data:
+    def __getitem__(self, selection: Selection) -> DecodedArray:
         """Read a region with numpy-style basic indexing, e.g. `arr[0:10, :, 5]`.
 
         Sugar for `retrieve_array_subset`.
@@ -111,7 +111,7 @@ class AsyncArray:
         """The array's path in the store."""
     async def retrieve_array_subset(
         self, selection: Selection, **codec_options: Unpack[CodecOptions]
-    ) -> Data:
+    ) -> DecodedArray:
         """Read and decode an array region selected with numpy-style basic indexing.
 
         The result is ndim-preserving (consistent with a zarrs `ArraySubset`): an
@@ -121,7 +121,7 @@ class AsyncArray:
         """
     async def retrieve_chunk(
         self, chunk_indices: list[int], **codec_options: Unpack[CodecOptions]
-    ) -> Data:
+    ) -> DecodedArray:
         """Read and decode the chunk at the given chunk grid indices.
 
         Keyword arguments are passed as [`CodecOptions`][zarrista.codec.CodecOptions].
@@ -129,7 +129,7 @@ class AsyncArray:
     @property
     def shape(self) -> list[int]:
         """The array shape."""
-    async def __getitem__(self, selection: Selection) -> Data:
+    async def __getitem__(self, selection: Selection) -> DecodedArray:
         """Read a region with numpy-style basic indexing: `await arr[0:10, :, 5]`.
 
         Sugar for `retrieve_array_subset`.
diff --git a/python/zarrista/_data.pyi b/python/zarrista/_data.pyi
deleted file mode 100644
index dc30c8c..0000000
--- a/python/zarrista/_data.pyi
+++ /dev/null
@@ -1,25 +0,0 @@
-import sys
-from typing import Any
-
-from numpy.typing import NDArray
-
-if sys.version_info >= (3, 12):
-    from collections.abc import Buffer
-else:
-    from typing_extensions import Buffer
-
-class Data(Buffer):
-    """A decoded chunk of array data.
-
-    Implements the Python buffer protocol (PEP 3118), so dtypes with a native
-    buffer representation can be read zero-copy via `memoryview(data)` or
-    `np.asarray(data)`. Dtypes without one raise `BufferError`.
-    """
-
-    def __buffer__(self, flags: int) -> memoryview: ...
-    def to_numpy(self) -> NDArray[Any]:
-        """Convert the chunk into a NumPy array.
-
-        Zero-copy when the dtype supports the buffer protocol; otherwise the
-        data is copied and converted.
-        """
diff --git a/python/zarrista/_decoded_array.pyi b/python/zarrista/_decoded_array.pyi
new file mode 100644
index 0000000..83bcc1a
--- /dev/null
+++ b/python/zarrista/_decoded_array.pyi
@@ -0,0 +1,79 @@
+import sys
+from typing import Any, TypeAlias
+
+from numpy.typing import NDArray
+
+from ._dtype import DataType
+
+if sys.version_info >= (3, 12):
+    from collections.abc import Buffer
+else:
+    from typing_extensions import Buffer
+
+class Tensor:
+    """Fixed-width, dense decoded array data.
+
+    The decoded bytes are held zero-copy. Reinterpret them as a NumPy array with
+    `to_numpy()`, or get the raw bytes as a buffer-protocol object via `buffer()`.
+    """
+
+    @property
+    def shape(self) -> list[int]:
+        """The shape of the decoded region."""
+    @property
+    def dtype(self) -> DataType:
+        """The Zarr data type."""
+    def buffer(self) -> Buffer:
+        """The raw decoded bytes as a zero-copy buffer-protocol object."""
+    def to_numpy(self) -> NDArray[Any]:
+        """Access a NumPy array view over Rust memory.
+
+        This is a zero-copy view via `np.frombuffer`.
+        """
+
+class VariableArray:
+    """Variable-length decoded data (e.g. strings or bytes).
+
+    Not yet exposed to NumPy.
+    """
+
+    @property
+    def shape(self) -> list[int]:
+        """The shape of the decoded region."""
+    @property
+    def dtype(self) -> DataType:
+        """The Zarr data type."""
+
+class MaskedTensor:
+    """Fixed-width decoded data with a validity mask.
+
+    Not yet exposed to NumPy.
+    """
+
+    @property
+    def shape(self) -> list[int]:
+        """The shape of the decoded region."""
+    @property
+    def dtype(self) -> DataType:
+        """The Zarr data type."""
+
+class MaskedVariableArray:
+    """Variable-length decoded data with a validity mask.
+
+    Not yet exposed to NumPy.
+    """
+
+    @property
+    def shape(self) -> list[int]:
+        """The shape of the decoded region."""
+    @property
+    def dtype(self) -> DataType:
+        """The Zarr data type."""
+
+DecodedArray: TypeAlias = Tensor | VariableArray | MaskedTensor | MaskedVariableArray
+"""The result of a read: one of the four decoded array layouts.
+
+Which one is returned depends on the dtype's byte layout (fixed vs. variable, and
+whether it carries a validity mask). Use `isinstance` to narrow to a concrete
+type before using layout-specific methods.
+"""
diff --git a/python/zarrista/_zarrista.pyi b/python/zarrista/_zarrista.pyi
index c6783a8..0ff4154 100644
--- a/python/zarrista/_zarrista.pyi
+++ b/python/zarrista/_zarrista.pyi
@@ -1,7 +1,7 @@
 from ._array import Array, AsyncArray
 from ._array_bytes import ArrayBytes
 from ._chunks import ChunkGrid
-from ._data import Data
+from ._decoded_array import MaskedTensor, MaskedVariableArray, Tensor, VariableArray
 from ._dtype import DataType
 from ._fill_value import FillValue
 from ._group import AsyncGroup, Group
@@ -15,11 +15,14 @@ __all__ = [
     "AsyncArray",
     "AsyncGroup",
     "ChunkGrid",
-    "Data",
     "DataType",
     "FillValue",
     "FilesystemStore",
     "Group",
+    "MaskedTensor",
+    "MaskedVariableArray",
     "MemoryStore",
+    "Tensor",
+    "VariableArray",
     "__version__",
 ]
diff --git a/src/array/async.rs b/src/array/async.rs
index ec8a1e0..b8e6a80 100644
--- a/src/array/async.rs
+++ b/src/array/async.rs
@@ -6,13 +6,11 @@ use crate::array::selection::PySelection;
 use crate::array::util::PyChunkIndices;
 use crate::chunks::PyChunkGrid;
 use crate::codec::{PyCodecChain, PyCodecOptions};
-use crate::data::{for_each_dtype, DataInner, PyData};
+use crate::decoded_array::DecodedArray;
 use crate::dtype::PyDataType;
 use crate::error::ZarristaError;
 use crate::node::PyNodePath;
 use crate::storage::PyAsyncStorage;
-use ndarray::ArrayD;
-use pyo3::exceptions::PyNotImplementedError;
 use pyo3::prelude::*;
 use pyo3_async_runtimes::tokio::future_into_py;
 use pyo3_bytes::PyBytes;
@@ -123,30 +121,15 @@ impl PyAsyncArray {
         py: Python<'py>,
         selection: PySelection,
     ) -> PyResult<Bound<'py, PyAny>> {
-        use zarrs::array::data_type::*;
-
         let inner = self.inner.clone();
         let array_subset = selection.to_array_subset(inner.shape())?;
 
         future_into_py(py, async move {
-            let dtype = inner.data_type();
-
-            macro_rules! arm {
-                ($dtype:ty, $variant:ident, $elem:ty) => {
-                    if dtype.is::<$dtype>() {
-                        let data = inner
-                            .async_retrieve_array_subset::<ArrayD<$elem>>(&array_subset)
-                            .await
-                            .map_err(ZarristaError::from)?;
-                        return Ok(PyData::from(DataInner::$variant(data)));
-                    }
-                };
-            }
-            for_each_dtype!(arm);
-
-            Err(PyNotImplementedError::new_err(format!(
-                "reading data type {dtype} is not supported yet"
-            )))
+            let decoded = inner
+                .async_retrieve_array_subset::<DecodedArray>(&array_subset)
+                .await
+                .map_err(ZarristaError::from)?;
+            Ok(decoded)
         })
     }
 
@@ -157,35 +140,17 @@ impl PyAsyncArray {
         chunk_indices: PyChunkIndices,
         codec_options: Option<PyCodecOptions>,
     ) -> PyResult<Bound<'py, PyAny>> {
-        use zarrs::array::data_type::*;
-
         let inner = self.inner.clone();
         let codec_options = codec_options
             .map(|opts| opts.into_inner())
             .unwrap_or_default();
 
         future_into_py(py, async move {
-            let dtype = inner.data_type();
-
-            macro_rules! arm {
-                ($dtype:ty, $variant:ident, $elem:ty) => {
-                    if dtype.is::<$dtype>() {
-                        let chunk = inner
-                            .async_retrieve_chunk_opt::<ArrayD<$elem>>(
-                                chunk_indices.as_ref(),
-                                &codec_options,
-                            )
-                            .await
-                            .map_err(ZarristaError::from)?;
-                        return Ok(PyData::from(DataInner::$variant(chunk)));
-                    }
-                };
-            }
-            for_each_dtype!(arm);
-
-            Err(PyNotImplementedError::new_err(format!(
-                "reading data type {dtype} is not supported yet"
-            )))
+            let decoded = inner
+                .async_retrieve_chunk_opt::<DecodedArray>(chunk_indices.as_ref(), &codec_options)
+                .await
+                .map_err(ZarristaError::from)?;
+            Ok(decoded)
         })
     }
 
diff --git a/src/array/sync.rs b/src/array/sync.rs
index 5d45c66..32387c2 100644
--- a/src/array/sync.rs
+++ b/src/array/sync.rs
@@ -4,13 +4,11 @@ use crate::array::selection::PySelection;
 use crate::array::util::PyChunkIndices;
 use crate::chunks::PyChunkGrid;
 use crate::codec::{PyCodecChain, PyCodecOptions};
-use crate::data::{for_each_dtype, DataInner, PyData};
+use crate::decoded_array::DecodedArray;
 use crate::dtype::PyDataType;
 use crate::error::ZarristaResult;
 use crate::node::PyNodePath;
 use crate::storage::PySyncStorage;
-use ndarray::ArrayD;
-use pyo3::exceptions::PyNotImplementedError;
 use pyo3::prelude::*;
 use pyo3_bytes::PyBytes;
 use pythonize::pythonize;
@@ -33,7 +31,7 @@ impl PyArray {
 #[pymethods]
 impl PyArray {
     /// Read a region with numpy-style basic indexing, e.g. `arr[0:10, :, 5]`.
-    fn __getitem__(&self, selection: PySelection) -> ZarristaResult<PyData> {
+    fn __getitem__(&self, selection: PySelection) -> ZarristaResult<DecodedArray> {
         self.retrieve_array_subset(selection)
     }
 
@@ -101,29 +99,13 @@ impl PyArray {
         self.inner.path().as_str()
     }
 
-    /// Read a region of the array as `Data`, using numpy-style basic indexing.
-    fn retrieve_array_subset(&self, selection: PySelection) -> ZarristaResult<PyData> {
-        use zarrs::array::data_type::*;
-
+    /// Read a region of the array, using numpy-style basic indexing.
+    ///
+    /// Returns one of the decoded result classes (`Tensor`, `VariableArray`,
+    /// `MaskedTensor`, `MaskedVariableArray`) depending on the dtype layout.
+    fn retrieve_array_subset(&self, selection: PySelection) -> ZarristaResult<DecodedArray> {
         let array_subset = selection.to_array_subset(self.inner.shape())?;
-        let dtype = self.inner.data_type();
-
-        macro_rules! arm {
-            ($dtype:ty, $variant:ident, $elem:ty) => {
-                if dtype.is::<$dtype>() {
-                    let data = self
-                        .inner
-                        .retrieve_array_subset::<ArrayD<$elem>>(&array_subset)?;
-                    return Ok(PyData::from(DataInner::$variant(data)));
-                }
-            };
-        }
-        for_each_dtype!(arm);
-
-        Err(PyNotImplementedError::new_err(format!(
-            "reading data type {dtype} is not supported yet"
-        ))
-        .into())
+        Ok(self.inner.retrieve_array_subset(&array_subset)?)
     }
 
     #[pyo3(signature = (chunk_indices, **codec_options))]
@@ -131,31 +113,13 @@ impl PyArray {
         &self,
         chunk_indices: PyChunkIndices,
         codec_options: Option<PyCodecOptions>,
-    ) -> ZarristaResult<PyData> {
-        use zarrs::array::data_type::*;
-
-        let dtype = self.inner.data_type();
+    ) -> ZarristaResult<DecodedArray> {
         let codec_options = codec_options
             .map(|opts| opts.into_inner())
             .unwrap_or_default();
-
-        macro_rules! arm {
-            ($dtype:ty, $variant:ident, $elem:ty) => {
-                if dtype.is::<$dtype>() {
-                    let chunk = self.inner.retrieve_chunk_opt::<ArrayD<$elem>>(
-                        chunk_indices.as_ref(),
-                        &codec_options,
-                    )?;
-                    return Ok(PyData::from(DataInner::$variant(chunk)));
-                }
-            };
-        }
-        for_each_dtype!(arm);
-
-        Err(PyNotImplementedError::new_err(format!(
-            "reading data type {dtype} is not supported yet"
-        ))
-        .into())
+        Ok(self
+            .inner
+            .retrieve_chunk_opt(chunk_indices.as_ref(), &codec_options)?)
     }
 
     fn retrieve_encoded_chunk(
@@ -171,38 +135,4 @@ impl PyArray {
     fn shape(&self) -> &[u64] {
         self.inner.shape()
     }
-
-    // /// The chunk shape (size of a chunk along each dimension).
-    // #[getter]
-    // fn chunks(&self) -> PyResult<Vec<u64>> {
-    //     // TODO: review
-    //     let origin = vec![0u64; self.inner.shape().len()];
-    //     let chunk_shape = self.inner.chunk_shape(&origin).map_err(to_py_err)?;
-    //     Ok(chunk_shape.iter().map(|n| n.get()).collect())
-    // }
-
-    // /// The fill value as a Python scalar (or `None` if not interpretable).
-    // #[getter]
-    // fn fill_value(&self, py: Python<'_>) -> PyResult<PyObject> {
-    //     dtype::fill_value_to_py(
-    //         py,
-    //         self.inner.data_type(),
-    //         self.inner.fill_value().as_ne_bytes(),
-    //     )
-    // }
-
-    // /// Read a region with numpy-style basic indexing.
-    // fn __getitem__(&self, py: Python<'_>, key: &Bound<'_, PyAny>) -> PyResult<PyObject> {
-    //     let shape = self.inner.shape().to_vec();
-    //     let (ranges, out_shape) = parse_index(py, key, &shape)?;
-    //     let subset = ArraySubset::new_with_ranges(&ranges);
-    //     dtype::read_region(py, &self.inner, &Region::Subset(&subset), &out_shape)
-    // }
-
-    // /// Read a single chunk by its chunk coordinates.
-    // fn get_chunk(&self, py: Python<'_>, chunk_coords: Vec<u64>) -> PyResult<PyObject> {
-    //     let chunk_shape = self.inner.chunk_shape(&chunk_coords).map_err(to_py_err)?;
-    //     let out_shape: Vec<usize> = chunk_shape.iter().map(|n| n.get() as usize).collect();
-    //     dtype::read_region(py, &self.inner, &Region::Chunk(&chunk_coords), &out_shape)
-    // }
 }
diff --git a/src/data.rs b/src/data.rs
deleted file mode 100644
index f271a01..0000000
--- a/src/data.rs
+++ /dev/null
@@ -1,254 +0,0 @@
-//! A Python-exposed array type that implements the buffer protocol.
-//!
-//! This module provides `PyData`, an ND array type that can be used with numpy
-//! via Python's buffer protocol. The buffer protocol allows Python objects to
-//! expose raw memory buffers, enabling zero-copy interoperability with numpy.
-//!
-//! ## Buffer Protocol Overview
-//!
-//! The buffer protocol is defined in [PEP 3118] and allows objects to expose their
-//! internal data as a contiguous or strided memory region. Key concepts:
-//!
-//! [PEP 3118](https://peps.python.org/pep-3118/)
-//!
-//! - **view**: A `Py_buffer` struct that describes how to interpret the memory
-//! - **format**: A string describing the element type (e.g., "<H" for little-endian uint16)
-//! - **shape**: Array dimensions (e.g., [height, width, bands] for a 3D array)
-//! - **strides**: Byte offsets between consecutive elements in each dimension
-//!
-//! ## Reference Counting
-//!
-//! When `__getbuffer__` is called, we increment the reference count on the PyData
-//! object (`Py_INCREF`) to ensure it stays alive while the buffer view exists.
-//! Python's `PyBuffer_Release` automatically calls `Py_DECREF` after `__releasebuffer__`
-//! returns, so we don't need to manually decrement the count.
-//!
-//! ## Memory Safety
-//!
-//! The shape and strides arrays are stored as `Box<[isize]>` on the PyData struct
-//! itself. This means their lifetime is tied to the PyData object, which is kept
-//! alive by the `Py_INCREF` call. This avoids the need to leak/free allocations
-//! in `__getbuffer__`/`__releasebuffer__`.
-
-use std::ffi::{c_char, c_void, CStr};
-use std::os::raw::c_int;
-
-use ndarray::ArrayD;
-use numpy::PyArray;
-use pyo3::exceptions::PyBufferError;
-use pyo3::ffi;
-use pyo3::prelude::*;
-
-/// The decoded data of a chunk, with variants for each supported dtype.
-///
-/// This is kept separate from `PyData` 1. because Python classes can't be defined as Rust enums and 2. so that we have a clean place to manage data types not supported by numpy.
-pub enum DataInner {
-    Bool(ArrayD<bool>),
-    Float16(ArrayD<half::f16>),
-    Float32(ArrayD<f32>),
-    Float64(ArrayD<f64>),
-    Int16(ArrayD<i16>),
-    Int32(ArrayD<i32>),
-    Int64(ArrayD<i64>),
-    Int8(ArrayD<i8>),
-    Uint16(ArrayD<u16>),
-    Uint32(ArrayD<u32>),
-    Uint64(ArrayD<u64>),
-    Uint8(ArrayD<u8>),
-}
-
-/// Invoke `$arm!(ZarrsDataType, DataInnerVariant, rust_elem_type)` once per
-/// supported dtype. The caller supplies an `arm!` macro that turns a single
-/// `(dtype, variant, elem)` triple into a retrieval (e.g. `retrieve_chunk` or
-/// `retrieve_array_subset`), keeping the dtype list defined in exactly one place.
-///
-/// The `ZarrsDataType` idents (e.g. `BoolDataType`) and `half::f16` are resolved
-/// in the caller's scope, so callers must have `use zarrs::array::data_type::*`.
-macro_rules! for_each_dtype {
-    ($arm:ident) => {
-        $arm!(BoolDataType, Bool, bool);
-        $arm!(Int8DataType, Int8, i8);
-        $arm!(Int16DataType, Int16, i16);
-        $arm!(Int32DataType, Int32, i32);
-        $arm!(Int64DataType, Int64, i64);
-        $arm!(UInt8DataType, Uint8, u8);
-        $arm!(UInt16DataType, Uint16, u16);
-        $arm!(UInt32DataType, Uint32, u32);
-        $arm!(UInt64DataType, Uint64, u64);
-        $arm!(Float16DataType, Float16, half::f16);
-        $arm!(Float32DataType, Float32, f32);
-        $arm!(Float64DataType, Float64, f64);
-    };
-}
-pub(crate) use for_each_dtype;
-
-/// Run `$body` against the inner `ArrayD`, with `$a` bound to it regardless of
-/// element type. Exhaustive, so a new `DataInner` variant is a compile error.
-macro_rules! with_array {
-    ($data:expr, $a:ident => $body:expr) => {{
-        use DataInner::*;
-        match $data {
-            Bool($a) => $body,
-            Float16($a) => $body,
-            Float32($a) => $body,
-            Float64($a) => $body,
-            Int16($a) => $body,
-            Int32($a) => $body,
-            Int64($a) => $body,
-            Int8($a) => $body,
-            Uint16($a) => $body,
-            Uint32($a) => $body,
-            Uint64($a) => $body,
-            Uint8($a) => $body,
-        }
-    }};
-}
-
-impl DataInner {
-    /// The PEP 3118 buffer-protocol format string for this dtype, or `None` if
-    /// the dtype has no buffer-protocol representation (e.g. dtypes with no
-    /// matching native layout). `None` is the signal that consumers must copy.
-    fn buffer_format(&self) -> Option<&'static CStr> {
-        use DataInner::*;
-
-        let format = match self {
-            Bool(_) => c"?",
-            Int8(_) => c"b",
-            Int16(_) => c"h",
-            Int32(_) => c"i",
-            Int64(_) => c"q",
-            Uint8(_) => c"B",
-            Uint16(_) => c"H",
-            Uint32(_) => c"I",
-            Uint64(_) => c"Q",
-            Float16(_) => c"e",
-            Float32(_) => c"f",
-            Float64(_) => c"d",
-        };
-        Some(format)
-    }
-
-    /// The size of a single element in bytes.
-    fn itemsize(&self) -> usize {
-        use DataInner::*;
-
-        match self {
-            Bool(_) | Int8(_) | Uint8(_) => 1,
-            Float16(_) | Int16(_) | Uint16(_) => 2,
-            Float32(_) | Int32(_) | Uint32(_) => 4,
-            Float64(_) | Int64(_) | Uint64(_) => 8,
-        }
-    }
-
-    /// A raw pointer to the first element of the backing buffer.
-    fn data_ptr(&self) -> *mut c_void {
-        with_array!(self, a => a.as_ptr() as *mut c_void)
-    }
-
-    /// Copy the decoded chunk into a fresh NumPy array. Used as the fallback for
-    /// dtypes that cannot be exposed via the buffer protocol.
-    fn to_numpy_with_copy<'py>(&self, py: Python<'py>) -> Bound<'py, PyAny> {
-        with_array!(&self, a => PyArray::from_array(py, a).into_any())
-    }
-}
-
-#[pyclass(module = "zarrista", frozen, name = "Data")]
-pub struct PyData {
-    inner: DataInner,
-    /// Shape in elements. Cached here so the buffer protocol can hand out a
-    /// pointer with a lifetime tied to this (frozen) object.
-    shape: Box<[isize]>,
-    /// Strides in bytes, cached for the same reason.
-    strides: Box<[isize]>,
-}
-
-impl From<DataInner> for PyData {
-    fn from(inner: DataInner) -> Self {
-        let itemsize = inner.itemsize() as isize;
-        let shape: Box<[isize]> =
-            with_array!(&inner, a => a.shape().iter().map(|&d| d as isize).collect());
-        let strides: Box<[isize]> = with_array!(&inner,
-            a => a.strides().iter().map(|&s| s * itemsize).collect());
-        Self {
-            inner,
-            shape,
-            strides,
-        }
-    }
-}
-
-#[pymethods]
-impl PyData {
-    /// Convert the decoded chunk into a NumPy array.
-    ///
-    /// When the dtype has a buffer-protocol representation this is a zero-copy
-    /// view (numpy reads our buffer directly). Otherwise the data is copied and
-    /// converted into a fresh array.
-    fn to_numpy<'py>(slf: Bound<'py, Self>) -> PyResult<Bound<'py, PyAny>> {
-        let py = slf.py();
-        if slf.borrow().inner.buffer_format().is_some() {
-            // Zero-copy: `np.asarray` views our buffer via `__getbuffer__`.
-            py.import("numpy")?.call_method1("asarray", (&slf,))
-        } else {
-            // Fallback: copy/convert into a new numpy array.
-            Ok(slf.borrow().inner.to_numpy_with_copy(py))
-        }
-    }
-
-    /// Buffer-protocol export (PEP 3118): lets any consumer (`memoryview`,
-    /// numpy, pyarrow, …) read the data without a copy.
-    ///
-    /// # Safety
-    /// `view` is a valid `Py_buffer` provided by the interpreter. We pin `self`
-    /// for the view's lifetime with `Py_INCREF`, and `shape`/`strides` are owned
-    /// by `self`, so all pointers stay valid until `__releasebuffer__`.
-    unsafe fn __getbuffer__(
-        slf: PyRef<'_, Self>,
-        view: *mut ffi::Py_buffer,
-        flags: c_int,
-    ) -> PyResult<()> {
-        if view.is_null() {
-            return Err(PyBufferError::new_err("null buffer view"));
-        }
-        let Some(format) = slf.inner.buffer_format() else {
-            return Err(PyBufferError::new_err(
-                "this data type has no buffer-protocol representation",
-            ));
-        };
-        // We only ever expose a read-only buffer.
-        if (flags & ffi::PyBUF_WRITABLE) == ffi::PyBUF_WRITABLE {
-            return Err(PyBufferError::new_err("buffer is read-only"));
-        }
-
-        let itemsize = slf.inner.itemsize() as isize;
-        let len = slf.shape.iter().product::<isize>() * itemsize;
-
-        (*view).buf = slf.inner.data_ptr();
-        (*view).len = len;
-        (*view).itemsize = itemsize;
-        (*view).readonly = 1;
-        (*view).ndim = slf.shape.len() as c_int;
-        (*view).format = if (flags & ffi::PyBUF_FORMAT) == ffi::PyBUF_FORMAT {
-            format.as_ptr() as *mut c_char
-        } else {
-            std::ptr::null_mut()
-        };
-        (*view).shape = slf.shape.as_ptr() as *mut isize;
-        (*view).strides = slf.strides.as_ptr() as *mut isize;
-        (*view).suboffsets = std::ptr::null_mut();
-        (*view).internal = std::ptr::null_mut();
-
-        // Keep `self` (and therefore the buffer) alive while the view exists;
-        // Python calls `Py_DECREF` after `__releasebuffer__` returns.
-        (*view).obj = slf.as_ptr();
-        ffi::Py_INCREF((*view).obj);
-
-        Ok(())
-    }
-
-    /// Required counterpart to `__getbuffer__`. Nothing to free: all memory is
-    /// owned by `self`, and Python handles the `Py_DECREF` on `view.obj`.
-    unsafe fn __releasebuffer__(&self, _view: *mut ffi::Py_buffer) {}
-}
-
-impl PyData {}
diff --git a/src/decoded_array.rs b/src/decoded_array.rs
new file mode 100644
index 0000000..81ca5cc
--- /dev/null
+++ b/src/decoded_array.rs
@@ -0,0 +1,245 @@
+//! Decoded array data exposed to Python.
+//!
+//! SPIKE: instead of decoding into a typed `ndarray::ArrayD<T>` (which copies
+//! every buffer via `bytemuck::pod_collect_to_vec`), we retrieve the raw
+//! post-codec [`ArrayBytes`] and wrap it zero-copy. Retrieval is a single
+//! generic call (`retrieve::<DecodedArray>`) — no per-dtype macro — because we
+//! implement [`FromArrayBytes`] on our own [`DecodedArray`] type.
+//!
+//! `ArrayBytes` has three layouts (`Fixed`, `Variable`, `Optional`), which we
+//! surface as four concrete Python classes so each exposes exactly the faces it
+//! can support:
+//!
+//! - [`PyTensor`] — fixed-width, dense. Buffer protocol + `to_numpy`.
+//! - [`PyVariableArray`] — variable-length (string/bytes). (skeleton)
+//! - [`PyMaskedTensor`] — fixed-width with a validity mask. (skeleton)
+//! - [`PyMaskedVariableArray`] — variable-length with a validity mask. (skeleton)
+//!
+//! Buffers are **not** aligned: numpy's `frombuffer` tolerates unaligned data
+//! (it sets `aligned=False`), and any consumer that materializes an owned array
+//! pays the alignment copy as part of the copy it was already doing.
+
+use std::borrow::Cow;
+
+use bytes::Bytes;
+use pyo3::exceptions::PyNotImplementedError;
+use pyo3::prelude::*;
+use pyo3::IntoPyObjectExt;
+use pyo3_bytes::PyBytes;
+use zarrs::array::{ArrayBytes, ArrayError, DataType, FromArrayBytes};
+
+use crate::dtype::PyDataType;
+
+/// Fixed-width, dense decoded data.
+///
+/// We don't use the upstream `Tensor` type because its bytes are not reference counted, and thus
+/// don't play nicely with buffer protocol export
+#[pyclass(module = "zarrista", frozen, name = "Tensor")]
+pub struct PyTensor {
+    bytes: Bytes,
+    data_type: DataType,
+    shape: Vec<u64>,
+}
+
+#[pymethods]
+impl PyTensor {
+    #[getter]
+    fn shape(&self) -> &[u64] {
+        &self.shape
+    }
+
+    #[getter]
+    fn dtype(&self) -> PyDataType {
+        self.data_type.clone().into()
+    }
+
+    /// The raw decoded bytes, zero-copy, as a buffer-protocol object.
+    // TODO: it might be nice for the Tensor itself to implement the buffer protocol, instead of
+    // having to call the buffer method? :shrug:
+    fn buffer(&self) -> PyBytes {
+        PyBytes::new(self.bytes.clone())
+    }
+
+    /// Reinterpret the raw bytes as a numpy array of this dtype and shape.
+    ///
+    /// Zero-copy view (`np.frombuffer`) — numpy tolerates an unaligned buffer.
+    fn to_numpy<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
+        // TODO: will the name_v3 always be understood by numpy?
+        let name = self.data_type.name_v3().ok_or_else(|| {
+            PyNotImplementedError::new_err(format!(
+                "data type {} has no zarr v3 name / numpy mapping",
+                self.data_type
+            ))
+        })?;
+        let np = py.import("numpy")?;
+        let flat = np.call_method1("frombuffer", (self.buffer(), name.into_owned()))?;
+        flat.call_method1("reshape", (&self.shape,))
+    }
+}
+
+/// Variable-length data (string/bytes). Skeleton: carries metadata only for now.
+#[pyclass(module = "zarrista", frozen, name = "VariableArray")]
+pub struct PyVariableArray {
+    #[expect(dead_code)]
+    bytes: Bytes,
+    #[expect(dead_code)]
+    offsets: Vec<usize>,
+    data_type: DataType,
+    shape: Vec<u64>,
+}
+
+#[pymethods]
+impl PyVariableArray {
+    #[getter]
+    fn shape(&self) -> &[u64] {
+        &self.shape
+    }
+
+    #[getter]
+    fn dtype(&self) -> PyDataType {
+        self.data_type.clone().into()
+    }
+}
+
+/// Fixed-width data with a validity mask. Skeleton.
+#[pyclass(module = "zarrista", frozen, name = "MaskedTensor")]
+pub struct PyMaskedTensor {
+    #[expect(dead_code)]
+    bytes: Bytes,
+    /// The mask is 1 byte per element where 0 = invalid/missing, non-zero = valid/present.
+    #[expect(dead_code)]
+    mask: Bytes,
+    data_type: DataType,
+    shape: Vec<u64>,
+}
+
+#[pymethods]
+impl PyMaskedTensor {
+    #[getter]
+    fn shape(&self) -> &[u64] {
+        &self.shape
+    }
+
+    #[getter]
+    fn dtype(&self) -> PyDataType {
+        self.data_type.clone().into()
+    }
+}
+
+/// Variable-length data with a validity mask. Skeleton.
+#[pyclass(module = "zarrista", frozen, name = "MaskedVariableArray")]
+pub struct PyMaskedVariableArray {
+    #[expect(dead_code)]
+    bytes: Bytes,
+    #[expect(dead_code)]
+    offsets: Vec<usize>,
+    /// The mask is 1 byte per element where 0 = invalid/missing, non-zero = valid/present.
+    #[expect(dead_code)]
+    mask: Bytes,
+    data_type: DataType,
+    shape: Vec<u64>,
+}
+
+#[pymethods]
+impl PyMaskedVariableArray {
+    #[getter]
+    fn shape(&self) -> &[u64] {
+        &self.shape
+    }
+
+    #[getter]
+    fn dtype(&self) -> PyDataType {
+        self.data_type.clone().into()
+    }
+}
+
+/// Internal decoded result, produced by our [`FromArrayBytes`] impl. Carries the
+/// post-codec bytes (zero-copy), the data type, and the region shape (which
+/// zarrs hands us, so we never have to re-derive it).
+pub enum DecodedArray {
+    Tensor(PyTensor),
+    Variable(PyVariableArray),
+    MaskedTensor(PyMaskedTensor),
+    MaskedVariable(PyMaskedVariableArray),
+}
+
+impl FromArrayBytes for DecodedArray {
+    fn from_array_bytes(
+        bytes: ArrayBytes<'static>,
+        shape: &[u64],
+        data_type: &DataType,
+    ) -> Result<Self, ArrayError> {
+        let shape = shape.to_vec();
+        let data_type = data_type.clone();
+        Ok(match bytes {
+            ArrayBytes::Fixed(bytes) => DecodedArray::Tensor(PyTensor {
+                bytes: cow_to_bytes(bytes),
+                data_type,
+                shape,
+            }),
+            ArrayBytes::Variable(v) => {
+                let (buf, offsets) = v.into_parts();
+                DecodedArray::Variable(PyVariableArray {
+                    bytes: cow_to_bytes(buf),
+                    // Ideally in the future we'll avoid a copy:
+                    // https://github.com/zarrs/zarrs/issues/406
+                    offsets: offsets.to_vec(),
+                    data_type,
+                    shape,
+                })
+            }
+            ArrayBytes::Optional(optional) => {
+                let (data, mask) = optional.into_parts();
+                match *data {
+                    ArrayBytes::Fixed(fixed) => DecodedArray::MaskedTensor(PyMaskedTensor {
+                        bytes: cow_to_bytes(fixed),
+                        mask: cow_to_bytes(mask),
+                        data_type,
+                        shape,
+                    }),
+                    ArrayBytes::Variable(variable) => {
+                        let (buf, offsets) = variable.into_parts();
+                        DecodedArray::MaskedVariable(PyMaskedVariableArray {
+                            bytes: cow_to_bytes(buf),
+                            // Ideally in the future we'll avoid a copy:
+                            // https://github.com/zarrs/zarrs/issues/406
+                            offsets: offsets.to_vec(),
+                            mask: cow_to_bytes(mask),
+                            data_type,
+                            shape,
+                        })
+                    }
+                    ArrayBytes::Optional(_) => {
+                        unreachable!("nested optional is not a valid layout")
+                    }
+                }
+            }
+        })
+    }
+}
+
+impl<'py> IntoPyObject<'py> for DecodedArray {
+    type Target = PyAny;
+    type Output = Bound<'py, PyAny>;
+    type Error = PyErr;
+
+    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
+        match self {
+            DecodedArray::Tensor(py_tensor) => py_tensor.into_bound_py_any(py),
+            DecodedArray::Variable(py_variable_array) => py_variable_array.into_bound_py_any(py),
+            DecodedArray::MaskedTensor(py_masked_tensor) => py_masked_tensor.into_bound_py_any(py),
+            DecodedArray::MaskedVariable(py_masked_variable_array) => {
+                py_masked_variable_array.into_bound_py_any(py)
+            }
+        }
+    }
+}
+
+/// Move a `'static` `Cow<[u8]>` into `bytes::Bytes`. Owned is a zero-copy move;
+/// borrowed (rare for retrieval) copies.
+fn cow_to_bytes(cow: Cow<'static, [u8]>) -> Bytes {
+    match cow {
+        Cow::Owned(v) => Bytes::from(v),
+        Cow::Borrowed(b) => Bytes::copy_from_slice(b),
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index c8f6d4f..42481c0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -4,7 +4,7 @@ mod array;
 mod array_bytes;
 mod chunks;
 mod codec;
-mod data;
+mod decoded_array;
 mod dtype;
 mod error;
 mod exceptions;
@@ -20,7 +20,7 @@ use crate::array::{PyArray, PyAsyncArray};
 use crate::array_bytes::PyArrayBytes;
 use crate::chunks::PyChunkGrid;
 use crate::codec::register_codec_module;
-use crate::data::PyData;
+use crate::decoded_array::{PyMaskedTensor, PyMaskedVariableArray, PyTensor, PyVariableArray};
 use crate::dtype::PyDataType;
 use crate::exceptions::register_exceptions_module;
 use crate::fill_value::PyFillValue;
@@ -37,7 +37,10 @@ fn _zarrista(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<PyAsyncArray>()?;
     m.add_class::<PyAsyncGroup>()?;
     m.add_class::<PyChunkGrid>()?;
-    m.add_class::<PyData>()?;
+    m.add_class::<PyTensor>()?;
+    m.add_class::<PyVariableArray>()?;
+    m.add_class::<PyMaskedTensor>()?;
+    m.add_class::<PyMaskedVariableArray>()?;
     m.add_class::<PyDataType>()?;
     m.add_class::<PyFillValue>()?;
     m.add_class::<PyFilesystemStore>()?;
diff --git a/tests/test_indexing.py b/tests/test_indexing.py
index d33c18f..8d0010e 100644
--- a/tests/test_indexing.py
+++ b/tests/test_indexing.py
@@ -12,7 +12,7 @@
 import zarr
 from numpy.typing import NDArray
 from obstore.store import LocalStore
-from zarrista import Array, AsyncArray, FilesystemStore
+from zarrista import Array, AsyncArray, FilesystemStore, Tensor
 
 
 @pytest.fixture
@@ -38,6 +38,25 @@ def test_slice_read_matches_numpy(int32_array: tuple[Path, NDArray[np.int32]]):
     np.testing.assert_array_equal(result, data[0:2, :, 5:7])
 
 
+def test_fixed_dtype_returns_tensor(int32_array: tuple[Path, NDArray[np.int32]]):
+    """A fixed-width dtype decodes to a `Tensor` carrying `shape`/`dtype`; its raw
+    `buffer()` reinterprets to the same array as `to_numpy()`."""
+    path, data = int32_array
+    arr = Array.open(FilesystemStore(path))
+
+    tensor = arr.retrieve_array_subset((slice(0, 2), slice(None), slice(5, 7)))
+    assert isinstance(tensor, Tensor)
+    assert tensor.shape == [2, 64, 2]
+    assert tensor.dtype == arr.dtype
+
+    expected = data[0:2, :, 5:7]
+    np.testing.assert_array_equal(tensor.to_numpy(), expected)
+
+    # buffer() exposes the raw decoded bytes; reinterpreting matches to_numpy().
+    from_buffer = np.frombuffer(tensor.buffer(), dtype="int32").reshape(tensor.shape)
+    np.testing.assert_array_equal(from_buffer, expected)
+
+
 def test_getitem_matches_retrieve_array_subset(
     int32_array: tuple[Path, NDArray[np.int32]],
 ):