diff --git a/benchmarks/rabitq_poc/.gitignore b/benchmarks/rabitq_poc/.gitignore
deleted file mode 100644
index 33511340..00000000
--- a/benchmarks/rabitq_poc/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# Reproducible binary blobs — regenerate via `cargo run -p turbovec --example dump_state`
-rust_state/
diff --git a/benchmarks/rabitq_poc/apples_results.json b/benchmarks/rabitq_poc/apples_results.json
deleted file mode 100644
index d5b04460..00000000
--- a/benchmarks/rabitq_poc/apples_results.json
+++ /dev/null
@@ -1,122 +0,0 @@
-{
-  "glove-200_2bit": {
-    "baseline": {
-      "1": 0.5074,
-      "2": 0.6606,
-      "4": 0.7896,
-      "8": 0.8794,
-      "16": 0.9432,
-      "32": 0.9759,
-      "64": 0.9926
-    },
-    "form_B_paper": {
-      "1": 0.5544,
-      "2": 0.7041,
-      "4": 0.8278,
-      "8": 0.9097,
-      "16": 0.9627,
-      "32": 0.9852,
-      "64": 0.9963
-    }
-  },
-  "glove-200_4bit": {
-    "baseline": {
-      "1": 0.8231,
-      "2": 0.9395,
-      "4": 0.988,
-      "8": 0.9983,
-      "16": 0.9997,
-      "32": 1.0,
-      "64": 1.0
-    },
-    "form_B_paper": {
-      "1": 0.8577,
-      "2": 0.9583,
-      "4": 0.9948,
-      "8": 0.9998,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    }
-  },
-  "openai-1536_2bit": {
-    "baseline": {
-      "1": 0.872,
-      "2": 0.963,
-      "4": 0.999,
-      "8": 1.0,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    },
-    "form_B_paper": {
-      "1": 0.904,
-      "2": 0.975,
-      "4": 0.998,
-      "8": 1.0,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    }
-  },
-  "openai-1536_4bit": {
-    "baseline": {
-      "1": 0.957,
-      "2": 0.997,
-      "4": 1.0,
-      "8": 1.0,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    },
-    "form_B_paper": {
-      "1": 0.975,
-      "2": 0.998,
-      "4": 1.0,
-      "8": 1.0,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    }
-  },
-  "openai-3072_2bit": {
-    "baseline": {
-      "1": 0.915,
-      "2": 0.984,
-      "4": 1.0,
-      "8": 1.0,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    },
-    "form_B_paper": {
-      "1": 0.922,
-      "2": 0.993,
-      "4": 0.999,
-      "8": 1.0,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    }
-  },
-  "openai-3072_4bit": {
-    "baseline": {
-      "1": 0.973,
-      "2": 0.999,
-      "4": 1.0,
-      "8": 1.0,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    },
-    "form_B_paper": {
-      "1": 0.982,
-      "2": 0.999,
-      "4": 1.0,
-      "8": 1.0,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    }
-  }
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/bench_block_skip.py b/benchmarks/rabitq_poc/bench_block_skip.py
deleted file mode 100644
index 8c7773e3..00000000
--- a/benchmarks/rabitq_poc/bench_block_skip.py
+++ /dev/null
@@ -1,85 +0,0 @@
-"""
-Speed-test the block-level mask early-exit: 100K vectors, only the last 1K
-slots allowed.
-
-100K / 32 per block = 3125 blocks. The last 1K vectors occupy ~32 blocks
-at the end. With block-skip active, ~3093 of 3125 blocks (~99%) should be
-short-circuited. Without it (main), the masked search pays the full
-unmasked SIMD cost.
-
-Run this script twice — once on each wheel — to see the before/after.
-"""
-
-import os
-import time
-
-import numpy as np
-from turbovec import TurboQuantIndex
-
-DIM = 1536
-N_DB = 100_000
-N_ALLOWED = 1_000
-N_QUERIES = 100
-K = 10
-SEED = 42
-WARMUP = 3
-REPEATS = 5
-
-
-def main() -> None:
-    rng = np.random.RandomState(SEED)
-    database = rng.standard_normal((N_DB, DIM)).astype(np.float32)
-    database /= np.linalg.norm(database, axis=-1, keepdims=True)
-    queries = rng.standard_normal((N_QUERIES, DIM)).astype(np.float32)
-    queries /= np.linalg.norm(queries, axis=-1, keepdims=True)
-
-    # Allow only the last 1K slots.
-    mask = np.zeros(N_DB, dtype=bool)
-    mask[N_DB - N_ALLOWED:] = True
-
-    index = TurboQuantIndex(DIM, bit_width=4)
-    index.add(database)
-    index.prepare()
-
-    print(f"=== block-skip selectivity benchmark ===")
-    print(f"  db={N_DB}, dim={DIM}, queries={N_QUERIES}, k={K}")
-    print(f"  allowed slots: {N_ALLOWED} (last {N_ALLOWED}; "
-          f"{N_ALLOWED / N_DB * 100:.1f}% of index)")
-    print(f"  blocks total: {(N_DB + 31) // 32}, "
-          f"blocks containing allowed slots: ~{(N_ALLOWED + 31) // 32}")
-    print()
-
-    for _ in range(WARMUP):
-        index.search(queries, K)
-        index.search(queries, K, mask=mask)
-
-    unmasked_times = []
-    masked_times = []
-    for _ in range(REPEATS):
-        t0 = time.perf_counter()
-        index.search(queries, K)
-        unmasked_times.append((time.perf_counter() - t0) * 1000 / N_QUERIES)
-
-        t0 = time.perf_counter()
-        index.search(queries, K, mask=mask)
-        masked_times.append((time.perf_counter() - t0) * 1000 / N_QUERIES)
-
-    unmasked_ms = sorted(unmasked_times)[REPEATS // 2]
-    masked_ms = sorted(masked_times)[REPEATS // 2]
-
-    print(f"  unmasked search:  {unmasked_ms:.3f} ms / query (median of {REPEATS})")
-    print(f"  masked search:    {masked_ms:.3f} ms / query (median of {REPEATS})")
-    print(f"  speedup (unmasked / masked): {unmasked_ms / masked_ms:.2f}x")
-
-    if masked_ms < unmasked_ms * 0.5:
-        print("  -> block-skip appears active (>2x speedup at 1% selectivity)")
-    elif masked_ms < unmasked_ms * 0.95:
-        print("  -> some speedup but not large; block-skip may be partial or "
-              "post-kernel scan is dominant")
-    else:
-        print("  -> no measurable speedup; block-skip likely not active "
-              "(post-filter only)")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmarks/rabitq_poc/comparison.py b/benchmarks/rabitq_poc/comparison.py
deleted file mode 100644
index 50a451b6..00000000
--- a/benchmarks/rabitq_poc/comparison.py
+++ /dev/null
@@ -1,99 +0,0 @@
-"""Build the final published-vs-POC-vs-Rust-prototype comparison table + plot."""
-
-import json
-import os
-
-import matplotlib.pyplot as plt
-
-HERE = os.path.dirname(__file__)
-RESULTS_DIR = os.path.join(HERE, "..", "results")
-PROTO_DIR = os.path.join(HERE, "rust_results")
-POC_RESULTS = json.load(open(os.path.join(HERE, "results.json")))
-
-CELLS = [
-    ("glove_2bit", "glove-200_2bit", "GloVe-200 2-bit"),
-    ("glove_4bit", "glove-200_4bit", "GloVe-200 4-bit"),
-    ("d1536_2bit", "openai-1536_2bit", "OpenAI-1536 2-bit"),
-    ("d1536_4bit", "openai-1536_4bit", "OpenAI-1536 4-bit"),
-    ("d3072_2bit", "openai-3072_2bit", "OpenAI-3072 2-bit"),
-    ("d3072_4bit", "openai-3072_4bit", "OpenAI-3072 4-bit"),
-]
-K_VALUES = [1, 2, 4, 8, 16, 32, 64]
-
-
-def load_cell(fkey, pkey):
-    base = json.load(open(os.path.join(RESULTS_DIR, f"recall_{fkey}.json")))
-    proto = json.load(open(os.path.join(PROTO_DIR, f"recall_{fkey}.json")))
-    poc = POC_RESULTS[pkey]["recall_at_1_at_k"]
-    return base, proto, poc
-
-
-def summary_table():
-    print(f"{'cell':<22} {'old TV':>8} {'POC pred':>9} {'Rust proto':>11} {'Δ vs old':>10} {'FAISS':>8} {'beats FAISS':>12}")
-    print("-" * 86)
-    rows = []
-    for fkey, pkey, label in CELLS:
-        base, proto, poc = load_cell(fkey, pkey)
-        rb = base["tq_recalls"]["1"]
-        rp = proto["tq_recalls"]["1"]
-        rf = base["faiss_recalls"]["1"]
-        rp_poc = poc["form_B_paper"]["1"]
-        beats = "YES" if rp > rf else ("tie" if rp == rf else "no")
-        rows.append((label, rb, rp_poc, rp, rp - rb, rf, beats))
-        print(f"{label:<22} {rb:>8.4f} {rp_poc:>9.4f} {rp:>11.4f} {rp-rb:>+10.4f} {rf:>8.4f} {beats:>12}")
-    return rows
-
-
-def plot():
-    fig, axes = plt.subplots(2, 3, figsize=(15, 8), sharey=False)
-    bit_widths = [2, 4]
-    datasets = [
-        ("glove", "GloVe-200"),
-        ("d1536", "OpenAI-1536"),
-        ("d3072", "OpenAI-3072"),
-    ]
-    pkey_map = {
-        ("glove", 2): "glove-200_2bit",
-        ("glove", 4): "glove-200_4bit",
-        ("d1536", 2): "openai-1536_2bit",
-        ("d1536", 4): "openai-1536_4bit",
-        ("d3072", 2): "openai-3072_2bit",
-        ("d3072", 4): "openai-3072_4bit",
-    }
-
-    for row, bits in enumerate(bit_widths):
-        for col, (ds, label) in enumerate(datasets):
-            ax = axes[row, col]
-            fkey = f"{ds}_{bits}bit"
-            pkey = pkey_map[(ds, bits)]
-            base, proto, poc = load_cell(fkey, pkey)
-
-            x = K_VALUES
-            base_y = [base["tq_recalls"][str(k)] for k in x]
-            proto_y = [proto["tq_recalls"][str(k)] for k in x]
-            faiss_y = [base["faiss_recalls"][str(k)] for k in x]
-            poc_y = [poc["form_B_paper"][str(k)] for k in x]
-
-            ax.plot(x, base_y, marker="o", label="turbovec 0.4.3 (baseline)", color="C0", linewidth=2)
-            ax.plot(x, proto_y, marker="s", label="prototype (Rust, corrected)", color="C3", linewidth=2)
-            ax.plot(x, poc_y, marker="x", label="POC (numpy, corrected)", color="C2", linestyle="--", alpha=0.6)
-            ax.plot(x, faiss_y, marker="^", label="FAISS PQ", color="C7", alpha=0.7)
-
-            ax.set_xscale("log", base=2)
-            ax.set_xticks(K_VALUES)
-            ax.set_xticklabels([str(k) for k in K_VALUES])
-            ax.set_xlabel("k")
-            ax.set_ylabel(f"recall@1@k")
-            ax.set_title(f"{label}, {bits}-bit")
-            ax.grid(True, alpha=0.3)
-            if row == 0 and col == 0:
-                ax.legend(loc="lower right", fontsize=8)
-
-    plt.tight_layout()
-    plt.savefig(os.path.join(HERE, "rust_comparison.png"), dpi=120)
-    print(f"\nPlot saved to {os.path.join(HERE, 'rust_comparison.png')}")
-
-
-if __name__ == "__main__":
-    summary_table()
-    plot()
diff --git a/benchmarks/rabitq_poc/exact_vs_lut.json b/benchmarks/rabitq_poc/exact_vs_lut.json
deleted file mode 100644
index f39e5e04..00000000
--- a/benchmarks/rabitq_poc/exact_vs_lut.json
+++ /dev/null
@@ -1,158 +0,0 @@
-{
-  "glove-200_2bit": {
-    "dataset": "glove-200",
-    "dim": 200,
-    "bit_width": 2,
-    "seed": 42,
-    "n_db": 100000,
-    "n_queries": 10000,
-    "lut_kernel": {
-      "1": 0.5524,
-      "2": 0.7071,
-      "4": 0.8273,
-      "8": 0.91,
-      "16": 0.9624,
-      "32": 0.9852,
-      "64": 0.9957
-    },
-    "exact_kernel": {
-      "1": 0.5544,
-      "2": 0.7041,
-      "4": 0.8278,
-      "8": 0.9097,
-      "16": 0.9627,
-      "32": 0.9852,
-      "64": 0.9963
-    }
-  },
-  "glove-200_4bit": {
-    "dataset": "glove-200",
-    "dim": 200,
-    "bit_width": 4,
-    "seed": 42,
-    "n_db": 100000,
-    "n_queries": 10000,
-    "lut_kernel": {
-      "1": 0.844,
-      "2": 0.9556,
-      "4": 0.9932,
-      "8": 0.9997,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    },
-    "exact_kernel": {
-      "1": 0.8577,
-      "2": 0.9584,
-      "4": 0.9948,
-      "8": 0.9998,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    }
-  },
-  "openai-1536_2bit": {
-    "dataset": "openai-1536",
-    "dim": 1536,
-    "bit_width": 2,
-    "seed": 42,
-    "n_db": 100000,
-    "n_queries": 1000,
-    "lut_kernel": {
-      "1": 0.906,
-      "2": 0.976,
-      "4": 0.999,
-      "8": 1.0,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    },
-    "exact_kernel": {
-      "1": 0.904,
-      "2": 0.975,
-      "4": 0.998,
-      "8": 1.0,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    }
-  },
-  "openai-1536_4bit": {
-    "dataset": "openai-1536",
-    "dim": 1536,
-    "bit_width": 4,
-    "seed": 42,
-    "n_db": 100000,
-    "n_queries": 1000,
-    "lut_kernel": {
-      "1": 0.97,
-      "2": 0.997,
-      "4": 1.0,
-      "8": 1.0,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    },
-    "exact_kernel": {
-      "1": 0.975,
-      "2": 0.998,
-      "4": 1.0,
-      "8": 1.0,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    }
-  },
-  "openai-3072_2bit": {
-    "dataset": "openai-3072",
-    "dim": 3072,
-    "bit_width": 2,
-    "seed": 42,
-    "n_db": 100000,
-    "n_queries": 1000,
-    "lut_kernel": {
-      "1": 0.924,
-      "2": 0.992,
-      "4": 0.999,
-      "8": 1.0,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    },
-    "exact_kernel": {
-      "1": 0.922,
-      "2": 0.993,
-      "4": 0.999,
-      "8": 1.0,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    }
-  },
-  "openai-3072_4bit": {
-    "dataset": "openai-3072",
-    "dim": 3072,
-    "bit_width": 4,
-    "seed": 42,
-    "n_db": 100000,
-    "n_queries": 1000,
-    "lut_kernel": {
-      "1": 0.98,
-      "2": 1.0,
-      "4": 1.0,
-      "8": 1.0,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    },
-    "exact_kernel": {
-      "1": 0.982,
-      "2": 1.0,
-      "4": 1.0,
-      "8": 1.0,
-      "16": 1.0,
-      "32": 1.0,
-      "64": 1.0
-    }
-  }
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/kernel_math_comparison.py b/benchmarks/rabitq_poc/kernel_math_comparison.py
deleted file mode 100644
index 11d027f5..00000000
--- a/benchmarks/rabitq_poc/kernel_math_comparison.py
+++ /dev/null
@@ -1,216 +0,0 @@
-"""
-Math-isolated ARM vs x86 kernel comparison — pure synthetic test.
-
-Generates random codes + random LUTs, runs four scoring variants on the
-SAME inputs, compares per-vector outputs. No dependency on rotation,
-centroids, or real datasets — the question we're answering ("do ARM and
-x86 kernels produce the same scores for the same LUT and codes?") is
-purely about kernel arithmetic.
-
-Variants:
-
-* `arm` — ARM NEON kernel math: per byte-group, compute `u8_sum = lo+hi`
-  (capped at 254 with max_lut=127, otherwise wraps modulo 256), accumulate
-  into u16, flush to f32 every FLUSH_EVERY=256 byte-groups.
-
-* `x86_current` — AVX2 kernel math: accumulate u8 lookups directly into
-  i16 lanes (FAISS even/odd-byte interleave), NO periodic flush. Per
-  nibble-half sum must fit in 16 bits, so effective `max_lut <=
-  65535 / n_byte_groups`. The implementation here collapses to:
-  `lo_sum_u16 + hi_sum_u16` computed mod 2^16 each.
-
-* `x86_with_flush` — hypothetical fix: same i16 accumulator BUT flushed
-  to f32 every 256 byte-groups, mirroring ARM. Per-flush max sum is
-  `flush_every * max_lut = 256 * 127 = 32512 <= 65535`, so this allows
-  max_lut=127 at any dim.
-
-* `exact_int` — bit-exact integer reference: pure-Python sum of LUT
-  lookups in unbounded ints. No modular wrap. The "what should happen
-  in real arithmetic" baseline.
-
-Usage:
-  python3 kernel_math_comparison.py [dim]  # default 3072
-"""
-
-import sys
-
-import numpy as np
-
-
-FLUSH_EVERY = 256
-SEED = 42
-
-
-# ─── Kernel simulations ──────────────────────────────────────────────────────
-
-def arm_kernel_score(codes_one_vec: np.ndarray, lut_u8: np.ndarray,
-                     scale: float, bias: float, vec_scale: float,
-                     flush_every: int = FLUSH_EVERY) -> float:
-    """NEON math: u8 sum lo+hi per byte-group → widen u16 → periodic flush."""
-    dim = codes_one_vec.shape[0]
-    n_byte_groups = dim // 2
-    fa = float(bias)
-    u16_accum = 0
-    flush_idx = 0
-    for g in range(n_byte_groups):
-        lo = lut_u8[2 * g, codes_one_vec[2 * g]]
-        hi = lut_u8[2 * g + 1, codes_one_vec[2 * g + 1]]
-        u8_sum = (lo + hi) & 0xFF  # explicit u8 modular wrap
-        u16_accum = (u16_accum + u8_sum) & 0xFFFF
-        flush_idx += 1
-        if flush_idx >= flush_every or g == n_byte_groups - 1:
-            fa += scale * float(u16_accum)
-            u16_accum = 0
-            flush_idx = 0
-    return fa * vec_scale
-
-
-def x86_kernel_score(codes_one_vec: np.ndarray, lut_u8: np.ndarray,
-                     scale: float, bias: float, vec_scale: float) -> float:
-    """AVX2 math: lo and hi sums accumulated independently into u16 lanes,
-    no flush. Each sum must fit in 16 bits; above that, modular wrap."""
-    dim = codes_one_vec.shape[0]
-    n_byte_groups = dim // 2
-    lo_sum_u16 = 0
-    hi_sum_u16 = 0
-    for g in range(n_byte_groups):
-        lo = lut_u8[2 * g, codes_one_vec[2 * g]]
-        hi = lut_u8[2 * g + 1, codes_one_vec[2 * g + 1]]
-        lo_sum_u16 = (int(lo_sum_u16) + int(lo)) & 0xFFFF
-        hi_sum_u16 = (int(hi_sum_u16) + int(hi)) & 0xFFFF
-    return (bias + scale * (int(lo_sum_u16) + int(hi_sum_u16))) * vec_scale
-
-
-def x86_with_flush_score(codes_one_vec: np.ndarray, lut_u8: np.ndarray,
-                         scale: float, bias: float, vec_scale: float,
-                         flush_every: int = FLUSH_EVERY) -> float:
-    """Hypothetical fix: same i16 accumulator structure, periodic flush."""
-    dim = codes_one_vec.shape[0]
-    n_byte_groups = dim // 2
-    fa = float(bias)
-    lo_sum_u16 = 0
-    hi_sum_u16 = 0
-    flush_idx = 0
-    for g in range(n_byte_groups):
-        lo = lut_u8[2 * g, codes_one_vec[2 * g]]
-        hi = lut_u8[2 * g + 1, codes_one_vec[2 * g + 1]]
-        lo_sum_u16 = (int(lo_sum_u16) + int(lo)) & 0xFFFF
-        hi_sum_u16 = (int(hi_sum_u16) + int(hi)) & 0xFFFF
-        flush_idx += 1
-        if flush_idx >= flush_every or g == n_byte_groups - 1:
-            fa += scale * (int(lo_sum_u16) + int(hi_sum_u16))
-            lo_sum_u16 = 0
-            hi_sum_u16 = 0
-            flush_idx = 0
-    return fa * vec_scale
-
-
-def exact_int_score(codes_one_vec: np.ndarray, lut_u8: np.ndarray,
-                    scale: float, bias: float, vec_scale: float) -> float:
-    """Reference: unbounded integer sum, no modular wrap."""
-    dim = codes_one_vec.shape[0]
-    n_byte_groups = dim // 2
-    total = 0
-    for g in range(n_byte_groups):
-        total += int(lut_u8[2 * g, codes_one_vec[2 * g]])
-        total += int(lut_u8[2 * g + 1, codes_one_vec[2 * g + 1]])
-    return (bias + scale * total) * vec_scale
-
-
-# ─── Driver ──────────────────────────────────────────────────────────────────
-
-def run(dim: int, n_vectors: int, max_lut: int, lut_distribution: str = "uniform"):
-    n_byte_groups = dim // 2
-    n_subs = dim
-
-    print(f"\n--- dim={dim} n_byte_groups={n_byte_groups} n_subs={n_subs} max_lut={max_lut} ---")
-    # x86 sum-fits-in-u16 constraint: max_lut * n_byte_groups <= 65535
-    sum_cap = n_byte_groups * max_lut
-    print(f"x86 per-half max sum: {n_byte_groups} * {max_lut} = {sum_cap}"
-          f"  ({'FITS in u16' if sum_cap <= 65535 else 'OVERFLOWS u16 (sum mod 2^16 corrupts result)'})")
-    # ARM per-flush sum: flush_every * (lo+hi cap)
-    arm_per_flush = FLUSH_EVERY * min(2 * max_lut, 255)  # u8 sum capped at 255
-    print(f"ARM per-flush u16 sum: min(2*max_lut, 255) * FLUSH_EVERY = "
-          f"{arm_per_flush}  ({'FITS' if arm_per_flush <= 65535 else 'OVERFLOWS u16'})")
-
-    rng = np.random.RandomState(SEED)
-    # Random codes 0..15
-    codes = rng.randint(0, 16, size=(n_vectors, dim), dtype=np.int8)
-
-    # Generate a per-sub-table LUT. "uniform" = all sub-tables span similar
-    # range (mimics TQ+ output), "skewed" = a few wide + many narrow (mimics
-    # raw GloVe-like distribution).
-    if lut_distribution == "uniform":
-        # Each sub-table: values uniformly distributed up to max_lut.
-        lut_u8 = rng.randint(0, max_lut + 1, size=(n_subs, 16), dtype=np.uint16)
-    elif lut_distribution == "skewed":
-        # 10% sub-tables span full range, 90% span ~10% of full range.
-        spans = np.where(rng.uniform(size=n_subs) < 0.1, max_lut, max_lut // 10 + 1)
-        lut_u8 = np.zeros((n_subs, 16), dtype=np.uint16)
-        for s in range(n_subs):
-            lut_u8[s] = rng.randint(0, spans[s] + 1, size=16)
-    else:
-        raise ValueError(lut_distribution)
-
-    scale = 0.01  # arbitrary; doesn't affect ranking
-    bias = 0.0
-    vec_scales = np.full(n_vectors, 1.0)
-
-    arm = np.zeros(n_vectors)
-    x86 = np.zeros(n_vectors)
-    x86_f = np.zeros(n_vectors)
-    exact = np.zeros(n_vectors)
-    for i in range(n_vectors):
-        c = codes[i]
-        arm[i] = arm_kernel_score(c, lut_u8, scale, bias, vec_scales[i])
-        x86[i] = x86_kernel_score(c, lut_u8, scale, bias, vec_scales[i])
-        x86_f[i] = x86_with_flush_score(c, lut_u8, scale, bias, vec_scales[i])
-        exact[i] = exact_int_score(c, lut_u8, scale, bias, vec_scales[i])
-
-    # Compare each variant to the exact reference.
-    def report(name, arr):
-        diff_count = int(np.sum(np.abs(arr - exact) > 1e-9))
-        max_diff = float(np.max(np.abs(arr - exact)))
-        ranks_arr = np.argsort(-arr)
-        ranks_exact = np.argsort(-exact)
-        # top-K agreement
-        K = 10
-        topk_arr = set(np.argpartition(-arr, K)[:K].tolist())
-        topk_exact = set(np.argpartition(-exact, K)[:K].tolist())
-        overlap = len(topk_arr & topk_exact)
-        print(f"  {name:<16} mismatches={diff_count:>5}/{n_vectors}   "
-              f"max|Δ|={max_diff:.4f}   top-{K} overlap with exact={overlap}/{K}")
-
-    report("exact_int", exact)
-    report("arm", arm)
-    report("x86_current", x86)
-    report("x86_with_flush", x86_f)
-
-
-def main():
-    dim = int(sys.argv[1]) if len(sys.argv) > 1 else 3072
-
-    # Sweep 1: each kernel at the max_lut value where IT would currently operate.
-    print("\n=== current production max_lut per arch ===")
-    n_byte_groups = dim // 2
-    x86_cap = min(127, 65535 // n_byte_groups // 1)  # see search.rs formula (n_byte_groups*2 in denom = n_subs)
-    arm_cap = 127
-    print(f"x86 cap derived from search.rs formula: min(127, 65535/{2*n_byte_groups}) "
-          f"= {min(127, 65535 // (2 * n_byte_groups))}")
-    print(f"ARM cap: 127")
-    run(dim, n_vectors=200, max_lut=min(127, 65535 // (2 * n_byte_groups)), lut_distribution="uniform")
-    # And ARM at its own cap
-    run(dim, n_vectors=200, max_lut=127, lut_distribution="uniform")
-
-    # Sweep 2: force x86 to use the ARM cap (max_lut=127) — does ARM math match
-    # x86_with_flush? Does x86_current overflow?
-    print("\n=== force max_lut=127 on both kernels (high precision regime) ===")
-    run(dim, n_vectors=200, max_lut=127, lut_distribution="uniform")
-
-    # Sweep 3: same at low dim where x86_current doesn't overflow.
-    print("\n=== sanity check: low dim where x86_current is fine ===")
-    run(dim=200, n_vectors=200, max_lut=127, lut_distribution="uniform")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmarks/rabitq_poc/poc.py b/benchmarks/rabitq_poc/poc.py
deleted file mode 100644
index aa5c5867..00000000
--- a/benchmarks/rabitq_poc/poc.py
+++ /dev/null
@@ -1,247 +0,0 @@
-"""
-RaBitQ-style scalar correction on top of turbovec's Lloyd-Max codebook.
-
-Tests whether a per-vector correction scalar — computed at encode time
-and applied at search time — recovers recall lost to systematic bias
-in turbovec's inner-product estimates.
-
-Pipeline (numpy reimplementation of turbovec/src/encode.rs):
-    1. Normalize each data vector v to unit u = v / ||v||
-    2. Rotate: u_rot = R @ u  (R = seeded random orthogonal)
-    3. Quantize: each coord of u_rot -> nearest Lloyd-Max centroid (Beta dist)
-    4. Reconstruct: x_hat = centroids[codes]
-    5. Baseline score: ||v|| * <x_hat, y_rot>
-    6. Corrected score: scalar_v * <x_hat, y_rot>
-       where scalar_v is one of three forms tested.
-
-Two correction forms tested:
-    A. Regression-optimal (JL projection):
-       scalar = ||v|| * cos(u_rot, x_hat) / ||x_hat||
-    B. Paper formula (RaBitQ Section 2.2.3):
-       scalar = ||v|| / (||x_hat|| * cos(u_rot, x_hat))
-
-Runs across 3 datasets x 2 bit widths = 6 operating points, plots
-recall@1@k.
-"""
-
-import json
-import os
-import time
-
-import h5py
-import matplotlib.pyplot as plt
-import numpy as np
-from scipy.stats import beta as beta_dist
-
-DATA_DIR = os.path.expanduser("~/data/py-turboquant")
-RESULTS_DIR = os.path.dirname(__file__)
-SEED = 42
-N_DB = 100_000
-K_VALUES = [1, 2, 4, 8, 16, 32, 64]
-
-
-def lloyd_max_codebook(bits, dim, max_iter=200, tol=1e-12):
-    """Match turbovec/src/codebook.rs — Lloyd-Max on Beta((d-1)/2, (d-1)/2)."""
-    a = (dim - 1) / 2.0
-    n_levels = 1 << bits
-    std_dev = np.sqrt(2.0 * a / ((2.0 * a + 1.0) * 4.0 * a))
-    spread = 3.0 * std_dev
-    centroids = np.linspace(-spread, spread, n_levels, dtype=np.float64)
-
-    for _ in range(max_iter):
-        midpoints = (centroids[:-1] + centroids[1:]) / 2.0
-        edges = np.concatenate(([-1.0], midpoints, [1.0]))
-
-        u_lo = (edges[:-1] + 1.0) / 2.0
-        u_hi = (edges[1:] + 1.0) / 2.0
-        prob = beta_dist.cdf(u_hi, a, a) - beta_dist.cdf(u_lo, a, a)
-
-        new_centroids = np.empty(n_levels, dtype=np.float64)
-        for i in range(n_levels):
-            if prob[i] < 1e-15:
-                new_centroids[i] = centroids[i]
-                continue
-            xs = np.linspace(edges[i], edges[i + 1], 2049)
-            pdf_xs = beta_dist.pdf((xs + 1) / 2, a, a) / 2
-            new_centroids[i] = np.trapz(xs * pdf_xs, xs) / prob[i]
-
-        if np.max(np.abs(centroids - new_centroids)) < tol:
-            centroids = new_centroids
-            break
-        centroids = new_centroids
-
-    boundaries = (centroids[:-1] + centroids[1:]) / 2.0
-    return boundaries.astype(np.float32), centroids.astype(np.float32)
-
-
-def random_rotation(dim, seed):
-    """Deterministic random orthogonal via QR (matches turbovec/src/rotation.rs)."""
-    rng = np.random.RandomState(seed)
-    g = rng.standard_normal((dim, dim)).astype(np.float64)
-    q, r = np.linalg.qr(g)
-    signs = np.sign(np.diag(r))
-    signs[signs == 0] = 1.0
-    q = q * signs
-    return q.astype(np.float32)
-
-
-def encode(vectors, rotation, boundaries, centroids):
-    norms = np.linalg.norm(vectors, axis=1).astype(np.float32)
-    unit = vectors / np.clip(norms[:, None], 1e-10, None)
-    rotated = unit @ rotation.T
-    codes_idx = np.searchsorted(boundaries, rotated)
-    x_hat = centroids[codes_idx]
-    return norms, rotated, x_hat
-
-
-def compute_corrections(rotated, x_hat, norms):
-    inner = np.einsum("nd,nd->n", rotated, x_hat)
-    xh_norm = np.linalg.norm(x_hat, axis=1)
-    cos_uv = inner / np.clip(xh_norm, 1e-10, None)
-    return {
-        "baseline": norms,
-        "form_A_regression": norms * cos_uv / np.clip(xh_norm, 1e-10, None),
-        "form_B_paper": norms / np.clip(xh_norm * cos_uv, 1e-10, None),
-    }, {
-        "cos_mean": float(cos_uv.mean()),
-        "cos_std": float(cos_uv.std()),
-        "xh_norm_mean": float(xh_norm.mean()),
-    }
-
-
-def score_and_topk(query_rot, x_hat_db, scalars, k):
-    raw = x_hat_db @ query_rot.T
-    scored = raw * scalars[:, None]
-    topk = np.argpartition(-scored, k, axis=0)[:k]
-    topk_scores = np.take_along_axis(scored, topk, axis=0)
-    order = np.argsort(-topk_scores, axis=0)
-    return np.take_along_axis(topk, order, axis=0).T
-
-
-def recall_at_1_at_k(true_top1, predicted, k):
-    return float(np.mean([true_top1[i] in predicted[i, :k] for i in range(len(true_top1))]))
-
-
-def load_openai(dim):
-    all_vecs = np.load(os.path.join(DATA_DIR, f"openai-{dim}.npy"))
-    rng = np.random.RandomState(SEED)
-    idx = rng.permutation(len(all_vecs))
-    db = all_vecs[idx[:N_DB]].astype(np.float32)
-    q = all_vecs[idx[N_DB : N_DB + 1_000]].astype(np.float32)
-    db /= np.linalg.norm(db, axis=-1, keepdims=True)
-    q /= np.linalg.norm(q, axis=-1, keepdims=True)
-    return db, q, dim
-
-
-def load_glove():
-    with h5py.File(os.path.join(DATA_DIR, "glove-200-angular.hdf5"), "r") as f:
-        all_train = f["train"][:].astype(np.float32)
-        queries = f["test"][:].astype(np.float32)
-    rng = np.random.RandomState(SEED)
-    idx = rng.choice(len(all_train), N_DB, replace=False)
-    db = all_train[idx]
-    db /= np.linalg.norm(db, axis=-1, keepdims=True)
-    queries /= np.linalg.norm(queries, axis=-1, keepdims=True)
-    return db, queries, 200
-
-
-DATASETS = {
-    "glove-200": load_glove,
-    "openai-1536": lambda: load_openai(1536),
-    "openai-3072": lambda: load_openai(3072),
-}
-
-
-def run(dataset, bit_width):
-    print(f"\n=== {dataset}, {bit_width}-bit, seed={SEED} ===")
-    t0 = time.time()
-    database, queries, dim = DATASETS[dataset]()
-    true_top1 = np.argmax(queries @ database.T, axis=1)
-    print(f"  data + ground truth ({len(queries)} queries x {len(database)} db): {time.time() - t0:.1f}s")
-
-    boundaries, centroids = lloyd_max_codebook(bit_width, dim)
-    rotation = random_rotation(dim, SEED)
-
-    t0 = time.time()
-    norms, rotated, x_hat = encode(database, rotation, boundaries, centroids)
-    print(f"  encode: {time.time() - t0:.1f}s")
-
-    scalars, stats = compute_corrections(rotated, x_hat, norms)
-    print(f"  cos(u, x_hat): mean={stats['cos_mean']:.4f} std={stats['cos_std']:.4f}; ||x_hat|| mean={stats['xh_norm_mean']:.4f}")
-
-    queries_rot = (queries @ rotation.T).astype(np.float32)
-    k_max = max(K_VALUES)
-    recalls = {}
-    for name, s in scalars.items():
-        t0 = time.time()
-        top = score_and_topk(queries_rot, x_hat, s, k_max)
-        recalls[name] = {str(k): round(recall_at_1_at_k(true_top1, top, k), 4) for k in K_VALUES}
-        print(f"  {name:<22} recall@1={recalls[name]['1']:.4f} ({time.time() - t0:.1f}s)")
-
-    return {
-        "dataset": dataset, "dim": dim, "bit_width": bit_width, "seed": SEED,
-        "n_db": len(database), "n_queries": len(queries),
-        "cos_stats": stats, "recall_at_1_at_k": recalls,
-    }
-
-
-def plot(results, out_path):
-    fig, axes = plt.subplots(2, 3, figsize=(15, 8), sharey=False)
-    datasets = ["glove-200", "openai-1536", "openai-3072"]
-    bit_widths = [2, 4]
-    series = [
-        ("baseline", "baseline (||v|| only)", "C0", "-"),
-        ("form_A_regression", "form A (cos / ||x_hat||)", "C1", "--"),
-        ("form_B_paper", "form B (1 / (||x_hat||·cos))", "C2", "-"),
-    ]
-    for row, bits in enumerate(bit_widths):
-        for col, ds in enumerate(datasets):
-            ax = axes[row, col]
-            key = f"{ds}_{bits}bit"
-            if key not in results:
-                ax.set_title(f"{ds} {bits}-bit (missing)")
-                continue
-            r = results[key]["recall_at_1_at_k"]
-            for name, label, color, ls in series:
-                ys = [r[name][str(k)] for k in K_VALUES]
-                ax.plot(K_VALUES, ys, marker="o", label=label, color=color, linestyle=ls)
-            ax.set_xscale("log", base=2)
-            ax.set_xticks(K_VALUES)
-            ax.set_xticklabels([str(k) for k in K_VALUES])
-            ax.set_xlabel("k")
-            ax.set_ylabel(f"recall@1@k ({bits}-bit)")
-            ax.set_title(f"{ds} ({bits}-bit, d={results[key]['dim']})")
-            ax.grid(True, alpha=0.3)
-            if row == 0 and col == 0:
-                ax.legend(loc="lower right", fontsize=9)
-    plt.tight_layout()
-    plt.savefig(out_path, dpi=120)
-    print(f"\nPlot saved to {out_path}")
-
-
-def summary_table(results):
-    """Print a table comparing recall@1 across all 6 cells."""
-    print("\n" + "=" * 80)
-    print(f"{'cell':<28} {'baseline':>10} {'form_A':>10} {'form_B':>10} {'Δ(B-base)':>10}")
-    print("-" * 80)
-    for key in sorted(results.keys()):
-        r = results[key]["recall_at_1_at_k"]
-        b = r["baseline"]["1"]
-        a = r["form_A_regression"]["1"]
-        bb = r["form_B_paper"]["1"]
-        print(f"{key:<28} {b:>10.4f} {a:>10.4f} {bb:>10.4f} {bb - b:>+10.4f}")
-
-
-if __name__ == "__main__":
-    results = {}
-    for dataset in ["glove-200", "openai-1536", "openai-3072"]:
-        for bits in (2, 4):
-            results[f"{dataset}_{bits}bit"] = run(dataset, bits)
-
-    out_json = os.path.join(RESULTS_DIR, "results.json")
-    with open(out_json, "w") as f:
-        json.dump(results, f, indent=2)
-    print(f"\nResults: {out_json}")
-
-    plot(results, os.path.join(RESULTS_DIR, "recall_grid.png"))
-    summary_table(results)
diff --git a/benchmarks/rabitq_poc/poc_apples_to_apples.py b/benchmarks/rabitq_poc/poc_apples_to_apples.py
deleted file mode 100644
index e87011f7..00000000
--- a/benchmarks/rabitq_poc/poc_apples_to_apples.py
+++ /dev/null
@@ -1,147 +0,0 @@
-"""
-Apples-to-apples POC: consumes Rust's EXACT rotation matrix, Lloyd-Max
-boundaries, and centroids — exported by `cargo run --example dump_state` —
-so the only thing left that can differ from the real Rust pipeline is the
-scoring kernel (numpy f32 matmul vs Rust bit-plane SIMD).
-
-If the corrected recall here matches the Rust prototype's recall to within
-the SIMD kernel noise floor, that proves the correction math is implemented
-identically in both pipelines.
-
-If they DON'T match, that's a real signal something is wrong in one of the
-two paths.
-
-Run prerequisites:
-    cargo run -p turbovec --example dump_state --release -- benchmarks/rabitq_poc/rust_state
-"""
-
-import json
-import os
-import time
-
-import h5py
-import numpy as np
-
-DATA_DIR = os.path.expanduser("~/data/py-turboquant")
-HERE = os.path.dirname(__file__)
-STATE_DIR = os.path.join(HERE, "rust_state")
-SEED = 42
-N_DB = 100_000
-K_VALUES = [1, 2, 4, 8, 16, 32, 64]
-
-
-def load_rust_state(dim, bits):
-    """Load rotation + boundaries + centroids dumped by examples/dump_state.rs."""
-    path = os.path.join(STATE_DIR, f"state_d{dim}_b{bits}.bin")
-    raw = np.fromfile(path, dtype="<f4")
-    n_rot = dim * dim
-    n_bnd = (1 << bits) - 1
-    n_cent = 1 << bits
-    assert len(raw) == n_rot + n_bnd + n_cent, f"size mismatch in {path}: {len(raw)} vs {n_rot + n_bnd + n_cent}"
-    rotation = raw[:n_rot].reshape(dim, dim).copy()
-    boundaries = raw[n_rot : n_rot + n_bnd].copy()
-    centroids = raw[n_rot + n_bnd :].copy()
-    return rotation, boundaries, centroids
-
-
-def encode(vectors, rotation, boundaries, centroids):
-    norms = np.linalg.norm(vectors, axis=1).astype(np.float32)
-    unit = vectors / np.clip(norms[:, None], 1e-10, None)
-    rotated = unit @ rotation.T
-    codes_idx = np.searchsorted(boundaries, rotated)
-    x_hat = centroids[codes_idx]
-    return norms, rotated, x_hat
-
-
-def compute_scales(rotated, x_hat, norms):
-    inner = np.einsum("nd,nd->n", rotated, x_hat)
-    inner = np.clip(inner, 1e-10, None)
-    return {
-        "baseline": norms,
-        "form_B_paper": norms / inner,
-    }
-
-
-def score_and_topk(query_rot, x_hat_db, scalars, k):
-    raw = x_hat_db @ query_rot.T
-    scored = raw * scalars[:, None]
-    topk = np.argpartition(-scored, k, axis=0)[:k]
-    topk_scores = np.take_along_axis(scored, topk, axis=0)
-    order = np.argsort(-topk_scores, axis=0)
-    return np.take_along_axis(topk, order, axis=0).T
-
-
-def recall_at_1_at_k(true_top1, predicted, k):
-    return float(np.mean([true_top1[i] in predicted[i, :k] for i in range(len(true_top1))]))
-
-
-def load_openai(dim):
-    all_vecs = np.load(os.path.join(DATA_DIR, f"openai-{dim}.npy"))
-    rng = np.random.RandomState(SEED)
-    idx = rng.permutation(len(all_vecs))
-    db = all_vecs[idx[:N_DB]].astype(np.float32)
-    q = all_vecs[idx[N_DB : N_DB + 1_000]].astype(np.float32)
-    db /= np.linalg.norm(db, axis=-1, keepdims=True)
-    q /= np.linalg.norm(q, axis=-1, keepdims=True)
-    return db, q, dim
-
-
-def load_glove():
-    with h5py.File(os.path.join(DATA_DIR, "glove-200-angular.hdf5"), "r") as f:
-        all_train = f["train"][:].astype(np.float32)
-        queries = f["test"][:].astype(np.float32)
-    rng = np.random.RandomState(SEED)
-    idx = rng.choice(len(all_train), N_DB, replace=False)
-    db = all_train[idx]
-    db /= np.linalg.norm(db, axis=-1, keepdims=True)
-    queries /= np.linalg.norm(queries, axis=-1, keepdims=True)
-    return db, queries, 200
-
-
-DATASETS = {
-    "glove-200":   load_glove,
-    "openai-1536": lambda: load_openai(1536),
-    "openai-3072": lambda: load_openai(3072),
-}
-
-
-def run(dataset_label, bits):
-    dim = {"glove-200": 200, "openai-1536": 1536, "openai-3072": 3072}[dataset_label]
-    print(f"\n=== {dataset_label}, {bits}-bit (apples-to-apples vs Rust) ===")
-
-    t0 = time.time()
-    database, queries, _ = DATASETS[dataset_label]()
-    true_top1 = np.argmax(queries @ database.T, axis=1)
-    print(f"  data + ground truth: {time.time() - t0:.1f}s")
-
-    rotation, boundaries, centroids = load_rust_state(dim, bits)
-    print(f"  loaded Rust state for d={dim} bits={bits}")
-    print(f"    rotation[:3,:3] = {rotation[:3,:3]}")
-    print(f"    centroids = {centroids}")
-
-    norms, rotated, x_hat = encode(database, rotation, boundaries, centroids)
-    print(f"  ||x_hat|| mean = {np.linalg.norm(x_hat, axis=1).mean():.4f}")
-
-    scalars = compute_scales(rotated, x_hat, norms)
-    queries_rot = (queries @ rotation.T).astype(np.float32)
-
-    out = {}
-    k_max = max(K_VALUES)
-    for name, s in scalars.items():
-        top = score_and_topk(queries_rot, x_hat, s, k_max)
-        recalls = {str(k): round(recall_at_1_at_k(true_top1, top, k), 4) for k in K_VALUES}
-        out[name] = recalls
-        print(f"  {name:<14} recall@1 = {recalls['1']:.4f}")
-    return out
-
-
-if __name__ == "__main__":
-    results = {}
-    for dataset in ["glove-200", "openai-1536", "openai-3072"]:
-        for bits in (2, 4):
-            results[f"{dataset}_{bits}bit"] = run(dataset, bits)
-
-    out_json = os.path.join(HERE, "apples_results.json")
-    with open(out_json, "w") as f:
-        json.dump(results, f, indent=2)
-    print(f"\nResults: {out_json}")
diff --git a/benchmarks/rabitq_poc/recall_grid.png b/benchmarks/rabitq_poc/recall_grid.png
deleted file mode 100644
index 39d79d4f..00000000
Binary files a/benchmarks/rabitq_poc/recall_grid.png and /dev/null differ
diff --git a/benchmarks/rabitq_poc/results.json b/benchmarks/rabitq_poc/results.json
deleted file mode 100644
index bd10edd7..00000000
--- a/benchmarks/rabitq_poc/results.json
+++ /dev/null
@@ -1,254 +0,0 @@
-{
-  "glove-200_2bit": {
-    "dataset": "glove-200",
-    "dim": 200,
-    "bit_width": 2,
-    "seed": 42,
-    "n_db": 100000,
-    "n_queries": 10000,
-    "cos_stats": {
-      "cos_mean": 0.9400802254676819,
-      "cos_std": 0.006636226084083319,
-      "xh_norm_mean": 0.9393569827079773
-    },
-    "recall_at_1_at_k": {
-      "baseline": {
-        "1": 0.5117,
-        "2": 0.6657,
-        "4": 0.7916,
-        "8": 0.8855,
-        "16": 0.9466,
-        "32": 0.9771,
-        "64": 0.9913
-      },
-      "form_A_regression": {
-        "1": 0.557,
-        "2": 0.7058,
-        "4": 0.8235,
-        "8": 0.9071,
-        "16": 0.9593,
-        "32": 0.9845,
-        "64": 0.9939
-      },
-      "form_B_paper": {
-        "1": 0.5609,
-        "2": 0.7128,
-        "4": 0.831,
-        "8": 0.9112,
-        "16": 0.9634,
-        "32": 0.9848,
-        "64": 0.9948
-      }
-    }
-  },
-  "glove-200_4bit": {
-    "dataset": "glove-200",
-    "dim": 200,
-    "bit_width": 4,
-    "seed": 42,
-    "n_db": 100000,
-    "n_queries": 10000,
-    "cos_stats": {
-      "cos_mean": 0.9953391551971436,
-      "cos_std": 0.0009531387477181852,
-      "xh_norm_mean": 0.9951646327972412
-    },
-    "recall_at_1_at_k": {
-      "baseline": {
-        "1": 0.8249,
-        "2": 0.9408,
-        "4": 0.9875,
-        "8": 0.9979,
-        "16": 0.9999,
-        "32": 1.0,
-        "64": 1.0
-      },
-      "form_A_regression": {
-        "1": 0.8607,
-        "2": 0.9586,
-        "4": 0.9928,
-        "8": 0.9991,
-        "16": 1.0,
-        "32": 1.0,
-        "64": 1.0
-      },
-      "form_B_paper": {
-        "1": 0.86,
-        "2": 0.9595,
-        "4": 0.9928,
-        "8": 0.9994,
-        "16": 1.0,
-        "32": 1.0,
-        "64": 1.0
-      }
-    }
-  },
-  "openai-1536_2bit": {
-    "dataset": "openai-1536",
-    "dim": 1536,
-    "bit_width": 2,
-    "seed": 42,
-    "n_db": 100000,
-    "n_queries": 1000,
-    "cos_stats": {
-      "cos_mean": 0.9395331144332886,
-      "cos_std": 0.0024137997534126043,
-      "xh_norm_mean": 0.9394603967666626
-    },
-    "recall_at_1_at_k": {
-      "baseline": {
-        "1": 0.862,
-        "2": 0.967,
-        "4": 0.995,
-        "8": 0.999,
-        "16": 1.0,
-        "32": 1.0,
-        "64": 1.0
-      },
-      "form_A_regression": {
-        "1": 0.893,
-        "2": 0.974,
-        "4": 0.996,
-        "8": 1.0,
-        "16": 1.0,
-        "32": 1.0,
-        "64": 1.0
-      },
-      "form_B_paper": {
-        "1": 0.909,
-        "2": 0.978,
-        "4": 0.996,
-        "8": 1.0,
-        "16": 1.0,
-        "32": 1.0,
-        "64": 1.0
-      }
-    }
-  },
-  "openai-1536_4bit": {
-    "dataset": "openai-1536",
-    "dim": 1536,
-    "bit_width": 4,
-    "seed": 42,
-    "n_db": 100000,
-    "n_queries": 1000,
-    "cos_stats": {
-      "cos_mean": 0.9952533841133118,
-      "cos_std": 0.00035732678952626884,
-      "xh_norm_mean": 0.99527508020401
-    },
-    "recall_at_1_at_k": {
-      "baseline": {
-        "1": 0.963,
-        "2": 0.995,
-        "4": 1.0,
-        "8": 1.0,
-        "16": 1.0,
-        "32": 1.0,
-        "64": 1.0
-      },
-      "form_A_regression": {
-        "1": 0.974,
-        "2": 0.996,
-        "4": 1.0,
-        "8": 1.0,
-        "16": 1.0,
-        "32": 1.0,
-        "64": 1.0
-      },
-      "form_B_paper": {
-        "1": 0.974,
-        "2": 0.997,
-        "4": 1.0,
-        "8": 1.0,
-        "16": 1.0,
-        "32": 1.0,
-        "64": 1.0
-      }
-    }
-  },
-  "openai-3072_2bit": {
-    "dataset": "openai-3072",
-    "dim": 3072,
-    "bit_width": 2,
-    "seed": 42,
-    "n_db": 100000,
-    "n_queries": 1000,
-    "cos_stats": {
-      "cos_mean": 0.9394387602806091,
-      "cos_std": 0.001708767144009471,
-      "xh_norm_mean": 0.9393318891525269
-    },
-    "recall_at_1_at_k": {
-      "baseline": {
-        "1": 0.906,
-        "2": 0.98,
-        "4": 0.998,
-        "8": 1.0,
-        "16": 1.0,
-        "32": 1.0,
-        "64": 1.0
-      },
-      "form_A_regression": {
-        "1": 0.917,
-        "2": 0.984,
-        "4": 0.999,
-        "8": 1.0,
-        "16": 1.0,
-        "32": 1.0,
-        "64": 1.0
-      },
-      "form_B_paper": {
-        "1": 0.913,
-        "2": 0.984,
-        "4": 0.999,
-        "8": 1.0,
-        "16": 1.0,
-        "32": 1.0,
-        "64": 1.0
-      }
-    }
-  },
-  "openai-3072_4bit": {
-    "dataset": "openai-3072",
-    "dim": 3072,
-    "bit_width": 4,
-    "seed": 42,
-    "n_db": 100000,
-    "n_queries": 1000,
-    "cos_stats": {
-      "cos_mean": 0.9952409267425537,
-      "cos_std": 0.00025407798239029944,
-      "xh_norm_mean": 0.9952232837677002
-    },
-    "recall_at_1_at_k": {
-      "baseline": {
-        "1": 0.969,
-        "2": 1.0,
-        "4": 1.0,
-        "8": 1.0,
-        "16": 1.0,
-        "32": 1.0,
-        "64": 1.0
-      },
-      "form_A_regression": {
-        "1": 0.98,
-        "2": 1.0,
-        "4": 1.0,
-        "8": 1.0,
-        "16": 1.0,
-        "32": 1.0,
-        "64": 1.0
-      },
-      "form_B_paper": {
-        "1": 0.981,
-        "2": 1.0,
-        "4": 1.0,
-        "8": 1.0,
-        "16": 1.0,
-        "32": 1.0,
-        "64": 1.0
-      }
-    }
-  }
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/run_exact_vs_lut.py b/benchmarks/rabitq_poc/run_exact_vs_lut.py
deleted file mode 100644
index 66925c54..00000000
--- a/benchmarks/rabitq_poc/run_exact_vs_lut.py
+++ /dev/null
@@ -1,117 +0,0 @@
-"""
-Compare Rust LUT kernel vs Rust exact-math kernel on the same index.
-
-Both paths run inside the same Rust pipeline — same rotation, same Lloyd-Max
-codebook, same encoded codes, same per-vector scale. The only difference is
-how the inner product is computed:
-
-    - search()       : bit-plane SIMD popcount through u8 LUTs + calibration
-    - search_exact() : float32 x_hat reconstruction + BLAS matmul
-
-If they differ by more than ~0.1pp on recall, that gap is precisely the
-recall cost of the LUT-quantization noise in the SIMD kernel. Apples-to-
-apples at the implementation level.
-"""
-
-import json
-import os
-import time
-
-import h5py
-import numpy as np
-from turbovec import TurboQuantIndex
-
-DATA_DIR = os.path.expanduser("~/data/py-turboquant")
-HERE = os.path.dirname(__file__)
-SEED = 42
-N_DB = 100_000
-K = 64
-K_VALUES = [1, 2, 4, 8, 16, 32, 64]
-
-
-def load_openai(dim):
-    all_vecs = np.load(os.path.join(DATA_DIR, f"openai-{dim}.npy"))
-    rng = np.random.RandomState(SEED)
-    idx = rng.permutation(len(all_vecs))
-    db = all_vecs[idx[:N_DB]].astype(np.float32)
-    q = all_vecs[idx[N_DB : N_DB + 1_000]].astype(np.float32)
-    db /= np.linalg.norm(db, axis=-1, keepdims=True)
-    q /= np.linalg.norm(q, axis=-1, keepdims=True)
-    return db, q, dim
-
-
-def load_glove():
-    with h5py.File(os.path.join(DATA_DIR, "glove-200-angular.hdf5"), "r") as f:
-        all_train = f["train"][:].astype(np.float32)
-        queries = f["test"][:].astype(np.float32)
-    rng = np.random.RandomState(SEED)
-    idx = rng.choice(len(all_train), N_DB, replace=False)
-    db = all_train[idx]
-    db /= np.linalg.norm(db, axis=-1, keepdims=True)
-    queries /= np.linalg.norm(queries, axis=-1, keepdims=True)
-    return db, queries, 200
-
-
-DATASETS = {
-    "glove-200":   load_glove,
-    "openai-1536": lambda: load_openai(1536),
-    "openai-3072": lambda: load_openai(3072),
-}
-
-
-def recall_at_1_at_k(true_top1, predicted_indices, k):
-    return float(np.mean([true_top1[i] in predicted_indices[i, :k] for i in range(len(true_top1))]))
-
-
-def run(dataset, bits):
-    print(f"\n=== {dataset}, {bits}-bit ===")
-    t0 = time.time()
-    database, queries, dim = DATASETS[dataset]()
-    true_top1 = np.argmax(queries @ database.T, axis=1)
-    print(f"  data + ground truth: {time.time() - t0:.1f}s")
-
-    t0 = time.time()
-    index = TurboQuantIndex(dim, bit_width=bits)
-    index.add(database)
-    index.prepare()
-    print(f"  build + prepare: {time.time() - t0:.1f}s")
-
-    t0 = time.time()
-    _, lut_indices = index.search(queries, k=K)
-    lut_indices = np.array(lut_indices)
-    lut_recalls = {str(k): round(recall_at_1_at_k(true_top1, lut_indices, k), 4) for k in K_VALUES}
-    print(f"  LUT kernel:   recall@1 = {lut_recalls['1']:.4f} ({time.time() - t0:.1f}s)")
-
-    t0 = time.time()
-    _, exact_indices = index.search_exact(queries, k=K)
-    exact_indices = np.array(exact_indices)
-    exact_recalls = {str(k): round(recall_at_1_at_k(true_top1, exact_indices, k), 4) for k in K_VALUES}
-    print(f"  exact kernel: recall@1 = {exact_recalls['1']:.4f} ({time.time() - t0:.1f}s)")
-    print(f"  Δ (exact - LUT) recall@1 = {exact_recalls['1'] - lut_recalls['1']:+.4f}")
-
-    return {
-        "dataset": dataset, "dim": dim, "bit_width": bits, "seed": SEED,
-        "n_db": N_DB, "n_queries": len(queries),
-        "lut_kernel": lut_recalls,
-        "exact_kernel": exact_recalls,
-    }
-
-
-if __name__ == "__main__":
-    results = {}
-    for dataset in ["glove-200", "openai-1536", "openai-3072"]:
-        for bits in (2, 4):
-            results[f"{dataset}_{bits}bit"] = run(dataset, bits)
-
-    out_path = os.path.join(HERE, "exact_vs_lut.json")
-    with open(out_path, "w") as f:
-        json.dump(results, f, indent=2)
-    print(f"\nResults: {out_path}")
-
-    print("\n" + "=" * 72)
-    print(f"{'cell':<22} {'LUT':>10} {'exact':>10} {'Δ exact-LUT':>14}")
-    print("-" * 72)
-    for key, r in results.items():
-        lut = r["lut_kernel"]["1"]
-        exact = r["exact_kernel"]["1"]
-        print(f"{key:<22} {lut:>10.4f} {exact:>10.4f} {exact-lut:>+14.4f}")
diff --git a/benchmarks/rabitq_poc/rust_comparison.png b/benchmarks/rabitq_poc/rust_comparison.png
deleted file mode 100644
index 12011695..00000000
Binary files a/benchmarks/rabitq_poc/rust_comparison.png and /dev/null differ
diff --git a/benchmarks/rabitq_poc/rust_results/recall_d1536_2bit.json b/benchmarks/rabitq_poc/rust_results/recall_d1536_2bit.json
deleted file mode 100644
index 10fc3010..00000000
--- a/benchmarks/rabitq_poc/rust_results/recall_d1536_2bit.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "dataset": "openai-1536",
-  "dim": 1536,
-  "bit_width": 2,
-  "faiss_variant": "IndexPQ(m=384, nbits=8)",
-  "seed": 42,
-  "tq_recalls": {
-    "1": 0.906,
-    "2": 0.976,
-    "4": 0.999,
-    "8": 1.0,
-    "16": 1.0,
-    "32": 1.0,
-    "64": 1.0
-  },
-  "faiss_recalls": {
-    "1": 0.872,
-    "2": 0.977,
-    "4": 0.997,
-    "8": 1.0,
-    "16": 1.0,
-    "32": 1.0,
-    "64": 1.0
-  }
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results/recall_d1536_4bit.json b/benchmarks/rabitq_poc/rust_results/recall_d1536_4bit.json
deleted file mode 100644
index e4930ae0..00000000
--- a/benchmarks/rabitq_poc/rust_results/recall_d1536_4bit.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "dataset": "openai-1536",
-  "dim": 1536,
-  "bit_width": 4,
-  "faiss_variant": "IndexPQ(m=768, nbits=8)",
-  "seed": 42,
-  "tq_recalls": {
-    "1": 0.97,
-    "2": 0.997,
-    "4": 1.0,
-    "8": 1.0,
-    "16": 1.0,
-    "32": 1.0,
-    "64": 1.0
-  },
-  "faiss_recalls": {
-    "1": 0.966,
-    "2": 0.998,
-    "4": 1.0,
-    "8": 1.0,
-    "16": 1.0,
-    "32": 1.0,
-    "64": 1.0
-  }
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results/recall_d3072_2bit.json b/benchmarks/rabitq_poc/rust_results/recall_d3072_2bit.json
deleted file mode 100644
index a0207e66..00000000
--- a/benchmarks/rabitq_poc/rust_results/recall_d3072_2bit.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "dataset": "openai-3072",
-  "dim": 3072,
-  "bit_width": 2,
-  "faiss_variant": "IndexPQ(m=768, nbits=8)",
-  "seed": 42,
-  "tq_recalls": {
-    "1": 0.924,
-    "2": 0.992,
-    "4": 0.999,
-    "8": 1.0,
-    "16": 1.0,
-    "32": 1.0,
-    "64": 1.0
-  },
-  "faiss_recalls": {
-    "1": 0.912,
-    "2": 0.986,
-    "4": 1.0,
-    "8": 1.0,
-    "16": 1.0,
-    "32": 1.0,
-    "64": 1.0
-  }
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results/recall_d3072_4bit.json b/benchmarks/rabitq_poc/rust_results/recall_d3072_4bit.json
deleted file mode 100644
index 4699ea77..00000000
--- a/benchmarks/rabitq_poc/rust_results/recall_d3072_4bit.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "dataset": "openai-3072",
-  "dim": 3072,
-  "bit_width": 4,
-  "faiss_variant": "IndexPQ(m=1536, nbits=8)",
-  "seed": 42,
-  "tq_recalls": {
-    "1": 0.98,
-    "2": 1.0,
-    "4": 1.0,
-    "8": 1.0,
-    "16": 1.0,
-    "32": 1.0,
-    "64": 1.0
-  },
-  "faiss_recalls": {
-    "1": 0.972,
-    "2": 0.998,
-    "4": 1.0,
-    "8": 1.0,
-    "16": 1.0,
-    "32": 1.0,
-    "64": 1.0
-  }
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results/recall_glove_2bit.json b/benchmarks/rabitq_poc/rust_results/recall_glove_2bit.json
deleted file mode 100644
index f115b270..00000000
--- a/benchmarks/rabitq_poc/rust_results/recall_glove_2bit.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "dataset": "glove",
-  "dim": 200,
-  "bit_width": 2,
-  "faiss_variant": "IndexPQ(m=50, nbits=8)",
-  "seed": 42,
-  "tq_recalls": {
-    "1": 0.5524,
-    "2": 0.7071,
-    "4": 0.8273,
-    "8": 0.91,
-    "16": 0.9624,
-    "32": 0.9852,
-    "64": 0.9957
-  },
-  "faiss_recalls": {
-    "1": 0.5643,
-    "2": 0.7188,
-    "4": 0.8446,
-    "8": 0.9252,
-    "16": 0.97,
-    "32": 0.9908,
-    "64": 0.9981
-  }
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results/recall_glove_4bit.json b/benchmarks/rabitq_poc/rust_results/recall_glove_4bit.json
deleted file mode 100644
index 1a377862..00000000
--- a/benchmarks/rabitq_poc/rust_results/recall_glove_4bit.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "dataset": "glove",
-  "dim": 200,
-  "bit_width": 4,
-  "faiss_variant": "IndexPQ(m=100, nbits=8)",
-  "seed": 42,
-  "tq_recalls": {
-    "1": 0.844,
-    "2": 0.9556,
-    "4": 0.9932,
-    "8": 0.9997,
-    "16": 1.0,
-    "32": 1.0,
-    "64": 1.0
-  },
-  "faiss_recalls": {
-    "1": 0.841,
-    "2": 0.9515,
-    "4": 0.9914,
-    "8": 0.9986,
-    "16": 1.0,
-    "32": 1.0,
-    "64": 1.0
-  }
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results/speed_d1536_2bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results/speed_d1536_2bit_arm_mt.json
deleted file mode 100644
index 377e3e10..00000000
--- a/benchmarks/rabitq_poc/rust_results/speed_d1536_2bit_arm_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 2,
-  "arch": "arm",
-  "threading": "mt",
-  "tq_ms_per_query": 0.109,
-  "faiss_ms_per_query": 0.13
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results/speed_d1536_2bit_arm_st.json b/benchmarks/rabitq_poc/rust_results/speed_d1536_2bit_arm_st.json
deleted file mode 100644
index 0271049f..00000000
--- a/benchmarks/rabitq_poc/rust_results/speed_d1536_2bit_arm_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 2,
-  "arch": "arm",
-  "threading": "st",
-  "tq_ms_per_query": 1.114,
-  "faiss_ms_per_query": 1.262
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results/speed_d1536_4bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results/speed_d1536_4bit_arm_mt.json
deleted file mode 100644
index 1a043225..00000000
--- a/benchmarks/rabitq_poc/rust_results/speed_d1536_4bit_arm_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 4,
-  "arch": "arm",
-  "threading": "mt",
-  "tq_ms_per_query": 0.202,
-  "faiss_ms_per_query": 0.261
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results/speed_d1536_4bit_arm_st.json b/benchmarks/rabitq_poc/rust_results/speed_d1536_4bit_arm_st.json
deleted file mode 100644
index 6da77e10..00000000
--- a/benchmarks/rabitq_poc/rust_results/speed_d1536_4bit_arm_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 4,
-  "arch": "arm",
-  "threading": "st",
-  "tq_ms_per_query": 2.079,
-  "faiss_ms_per_query": 2.525
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results/speed_d3072_2bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results/speed_d3072_2bit_arm_mt.json
deleted file mode 100644
index f732e3fa..00000000
--- a/benchmarks/rabitq_poc/rust_results/speed_d3072_2bit_arm_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 2,
-  "arch": "arm",
-  "threading": "mt",
-  "tq_ms_per_query": 0.214,
-  "faiss_ms_per_query": 0.267
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results/speed_d3072_2bit_arm_st.json b/benchmarks/rabitq_poc/rust_results/speed_d3072_2bit_arm_st.json
deleted file mode 100644
index 1920310f..00000000
--- a/benchmarks/rabitq_poc/rust_results/speed_d3072_2bit_arm_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 2,
-  "arch": "arm",
-  "threading": "st",
-  "tq_ms_per_query": 2.176,
-  "faiss_ms_per_query": 2.512
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results/speed_d3072_4bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results/speed_d3072_4bit_arm_mt.json
deleted file mode 100644
index 6585c998..00000000
--- a/benchmarks/rabitq_poc/rust_results/speed_d3072_4bit_arm_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 4,
-  "arch": "arm",
-  "threading": "mt",
-  "tq_ms_per_query": 0.389,
-  "faiss_ms_per_query": 0.478
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results/speed_d3072_4bit_arm_st.json b/benchmarks/rabitq_poc/rust_results/speed_d3072_4bit_arm_st.json
deleted file mode 100644
index 9d58fadb..00000000
--- a/benchmarks/rabitq_poc/rust_results/speed_d3072_4bit_arm_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 4,
-  "arch": "arm",
-  "threading": "st",
-  "tq_ms_per_query": 4.083,
-  "faiss_ms_per_query": 5.06
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_2bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_2bit_arm_mt.json
deleted file mode 100644
index 505f5d9d..00000000
--- a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_2bit_arm_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 2,
-  "arch": "arm",
-  "threading": "mt",
-  "tq_ms_per_query": 0.107,
-  "faiss_ms_per_query": 0.126
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_2bit_arm_st.json b/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_2bit_arm_st.json
deleted file mode 100644
index 68ffa404..00000000
--- a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_2bit_arm_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 2,
-  "arch": "arm",
-  "threading": "st",
-  "tq_ms_per_query": 1.069,
-  "faiss_ms_per_query": 1.241
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_4bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_4bit_arm_mt.json
deleted file mode 100644
index df834670..00000000
--- a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_4bit_arm_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 4,
-  "arch": "arm",
-  "threading": "mt",
-  "tq_ms_per_query": 0.192,
-  "faiss_ms_per_query": 0.244
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_4bit_arm_st.json b/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_4bit_arm_st.json
deleted file mode 100644
index f06c14d3..00000000
--- a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_4bit_arm_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 4,
-  "arch": "arm",
-  "threading": "st",
-  "tq_ms_per_query": 2.017,
-  "faiss_ms_per_query": 2.518
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_2bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_2bit_arm_mt.json
deleted file mode 100644
index 4c5da68e..00000000
--- a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_2bit_arm_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 2,
-  "arch": "arm",
-  "threading": "mt",
-  "tq_ms_per_query": 0.21,
-  "faiss_ms_per_query": 0.247
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_2bit_arm_st.json b/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_2bit_arm_st.json
deleted file mode 100644
index bec5918a..00000000
--- a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_2bit_arm_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 2,
-  "arch": "arm",
-  "threading": "st",
-  "tq_ms_per_query": 2.145,
-  "faiss_ms_per_query": 2.482
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_4bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_4bit_arm_mt.json
deleted file mode 100644
index 363e311f..00000000
--- a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_4bit_arm_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 4,
-  "arch": "arm",
-  "threading": "mt",
-  "tq_ms_per_query": 0.386,
-  "faiss_ms_per_query": 0.471
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_4bit_arm_st.json b/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_4bit_arm_st.json
deleted file mode 100644
index b3cc5af6..00000000
--- a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_4bit_arm_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 4,
-  "arch": "arm",
-  "threading": "st",
-  "tq_ms_per_query": 4.02,
-  "faiss_ms_per_query": 4.996
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_2bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_2bit_arm_mt.json
deleted file mode 100644
index c122dfe3..00000000
--- a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_2bit_arm_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 2,
-  "arch": "arm",
-  "threading": "mt",
-  "tq_ms_per_query": 0.106,
-  "faiss_ms_per_query": 0.12
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_2bit_arm_st.json b/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_2bit_arm_st.json
deleted file mode 100644
index e4372abb..00000000
--- a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_2bit_arm_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 2,
-  "arch": "arm",
-  "threading": "st",
-  "tq_ms_per_query": 1.097,
-  "faiss_ms_per_query": 1.248
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_4bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_4bit_arm_mt.json
deleted file mode 100644
index 5cb4a352..00000000
--- a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_4bit_arm_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 4,
-  "arch": "arm",
-  "threading": "mt",
-  "tq_ms_per_query": 0.19,
-  "faiss_ms_per_query": 0.234
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_4bit_arm_st.json b/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_4bit_arm_st.json
deleted file mode 100644
index 0ad0cb75..00000000
--- a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_4bit_arm_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 4,
-  "arch": "arm",
-  "threading": "st",
-  "tq_ms_per_query": 2.017,
-  "faiss_ms_per_query": 2.479
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_2bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_2bit_arm_mt.json
deleted file mode 100644
index 2e3be1d3..00000000
--- a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_2bit_arm_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 2,
-  "arch": "arm",
-  "threading": "mt",
-  "tq_ms_per_query": 0.218,
-  "faiss_ms_per_query": 0.256
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_2bit_arm_st.json b/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_2bit_arm_st.json
deleted file mode 100644
index 9e72f306..00000000
--- a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_2bit_arm_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 2,
-  "arch": "arm",
-  "threading": "st",
-  "tq_ms_per_query": 2.151,
-  "faiss_ms_per_query": 2.477
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_4bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_4bit_arm_mt.json
deleted file mode 100644
index af45d503..00000000
--- a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_4bit_arm_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 4,
-  "arch": "arm",
-  "threading": "mt",
-  "tq_ms_per_query": 0.396,
-  "faiss_ms_per_query": 0.484
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_4bit_arm_st.json b/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_4bit_arm_st.json
deleted file mode 100644
index 1402d3e7..00000000
--- a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_4bit_arm_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 4,
-  "arch": "arm",
-  "threading": "st",
-  "tq_ms_per_query": 4.032,
-  "faiss_ms_per_query": 5.09
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_2bit_x86_mt.json b/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_2bit_x86_mt.json
deleted file mode 100644
index d9390c6f..00000000
--- a/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_2bit_x86_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 2,
-  "arch": "x86",
-  "threading": "mt",
-  "tq_ms_per_query": 0.302,
-  "faiss_ms_per_query": 0.297
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_2bit_x86_st.json b/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_2bit_x86_st.json
deleted file mode 100644
index 087a9a88..00000000
--- a/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_2bit_x86_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 2,
-  "arch": "x86",
-  "threading": "st",
-  "tq_ms_per_query": 1.322,
-  "faiss_ms_per_query": 1.314
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_4bit_x86_mt.json b/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_4bit_x86_mt.json
deleted file mode 100644
index ddccda71..00000000
--- a/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_4bit_x86_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 4,
-  "arch": "x86",
-  "threading": "mt",
-  "tq_ms_per_query": 0.567,
-  "faiss_ms_per_query": 0.589
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_4bit_x86_st.json b/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_4bit_x86_st.json
deleted file mode 100644
index 7e03309f..00000000
--- a/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_4bit_x86_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 4,
-  "arch": "x86",
-  "threading": "st",
-  "tq_ms_per_query": 2.533,
-  "faiss_ms_per_query": 2.554
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_2bit_x86_mt.json b/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_2bit_x86_mt.json
deleted file mode 100644
index 6ab96961..00000000
--- a/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_2bit_x86_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 2,
-  "arch": "x86",
-  "threading": "mt",
-  "tq_ms_per_query": 0.614,
-  "faiss_ms_per_query": 0.588
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_2bit_x86_st.json b/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_2bit_x86_st.json
deleted file mode 100644
index 21eb78ee..00000000
--- a/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_2bit_x86_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 2,
-  "arch": "x86",
-  "threading": "st",
-  "tq_ms_per_query": 2.56,
-  "faiss_ms_per_query": 2.538
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_4bit_x86_mt.json b/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_4bit_x86_mt.json
deleted file mode 100644
index 285a30e2..00000000
--- a/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_4bit_x86_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 4,
-  "arch": "x86",
-  "threading": "mt",
-  "tq_ms_per_query": 1.154,
-  "faiss_ms_per_query": 1.17
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_4bit_x86_st.json b/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_4bit_x86_st.json
deleted file mode 100644
index f6474fdb..00000000
--- a/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_4bit_x86_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 4,
-  "arch": "x86",
-  "threading": "st",
-  "tq_ms_per_query": 4.935,
-  "faiss_ms_per_query": 5.011
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_2bit_x86_mt.json b/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_2bit_x86_mt.json
deleted file mode 100644
index 1cd89c94..00000000
--- a/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_2bit_x86_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 2,
-  "arch": "x86",
-  "threading": "mt",
-  "tq_ms_per_query": 0.305,
-  "faiss_ms_per_query": 0.297
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_2bit_x86_st.json b/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_2bit_x86_st.json
deleted file mode 100644
index db4c36f9..00000000
--- a/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_2bit_x86_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 2,
-  "arch": "x86",
-  "threading": "st",
-  "tq_ms_per_query": 1.344,
-  "faiss_ms_per_query": 1.272
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_4bit_x86_mt.json b/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_4bit_x86_mt.json
deleted file mode 100644
index 9fefa5c7..00000000
--- a/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_4bit_x86_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 4,
-  "arch": "x86",
-  "threading": "mt",
-  "tq_ms_per_query": 0.569,
-  "faiss_ms_per_query": 0.588
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_4bit_x86_st.json b/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_4bit_x86_st.json
deleted file mode 100644
index 6e664784..00000000
--- a/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_4bit_x86_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 1536,
-  "bit_width": 4,
-  "arch": "x86",
-  "threading": "st",
-  "tq_ms_per_query": 2.545,
-  "faiss_ms_per_query": 2.566
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_2bit_x86_mt.json b/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_2bit_x86_mt.json
deleted file mode 100644
index 6ab96961..00000000
--- a/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_2bit_x86_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 2,
-  "arch": "x86",
-  "threading": "mt",
-  "tq_ms_per_query": 0.614,
-  "faiss_ms_per_query": 0.588
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_2bit_x86_st.json b/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_2bit_x86_st.json
deleted file mode 100644
index 21eb78ee..00000000
--- a/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_2bit_x86_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 2,
-  "arch": "x86",
-  "threading": "st",
-  "tq_ms_per_query": 2.56,
-  "faiss_ms_per_query": 2.538
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_4bit_x86_mt.json b/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_4bit_x86_mt.json
deleted file mode 100644
index 285a30e2..00000000
--- a/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_4bit_x86_mt.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 4,
-  "arch": "x86",
-  "threading": "mt",
-  "tq_ms_per_query": 1.154,
-  "faiss_ms_per_query": 1.17
-}
\ No newline at end of file
diff --git a/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_4bit_x86_st.json b/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_4bit_x86_st.json
deleted file mode 100644
index f6474fdb..00000000
--- a/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_4bit_x86_st.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "dim": 3072,
-  "bit_width": 4,
-  "arch": "x86",
-  "threading": "st",
-  "tq_ms_per_query": 4.935,
-  "faiss_ms_per_query": 5.011
-}
\ No newline at end of file