diff --git a/benchmarks/rabitq_poc/.gitignore b/benchmarks/rabitq_poc/.gitignore deleted file mode 100644 index 33511340..00000000 --- a/benchmarks/rabitq_poc/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# Reproducible binary blobs — regenerate via `cargo run -p turbovec --example dump_state` -rust_state/ diff --git a/benchmarks/rabitq_poc/apples_results.json b/benchmarks/rabitq_poc/apples_results.json deleted file mode 100644 index d5b04460..00000000 --- a/benchmarks/rabitq_poc/apples_results.json +++ /dev/null @@ -1,122 +0,0 @@ -{ - "glove-200_2bit": { - "baseline": { - "1": 0.5074, - "2": 0.6606, - "4": 0.7896, - "8": 0.8794, - "16": 0.9432, - "32": 0.9759, - "64": 0.9926 - }, - "form_B_paper": { - "1": 0.5544, - "2": 0.7041, - "4": 0.8278, - "8": 0.9097, - "16": 0.9627, - "32": 0.9852, - "64": 0.9963 - } - }, - "glove-200_4bit": { - "baseline": { - "1": 0.8231, - "2": 0.9395, - "4": 0.988, - "8": 0.9983, - "16": 0.9997, - "32": 1.0, - "64": 1.0 - }, - "form_B_paper": { - "1": 0.8577, - "2": 0.9583, - "4": 0.9948, - "8": 0.9998, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } - }, - "openai-1536_2bit": { - "baseline": { - "1": 0.872, - "2": 0.963, - "4": 0.999, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "form_B_paper": { - "1": 0.904, - "2": 0.975, - "4": 0.998, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } - }, - "openai-1536_4bit": { - "baseline": { - "1": 0.957, - "2": 0.997, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "form_B_paper": { - "1": 0.975, - "2": 0.998, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } - }, - "openai-3072_2bit": { - "baseline": { - "1": 0.915, - "2": 0.984, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "form_B_paper": { - "1": 0.922, - "2": 0.993, - "4": 0.999, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } - }, - "openai-3072_4bit": { - "baseline": { - "1": 0.973, - "2": 0.999, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "form_B_paper": { - "1": 0.982, - "2": 0.999, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } - } -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/bench_block_skip.py b/benchmarks/rabitq_poc/bench_block_skip.py deleted file mode 100644 index 8c7773e3..00000000 --- a/benchmarks/rabitq_poc/bench_block_skip.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -Speed-test the block-level mask early-exit: 100K vectors, only the last 1K -slots allowed. - -100K / 32 per block = 3125 blocks. The last 1K vectors occupy ~32 blocks -at the end. With block-skip active, ~3093 of 3125 blocks (~99%) should be -short-circuited. Without it (main), the masked search pays the full -unmasked SIMD cost. - -Run this script twice — once on each wheel — to see the before/after. -""" - -import os -import time - -import numpy as np -from turbovec import TurboQuantIndex - -DIM = 1536 -N_DB = 100_000 -N_ALLOWED = 1_000 -N_QUERIES = 100 -K = 10 -SEED = 42 -WARMUP = 3 -REPEATS = 5 - - -def main() -> None: - rng = np.random.RandomState(SEED) - database = rng.standard_normal((N_DB, DIM)).astype(np.float32) - database /= np.linalg.norm(database, axis=-1, keepdims=True) - queries = rng.standard_normal((N_QUERIES, DIM)).astype(np.float32) - queries /= np.linalg.norm(queries, axis=-1, keepdims=True) - - # Allow only the last 1K slots. - mask = np.zeros(N_DB, dtype=bool) - mask[N_DB - N_ALLOWED:] = True - - index = TurboQuantIndex(DIM, bit_width=4) - index.add(database) - index.prepare() - - print(f"=== block-skip selectivity benchmark ===") - print(f" db={N_DB}, dim={DIM}, queries={N_QUERIES}, k={K}") - print(f" allowed slots: {N_ALLOWED} (last {N_ALLOWED}; " - f"{N_ALLOWED / N_DB * 100:.1f}% of index)") - print(f" blocks total: {(N_DB + 31) // 32}, " - f"blocks containing allowed slots: ~{(N_ALLOWED + 31) // 32}") - print() - - for _ in range(WARMUP): - index.search(queries, K) - index.search(queries, K, mask=mask) - - unmasked_times = [] - masked_times = [] - for _ in range(REPEATS): - t0 = time.perf_counter() - index.search(queries, K) - unmasked_times.append((time.perf_counter() - t0) * 1000 / N_QUERIES) - - t0 = time.perf_counter() - index.search(queries, K, mask=mask) - masked_times.append((time.perf_counter() - t0) * 1000 / N_QUERIES) - - unmasked_ms = sorted(unmasked_times)[REPEATS // 2] - masked_ms = sorted(masked_times)[REPEATS // 2] - - print(f" unmasked search: {unmasked_ms:.3f} ms / query (median of {REPEATS})") - print(f" masked search: {masked_ms:.3f} ms / query (median of {REPEATS})") - print(f" speedup (unmasked / masked): {unmasked_ms / masked_ms:.2f}x") - - if masked_ms < unmasked_ms * 0.5: - print(" -> block-skip appears active (>2x speedup at 1% selectivity)") - elif masked_ms < unmasked_ms * 0.95: - print(" -> some speedup but not large; block-skip may be partial or " - "post-kernel scan is dominant") - else: - print(" -> no measurable speedup; block-skip likely not active " - "(post-filter only)") - - -if __name__ == "__main__": - main() diff --git a/benchmarks/rabitq_poc/comparison.py b/benchmarks/rabitq_poc/comparison.py deleted file mode 100644 index 50a451b6..00000000 --- a/benchmarks/rabitq_poc/comparison.py +++ /dev/null @@ -1,99 +0,0 @@ -"""Build the final published-vs-POC-vs-Rust-prototype comparison table + plot.""" - -import json -import os - -import matplotlib.pyplot as plt - -HERE = os.path.dirname(__file__) -RESULTS_DIR = os.path.join(HERE, "..", "results") -PROTO_DIR = os.path.join(HERE, "rust_results") -POC_RESULTS = json.load(open(os.path.join(HERE, "results.json"))) - -CELLS = [ - ("glove_2bit", "glove-200_2bit", "GloVe-200 2-bit"), - ("glove_4bit", "glove-200_4bit", "GloVe-200 4-bit"), - ("d1536_2bit", "openai-1536_2bit", "OpenAI-1536 2-bit"), - ("d1536_4bit", "openai-1536_4bit", "OpenAI-1536 4-bit"), - ("d3072_2bit", "openai-3072_2bit", "OpenAI-3072 2-bit"), - ("d3072_4bit", "openai-3072_4bit", "OpenAI-3072 4-bit"), -] -K_VALUES = [1, 2, 4, 8, 16, 32, 64] - - -def load_cell(fkey, pkey): - base = json.load(open(os.path.join(RESULTS_DIR, f"recall_{fkey}.json"))) - proto = json.load(open(os.path.join(PROTO_DIR, f"recall_{fkey}.json"))) - poc = POC_RESULTS[pkey]["recall_at_1_at_k"] - return base, proto, poc - - -def summary_table(): - print(f"{'cell':<22} {'old TV':>8} {'POC pred':>9} {'Rust proto':>11} {'Δ vs old':>10} {'FAISS':>8} {'beats FAISS':>12}") - print("-" * 86) - rows = [] - for fkey, pkey, label in CELLS: - base, proto, poc = load_cell(fkey, pkey) - rb = base["tq_recalls"]["1"] - rp = proto["tq_recalls"]["1"] - rf = base["faiss_recalls"]["1"] - rp_poc = poc["form_B_paper"]["1"] - beats = "YES" if rp > rf else ("tie" if rp == rf else "no") - rows.append((label, rb, rp_poc, rp, rp - rb, rf, beats)) - print(f"{label:<22} {rb:>8.4f} {rp_poc:>9.4f} {rp:>11.4f} {rp-rb:>+10.4f} {rf:>8.4f} {beats:>12}") - return rows - - -def plot(): - fig, axes = plt.subplots(2, 3, figsize=(15, 8), sharey=False) - bit_widths = [2, 4] - datasets = [ - ("glove", "GloVe-200"), - ("d1536", "OpenAI-1536"), - ("d3072", "OpenAI-3072"), - ] - pkey_map = { - ("glove", 2): "glove-200_2bit", - ("glove", 4): "glove-200_4bit", - ("d1536", 2): "openai-1536_2bit", - ("d1536", 4): "openai-1536_4bit", - ("d3072", 2): "openai-3072_2bit", - ("d3072", 4): "openai-3072_4bit", - } - - for row, bits in enumerate(bit_widths): - for col, (ds, label) in enumerate(datasets): - ax = axes[row, col] - fkey = f"{ds}_{bits}bit" - pkey = pkey_map[(ds, bits)] - base, proto, poc = load_cell(fkey, pkey) - - x = K_VALUES - base_y = [base["tq_recalls"][str(k)] for k in x] - proto_y = [proto["tq_recalls"][str(k)] for k in x] - faiss_y = [base["faiss_recalls"][str(k)] for k in x] - poc_y = [poc["form_B_paper"][str(k)] for k in x] - - ax.plot(x, base_y, marker="o", label="turbovec 0.4.3 (baseline)", color="C0", linewidth=2) - ax.plot(x, proto_y, marker="s", label="prototype (Rust, corrected)", color="C3", linewidth=2) - ax.plot(x, poc_y, marker="x", label="POC (numpy, corrected)", color="C2", linestyle="--", alpha=0.6) - ax.plot(x, faiss_y, marker="^", label="FAISS PQ", color="C7", alpha=0.7) - - ax.set_xscale("log", base=2) - ax.set_xticks(K_VALUES) - ax.set_xticklabels([str(k) for k in K_VALUES]) - ax.set_xlabel("k") - ax.set_ylabel(f"recall@1@k") - ax.set_title(f"{label}, {bits}-bit") - ax.grid(True, alpha=0.3) - if row == 0 and col == 0: - ax.legend(loc="lower right", fontsize=8) - - plt.tight_layout() - plt.savefig(os.path.join(HERE, "rust_comparison.png"), dpi=120) - print(f"\nPlot saved to {os.path.join(HERE, 'rust_comparison.png')}") - - -if __name__ == "__main__": - summary_table() - plot() diff --git a/benchmarks/rabitq_poc/exact_vs_lut.json b/benchmarks/rabitq_poc/exact_vs_lut.json deleted file mode 100644 index f39e5e04..00000000 --- a/benchmarks/rabitq_poc/exact_vs_lut.json +++ /dev/null @@ -1,158 +0,0 @@ -{ - "glove-200_2bit": { - "dataset": "glove-200", - "dim": 200, - "bit_width": 2, - "seed": 42, - "n_db": 100000, - "n_queries": 10000, - "lut_kernel": { - "1": 0.5524, - "2": 0.7071, - "4": 0.8273, - "8": 0.91, - "16": 0.9624, - "32": 0.9852, - "64": 0.9957 - }, - "exact_kernel": { - "1": 0.5544, - "2": 0.7041, - "4": 0.8278, - "8": 0.9097, - "16": 0.9627, - "32": 0.9852, - "64": 0.9963 - } - }, - "glove-200_4bit": { - "dataset": "glove-200", - "dim": 200, - "bit_width": 4, - "seed": 42, - "n_db": 100000, - "n_queries": 10000, - "lut_kernel": { - "1": 0.844, - "2": 0.9556, - "4": 0.9932, - "8": 0.9997, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "exact_kernel": { - "1": 0.8577, - "2": 0.9584, - "4": 0.9948, - "8": 0.9998, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } - }, - "openai-1536_2bit": { - "dataset": "openai-1536", - "dim": 1536, - "bit_width": 2, - "seed": 42, - "n_db": 100000, - "n_queries": 1000, - "lut_kernel": { - "1": 0.906, - "2": 0.976, - "4": 0.999, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "exact_kernel": { - "1": 0.904, - "2": 0.975, - "4": 0.998, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } - }, - "openai-1536_4bit": { - "dataset": "openai-1536", - "dim": 1536, - "bit_width": 4, - "seed": 42, - "n_db": 100000, - "n_queries": 1000, - "lut_kernel": { - "1": 0.97, - "2": 0.997, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "exact_kernel": { - "1": 0.975, - "2": 0.998, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } - }, - "openai-3072_2bit": { - "dataset": "openai-3072", - "dim": 3072, - "bit_width": 2, - "seed": 42, - "n_db": 100000, - "n_queries": 1000, - "lut_kernel": { - "1": 0.924, - "2": 0.992, - "4": 0.999, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "exact_kernel": { - "1": 0.922, - "2": 0.993, - "4": 0.999, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } - }, - "openai-3072_4bit": { - "dataset": "openai-3072", - "dim": 3072, - "bit_width": 4, - "seed": 42, - "n_db": 100000, - "n_queries": 1000, - "lut_kernel": { - "1": 0.98, - "2": 1.0, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "exact_kernel": { - "1": 0.982, - "2": 1.0, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } - } -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/kernel_math_comparison.py b/benchmarks/rabitq_poc/kernel_math_comparison.py deleted file mode 100644 index 11d027f5..00000000 --- a/benchmarks/rabitq_poc/kernel_math_comparison.py +++ /dev/null @@ -1,216 +0,0 @@ -""" -Math-isolated ARM vs x86 kernel comparison — pure synthetic test. - -Generates random codes + random LUTs, runs four scoring variants on the -SAME inputs, compares per-vector outputs. No dependency on rotation, -centroids, or real datasets — the question we're answering ("do ARM and -x86 kernels produce the same scores for the same LUT and codes?") is -purely about kernel arithmetic. - -Variants: - -* `arm` — ARM NEON kernel math: per byte-group, compute `u8_sum = lo+hi` - (capped at 254 with max_lut=127, otherwise wraps modulo 256), accumulate - into u16, flush to f32 every FLUSH_EVERY=256 byte-groups. - -* `x86_current` — AVX2 kernel math: accumulate u8 lookups directly into - i16 lanes (FAISS even/odd-byte interleave), NO periodic flush. Per - nibble-half sum must fit in 16 bits, so effective `max_lut <= - 65535 / n_byte_groups`. The implementation here collapses to: - `lo_sum_u16 + hi_sum_u16` computed mod 2^16 each. - -* `x86_with_flush` — hypothetical fix: same i16 accumulator BUT flushed - to f32 every 256 byte-groups, mirroring ARM. Per-flush max sum is - `flush_every * max_lut = 256 * 127 = 32512 <= 65535`, so this allows - max_lut=127 at any dim. - -* `exact_int` — bit-exact integer reference: pure-Python sum of LUT - lookups in unbounded ints. No modular wrap. The "what should happen - in real arithmetic" baseline. - -Usage: - python3 kernel_math_comparison.py [dim] # default 3072 -""" - -import sys - -import numpy as np - - -FLUSH_EVERY = 256 -SEED = 42 - - -# ─── Kernel simulations ────────────────────────────────────────────────────── - -def arm_kernel_score(codes_one_vec: np.ndarray, lut_u8: np.ndarray, - scale: float, bias: float, vec_scale: float, - flush_every: int = FLUSH_EVERY) -> float: - """NEON math: u8 sum lo+hi per byte-group → widen u16 → periodic flush.""" - dim = codes_one_vec.shape[0] - n_byte_groups = dim // 2 - fa = float(bias) - u16_accum = 0 - flush_idx = 0 - for g in range(n_byte_groups): - lo = lut_u8[2 * g, codes_one_vec[2 * g]] - hi = lut_u8[2 * g + 1, codes_one_vec[2 * g + 1]] - u8_sum = (lo + hi) & 0xFF # explicit u8 modular wrap - u16_accum = (u16_accum + u8_sum) & 0xFFFF - flush_idx += 1 - if flush_idx >= flush_every or g == n_byte_groups - 1: - fa += scale * float(u16_accum) - u16_accum = 0 - flush_idx = 0 - return fa * vec_scale - - -def x86_kernel_score(codes_one_vec: np.ndarray, lut_u8: np.ndarray, - scale: float, bias: float, vec_scale: float) -> float: - """AVX2 math: lo and hi sums accumulated independently into u16 lanes, - no flush. Each sum must fit in 16 bits; above that, modular wrap.""" - dim = codes_one_vec.shape[0] - n_byte_groups = dim // 2 - lo_sum_u16 = 0 - hi_sum_u16 = 0 - for g in range(n_byte_groups): - lo = lut_u8[2 * g, codes_one_vec[2 * g]] - hi = lut_u8[2 * g + 1, codes_one_vec[2 * g + 1]] - lo_sum_u16 = (int(lo_sum_u16) + int(lo)) & 0xFFFF - hi_sum_u16 = (int(hi_sum_u16) + int(hi)) & 0xFFFF - return (bias + scale * (int(lo_sum_u16) + int(hi_sum_u16))) * vec_scale - - -def x86_with_flush_score(codes_one_vec: np.ndarray, lut_u8: np.ndarray, - scale: float, bias: float, vec_scale: float, - flush_every: int = FLUSH_EVERY) -> float: - """Hypothetical fix: same i16 accumulator structure, periodic flush.""" - dim = codes_one_vec.shape[0] - n_byte_groups = dim // 2 - fa = float(bias) - lo_sum_u16 = 0 - hi_sum_u16 = 0 - flush_idx = 0 - for g in range(n_byte_groups): - lo = lut_u8[2 * g, codes_one_vec[2 * g]] - hi = lut_u8[2 * g + 1, codes_one_vec[2 * g + 1]] - lo_sum_u16 = (int(lo_sum_u16) + int(lo)) & 0xFFFF - hi_sum_u16 = (int(hi_sum_u16) + int(hi)) & 0xFFFF - flush_idx += 1 - if flush_idx >= flush_every or g == n_byte_groups - 1: - fa += scale * (int(lo_sum_u16) + int(hi_sum_u16)) - lo_sum_u16 = 0 - hi_sum_u16 = 0 - flush_idx = 0 - return fa * vec_scale - - -def exact_int_score(codes_one_vec: np.ndarray, lut_u8: np.ndarray, - scale: float, bias: float, vec_scale: float) -> float: - """Reference: unbounded integer sum, no modular wrap.""" - dim = codes_one_vec.shape[0] - n_byte_groups = dim // 2 - total = 0 - for g in range(n_byte_groups): - total += int(lut_u8[2 * g, codes_one_vec[2 * g]]) - total += int(lut_u8[2 * g + 1, codes_one_vec[2 * g + 1]]) - return (bias + scale * total) * vec_scale - - -# ─── Driver ────────────────────────────────────────────────────────────────── - -def run(dim: int, n_vectors: int, max_lut: int, lut_distribution: str = "uniform"): - n_byte_groups = dim // 2 - n_subs = dim - - print(f"\n--- dim={dim} n_byte_groups={n_byte_groups} n_subs={n_subs} max_lut={max_lut} ---") - # x86 sum-fits-in-u16 constraint: max_lut * n_byte_groups <= 65535 - sum_cap = n_byte_groups * max_lut - print(f"x86 per-half max sum: {n_byte_groups} * {max_lut} = {sum_cap}" - f" ({'FITS in u16' if sum_cap <= 65535 else 'OVERFLOWS u16 (sum mod 2^16 corrupts result)'})") - # ARM per-flush sum: flush_every * (lo+hi cap) - arm_per_flush = FLUSH_EVERY * min(2 * max_lut, 255) # u8 sum capped at 255 - print(f"ARM per-flush u16 sum: min(2*max_lut, 255) * FLUSH_EVERY = " - f"{arm_per_flush} ({'FITS' if arm_per_flush <= 65535 else 'OVERFLOWS u16'})") - - rng = np.random.RandomState(SEED) - # Random codes 0..15 - codes = rng.randint(0, 16, size=(n_vectors, dim), dtype=np.int8) - - # Generate a per-sub-table LUT. "uniform" = all sub-tables span similar - # range (mimics TQ+ output), "skewed" = a few wide + many narrow (mimics - # raw GloVe-like distribution). - if lut_distribution == "uniform": - # Each sub-table: values uniformly distributed up to max_lut. - lut_u8 = rng.randint(0, max_lut + 1, size=(n_subs, 16), dtype=np.uint16) - elif lut_distribution == "skewed": - # 10% sub-tables span full range, 90% span ~10% of full range. - spans = np.where(rng.uniform(size=n_subs) < 0.1, max_lut, max_lut // 10 + 1) - lut_u8 = np.zeros((n_subs, 16), dtype=np.uint16) - for s in range(n_subs): - lut_u8[s] = rng.randint(0, spans[s] + 1, size=16) - else: - raise ValueError(lut_distribution) - - scale = 0.01 # arbitrary; doesn't affect ranking - bias = 0.0 - vec_scales = np.full(n_vectors, 1.0) - - arm = np.zeros(n_vectors) - x86 = np.zeros(n_vectors) - x86_f = np.zeros(n_vectors) - exact = np.zeros(n_vectors) - for i in range(n_vectors): - c = codes[i] - arm[i] = arm_kernel_score(c, lut_u8, scale, bias, vec_scales[i]) - x86[i] = x86_kernel_score(c, lut_u8, scale, bias, vec_scales[i]) - x86_f[i] = x86_with_flush_score(c, lut_u8, scale, bias, vec_scales[i]) - exact[i] = exact_int_score(c, lut_u8, scale, bias, vec_scales[i]) - - # Compare each variant to the exact reference. - def report(name, arr): - diff_count = int(np.sum(np.abs(arr - exact) > 1e-9)) - max_diff = float(np.max(np.abs(arr - exact))) - ranks_arr = np.argsort(-arr) - ranks_exact = np.argsort(-exact) - # top-K agreement - K = 10 - topk_arr = set(np.argpartition(-arr, K)[:K].tolist()) - topk_exact = set(np.argpartition(-exact, K)[:K].tolist()) - overlap = len(topk_arr & topk_exact) - print(f" {name:<16} mismatches={diff_count:>5}/{n_vectors} " - f"max|Δ|={max_diff:.4f} top-{K} overlap with exact={overlap}/{K}") - - report("exact_int", exact) - report("arm", arm) - report("x86_current", x86) - report("x86_with_flush", x86_f) - - -def main(): - dim = int(sys.argv[1]) if len(sys.argv) > 1 else 3072 - - # Sweep 1: each kernel at the max_lut value where IT would currently operate. - print("\n=== current production max_lut per arch ===") - n_byte_groups = dim // 2 - x86_cap = min(127, 65535 // n_byte_groups // 1) # see search.rs formula (n_byte_groups*2 in denom = n_subs) - arm_cap = 127 - print(f"x86 cap derived from search.rs formula: min(127, 65535/{2*n_byte_groups}) " - f"= {min(127, 65535 // (2 * n_byte_groups))}") - print(f"ARM cap: 127") - run(dim, n_vectors=200, max_lut=min(127, 65535 // (2 * n_byte_groups)), lut_distribution="uniform") - # And ARM at its own cap - run(dim, n_vectors=200, max_lut=127, lut_distribution="uniform") - - # Sweep 2: force x86 to use the ARM cap (max_lut=127) — does ARM math match - # x86_with_flush? Does x86_current overflow? - print("\n=== force max_lut=127 on both kernels (high precision regime) ===") - run(dim, n_vectors=200, max_lut=127, lut_distribution="uniform") - - # Sweep 3: same at low dim where x86_current doesn't overflow. - print("\n=== sanity check: low dim where x86_current is fine ===") - run(dim=200, n_vectors=200, max_lut=127, lut_distribution="uniform") - - -if __name__ == "__main__": - main() diff --git a/benchmarks/rabitq_poc/poc.py b/benchmarks/rabitq_poc/poc.py deleted file mode 100644 index aa5c5867..00000000 --- a/benchmarks/rabitq_poc/poc.py +++ /dev/null @@ -1,247 +0,0 @@ -""" -RaBitQ-style scalar correction on top of turbovec's Lloyd-Max codebook. - -Tests whether a per-vector correction scalar — computed at encode time -and applied at search time — recovers recall lost to systematic bias -in turbovec's inner-product estimates. - -Pipeline (numpy reimplementation of turbovec/src/encode.rs): - 1. Normalize each data vector v to unit u = v / ||v|| - 2. Rotate: u_rot = R @ u (R = seeded random orthogonal) - 3. Quantize: each coord of u_rot -> nearest Lloyd-Max centroid (Beta dist) - 4. Reconstruct: x_hat = centroids[codes] - 5. Baseline score: ||v|| * - 6. Corrected score: scalar_v * - where scalar_v is one of three forms tested. - -Two correction forms tested: - A. Regression-optimal (JL projection): - scalar = ||v|| * cos(u_rot, x_hat) / ||x_hat|| - B. Paper formula (RaBitQ Section 2.2.3): - scalar = ||v|| / (||x_hat|| * cos(u_rot, x_hat)) - -Runs across 3 datasets x 2 bit widths = 6 operating points, plots -recall@1@k. -""" - -import json -import os -import time - -import h5py -import matplotlib.pyplot as plt -import numpy as np -from scipy.stats import beta as beta_dist - -DATA_DIR = os.path.expanduser("~/data/py-turboquant") -RESULTS_DIR = os.path.dirname(__file__) -SEED = 42 -N_DB = 100_000 -K_VALUES = [1, 2, 4, 8, 16, 32, 64] - - -def lloyd_max_codebook(bits, dim, max_iter=200, tol=1e-12): - """Match turbovec/src/codebook.rs — Lloyd-Max on Beta((d-1)/2, (d-1)/2).""" - a = (dim - 1) / 2.0 - n_levels = 1 << bits - std_dev = np.sqrt(2.0 * a / ((2.0 * a + 1.0) * 4.0 * a)) - spread = 3.0 * std_dev - centroids = np.linspace(-spread, spread, n_levels, dtype=np.float64) - - for _ in range(max_iter): - midpoints = (centroids[:-1] + centroids[1:]) / 2.0 - edges = np.concatenate(([-1.0], midpoints, [1.0])) - - u_lo = (edges[:-1] + 1.0) / 2.0 - u_hi = (edges[1:] + 1.0) / 2.0 - prob = beta_dist.cdf(u_hi, a, a) - beta_dist.cdf(u_lo, a, a) - - new_centroids = np.empty(n_levels, dtype=np.float64) - for i in range(n_levels): - if prob[i] < 1e-15: - new_centroids[i] = centroids[i] - continue - xs = np.linspace(edges[i], edges[i + 1], 2049) - pdf_xs = beta_dist.pdf((xs + 1) / 2, a, a) / 2 - new_centroids[i] = np.trapz(xs * pdf_xs, xs) / prob[i] - - if np.max(np.abs(centroids - new_centroids)) < tol: - centroids = new_centroids - break - centroids = new_centroids - - boundaries = (centroids[:-1] + centroids[1:]) / 2.0 - return boundaries.astype(np.float32), centroids.astype(np.float32) - - -def random_rotation(dim, seed): - """Deterministic random orthogonal via QR (matches turbovec/src/rotation.rs).""" - rng = np.random.RandomState(seed) - g = rng.standard_normal((dim, dim)).astype(np.float64) - q, r = np.linalg.qr(g) - signs = np.sign(np.diag(r)) - signs[signs == 0] = 1.0 - q = q * signs - return q.astype(np.float32) - - -def encode(vectors, rotation, boundaries, centroids): - norms = np.linalg.norm(vectors, axis=1).astype(np.float32) - unit = vectors / np.clip(norms[:, None], 1e-10, None) - rotated = unit @ rotation.T - codes_idx = np.searchsorted(boundaries, rotated) - x_hat = centroids[codes_idx] - return norms, rotated, x_hat - - -def compute_corrections(rotated, x_hat, norms): - inner = np.einsum("nd,nd->n", rotated, x_hat) - xh_norm = np.linalg.norm(x_hat, axis=1) - cos_uv = inner / np.clip(xh_norm, 1e-10, None) - return { - "baseline": norms, - "form_A_regression": norms * cos_uv / np.clip(xh_norm, 1e-10, None), - "form_B_paper": norms / np.clip(xh_norm * cos_uv, 1e-10, None), - }, { - "cos_mean": float(cos_uv.mean()), - "cos_std": float(cos_uv.std()), - "xh_norm_mean": float(xh_norm.mean()), - } - - -def score_and_topk(query_rot, x_hat_db, scalars, k): - raw = x_hat_db @ query_rot.T - scored = raw * scalars[:, None] - topk = np.argpartition(-scored, k, axis=0)[:k] - topk_scores = np.take_along_axis(scored, topk, axis=0) - order = np.argsort(-topk_scores, axis=0) - return np.take_along_axis(topk, order, axis=0).T - - -def recall_at_1_at_k(true_top1, predicted, k): - return float(np.mean([true_top1[i] in predicted[i, :k] for i in range(len(true_top1))])) - - -def load_openai(dim): - all_vecs = np.load(os.path.join(DATA_DIR, f"openai-{dim}.npy")) - rng = np.random.RandomState(SEED) - idx = rng.permutation(len(all_vecs)) - db = all_vecs[idx[:N_DB]].astype(np.float32) - q = all_vecs[idx[N_DB : N_DB + 1_000]].astype(np.float32) - db /= np.linalg.norm(db, axis=-1, keepdims=True) - q /= np.linalg.norm(q, axis=-1, keepdims=True) - return db, q, dim - - -def load_glove(): - with h5py.File(os.path.join(DATA_DIR, "glove-200-angular.hdf5"), "r") as f: - all_train = f["train"][:].astype(np.float32) - queries = f["test"][:].astype(np.float32) - rng = np.random.RandomState(SEED) - idx = rng.choice(len(all_train), N_DB, replace=False) - db = all_train[idx] - db /= np.linalg.norm(db, axis=-1, keepdims=True) - queries /= np.linalg.norm(queries, axis=-1, keepdims=True) - return db, queries, 200 - - -DATASETS = { - "glove-200": load_glove, - "openai-1536": lambda: load_openai(1536), - "openai-3072": lambda: load_openai(3072), -} - - -def run(dataset, bit_width): - print(f"\n=== {dataset}, {bit_width}-bit, seed={SEED} ===") - t0 = time.time() - database, queries, dim = DATASETS[dataset]() - true_top1 = np.argmax(queries @ database.T, axis=1) - print(f" data + ground truth ({len(queries)} queries x {len(database)} db): {time.time() - t0:.1f}s") - - boundaries, centroids = lloyd_max_codebook(bit_width, dim) - rotation = random_rotation(dim, SEED) - - t0 = time.time() - norms, rotated, x_hat = encode(database, rotation, boundaries, centroids) - print(f" encode: {time.time() - t0:.1f}s") - - scalars, stats = compute_corrections(rotated, x_hat, norms) - print(f" cos(u, x_hat): mean={stats['cos_mean']:.4f} std={stats['cos_std']:.4f}; ||x_hat|| mean={stats['xh_norm_mean']:.4f}") - - queries_rot = (queries @ rotation.T).astype(np.float32) - k_max = max(K_VALUES) - recalls = {} - for name, s in scalars.items(): - t0 = time.time() - top = score_and_topk(queries_rot, x_hat, s, k_max) - recalls[name] = {str(k): round(recall_at_1_at_k(true_top1, top, k), 4) for k in K_VALUES} - print(f" {name:<22} recall@1={recalls[name]['1']:.4f} ({time.time() - t0:.1f}s)") - - return { - "dataset": dataset, "dim": dim, "bit_width": bit_width, "seed": SEED, - "n_db": len(database), "n_queries": len(queries), - "cos_stats": stats, "recall_at_1_at_k": recalls, - } - - -def plot(results, out_path): - fig, axes = plt.subplots(2, 3, figsize=(15, 8), sharey=False) - datasets = ["glove-200", "openai-1536", "openai-3072"] - bit_widths = [2, 4] - series = [ - ("baseline", "baseline (||v|| only)", "C0", "-"), - ("form_A_regression", "form A (cos / ||x_hat||)", "C1", "--"), - ("form_B_paper", "form B (1 / (||x_hat||·cos))", "C2", "-"), - ] - for row, bits in enumerate(bit_widths): - for col, ds in enumerate(datasets): - ax = axes[row, col] - key = f"{ds}_{bits}bit" - if key not in results: - ax.set_title(f"{ds} {bits}-bit (missing)") - continue - r = results[key]["recall_at_1_at_k"] - for name, label, color, ls in series: - ys = [r[name][str(k)] for k in K_VALUES] - ax.plot(K_VALUES, ys, marker="o", label=label, color=color, linestyle=ls) - ax.set_xscale("log", base=2) - ax.set_xticks(K_VALUES) - ax.set_xticklabels([str(k) for k in K_VALUES]) - ax.set_xlabel("k") - ax.set_ylabel(f"recall@1@k ({bits}-bit)") - ax.set_title(f"{ds} ({bits}-bit, d={results[key]['dim']})") - ax.grid(True, alpha=0.3) - if row == 0 and col == 0: - ax.legend(loc="lower right", fontsize=9) - plt.tight_layout() - plt.savefig(out_path, dpi=120) - print(f"\nPlot saved to {out_path}") - - -def summary_table(results): - """Print a table comparing recall@1 across all 6 cells.""" - print("\n" + "=" * 80) - print(f"{'cell':<28} {'baseline':>10} {'form_A':>10} {'form_B':>10} {'Δ(B-base)':>10}") - print("-" * 80) - for key in sorted(results.keys()): - r = results[key]["recall_at_1_at_k"] - b = r["baseline"]["1"] - a = r["form_A_regression"]["1"] - bb = r["form_B_paper"]["1"] - print(f"{key:<28} {b:>10.4f} {a:>10.4f} {bb:>10.4f} {bb - b:>+10.4f}") - - -if __name__ == "__main__": - results = {} - for dataset in ["glove-200", "openai-1536", "openai-3072"]: - for bits in (2, 4): - results[f"{dataset}_{bits}bit"] = run(dataset, bits) - - out_json = os.path.join(RESULTS_DIR, "results.json") - with open(out_json, "w") as f: - json.dump(results, f, indent=2) - print(f"\nResults: {out_json}") - - plot(results, os.path.join(RESULTS_DIR, "recall_grid.png")) - summary_table(results) diff --git a/benchmarks/rabitq_poc/poc_apples_to_apples.py b/benchmarks/rabitq_poc/poc_apples_to_apples.py deleted file mode 100644 index e87011f7..00000000 --- a/benchmarks/rabitq_poc/poc_apples_to_apples.py +++ /dev/null @@ -1,147 +0,0 @@ -""" -Apples-to-apples POC: consumes Rust's EXACT rotation matrix, Lloyd-Max -boundaries, and centroids — exported by `cargo run --example dump_state` — -so the only thing left that can differ from the real Rust pipeline is the -scoring kernel (numpy f32 matmul vs Rust bit-plane SIMD). - -If the corrected recall here matches the Rust prototype's recall to within -the SIMD kernel noise floor, that proves the correction math is implemented -identically in both pipelines. - -If they DON'T match, that's a real signal something is wrong in one of the -two paths. - -Run prerequisites: - cargo run -p turbovec --example dump_state --release -- benchmarks/rabitq_poc/rust_state -""" - -import json -import os -import time - -import h5py -import numpy as np - -DATA_DIR = os.path.expanduser("~/data/py-turboquant") -HERE = os.path.dirname(__file__) -STATE_DIR = os.path.join(HERE, "rust_state") -SEED = 42 -N_DB = 100_000 -K_VALUES = [1, 2, 4, 8, 16, 32, 64] - - -def load_rust_state(dim, bits): - """Load rotation + boundaries + centroids dumped by examples/dump_state.rs.""" - path = os.path.join(STATE_DIR, f"state_d{dim}_b{bits}.bin") - raw = np.fromfile(path, dtype="n", rotated, x_hat) - inner = np.clip(inner, 1e-10, None) - return { - "baseline": norms, - "form_B_paper": norms / inner, - } - - -def score_and_topk(query_rot, x_hat_db, scalars, k): - raw = x_hat_db @ query_rot.T - scored = raw * scalars[:, None] - topk = np.argpartition(-scored, k, axis=0)[:k] - topk_scores = np.take_along_axis(scored, topk, axis=0) - order = np.argsort(-topk_scores, axis=0) - return np.take_along_axis(topk, order, axis=0).T - - -def recall_at_1_at_k(true_top1, predicted, k): - return float(np.mean([true_top1[i] in predicted[i, :k] for i in range(len(true_top1))])) - - -def load_openai(dim): - all_vecs = np.load(os.path.join(DATA_DIR, f"openai-{dim}.npy")) - rng = np.random.RandomState(SEED) - idx = rng.permutation(len(all_vecs)) - db = all_vecs[idx[:N_DB]].astype(np.float32) - q = all_vecs[idx[N_DB : N_DB + 1_000]].astype(np.float32) - db /= np.linalg.norm(db, axis=-1, keepdims=True) - q /= np.linalg.norm(q, axis=-1, keepdims=True) - return db, q, dim - - -def load_glove(): - with h5py.File(os.path.join(DATA_DIR, "glove-200-angular.hdf5"), "r") as f: - all_train = f["train"][:].astype(np.float32) - queries = f["test"][:].astype(np.float32) - rng = np.random.RandomState(SEED) - idx = rng.choice(len(all_train), N_DB, replace=False) - db = all_train[idx] - db /= np.linalg.norm(db, axis=-1, keepdims=True) - queries /= np.linalg.norm(queries, axis=-1, keepdims=True) - return db, queries, 200 - - -DATASETS = { - "glove-200": load_glove, - "openai-1536": lambda: load_openai(1536), - "openai-3072": lambda: load_openai(3072), -} - - -def run(dataset_label, bits): - dim = {"glove-200": 200, "openai-1536": 1536, "openai-3072": 3072}[dataset_label] - print(f"\n=== {dataset_label}, {bits}-bit (apples-to-apples vs Rust) ===") - - t0 = time.time() - database, queries, _ = DATASETS[dataset_label]() - true_top1 = np.argmax(queries @ database.T, axis=1) - print(f" data + ground truth: {time.time() - t0:.1f}s") - - rotation, boundaries, centroids = load_rust_state(dim, bits) - print(f" loaded Rust state for d={dim} bits={bits}") - print(f" rotation[:3,:3] = {rotation[:3,:3]}") - print(f" centroids = {centroids}") - - norms, rotated, x_hat = encode(database, rotation, boundaries, centroids) - print(f" ||x_hat|| mean = {np.linalg.norm(x_hat, axis=1).mean():.4f}") - - scalars = compute_scales(rotated, x_hat, norms) - queries_rot = (queries @ rotation.T).astype(np.float32) - - out = {} - k_max = max(K_VALUES) - for name, s in scalars.items(): - top = score_and_topk(queries_rot, x_hat, s, k_max) - recalls = {str(k): round(recall_at_1_at_k(true_top1, top, k), 4) for k in K_VALUES} - out[name] = recalls - print(f" {name:<14} recall@1 = {recalls['1']:.4f}") - return out - - -if __name__ == "__main__": - results = {} - for dataset in ["glove-200", "openai-1536", "openai-3072"]: - for bits in (2, 4): - results[f"{dataset}_{bits}bit"] = run(dataset, bits) - - out_json = os.path.join(HERE, "apples_results.json") - with open(out_json, "w") as f: - json.dump(results, f, indent=2) - print(f"\nResults: {out_json}") diff --git a/benchmarks/rabitq_poc/recall_grid.png b/benchmarks/rabitq_poc/recall_grid.png deleted file mode 100644 index 39d79d4f..00000000 Binary files a/benchmarks/rabitq_poc/recall_grid.png and /dev/null differ diff --git a/benchmarks/rabitq_poc/results.json b/benchmarks/rabitq_poc/results.json deleted file mode 100644 index bd10edd7..00000000 --- a/benchmarks/rabitq_poc/results.json +++ /dev/null @@ -1,254 +0,0 @@ -{ - "glove-200_2bit": { - "dataset": "glove-200", - "dim": 200, - "bit_width": 2, - "seed": 42, - "n_db": 100000, - "n_queries": 10000, - "cos_stats": { - "cos_mean": 0.9400802254676819, - "cos_std": 0.006636226084083319, - "xh_norm_mean": 0.9393569827079773 - }, - "recall_at_1_at_k": { - "baseline": { - "1": 0.5117, - "2": 0.6657, - "4": 0.7916, - "8": 0.8855, - "16": 0.9466, - "32": 0.9771, - "64": 0.9913 - }, - "form_A_regression": { - "1": 0.557, - "2": 0.7058, - "4": 0.8235, - "8": 0.9071, - "16": 0.9593, - "32": 0.9845, - "64": 0.9939 - }, - "form_B_paper": { - "1": 0.5609, - "2": 0.7128, - "4": 0.831, - "8": 0.9112, - "16": 0.9634, - "32": 0.9848, - "64": 0.9948 - } - } - }, - "glove-200_4bit": { - "dataset": "glove-200", - "dim": 200, - "bit_width": 4, - "seed": 42, - "n_db": 100000, - "n_queries": 10000, - "cos_stats": { - "cos_mean": 0.9953391551971436, - "cos_std": 0.0009531387477181852, - "xh_norm_mean": 0.9951646327972412 - }, - "recall_at_1_at_k": { - "baseline": { - "1": 0.8249, - "2": 0.9408, - "4": 0.9875, - "8": 0.9979, - "16": 0.9999, - "32": 1.0, - "64": 1.0 - }, - "form_A_regression": { - "1": 0.8607, - "2": 0.9586, - "4": 0.9928, - "8": 0.9991, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "form_B_paper": { - "1": 0.86, - "2": 0.9595, - "4": 0.9928, - "8": 0.9994, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } - } - }, - "openai-1536_2bit": { - "dataset": "openai-1536", - "dim": 1536, - "bit_width": 2, - "seed": 42, - "n_db": 100000, - "n_queries": 1000, - "cos_stats": { - "cos_mean": 0.9395331144332886, - "cos_std": 0.0024137997534126043, - "xh_norm_mean": 0.9394603967666626 - }, - "recall_at_1_at_k": { - "baseline": { - "1": 0.862, - "2": 0.967, - "4": 0.995, - "8": 0.999, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "form_A_regression": { - "1": 0.893, - "2": 0.974, - "4": 0.996, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "form_B_paper": { - "1": 0.909, - "2": 0.978, - "4": 0.996, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } - } - }, - "openai-1536_4bit": { - "dataset": "openai-1536", - "dim": 1536, - "bit_width": 4, - "seed": 42, - "n_db": 100000, - "n_queries": 1000, - "cos_stats": { - "cos_mean": 0.9952533841133118, - "cos_std": 0.00035732678952626884, - "xh_norm_mean": 0.99527508020401 - }, - "recall_at_1_at_k": { - "baseline": { - "1": 0.963, - "2": 0.995, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "form_A_regression": { - "1": 0.974, - "2": 0.996, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "form_B_paper": { - "1": 0.974, - "2": 0.997, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } - } - }, - "openai-3072_2bit": { - "dataset": "openai-3072", - "dim": 3072, - "bit_width": 2, - "seed": 42, - "n_db": 100000, - "n_queries": 1000, - "cos_stats": { - "cos_mean": 0.9394387602806091, - "cos_std": 0.001708767144009471, - "xh_norm_mean": 0.9393318891525269 - }, - "recall_at_1_at_k": { - "baseline": { - "1": 0.906, - "2": 0.98, - "4": 0.998, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "form_A_regression": { - "1": 0.917, - "2": 0.984, - "4": 0.999, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "form_B_paper": { - "1": 0.913, - "2": 0.984, - "4": 0.999, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } - } - }, - "openai-3072_4bit": { - "dataset": "openai-3072", - "dim": 3072, - "bit_width": 4, - "seed": 42, - "n_db": 100000, - "n_queries": 1000, - "cos_stats": { - "cos_mean": 0.9952409267425537, - "cos_std": 0.00025407798239029944, - "xh_norm_mean": 0.9952232837677002 - }, - "recall_at_1_at_k": { - "baseline": { - "1": 0.969, - "2": 1.0, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "form_A_regression": { - "1": 0.98, - "2": 1.0, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "form_B_paper": { - "1": 0.981, - "2": 1.0, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } - } - } -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/run_exact_vs_lut.py b/benchmarks/rabitq_poc/run_exact_vs_lut.py deleted file mode 100644 index 66925c54..00000000 --- a/benchmarks/rabitq_poc/run_exact_vs_lut.py +++ /dev/null @@ -1,117 +0,0 @@ -""" -Compare Rust LUT kernel vs Rust exact-math kernel on the same index. - -Both paths run inside the same Rust pipeline — same rotation, same Lloyd-Max -codebook, same encoded codes, same per-vector scale. The only difference is -how the inner product is computed: - - - search() : bit-plane SIMD popcount through u8 LUTs + calibration - - search_exact() : float32 x_hat reconstruction + BLAS matmul - -If they differ by more than ~0.1pp on recall, that gap is precisely the -recall cost of the LUT-quantization noise in the SIMD kernel. Apples-to- -apples at the implementation level. -""" - -import json -import os -import time - -import h5py -import numpy as np -from turbovec import TurboQuantIndex - -DATA_DIR = os.path.expanduser("~/data/py-turboquant") -HERE = os.path.dirname(__file__) -SEED = 42 -N_DB = 100_000 -K = 64 -K_VALUES = [1, 2, 4, 8, 16, 32, 64] - - -def load_openai(dim): - all_vecs = np.load(os.path.join(DATA_DIR, f"openai-{dim}.npy")) - rng = np.random.RandomState(SEED) - idx = rng.permutation(len(all_vecs)) - db = all_vecs[idx[:N_DB]].astype(np.float32) - q = all_vecs[idx[N_DB : N_DB + 1_000]].astype(np.float32) - db /= np.linalg.norm(db, axis=-1, keepdims=True) - q /= np.linalg.norm(q, axis=-1, keepdims=True) - return db, q, dim - - -def load_glove(): - with h5py.File(os.path.join(DATA_DIR, "glove-200-angular.hdf5"), "r") as f: - all_train = f["train"][:].astype(np.float32) - queries = f["test"][:].astype(np.float32) - rng = np.random.RandomState(SEED) - idx = rng.choice(len(all_train), N_DB, replace=False) - db = all_train[idx] - db /= np.linalg.norm(db, axis=-1, keepdims=True) - queries /= np.linalg.norm(queries, axis=-1, keepdims=True) - return db, queries, 200 - - -DATASETS = { - "glove-200": load_glove, - "openai-1536": lambda: load_openai(1536), - "openai-3072": lambda: load_openai(3072), -} - - -def recall_at_1_at_k(true_top1, predicted_indices, k): - return float(np.mean([true_top1[i] in predicted_indices[i, :k] for i in range(len(true_top1))])) - - -def run(dataset, bits): - print(f"\n=== {dataset}, {bits}-bit ===") - t0 = time.time() - database, queries, dim = DATASETS[dataset]() - true_top1 = np.argmax(queries @ database.T, axis=1) - print(f" data + ground truth: {time.time() - t0:.1f}s") - - t0 = time.time() - index = TurboQuantIndex(dim, bit_width=bits) - index.add(database) - index.prepare() - print(f" build + prepare: {time.time() - t0:.1f}s") - - t0 = time.time() - _, lut_indices = index.search(queries, k=K) - lut_indices = np.array(lut_indices) - lut_recalls = {str(k): round(recall_at_1_at_k(true_top1, lut_indices, k), 4) for k in K_VALUES} - print(f" LUT kernel: recall@1 = {lut_recalls['1']:.4f} ({time.time() - t0:.1f}s)") - - t0 = time.time() - _, exact_indices = index.search_exact(queries, k=K) - exact_indices = np.array(exact_indices) - exact_recalls = {str(k): round(recall_at_1_at_k(true_top1, exact_indices, k), 4) for k in K_VALUES} - print(f" exact kernel: recall@1 = {exact_recalls['1']:.4f} ({time.time() - t0:.1f}s)") - print(f" Δ (exact - LUT) recall@1 = {exact_recalls['1'] - lut_recalls['1']:+.4f}") - - return { - "dataset": dataset, "dim": dim, "bit_width": bits, "seed": SEED, - "n_db": N_DB, "n_queries": len(queries), - "lut_kernel": lut_recalls, - "exact_kernel": exact_recalls, - } - - -if __name__ == "__main__": - results = {} - for dataset in ["glove-200", "openai-1536", "openai-3072"]: - for bits in (2, 4): - results[f"{dataset}_{bits}bit"] = run(dataset, bits) - - out_path = os.path.join(HERE, "exact_vs_lut.json") - with open(out_path, "w") as f: - json.dump(results, f, indent=2) - print(f"\nResults: {out_path}") - - print("\n" + "=" * 72) - print(f"{'cell':<22} {'LUT':>10} {'exact':>10} {'Δ exact-LUT':>14}") - print("-" * 72) - for key, r in results.items(): - lut = r["lut_kernel"]["1"] - exact = r["exact_kernel"]["1"] - print(f"{key:<22} {lut:>10.4f} {exact:>10.4f} {exact-lut:>+14.4f}") diff --git a/benchmarks/rabitq_poc/rust_comparison.png b/benchmarks/rabitq_poc/rust_comparison.png deleted file mode 100644 index 12011695..00000000 Binary files a/benchmarks/rabitq_poc/rust_comparison.png and /dev/null differ diff --git a/benchmarks/rabitq_poc/rust_results/recall_d1536_2bit.json b/benchmarks/rabitq_poc/rust_results/recall_d1536_2bit.json deleted file mode 100644 index 10fc3010..00000000 --- a/benchmarks/rabitq_poc/rust_results/recall_d1536_2bit.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "dataset": "openai-1536", - "dim": 1536, - "bit_width": 2, - "faiss_variant": "IndexPQ(m=384, nbits=8)", - "seed": 42, - "tq_recalls": { - "1": 0.906, - "2": 0.976, - "4": 0.999, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "faiss_recalls": { - "1": 0.872, - "2": 0.977, - "4": 0.997, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results/recall_d1536_4bit.json b/benchmarks/rabitq_poc/rust_results/recall_d1536_4bit.json deleted file mode 100644 index e4930ae0..00000000 --- a/benchmarks/rabitq_poc/rust_results/recall_d1536_4bit.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "dataset": "openai-1536", - "dim": 1536, - "bit_width": 4, - "faiss_variant": "IndexPQ(m=768, nbits=8)", - "seed": 42, - "tq_recalls": { - "1": 0.97, - "2": 0.997, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "faiss_recalls": { - "1": 0.966, - "2": 0.998, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results/recall_d3072_2bit.json b/benchmarks/rabitq_poc/rust_results/recall_d3072_2bit.json deleted file mode 100644 index a0207e66..00000000 --- a/benchmarks/rabitq_poc/rust_results/recall_d3072_2bit.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "dataset": "openai-3072", - "dim": 3072, - "bit_width": 2, - "faiss_variant": "IndexPQ(m=768, nbits=8)", - "seed": 42, - "tq_recalls": { - "1": 0.924, - "2": 0.992, - "4": 0.999, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "faiss_recalls": { - "1": 0.912, - "2": 0.986, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results/recall_d3072_4bit.json b/benchmarks/rabitq_poc/rust_results/recall_d3072_4bit.json deleted file mode 100644 index 4699ea77..00000000 --- a/benchmarks/rabitq_poc/rust_results/recall_d3072_4bit.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "dataset": "openai-3072", - "dim": 3072, - "bit_width": 4, - "faiss_variant": "IndexPQ(m=1536, nbits=8)", - "seed": 42, - "tq_recalls": { - "1": 0.98, - "2": 1.0, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "faiss_recalls": { - "1": 0.972, - "2": 0.998, - "4": 1.0, - "8": 1.0, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results/recall_glove_2bit.json b/benchmarks/rabitq_poc/rust_results/recall_glove_2bit.json deleted file mode 100644 index f115b270..00000000 --- a/benchmarks/rabitq_poc/rust_results/recall_glove_2bit.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "dataset": "glove", - "dim": 200, - "bit_width": 2, - "faiss_variant": "IndexPQ(m=50, nbits=8)", - "seed": 42, - "tq_recalls": { - "1": 0.5524, - "2": 0.7071, - "4": 0.8273, - "8": 0.91, - "16": 0.9624, - "32": 0.9852, - "64": 0.9957 - }, - "faiss_recalls": { - "1": 0.5643, - "2": 0.7188, - "4": 0.8446, - "8": 0.9252, - "16": 0.97, - "32": 0.9908, - "64": 0.9981 - } -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results/recall_glove_4bit.json b/benchmarks/rabitq_poc/rust_results/recall_glove_4bit.json deleted file mode 100644 index 1a377862..00000000 --- a/benchmarks/rabitq_poc/rust_results/recall_glove_4bit.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "dataset": "glove", - "dim": 200, - "bit_width": 4, - "faiss_variant": "IndexPQ(m=100, nbits=8)", - "seed": 42, - "tq_recalls": { - "1": 0.844, - "2": 0.9556, - "4": 0.9932, - "8": 0.9997, - "16": 1.0, - "32": 1.0, - "64": 1.0 - }, - "faiss_recalls": { - "1": 0.841, - "2": 0.9515, - "4": 0.9914, - "8": 0.9986, - "16": 1.0, - "32": 1.0, - "64": 1.0 - } -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results/speed_d1536_2bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results/speed_d1536_2bit_arm_mt.json deleted file mode 100644 index 377e3e10..00000000 --- a/benchmarks/rabitq_poc/rust_results/speed_d1536_2bit_arm_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 2, - "arch": "arm", - "threading": "mt", - "tq_ms_per_query": 0.109, - "faiss_ms_per_query": 0.13 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results/speed_d1536_2bit_arm_st.json b/benchmarks/rabitq_poc/rust_results/speed_d1536_2bit_arm_st.json deleted file mode 100644 index 0271049f..00000000 --- a/benchmarks/rabitq_poc/rust_results/speed_d1536_2bit_arm_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 2, - "arch": "arm", - "threading": "st", - "tq_ms_per_query": 1.114, - "faiss_ms_per_query": 1.262 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results/speed_d1536_4bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results/speed_d1536_4bit_arm_mt.json deleted file mode 100644 index 1a043225..00000000 --- a/benchmarks/rabitq_poc/rust_results/speed_d1536_4bit_arm_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 4, - "arch": "arm", - "threading": "mt", - "tq_ms_per_query": 0.202, - "faiss_ms_per_query": 0.261 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results/speed_d1536_4bit_arm_st.json b/benchmarks/rabitq_poc/rust_results/speed_d1536_4bit_arm_st.json deleted file mode 100644 index 6da77e10..00000000 --- a/benchmarks/rabitq_poc/rust_results/speed_d1536_4bit_arm_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 4, - "arch": "arm", - "threading": "st", - "tq_ms_per_query": 2.079, - "faiss_ms_per_query": 2.525 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results/speed_d3072_2bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results/speed_d3072_2bit_arm_mt.json deleted file mode 100644 index f732e3fa..00000000 --- a/benchmarks/rabitq_poc/rust_results/speed_d3072_2bit_arm_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 2, - "arch": "arm", - "threading": "mt", - "tq_ms_per_query": 0.214, - "faiss_ms_per_query": 0.267 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results/speed_d3072_2bit_arm_st.json b/benchmarks/rabitq_poc/rust_results/speed_d3072_2bit_arm_st.json deleted file mode 100644 index 1920310f..00000000 --- a/benchmarks/rabitq_poc/rust_results/speed_d3072_2bit_arm_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 2, - "arch": "arm", - "threading": "st", - "tq_ms_per_query": 2.176, - "faiss_ms_per_query": 2.512 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results/speed_d3072_4bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results/speed_d3072_4bit_arm_mt.json deleted file mode 100644 index 6585c998..00000000 --- a/benchmarks/rabitq_poc/rust_results/speed_d3072_4bit_arm_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 4, - "arch": "arm", - "threading": "mt", - "tq_ms_per_query": 0.389, - "faiss_ms_per_query": 0.478 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results/speed_d3072_4bit_arm_st.json b/benchmarks/rabitq_poc/rust_results/speed_d3072_4bit_arm_st.json deleted file mode 100644 index 9d58fadb..00000000 --- a/benchmarks/rabitq_poc/rust_results/speed_d3072_4bit_arm_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 4, - "arch": "arm", - "threading": "st", - "tq_ms_per_query": 4.083, - "faiss_ms_per_query": 5.06 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_2bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_2bit_arm_mt.json deleted file mode 100644 index 505f5d9d..00000000 --- a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_2bit_arm_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 2, - "arch": "arm", - "threading": "mt", - "tq_ms_per_query": 0.107, - "faiss_ms_per_query": 0.126 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_2bit_arm_st.json b/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_2bit_arm_st.json deleted file mode 100644 index 68ffa404..00000000 --- a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_2bit_arm_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 2, - "arch": "arm", - "threading": "st", - "tq_ms_per_query": 1.069, - "faiss_ms_per_query": 1.241 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_4bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_4bit_arm_mt.json deleted file mode 100644 index df834670..00000000 --- a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_4bit_arm_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 4, - "arch": "arm", - "threading": "mt", - "tq_ms_per_query": 0.192, - "faiss_ms_per_query": 0.244 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_4bit_arm_st.json b/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_4bit_arm_st.json deleted file mode 100644 index f06c14d3..00000000 --- a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d1536_4bit_arm_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 4, - "arch": "arm", - "threading": "st", - "tq_ms_per_query": 2.017, - "faiss_ms_per_query": 2.518 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_2bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_2bit_arm_mt.json deleted file mode 100644 index 4c5da68e..00000000 --- a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_2bit_arm_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 2, - "arch": "arm", - "threading": "mt", - "tq_ms_per_query": 0.21, - "faiss_ms_per_query": 0.247 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_2bit_arm_st.json b/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_2bit_arm_st.json deleted file mode 100644 index bec5918a..00000000 --- a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_2bit_arm_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 2, - "arch": "arm", - "threading": "st", - "tq_ms_per_query": 2.145, - "faiss_ms_per_query": 2.482 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_4bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_4bit_arm_mt.json deleted file mode 100644 index 363e311f..00000000 --- a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_4bit_arm_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 4, - "arch": "arm", - "threading": "mt", - "tq_ms_per_query": 0.386, - "faiss_ms_per_query": 0.471 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_4bit_arm_st.json b/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_4bit_arm_st.json deleted file mode 100644 index b3cc5af6..00000000 --- a/benchmarks/rabitq_poc/rust_results_baseline_speed/speed_d3072_4bit_arm_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 4, - "arch": "arm", - "threading": "st", - "tq_ms_per_query": 4.02, - "faiss_ms_per_query": 4.996 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_2bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_2bit_arm_mt.json deleted file mode 100644 index c122dfe3..00000000 --- a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_2bit_arm_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 2, - "arch": "arm", - "threading": "mt", - "tq_ms_per_query": 0.106, - "faiss_ms_per_query": 0.12 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_2bit_arm_st.json b/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_2bit_arm_st.json deleted file mode 100644 index e4372abb..00000000 --- a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_2bit_arm_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 2, - "arch": "arm", - "threading": "st", - "tq_ms_per_query": 1.097, - "faiss_ms_per_query": 1.248 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_4bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_4bit_arm_mt.json deleted file mode 100644 index 5cb4a352..00000000 --- a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_4bit_arm_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 4, - "arch": "arm", - "threading": "mt", - "tq_ms_per_query": 0.19, - "faiss_ms_per_query": 0.234 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_4bit_arm_st.json b/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_4bit_arm_st.json deleted file mode 100644 index 0ad0cb75..00000000 --- a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d1536_4bit_arm_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 4, - "arch": "arm", - "threading": "st", - "tq_ms_per_query": 2.017, - "faiss_ms_per_query": 2.479 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_2bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_2bit_arm_mt.json deleted file mode 100644 index 2e3be1d3..00000000 --- a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_2bit_arm_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 2, - "arch": "arm", - "threading": "mt", - "tq_ms_per_query": 0.218, - "faiss_ms_per_query": 0.256 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_2bit_arm_st.json b/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_2bit_arm_st.json deleted file mode 100644 index 9e72f306..00000000 --- a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_2bit_arm_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 2, - "arch": "arm", - "threading": "st", - "tq_ms_per_query": 2.151, - "faiss_ms_per_query": 2.477 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_4bit_arm_mt.json b/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_4bit_arm_mt.json deleted file mode 100644 index af45d503..00000000 --- a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_4bit_arm_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 4, - "arch": "arm", - "threading": "mt", - "tq_ms_per_query": 0.396, - "faiss_ms_per_query": 0.484 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_4bit_arm_st.json b/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_4bit_arm_st.json deleted file mode 100644 index 1402d3e7..00000000 --- a/benchmarks/rabitq_poc/rust_results_proto_speed/speed_d3072_4bit_arm_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 4, - "arch": "arm", - "threading": "st", - "tq_ms_per_query": 4.032, - "faiss_ms_per_query": 5.09 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_2bit_x86_mt.json b/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_2bit_x86_mt.json deleted file mode 100644 index d9390c6f..00000000 --- a/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_2bit_x86_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 2, - "arch": "x86", - "threading": "mt", - "tq_ms_per_query": 0.302, - "faiss_ms_per_query": 0.297 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_2bit_x86_st.json b/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_2bit_x86_st.json deleted file mode 100644 index 087a9a88..00000000 --- a/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_2bit_x86_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 2, - "arch": "x86", - "threading": "st", - "tq_ms_per_query": 1.322, - "faiss_ms_per_query": 1.314 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_4bit_x86_mt.json b/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_4bit_x86_mt.json deleted file mode 100644 index ddccda71..00000000 --- a/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_4bit_x86_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 4, - "arch": "x86", - "threading": "mt", - "tq_ms_per_query": 0.567, - "faiss_ms_per_query": 0.589 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_4bit_x86_st.json b/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_4bit_x86_st.json deleted file mode 100644 index 7e03309f..00000000 --- a/benchmarks/rabitq_poc/x86_results_baseline/speed_d1536_4bit_x86_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 4, - "arch": "x86", - "threading": "st", - "tq_ms_per_query": 2.533, - "faiss_ms_per_query": 2.554 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_2bit_x86_mt.json b/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_2bit_x86_mt.json deleted file mode 100644 index 6ab96961..00000000 --- a/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_2bit_x86_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 2, - "arch": "x86", - "threading": "mt", - "tq_ms_per_query": 0.614, - "faiss_ms_per_query": 0.588 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_2bit_x86_st.json b/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_2bit_x86_st.json deleted file mode 100644 index 21eb78ee..00000000 --- a/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_2bit_x86_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 2, - "arch": "x86", - "threading": "st", - "tq_ms_per_query": 2.56, - "faiss_ms_per_query": 2.538 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_4bit_x86_mt.json b/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_4bit_x86_mt.json deleted file mode 100644 index 285a30e2..00000000 --- a/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_4bit_x86_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 4, - "arch": "x86", - "threading": "mt", - "tq_ms_per_query": 1.154, - "faiss_ms_per_query": 1.17 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_4bit_x86_st.json b/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_4bit_x86_st.json deleted file mode 100644 index f6474fdb..00000000 --- a/benchmarks/rabitq_poc/x86_results_baseline/speed_d3072_4bit_x86_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 4, - "arch": "x86", - "threading": "st", - "tq_ms_per_query": 4.935, - "faiss_ms_per_query": 5.011 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_2bit_x86_mt.json b/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_2bit_x86_mt.json deleted file mode 100644 index 1cd89c94..00000000 --- a/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_2bit_x86_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 2, - "arch": "x86", - "threading": "mt", - "tq_ms_per_query": 0.305, - "faiss_ms_per_query": 0.297 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_2bit_x86_st.json b/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_2bit_x86_st.json deleted file mode 100644 index db4c36f9..00000000 --- a/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_2bit_x86_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 2, - "arch": "x86", - "threading": "st", - "tq_ms_per_query": 1.344, - "faiss_ms_per_query": 1.272 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_4bit_x86_mt.json b/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_4bit_x86_mt.json deleted file mode 100644 index 9fefa5c7..00000000 --- a/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_4bit_x86_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 4, - "arch": "x86", - "threading": "mt", - "tq_ms_per_query": 0.569, - "faiss_ms_per_query": 0.588 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_4bit_x86_st.json b/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_4bit_x86_st.json deleted file mode 100644 index 6e664784..00000000 --- a/benchmarks/rabitq_poc/x86_results_proto/speed_d1536_4bit_x86_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 1536, - "bit_width": 4, - "arch": "x86", - "threading": "st", - "tq_ms_per_query": 2.545, - "faiss_ms_per_query": 2.566 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_2bit_x86_mt.json b/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_2bit_x86_mt.json deleted file mode 100644 index 6ab96961..00000000 --- a/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_2bit_x86_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 2, - "arch": "x86", - "threading": "mt", - "tq_ms_per_query": 0.614, - "faiss_ms_per_query": 0.588 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_2bit_x86_st.json b/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_2bit_x86_st.json deleted file mode 100644 index 21eb78ee..00000000 --- a/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_2bit_x86_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 2, - "arch": "x86", - "threading": "st", - "tq_ms_per_query": 2.56, - "faiss_ms_per_query": 2.538 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_4bit_x86_mt.json b/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_4bit_x86_mt.json deleted file mode 100644 index 285a30e2..00000000 --- a/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_4bit_x86_mt.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 4, - "arch": "x86", - "threading": "mt", - "tq_ms_per_query": 1.154, - "faiss_ms_per_query": 1.17 -} \ No newline at end of file diff --git a/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_4bit_x86_st.json b/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_4bit_x86_st.json deleted file mode 100644 index f6474fdb..00000000 --- a/benchmarks/rabitq_poc/x86_results_proto/speed_d3072_4bit_x86_st.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "dim": 3072, - "bit_width": 4, - "arch": "x86", - "threading": "st", - "tq_ms_per_query": 4.935, - "faiss_ms_per_query": 5.011 -} \ No newline at end of file