diff --git a/records/track_10min_16mb/2026-04-02_Scylla_Byte_Accounting_Audit/README.md b/records/track_10min_16mb/2026-04-02_Scylla_Byte_Accounting_Audit/README.md new file mode 100644 index 0000000000..5d15d5b624 --- /dev/null +++ b/records/track_10min_16mb/2026-04-02_Scylla_Byte_Accounting_Audit/README.md @@ -0,0 +1,108 @@ +## What this is + +An audit of PR #1184's Scylla tokenizer byte accounting. I ran their exact +code with corrected `candidate.meta.npz` and proper val data. The result: +**1.1289 BPB, not 0.9485.** + +The sub-1.0 claim was a measurement error. + +## The bug + +PR #1184's `candidate.meta.npz` has 27 byte-fallback tokens (IDs 75-101) +with `base_bytes=3` instead of 1. These tokens represent single raw bytes +but are counted as 3 bytes each. This overcounts the byte denominator in +the BPP formula, making the score look ~4% better than it actually is. + +This was originally flagged by @dexhunter on PR #1143 (the earlier Scylla +submission, which was closed for exactly this reason). PR #1184 reuses the +same buggy `candidate.meta.npz`. + +## My test + +I ran PR #1184's **exact, unmodified `train_gpt.py`** with every env var +matching their README: + +```bash +VOCAB_SIZE=998 XSA_LAST_N=11 USE_GPTQ=1 GPTQ_RESERVE_MS=9000 +BIGRAM_VOCAB_SIZE=2816 BIGRAM_DIM=112 TTT_ENABLED=0 +``` + +The only change: a corrected `candidate.meta.npz` that fixes the 27 +byte-fallback tokens from `base_bytes=3` to `base_bytes=1`. Everything else +is identical — same architecture, same optimizer, same GPTQ, same data. + +I also retokenized the val shard directly from `docs_selected.jsonl` (no +SP1024 roundtrip) using the official split: shuffle with seed 1337, last +50K docs = val. This produced 62.6M val tokens, close to their 62.4M. + +## Results + +| | PR #1184 (buggy meta) | Me (corrected meta) | +|---|:---:|:---:| +| Val tokens | 62,363,648 | 62,609,408 | +| Val NLL | 1.928 | 1.916 | +| **Sliding BPB** | **0.9491** | **1.1289** | +| Train shards | 194 | 207 | +| Code | Their exact train_gpt.py | Same | + +The NLL is nearly identical (1.928 vs 1.916 — my model is actually slightly +*better*). The entire 0.18 BPP gap comes from different byte accounting. + +## Decomposing the gap + +I decomposed the BPB formula `(NLL / ln2) × (tokens / bytes)` to isolate +what's driving the difference: + +| Factor | BPB impact | +|--------|:---------:| +| Model quality (NLL difference) | +0.010 | +| **Byte accounting difference** | **+0.133** | +| Val text/token boundary differences | +0.037 | +| **Total** | **+0.180** | + +**93% of the gap is byte accounting, not model quality.** The Scylla +tokenizer doesn't make the model predict better, the buggy meta just +makes the denominator bigger, which makes BPP smaller. + +## What this means + +With corrected accounting, the Scylla stack lands at ~1.13 BPB. This is +essentially the same as the SP1024 stack at ~1.11-1.12. The tokenizer +itself provides no meaningful advantage. + +I'd like to flag PR #1184 for review. + +## Corrected files included + +- `correct_meta.npz` — fixes only the 27 byte-fallback tokens, leaves + everything else unchanged (has_leading_space=0, is_boundary=0) +- `retokenize_proper.py` — retokenizes from raw `docs_selected.jsonl` + with proper train/val split (shuffle seed 1337, last 50K = val) + +## Reproducing + +```bash +# Create corrected meta (only byte-fallback fix) +python3 -c " +import numpy as np +orig = np.load('candidate.meta.npz') +bb = orig['base_bytes'].copy() +for i in range(75, 102): bb[i] = 1 +np.savez('correct_meta.npz', **{k: orig[k] for k in orig if k != 'base_bytes'}, base_bytes=bb) +" + +# Retokenize from raw docs (proper split) +python3 retokenize_proper.py + +# Train (PR #1184's exact code + corrected meta) +SEED=1337 VOCAB_SIZE=998 XSA_LAST_N=11 USE_GPTQ=1 GPTQ_RESERVE_MS=9000 \ +BIGRAM_VOCAB_SIZE=2816 BIGRAM_DIM=112 TTT_ENABLED=0 \ +DATA_PATH=./fineweb_scylla TOKENIZER_PATH=./candidate.vocab \ +TOKENIZER_META_PATH=./correct_meta.npz \ +torchrun --standalone --nproc_per_node=8 train_gpt.py +``` + +## Request for review + +@0hq @valerio-oai PR #1184 should be re-evaluated with corrected byte +accounting before being merged. \ No newline at end of file diff --git a/records/track_10min_16mb/2026-04-02_Scylla_Byte_Accounting_Audit/correct_meta.npz b/records/track_10min_16mb/2026-04-02_Scylla_Byte_Accounting_Audit/correct_meta.npz new file mode 100644 index 0000000000..567ed9cf8b Binary files /dev/null and b/records/track_10min_16mb/2026-04-02_Scylla_Byte_Accounting_Audit/correct_meta.npz differ diff --git a/records/track_10min_16mb/2026-04-02_Scylla_Byte_Accounting_Audit/retokenize_proper.py b/records/track_10min_16mb/2026-04-02_Scylla_Byte_Accounting_Audit/retokenize_proper.py new file mode 100644 index 0000000000..9a65d9343e --- /dev/null +++ b/records/track_10min_16mb/2026-04-02_Scylla_Byte_Accounting_Audit/retokenize_proper.py @@ -0,0 +1,157 @@ +"""Proper Scylla retokenization: split train/val from raw docs, no SP1024 roundtrip. +Matches the official manifest: shuffle with seed 1337, last 50K docs = val. +Memory-efficient: workers read from disk, not from in-memory lists.""" +import json +import os +import sys +import time +import random +import math +import numpy as np +from pathlib import Path +from multiprocessing import Process, Queue + +HEADER_INTS = 256 +HEADER_MAGIC = 20240520 +HEADER_VERSION = 1 +TOKENS_PER_SHARD = 100_000_000 +NUM_VAL_DOCS = 50000 +SHUFFLE_SEED = 1337 + + +def write_shard(path, tokens): + header = np.zeros(HEADER_INTS, dtype="= TOKENS_PER_SHARD: + shard_tokens = np.array(buffer[:TOKENS_PER_SHARD], dtype=np.uint16) + write_shard( + Path(out_dir) / f"fineweb_train_w{worker_id:02d}_{shard_count:04d}.bin", + shard_tokens, + ) + buffer = buffer[TOKENS_PER_SHARD:] + shard_count += 1 + + if buffer: + write_shard( + Path(out_dir) / f"fineweb_train_w{worker_id:02d}_{shard_count:04d}.bin", + np.array(buffer, dtype=np.uint16), + ) + shard_count += 1 + + result_queue.put((worker_id, shard_count)) + print(f" Worker {worker_id}: {shard_count} shards", flush=True) + + +def main(): + vocab_path = os.environ.get("VOCAB_PATH", "/workspace/candidate.vocab") + docs_path = os.environ.get("DOCS_PATH", "/workspace/raw_docs/datasets/docs_selected.jsonl") + out_dir = Path(os.environ.get("OUTPUT_DIR", "/workspace/fineweb_scylla")) + num_workers = int(os.environ.get("NUM_WORKERS", "16")) + + out_dir.mkdir(parents=True, exist_ok=True) + + # --- Step 1: Count lines and determine split --- + print("Counting docs...", flush=True) + t0 = time.time() + total_lines = 0 + with open(docs_path, "r") as f: + for _ in f: + total_lines += 1 + print(f"Total: {total_lines} docs in {time.time()-t0:.0f}s", flush=True) + + # Shuffle indices (matching official manifest) + print(f"Shuffling with seed {SHUFFLE_SEED}...", flush=True) + indices = list(range(total_lines)) + random.seed(SHUFFLE_SEED) + random.shuffle(indices) + + val_indices = set(indices[-NUM_VAL_DOCS:]) + train_indices = indices[:-NUM_VAL_DOCS] + print(f"Train: {len(train_indices)} docs, Val: {len(val_indices)} docs", flush=True) + + # --- Step 2: Tokenize val (single process) --- + print("Tokenizing val docs...", flush=True) + import tokenmonster + vocab = tokenmonster.load(vocab_path) + val_tokens = [] + with open(docs_path, "r") as f: + for line_num, line in enumerate(f): + if line_num not in val_indices: + continue + doc = json.loads(line) + text = doc.get("text", "") + if text: + val_tokens.extend(vocab.tokenize(text)) + write_shard(out_dir / "fineweb_val_000000.bin", np.array(val_tokens, dtype=np.uint16)) + print(f"Val: {len(val_tokens)} tokens", flush=True) + del val_tokens, vocab + + # --- Step 3: Split train indices into contiguous chunks for workers --- + # Sort train indices so each worker processes docs in file order (fast sequential read) + train_indices.sort() + chunk_size = math.ceil(len(train_indices) / num_workers) + chunks = [train_indices[i*chunk_size:(i+1)*chunk_size] for i in range(num_workers)] + del train_indices + + # --- Step 4: Launch parallel workers --- + print(f"Tokenizing train with {num_workers} workers...", flush=True) + t0 = time.time() + result_queue = Queue() + workers = [] + for i in range(num_workers): + p = Process(target=worker_fn, args=(i, chunks[i], docs_path, vocab_path, str(out_dir), result_queue)) + p.start() + workers.append(p) + + for p in workers: + p.join() + + results = [] + while not result_queue.empty(): + results.append(result_queue.get()) + results.sort() + total_shards = sum(r[1] for r in results) + print(f"Workers done: {total_shards} shards in {time.time()-t0:.0f}s", flush=True) + + # --- Step 5: Rename to sequential --- + shard_files = [] + for wid in range(num_workers): + worker_files = sorted(out_dir.glob(f"fineweb_train_w{wid:02d}_*.bin")) + shard_files.extend(worker_files) + + for idx, f in enumerate(shard_files): + f.rename(f.parent / f"fineweb_train_{idx:06d}.bin") + + print(f"Done! {len(shard_files)} train + 1 val shards", flush=True) + + +if __name__ == "__main__": + main()