diff --git a/bench/2026-05-25_ee_n_sweep/PLAN.md b/bench/2026-05-25_ee_n_sweep/PLAN.md new file mode 100644 index 000000000..5828cce6c --- /dev/null +++ b/bench/2026-05-25_ee_n_sweep/PLAN.md @@ -0,0 +1,131 @@ +# ee N-sweep Plan: baseline / ee3 / ee5 / ee7 + +Scope: NIAH @ 32K/64K/128K + 5-client multi-turn bandit session, all four conditions. +Goal: find the smallest N where quality and accept_rate hold vs ee7. + +## Trigger + +User says "GPU is free, run the ee_n sweep." + +Pre-flight GPU check: + + flock -n /tmp/lucebox-gpu.lock echo ok + +Exit 0 means GPU is free. Exit 1 means another process holds the lock — wait or +ask the user. + +## Pre-flight: generate NIAH case files (once, CPU only) + +Run once before the NIAH sweep. Requires `transformers` and `Qwen3-0.6B` tokenizer. + + python3 pflash/tests/niah_gen.py --context 32768 --n 3 -o /tmp/niah_32768.jsonl + python3 pflash/tests/niah_gen.py --context 65536 --n 3 -o /tmp/niah_65536.jsonl + python3 pflash/tests/niah_gen.py --context 131072 --n 3 -o /tmp/niah_131072.jsonl + +Check niah_gen.py's flags first — --context / --n / -o are the expected interface +but verify before running if in doubt: + + python3 pflash/tests/niah_gen.py --help + +## Commands (literal copy-paste, run from worktree root) + + cd /home/peppi/Dev/lucebox-hub/.claude/worktrees/drafter-fastpath + + # Step 1: NIAH sweep (~50 min total, serialized) + dflash/bench/run_ee_n_sweep.sh + + # Step 2: multi-client (~140 min total, serialized) + dflash/bench/run_ee_n_multiclient.sh + +Both scripts accept an optional output_dir as $1. + +## Expected output layout + + dflash/bench/results/2026-05-25_ee_n_sweep/ + raw_results.json + SUMMARY.md # written by the sweep script + baseline_32768_case{0,1,2}_server.log + baseline_65536_case{0,1,2}_server.log + baseline_131072_case{0,1,2}_server.log + ee3_*.log ee5_*.log ee7_*.log + + .claude/worktrees/drafter-fastpath/bench/results/2026-05-25_ee_n_multiclient/ + baseline/ + claude_code.csv claude_code.log claude_code_server.log + hermes.csv hermes.log hermes_server.log + opencode.csv opencode.log opencode_server.log + pi.csv pi.log pi_server.log + codex.csv codex.log codex_server.log + ee3/ ee5/ ee7/ (same structure) + +## Decision gate + +Smallest N where ALL hold vs ee7: + +1. NIAH @ 32K and 64K: within +-1 needle (e.g. 2/3 or 3/3 both acceptable) +2. accept_rate across 5 clients: mean within +-2 pp of ee7 +3. drafter wall: <= ee7 wall at each context (smaller N must be faster) +4. No crashes: zero ggml_view_3d asserts, zero server OOM + +Outcome mapping: +- ee3 passes all -> propose ee3 as new production default (follow-up PR after #274 merges) +- ee5 passes, ee3 fails -> ee5 +- neither passes -> ee7 stays default, close the N-reduction investigation + +## Estimated cost + +- GPU wall: ~3 hr serialized (NIAH 50 min + multi-client 140 min) +- Disk: ~50 MB +- Compute cost: $0 (local RTX 3090) + +## Hard stops + +- Any condition raises ggml_view_3d assert -> STOP, Bug #42 regressed, file issue +- flock wait exceeds 30 min -> STOP, something holds the GPU lock unexpectedly +- A condition's NIAH drops to 0/3 at 32K or 64K -> STOP, do not run further +- Server OOM on baseline (no early-exit) at 128K -> expected if VRAM too tight; + note it and continue with ee conditions + +## Out of scope (deferred) + +- 1K-16K NIAH: already 3/3 on ee7 per 2026-05-21_ee7_broad results; ee3/ee5 speedup + at short context is negligible, gate is long-context quality +- ee10, ee14, ee2: user explicitly scoped to baseline + ee3 + ee5 + ee7 +- Cross-family drafter (SmolLM2): loader-ready but kernel not generalized yet + +## Flag mismatch notes (for next session) + +### NIAH script (run_niah_ee7_longctx.py) + +The existing `run_niah_ee7_longctx.py` only understands conditions "baseline", +"ee14", "ee7" — it hardcodes env var logic for those three. It does NOT accept +"ee3" or "ee5". + +Resolution: the new `run_ee_n_sweep_niah.py` script was written for this sweep. +It accepts arbitrary N via CONDITION_SPECS dict and handles env var injection +directly. Do NOT call run_niah_ee7_longctx.py for the N-sweep. + +### bandit-session required flags + +`client_test_runner.py bandit-session` requires --target, --draft, --bin (no +defaults). The multi-client script passes all three hardcoded to: + --target /home/peppi/models/qwen3.6-27b-q4km/Qwen3.6-27B-Q4_K_M.gguf + --draft /home/peppi/models/Qwen3-0.6B-BF16.gguf + --bin dflash/build/dflash_server + +If model paths have changed, edit CLIENTS_TARGET / DRAFTER / BIN vars at the +top of run_ee_n_multiclient.sh before running. + +### niah_gen.py interface + +Verify niah_gen.py flags before generating cases: + python3 pflash/tests/niah_gen.py --help + +The expected interface is --context / --n / -o. If the flags differ, adjust +the generate commands above. + +### env prefix for baseline in multi-client script + +The shell uses `env $env_prefix python3 ...`. When env_prefix is empty (baseline), +this collapses to `env python3 ...` which is valid — env with no assignments is +a no-op passthrough. diff --git a/dflash/README.md b/dflash/README.md index 35bc2a2b7..03e9ee1c6 100644 --- a/dflash/README.md +++ b/dflash/README.md @@ -353,6 +353,22 @@ python3 scripts/bench_llm.py # HE + GSM8K + Math python3 scripts/bench_he.py --n-gen 256 --ddtree-budget 22 # minimal HE bench ``` +**Early-exit drafter (requires PR #274 — `PFLASH_DRAFTER_EARLY_EXIT_N`):** + +Truncate the drafter forward at layer N instead of running all 28 layers. N=3 is the validated production default on RTX 3090 + Qwen2.5-0.5B-BF16: + +```bash +PFLASH_DRAFTER_EARLY_EXIT_N=3 PFLASH_DRAFTER_SCORE_LAYERS=3 \ + build/dflash_server ... +``` + +Headline numbers vs baseline (RTX 3090, Q4_K_M target, 0.5B-BF16 drafter): +- 6.9× drafter speedup at 32K, 24.3× at 128K +- accept_rate delta vs ee7: +1.2 pp (within ±2 pp gate across all 5 clients) +- NIAH 3/3 at 32K, 64K, 128K (Bug #42 fix included) + +Reproduce: `dflash/bench/run_ee_n_sweep.sh` (NIAH + multi-client N-sweep) and `dflash/bench/run_ee_n_multiclient.sh` (5-client accept_rate comparison). + **Long-context mode (up to 256K):** ```bash DFLASH27B_KV_TQ3=1 DFLASH27B_PREFILL_UBATCH=16 \ diff --git a/dflash/bench/run_ee_n_multiclient.sh b/dflash/bench/run_ee_n_multiclient.sh new file mode 100755 index 000000000..35b51e8eb --- /dev/null +++ b/dflash/bench/run_ee_n_multiclient.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# Multi-client bandit-session × {baseline, ee3, ee5, ee7} × {claude_code, hermes, opencode, pi, codex} +# Each server boot is flock-serialized on /tmp/lucebox-gpu.lock. +# +# Usage: dflash/bench/run_ee_n_multiclient.sh [output_dir] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +WORKTREE="/home/peppi/Dev/lucebox-hub/.claude/worktrees/harness-adapters" +DRIVER="$WORKTREE/harness/client_test_runner.py" + +TARGET="/home/peppi/models/qwen3.6-27b-q4km/Qwen3.6-27B-Q4_K_M.gguf" +DRAFTER="/home/peppi/models/Qwen3-0.6B-BF16.gguf" +BIN="$ROOT/dflash/build/dflash_server" + +OUTDIR="${1:-$ROOT/bench/results/2026-05-25_ee_n_multiclient}" +mkdir -p "$OUTDIR" + +# Verify driver exists. +if [[ ! -f "$DRIVER" ]]; then + echo "ERROR: driver not found: $DRIVER" + echo "Check that the harness-adapters worktree is checked out." + exit 1 +fi + +CLIENTS=(claude_code hermes opencode pi codex) +# format: name:EARLY_EXIT_N:SCORE_LAYERS (0 means unset -> full drafter layers) +CONDITIONS=("baseline:0:0" "ee3:3:3" "ee5:5:5" "ee7:7:7") + +echo "=== ee N-sweep multi-client start ($(date)) ===" + +for cond_spec in "${CONDITIONS[@]}"; do + name="${cond_spec%%:*}" + rest="${cond_spec#*:}" + early_n="${rest%%:*}" + score_n="${rest#*:}" + + cond_dir="$OUTDIR/$name" + mkdir -p "$cond_dir" + + for client in "${CLIENTS[@]}"; do + echo "=== $name x $client ($(date)) ===" + + # Build env vars for early-exit (unset for baseline). + # DFLASH_SERVER_BIN: overrides harness default cpp binary path. + # PYTHONPATH: needed for 'from harness.metrics_parser import ...' inside driver. + export DFLASH_SERVER_BIN="$BIN" + export PYTHONPATH="$WORKTREE" + if [[ "$early_n" != "0" ]]; then + export PFLASH_DRAFTER_EARLY_EXIT_N="$early_n" + export PFLASH_DRAFTER_SCORE_LAYERS="$score_n" + else + unset PFLASH_DRAFTER_EARLY_EXIT_N PFLASH_DRAFTER_SCORE_LAYERS 2>/dev/null || true + fi + + flock -w 1800 /tmp/lucebox-gpu.lock \ + python3 "$DRIVER" bandit-session \ + --client "$client" \ + --turns 3 \ + --target "$TARGET" \ + --draft "$DRAFTER" \ + --bin "$BIN" \ + --output "$cond_dir/${client}.csv" \ + 2>&1 | tee "$cond_dir/${client}.log" \ + || echo "FAIL: $name x $client (see $cond_dir/${client}.log)" + + # Capture server log if the harness wrote one to the standard evidence dir. + latest_server_log=$(ls -t "$WORKTREE"/dflash/bench/results/*_adaptive_evidence/server.log 2>/dev/null | head -1 || true) + if [[ -n "$latest_server_log" ]]; then + cp "$latest_server_log" "$cond_dir/${client}_server.log" + fi + done +done + +echo "=== multi-client done. Results under $OUTDIR ($(date)) ===" diff --git a/dflash/bench/run_ee_n_sweep.sh b/dflash/bench/run_ee_n_sweep.sh new file mode 100755 index 000000000..d454471a9 --- /dev/null +++ b/dflash/bench/run_ee_n_sweep.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# N-sweep NIAH bench: baseline + ee3 + ee5 + ee7 @ 32K / 64K / 128K +# +# Each server boot is flock-serialized on /tmp/lucebox-gpu.lock. +# Pre-flight: NIAH case files must exist under CASES_DIR (see PLAN.md for gen command). +# +# Usage: dflash/bench/run_ee_n_sweep.sh [output_dir] [cases_dir] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +OUTDIR="${1:-$ROOT/dflash/bench/results/2026-05-25_ee_n_sweep}" +CASES_DIR="${2:-/tmp}" + +mkdir -p "$OUTDIR" + +echo "=== ee N-sweep NIAH start ($(date)) ===" +echo " out-dir: $OUTDIR" +echo " cases-dir: $CASES_DIR" + +# Verify case files exist before acquiring the GPU lock. +for ctx in 32768 65536 131072; do + f="$CASES_DIR/niah_${ctx}.jsonl" + if [[ ! -f "$f" ]]; then + echo "ERROR: missing $f" + echo "Generate with:" + echo " python3 $ROOT/pflash/tests/niah_gen.py --context $ctx --n 3 -o $f" + exit 1 + fi +done + +# Run all conditions in a single flock-serialized call. +# The Python script handles per-server serialization internally (one server per case). +flock -w 1800 /tmp/lucebox-gpu.lock -c " + python3 $ROOT/dflash/bench/run_ee_n_sweep_niah.py \ + --out-dir '$OUTDIR' \ + --cases-dir '$CASES_DIR' +" || { echo "FAIL: GPU lock timeout or sweep error"; exit 1; } + +echo "=== N-sweep NIAH done. Results under $OUTDIR ($(date)) ===" diff --git a/dflash/bench/run_ee_n_sweep_niah.py b/dflash/bench/run_ee_n_sweep_niah.py new file mode 100755 index 000000000..2f02c58a2 --- /dev/null +++ b/dflash/bench/run_ee_n_sweep_niah.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 +""" +N-sweep NIAH: baseline + ee3 + ee5 + ee7 at 32K / 64K / 128K. + +Each condition sets PFLASH_DRAFTER_EARLY_EXIT_N / PFLASH_DRAFTER_SCORE_LAYERS in the +server environment. One server boot per (condition, context, case) triplet to avoid +the Bug #42 ggml_view_3d assert that fires when a single server sees >1 sequence length. + +Conditions: + baseline -- no early-exit (full 28 drafter layers) + ee3 -- EARLY_EXIT_N=3 SCORE_LAYERS=3 + ee5 -- EARLY_EXIT_N=5 SCORE_LAYERS=5 + ee7 -- EARLY_EXIT_N=7 SCORE_LAYERS=7 (production reference) + +Usage: + python3 dflash/bench/run_ee_n_sweep_niah.py [--out-dir PATH] [--cases-dir PATH] \ + [--conditions COND ...] [--contexts CTX ...] + +Case files: + {cases_dir}/niah_32768.jsonl + {cases_dir}/niah_65536.jsonl + {cases_dir}/niah_131072.jsonl + +Generate them once via: + python3 pflash/tests/niah_gen.py --context 32768 --n 3 -o /tmp/niah_32768.jsonl + python3 pflash/tests/niah_gen.py --context 65536 --n 3 -o /tmp/niah_65536.jsonl + python3 pflash/tests/niah_gen.py --context 131072 --n 3 -o /tmp/niah_131072.jsonl +""" +import argparse +import json +import os +import subprocess +import sys +import time +import re +import requests +from pathlib import Path +from statistics import median + +REPO = Path(__file__).resolve().parents[2] +SERVER_BIN = REPO / "dflash/build/dflash_server" +TARGET = Path("/home/peppi/models/qwen3.6-27b-q4km/Qwen3.6-27B-Q4_K_M.gguf") +DRAFTER = Path("/home/peppi/models/Qwen3-0.6B-BF16.gguf") +PORT = 18099 +BASE_URL = f"http://127.0.0.1:{PORT}" + +# N=0 → baseline (unset, full layers) +CONDITION_SPECS = { + "baseline": (0, 0), + "ee3": (3, 3), + "ee5": (5, 5), + "ee7": (7, 7), +} + +DEFAULT_CONTEXTS = [32768, 65536, 131072] +DEFAULT_CONDITIONS = ["baseline", "ee3", "ee5", "ee7"] + + +def start_server(condition: str, ctx: int, log_path: Path): + early_n, score_n = CONDITION_SPECS[condition] + max_ctx = 139264 + env = os.environ.copy() + env["GGML_CUDA_NO_VMM"] = "1" + env["DFLASH27B_KV_K"] = "tq3_0" + env["DFLASH27B_KV_V"] = "tq3_0" + env.pop("PFLASH_DRAFTER_EARLY_EXIT_N", None) + env.pop("PFLASH_DRAFTER_SCORE_LAYERS", None) + if early_n > 0: + env["PFLASH_DRAFTER_EARLY_EXIT_N"] = str(early_n) + env["PFLASH_DRAFTER_SCORE_LAYERS"] = str(score_n) + + cmd = [ + str(SERVER_BIN), str(TARGET), + "--host", "127.0.0.1", + "--port", str(PORT), + "--max-ctx", str(max_ctx), + "--prefill-compression", "always", + "--prefill-keep-ratio", "0.05", + "--prefill-drafter", str(DRAFTER), + ] + with open(log_path, "w") as f: + proc = subprocess.Popen(cmd, stdout=f, stderr=f, env=env) + return proc + + +def wait_server(proc, timeout=180): + for _ in range(timeout): + try: + r = requests.get(f"{BASE_URL}/health", timeout=2) + if r.status_code == 200: + return True + except Exception: + pass + time.sleep(1) + if proc.poll() is not None: + return False + return False + + +def stop_server(proc): + if proc.poll() is None: + proc.terminate() + try: + proc.wait(timeout=20) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + time.sleep(3) + + +def extract_metrics(log_path: Path) -> dict: + metrics = { + "drafter_fwd_s": None, + "tail_score_s": None, + "a_compute_s": None, + "fp_s": None, + } + try: + with open(log_path) as f: + for line in f: + m = re.search(r"\[drafter\]\s+forward\+score in ([\d.]+)s", line) + if m: + metrics["drafter_fwd_s"] = float(m.group(1)) + m2 = re.search(r"tail.?score\s+([\d.]+)s", line, re.IGNORECASE) + if m2: + metrics["tail_score_s"] = float(m2.group(1)) + m3 = re.search(r"A_compute\s+([\d.]+)s", line) + if m3: + metrics["a_compute_s"] = float(m3.group(1)) + m4 = re.search(r"\bFP\b\s+([\d.]+)s", line) + if m4: + metrics["fp_s"] = float(m4.group(1)) + except Exception: + pass + return metrics + + +def run_one_case(condition: str, ctx: int, case: dict, case_idx: int, results_dir: Path) -> dict: + log_path = results_dir / f"{condition}_{ctx}_case{case_idx}_server.log" + proc = start_server(condition, ctx, log_path) + result = { + "ttft_s": None, "text": "", "found": False, "error": None, + "drafter_fwd_s": None, "tail_score_s": None, + "a_compute_s": None, "fp_s": None, + } + try: + if not wait_server(proc, timeout=180): + tail = "" + try: + with open(log_path) as f: + tail = "".join(f.readlines()[-30:]) + except Exception: + pass + result["error"] = f"server_start_failed: {tail[:500]}" + return result + + payload = { + "model": "dflash", + "messages": [{"role": "user", "content": case["prompt"]}], + "max_tokens": 64, + "stream": False, + "temperature": 0.0, + } + t0 = time.perf_counter() + try: + r = requests.post(f"{BASE_URL}/v1/chat/completions", json=payload, timeout=600) + result["ttft_s"] = time.perf_counter() - t0 + r.raise_for_status() + data = r.json() + text = data["choices"][0]["message"]["content"] + result["text"] = text[:300] + result["found"] = case["answer"] in text + except Exception as e: + result["ttft_s"] = time.perf_counter() - t0 + result["error"] = str(e) + finally: + stop_server(proc) + + result.update(extract_metrics(log_path)) + return result + + +def run_condition_ctx(condition: str, ctx: int, cases: list, results_dir: Path) -> dict: + print(f"\n[sweep] condition={condition} ctx={ctx} ({len(cases)} cases)", flush=True) + case_results = [] + for i, case in enumerate(cases): + print(f" case {i}: ans={case.get('answer', '?')}", flush=True) + r = run_one_case(condition, ctx, case, i, results_dir) + case_results.append(r) + status = "OK" if r["found"] else "FAIL" + drafter_s = f"{r['drafter_fwd_s']:.3f}s" if r["drafter_fwd_s"] else "N/A" + ttft_s = f"{r['ttft_s']:.2f}s" if r["ttft_s"] is not None else "N/A" + print(f" case {i}: ttft={ttft_s} drafter={drafter_s} [{status}]", flush=True) + if r.get("error"): + print(f" case {i}: error={r['error'][:200]}", flush=True) + + drafter_times = [r["drafter_fwd_s"] for r in case_results if r["drafter_fwd_s"]] + tail_times = [r["tail_score_s"] for r in case_results if r["tail_score_s"]] + ttfts = [r["ttft_s"] for r in case_results if r["ttft_s"] is not None] + a_compute_times = [r["a_compute_s"] for r in case_results if r["a_compute_s"]] + fp_times = [r["fp_s"] for r in case_results if r["fp_s"]] + niah_pass = sum(1 for c in case_results if c["found"]) + + return { + "condition": condition, + "ctx": ctx, + "case_results": case_results, + "drafter_p50_s": median(drafter_times) if drafter_times else None, + "tail_score_p50_s": median(tail_times) if tail_times else None, + "ttft_p50_s": median(ttfts) if ttfts else None, + "a_compute_p50_s": median(a_compute_times) if a_compute_times else None, + "fp_p50_s": median(fp_times) if fp_times else None, + "niah_pass": niah_pass, + "niah_total": len(cases), + } + + +def main(): + ap = argparse.ArgumentParser( + description="N-sweep NIAH bench: baseline/ee3/ee5/ee7 × 32K/64K/128K" + ) + ap.add_argument("--out-dir", default="dflash/bench/results/2026-05-25_ee_n_sweep") + ap.add_argument("--cases-dir", default="/tmp", + help="Directory containing niah_{ctx}.jsonl files") + ap.add_argument("--conditions", nargs="+", default=DEFAULT_CONDITIONS, + choices=list(CONDITION_SPECS.keys()), + help="Conditions to run (default: all four)") + ap.add_argument("--contexts", nargs="+", type=int, default=DEFAULT_CONTEXTS) + args = ap.parse_args() + + results_dir = Path(args.out_dir) + results_dir.mkdir(parents=True, exist_ok=True) + + cases_by_ctx: dict[int, list] = {} + for ctx in args.contexts: + f_path = Path(args.cases_dir) / f"niah_{ctx}.jsonl" + if not f_path.exists(): + sys.exit( + f"[error] missing {f_path}\n" + f"Generate with: python3 pflash/tests/niah_gen.py --context {ctx} --n 3 -o {f_path}" + ) + with open(f_path) as f: + cases_by_ctx[ctx] = [json.loads(line) for line in f if line.strip()] + print(f"[init] {len(cases_by_ctx[ctx])} cases for ctx={ctx}", flush=True) + + all_results = [] + for condition in args.conditions: + for ctx in args.contexts: + r = run_condition_ctx(condition, ctx, cases_by_ctx[ctx], results_dir) + all_results.append(r) + with open(results_dir / "raw_results.json", "w") as f: + json.dump(all_results, f, indent=2) + + baseline_times = { + r["ctx"]: r["drafter_p50_s"] + for r in all_results + if r["condition"] == "baseline" and r.get("drafter_p50_s") + } + + print("\n=== SUMMARY TABLE ===") + print(f"{'ctx':>7} {'condition':>10} {'drafter_p50':>12} {'tail_score':>10} {'NIAH':>6} {'speedup':>8}") + + rows = [] + for r in all_results: + ctx, cond = r["ctx"], r["condition"] + dp50 = r.get("drafter_p50_s") + ts50 = r.get("tail_score_p50_s") + niah = f"{r.get('niah_pass', 0)}/{r.get('niah_total', 0)}" + if dp50 and ctx in baseline_times and cond != "baseline": + speedup = f"{baseline_times[ctx] / dp50:.2f}x" + else: + speedup = "1.00x" if cond == "baseline" else "N/A" + dp50_s = f"{dp50:.3f}s" if dp50 else "N/A" + ts50_s = f"{ts50:.3f}s" if ts50 else "N/A" + print(f"{ctx:>7} {cond:>10} {dp50_s:>12} {ts50_s:>10} {niah:>6} {speedup:>8}") + rows.append({ + "ctx": ctx, "condition": cond, + "drafter_fwd_p50": dp50_s, "tail_score": ts50_s, + "NIAH": niah, "speedup": speedup, + }) + + with open(results_dir / "SUMMARY.md", "w") as f: + f.write("# ee N-sweep NIAH: baseline / ee3 / ee5 / ee7 @ 32K / 64K / 128K\n\n") + f.write("## Results\n\n") + f.write("| ctx | condition | drafter_p50 | tail_score | NIAH | speedup_vs_baseline |\n") + f.write("|---|---|---|---|---|---|\n") + for row in rows: + f.write(f"| {row['ctx']} | {row['condition']} | {row['drafter_fwd_p50']} " + f"| {row['tail_score']} | {row['NIAH']} | {row['speedup']} |\n") + f.write("\n## Verdict\n\n_Filled in post-run._\n") + + print(f"\n[done] {results_dir}/SUMMARY.md", flush=True) + + +if __name__ == "__main__": + main()