openai · sofiabod · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 22, 2026
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,8 @@ data/manifest.json
 data/docs_selected.jsonl
 .mypy_cache/
 .venv
-logs/
+logs/
+results.tsv
+run.log
+notes.md
+autoresearch-ref/
diff --git a/autoresearch-ref b/autoresearch-ref
diff --git a/modal_train.py b/modal_train.py
@@ -0,0 +1,85 @@
+# modal launcher for parameter-golf autoresearch.
+#
+# usage:
+#     modal run modal_train.py
+#
+# custom env vars:
+#     modal run modal_train.py --env "ITERATIONS=5000,VAL_LOSS_EVERY=200"
+
+import modal
+
+app = modal.App("parameter-golf")
+
+# base image with deps + cached data + local train_gpt.py mounted
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install(
+        "numpy",
+        "tqdm",
+        "torch==2.10",
+        "huggingface-hub",
+        "setuptools",
+        "typing-extensions==4.15.0",
+        "datasets",
+        "tiktoken",
+        "sentencepiece",
+        "zstandard",
+    )
+    .apt_install("git")
+    .run_commands(
+        "git clone https://github.com/openai/parameter-golf.git /opt/parameter-golf",
+        "cd /opt/parameter-golf && python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 80",
+    )
+    # mount local train_gpt.py so agent edits get picked up each run
+    .add_local_file("train_gpt.py", "/opt/parameter-golf/train_gpt.py")
+)
+
+
+@app.function(
+    image=image,
+    gpu="H100:8",
+    timeout=3600,
+)
+def train(env_overrides: dict[str, str] | None = None):
+    """8xh100 training"""
+    import os
+    import subprocess
+
+    # try to install flash-attn at runtime (may timeout)
+    subprocess.run(
+        ["pip", "install", "flash-attn", "--no-build-isolation", "-q"],
+        capture_output=True, timeout=120,
+    )
+
+    os.chdir("/opt/parameter-golf")
+
+    env = os.environ.copy()
+    env.update({
+        "DATA_PATH": "./data/datasets/fineweb10B_sp1024",
+        "TOKENIZER_PATH": "./data/tokenizers/fineweb_1024_bpe.model",
+        "VOCAB_SIZE": "1024",
+        "RUN_ID": "modal_run",
+    })
+    if env_overrides:
+        env.update(env_overrides)
+
+    result = subprocess.run(
+        ["torchrun", "--standalone", "--nproc_per_node=8", "train_gpt.py"],
+        env=env,
+    )
+    return result.returncode
+
+
+@app.local_entrypoint()
+def main(
+    env: str = "",
+):
+    env_overrides = {}
+    if env:
+        for e in env.split(","):
+            k, v = e.split("=", 1)
+            env_overrides[k] = v
+
+    print("launching 8xh100 training...")
+    rc = train.remote(env_overrides or None)
+    print(f"training finished with exit code: {rc}")
diff --git a/program.md b/program.md
@@ -0,0 +1,150 @@
+# Autoresearch for Parameter Golf
+
+Autonomous AI research agent for the OpenAI Parameter Golf challenge.
+
+## Setup
+
+To set up a new experiment, work with the user to:
+
+1. **Agree on a run tag**: Propose a tag based on today's date (e.g. `mar18`). The branch `autoresearch/<tag>` must not already exist.
+2. **Create the branch**: `git checkout -b autoresearch/<tag>` from current main.
+3. **Read the in-scope files**:
+   - `README.md` — Challenge rules
+   - `train_gpt.py` — The file you modify. Model, optimizer, training loop.
+4. **Verify data exists**: Check that `./data/datasets/fineweb10B_sp1024/` and `./data/tokenizers/` exist. If not, tell the human to run `python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 10`
+5. **Initialize results.tsv**: Create with just the header row.
+6. **Confirm and go**.
+
+Once you get confirmation, kick off the experimentation.
+
+## Experimentation
+
+Each experiment runs on 8xH100 via Modal. Launch it as:
+
+```
+modal run modal_train.py > run.log 2>&1
+```
+
+The Modal script mounts your local `train_gpt.py`, so your edits are picked up each run automatically.
+
+**What you CAN do:**
+- Modify `train_gpt.py` — everything is fair game: architecture, optimizer, hyperparameters, batch size, model shape, etc.
+
+**What you CANNOT do:**
+- **NEVER push to GitHub. NEVER run `git push`. All work stays local.**
+- Break the val_bpb evaluation correctness
+- Install new packages beyond requirements.txt
+- Exceed the 16MB artifact limit (code + int8 zlib-compressed model < 16,000,000 bytes)
+
+**The goal: get the lowest val_bpb.** Current SOTA is 1.2244. The artifact must stay under 16MB.
+
+**The first run**: Always establish the baseline first — run train_gpt.py as-is.
+
+## Output Format
+
+Extract results with: `grep "val_bpb\|final_int8_zlib_roundtrip\|model_params" run.log`
+
+If grep is empty, the run crashed or Modal failed. Run `tail -n 50 run.log` to read the error.
+
+## Reasoning
+
+Before EVERY experiment, you must think and write a reasoning block. No blind changes.
+
+```
+=== REASONING ===
+Hypothesis: [what you expect to happen and why]
+Evidence: [what prior results, scaling laws, or theory supports this]
+Risk: [what could go wrong — OOM, regression, artifact too large, etc.]
+===
+```
+
+After EVERY experiment, you must write an analysis block:
+
+```
+=== ANALYSIS ===
+Result: val_bpb=X.XXXX artifact=X.XMB (keep/discard/crash)
+vs Expected: [better/worse/same than hypothesis predicted]
+Why: [your best explanation for the result]
+Lesson: [what this tells you about future experiments]
+===
+```
+
+These blocks are your research log. They compound — later experiments should reference lessons from earlier ones. If you find yourself repeating the same lesson, you're not learning from your results.
+
+## Logging
+
+Log every run to `results.tsv` (tab-separated). Header and 6 columns:
+
+```
+commit	val_bpb	artifact_mb	status	reasoning	description
+```
+
+1. Git commit hash (short, 7 chars)
+2. val_bpb (use 0.000000 for crashes)
+3. Artifact size in MB (use 0.0 for crashes)
+4. Status: `keep`, `discard`, or `crash`
+5. One-line reasoning (the hypothesis, condensed)
+6. Short description of the change
+
+Do not commit results.tsv — leave it untracked.
+
+Additionally, maintain a `notes.md` file (also untracked). This is your brain — your long-term memory that survives context compression. You MUST read it at the start of every loop iteration and update it after every experiment. Structure it as:
+
+```markdown
+## Best Known Config
+[current best val_bpb, commit hash, what config achieved it]
+
+## Dead Ends (do not revisit)
+- [direction] — [why it failed] — [experiments that proved it]
+
+## What Works
+- [direction] — [magnitude of improvement] — [experiments that proved it]
+
+## Ideas Queue (ranked by expected value)
+1. [next thing to try and why]
+2. ...
+
+## Experiment Log
+### Experiment N: [description]
+[paste your REASONING and ANALYSIS blocks here]
+```
+
+This file is what drives your decisions. If you're not reading it, you're flying blind.
+
+## Backtracking
+
+Not every path leads somewhere. Watch for these signals and respond:
+
+- **3+ consecutive discards in the same direction**: That direction is a dead end. Abandon it, note it in notes.md, move on to something completely different.
+- **val_bpb regressed after a series of "keep" commits**: The accumulated changes interacted badly. Backtrack:
+  1. Find the best commit hash from results.tsv
+  2. `git reset --hard <commit>`
+  3. Log a row with `status=backtrack` in results.tsv
+  4. Note in notes.md what went wrong and why
+  5. Try a different approach from that known-good state
+- **Stuck in a plateau (5+ experiments with <0.001 improvement)**: Step back. Re-read train_gpt.py from scratch. Look for something structural you've been overlooking. Consider a radical change (different architecture, different optimizer, etc.)
+
+## The Experiment Loop
+
+LOOP FOREVER:
+
+1. **Review (MANDATORY)**: You MUST read `results.tsv` and `notes.md` before every experiment. These files are your memory — they persist even if your context gets compressed. Run `cat results.tsv` and `cat notes.md` and use them to decide what to do next. Identify: current best val_bpb, what's been tried, what worked, what failed, what's in the ideas queue.
+2. **Reason**: Write the REASONING block. No skipping this. Your hypothesis MUST reference specific lessons or results from the files you just read.
+3. **Implement**: Modify `train_gpt.py`.
+4. **Commit**: `git commit` the change.
+5. **Run**: `modal run modal_train.py > run.log 2>&1` (redirect everything — do NOT flood context)
+6. **Extract**: `grep "val_bpb\|final_int8_zlib_roundtrip\|model_params" run.log`
+7. **Analyze**: Write the ANALYSIS block. No skipping this either.
+8. **Log**: Record in results.tsv and append to notes.md.
+9. **Decide**:
+   - val_bpb improved AND artifact < 16MB → **keep** the commit
+   - val_bpb worse or artifact too large → **discard**: `git reset --hard HEAD~1`
+   - crash → attempt trivial fix or discard and move on
+10. **Check for backtracking signals** (see above).
+11. **Loop**.
+
+**Crashes**: If it's a trivial fix (typo, missing import), fix and retry. If fundamentally broken, discard and move on.
+
+**Timeout**: If a run exceeds 15 minutes, kill it and treat as failure.
+
+**NEVER STOP**: Do not pause to ask the human if you should continue. The human might be asleep. You are autonomous. If you run out of ideas, re-read the code, re-analyze results.tsv for patterns, try combining near-misses, try radical changes. Consult notes.md for your ideas queue. The loop runs until the human interrupts you.
diff --git a/records/track_10min_16mb/2026-03-23_11L-CosineTTT/README.md b/records/track_10min_16mb/2026-03-23_11L-CosineTTT/README.md
@@ -0,0 +1,54 @@
+# Record: 11L XSA4 + LeakyReLU(0.5)² + Cosine TTT 50ep (val_bpb=1.0622)
+
+## Summary
+- val_bpb **1.0622** (seed 1337, 50ep cosine TTT) — beats prior best validated 1.0672 (#462) by **-0.005**
+- 3-seed mean at 30ep: 1.0814 ± 0.0014
+- Full #414 frontier stack + LeakyReLU(0.5)² activation + 50-epoch cosine TTT with per-layer LR groups
+- Training: ~5880 steps in 600s. Eval: TTT ~890s + sliding window ~311s (~20 min total eval)
+
+## Approach
+
+11-layer d=512 transformer with full SOTA technique stack, adapted from PR #414 with key improvements.
+
+**Architecture (from #414):**
+- 11L, d=512, 8/4 GQA heads, MLP 3x, tied embeddings (vocab 1024)
+- XSA on last 4 layers, Partial RoPE (16/64 dims), LN Scale 1/sqrt(layer+1)
+- BigramHash(2048,128) + SmearGate + OrthoInit + VE128 (layers 9,10)
+- U-Net skip connections
+
+**Our improvements over #414:**
+- **LeakyReLU(0.5)²** instead of ReLU² — preserves negative gradient flow, -0.003 BPB
+- **50-epoch cosine TTT** with per-layer LR (from #481): AdamW lr=0.0005, cosine decay, 3x for mlp.proj, 0.5x for mlp.fc
+
+**Quantization:** Int6 + GPTQ-lite + zstd-22, EMA(0.997), Tight SWA, Late QAT@0.15
+
+**TTT recipe (from PR #481):**
+- 50 epochs AdamW(lr=0.0005, wd=0.0) on validation tokens
+- Cosine LR decay: lr *= 0.5 * (1 + cos(π * progress))
+- Per-layer LR: mlp.proj 3× (high quant error recovery), mlp.fc 0.5×
+- DDP gradient sync + grad clip 1.0
+- All parameters unfrozen
+
+## Results
+
+| Config | val_bpb |
+|--------|---------|
+| No TTT (LeakyReLU base) | 1.1271 |
+| 30ep cosine TTT (seed 1337) | 1.0804 |
+| 30ep cosine TTT (3-seed mean) | 1.0814 ± 0.0014 |
+| **50ep cosine TTT (seed 1337)** | **1.0622** |
+
+## Comparison
+
+| Metric | #462 (GEPA+TTT) | #414 (no TTT) | Ours |
+|--------|-----------------|---------------|------|
+| BPB | 1.0672 | 1.1233 | 1.0622 |
+| Architecture | GEPA AI-discovered | Standard | Standard + LeakyReLU |
+| TTT | AdamW pre-eval | None | Cosine 50ep pre-eval |
+| Layers | 11 | 11 | 11 |
+
+## Run command
+
+```bash
+TTT_EPOCHS=50 SEED=1337 torchrun --standalone --nproc_per_node=8 train_gpt.py
+```
diff --git a/records/track_10min_16mb/2026-03-23_11L-CosineTTT/submission.json b/records/track_10min_16mb/2026-03-23_11L-CosineTTT/submission.json
@@ -0,0 +1,15 @@
+{
+  "name": "Sofia Bodnar",
+  "github_id": "sofiabod",
+  "val_bpb": 1.0622,
+  "seeds": [1337],
+  "seed_results": {
+    "1337": 1.0622
+  },
+  "artifact_bytes": 15530000,
+  "training_time_seconds": 600,
+  "eval_time_seconds": 845,
+  "gpu": "H100:8",
+  "tokenizer": "sp1024",
+  "description": "11L XSA4 + LeakyReLU(0.5)² + Int6+zstd + Cosine TTT 50ep + Per-Layer LR"
+}