aiming-lab · DermotOBrien-EC · May 28, 2026 · May 29, 2026
diff --git a/config.researchclaw.example.yaml b/config.researchclaw.example.yaml
@@ -71,6 +71,13 @@ experiment:
     python_path: ".venv/bin/python3"
     gpu_required: false
     max_memory_mb: 4096
+    # Filesystem path under which torchvision dataset raw files are pre-staged.
+    # Generated experiment code is instructed to load datasets from this path
+    # with download=False; sandbox mode runs with network_policy="none" so no
+    # downloads are possible. If a dataset file is missing the experiment is
+    # required to raise FileNotFoundError and exit non-zero rather than fall
+    # back to synthetic data. Default matches the production sandbox image.
+    dataset_cache_root: "/opt/datasets"
   # Docker sandbox settings (only used when mode: "docker")
   # Build image first: docker build -t researchclaw/experiment:latest researchclaw/docker/
   docker:

diff --git a/researchclaw/config.py b/researchclaw/config.py
@@ -226,6 +226,12 @@ class SandboxConfig:
         "sklearn",
     )
     max_memory_mb: int = 4096
+    # Filesystem path under which torchvision dataset raw files are pre-staged
+    # for sandbox mode (network_policy="none"). Generated experiment code is
+    # instructed to load datasets with root=<this path>, download=False, and
+    # to raise FileNotFoundError rather than fall back to synthetic data if
+    # the cache is missing. Default matches the production sandbox image.
+    dataset_cache_root: str = "/opt/datasets"
 
 
 @dataclass(frozen=True)
@@ -1251,6 +1257,9 @@ def _parse_experiment_config(data: dict[str, Any]) -> ExperimentConfig:
                 sandbox_data.get("allowed_imports", SandboxConfig.allowed_imports)
             ),
             max_memory_mb=_safe_int(sandbox_data.get("max_memory_mb"), 4096),
+            dataset_cache_root=sandbox_data.get(
+                "dataset_cache_root", SandboxConfig.dataset_cache_root
+            ),
         ),
         docker=DockerSandboxConfig(
             image=docker_data.get("image", "researchclaw/experiment:latest"),

diff --git a/researchclaw/pipeline/_helpers.py b/researchclaw/pipeline/_helpers.py
@@ -694,17 +694,39 @@ def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any]:
     - ``UCB (Stochastic) cumulative_regret: 361.9233``
     - ``condition=name metric=value`` (per-condition output)
     - ``condition=name/metric_name metric=value``
+    - ``PER_SEED [...] (cond|condition)=NAME seed=N <key>: <value> ...``
+      (multi-metric per-seed lines from structured experiment output)
+    - ``CONDITION_SUMMARY [...] (cond|condition)=NAME <key>: <value> ...``
+    - ``GAP_TO_BN [...] (cond|condition)=NAME <key>: <value> ...``
+
+    Returns a flat dict of metric_name -> value. Per-condition / per-seed
+    metrics are namespaced ``"<cond>/<seed>/<key>"`` or ``"<cond>/<key>"``
+    so they cannot collide with bare ``<key>: <value>`` simple-pair output.
 
-    Returns a flat dict of metric_name -> value.
     Filters out log/status lines using :func:`is_metric_name`.
     """
     # BUG-173: regex for condition=name metric=value format
     _CONDITION_RE = re.compile(
         r"^condition=(\S+)\s+metric=([0-9eE.+-]+)\s*$"
     )
+    # Structured per-condition / per-seed line patterns. The cond=/condition=
+    # alias is needed because the stage-10 prompt names the token "condition="
+    # but real generated code often emits "cond=" (observed in the Phase-2
+    # sandbox readiness trial).
+    _STRUCTURED_PREFIXES = ("PER_SEED", "CONDITION_SUMMARY", "GAP_TO_BN")
+    _COND_KEY_RE = re.compile(r"\b(?:cond|condition)=(\S+)")
+    _SEED_KEY_RE = re.compile(r"\bseed=(\d+)")
+    # Matches "<word>: <number>" with optional sign / decimal / exponent.
+    _METRIC_PAIR_RE = re.compile(
+        r"\b(\w+):\s*([\-+]?\d+(?:\.\d+)?(?:[eE][\-+]?\d+)?)"
+    )
+    # Tokens that are line-tagging metadata, not metric keys, even though they
+    # appear in the same "<word>: <value>" shape.
+    _STRUCTURED_NON_METRIC_KEYS = {"cond", "condition", "seed", "dataset"}
+
     metrics: dict[str, Any] = {}
-    for line in stdout.splitlines():
-        line = line.strip()
+    for raw_line in stdout.splitlines():
+        line = raw_line.strip()
         # --- Format 2: condition=xxx metric=yyy ---
         m = _CONDITION_RE.match(line)
         if m:
@@ -715,6 +737,23 @@ def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any]:
             except (ValueError, TypeError):
                 pass
             continue
+        # --- Format 3: structured per-condition lines ---
+        if line.startswith(_STRUCTURED_PREFIXES):
+            cond_match = _COND_KEY_RE.search(line)
+            if cond_match:
+                cond = cond_match.group(1)
+                seed_match = _SEED_KEY_RE.search(line)
+                seed_part = f"/{seed_match.group(1)}" if seed_match else ""
+                for key, val in _METRIC_PAIR_RE.findall(line):
+                    if key in _STRUCTURED_NON_METRIC_KEYS:
+                        continue
+                    try:
+                        metrics[f"{cond}{seed_part}/{key}"] = float(val)
+                    except (ValueError, TypeError):
+                        pass
+            # Structured lines are NOT also processed as simple <key>: <value>
+            # to avoid the leading prefix word being consumed as a metric name.
+            continue
         # --- Format 1: name: value ---
         if ":" not in line:
             continue
@@ -736,6 +775,71 @@ def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any]:
     return metrics
 
 
+def _flatten_structured_metrics(sr: Any) -> dict[str, float]:
+    """Flatten a sandbox-written ``results.json`` into a metric dict.
+
+    Handles four observed schemas:
+
+    1. ``{"metrics": {<key>: <number>}}`` — the auto-fallback shape.
+    2. ``{"conditions": [{"name": ..., "metrics": {...}}]}`` — list shape.
+    3. ``{"conditions": {<name>: {"metrics": {...}}}}`` — dict shape.
+    4. ``{"per_condition": ...}`` / ``{"condition_summaries": ...}`` — synonyms.
+
+    Per-condition keys are namespaced ``"<cond_name>/<metric_key>"``. Inside
+    each condition's ``metrics`` dict, one-level-shallow recursion picks up
+    nested stat blocks like ``{"accuracy": {"mean": 0.92, "std": 0.01}}`` —
+    those become ``"<cond>/accuracy_mean"``, ``"<cond>/accuracy_std"``.
+    Deeper nesting is intentionally ignored to keep the helper predictable.
+
+    Non-numeric values are silently dropped.
+    """
+    out: dict[str, float] = {}
+    if not isinstance(sr, dict):
+        return out
+
+    def _scrape_metrics_dict(d: dict, prefix: str) -> None:
+        """Append numeric leaves from a metric-dict to ``out`` with ``prefix/``."""
+        for key, val in d.items():
+            full = f"{prefix}/{key}" if prefix else str(key)
+            if isinstance(val, (int, float)) and not isinstance(val, bool):
+                out[full] = float(val)
+            elif isinstance(val, dict):
+                # One-level-shallow recursion: ``accuracy: {mean: 0.92, std: 0.01}``
+                # → ``<prefix>/accuracy_mean``, ``<prefix>/accuracy_std``.
+                for sub_key, sub_val in val.items():
+                    if isinstance(sub_val, (int, float)) and not isinstance(sub_val, bool):
+                        out[f"{full}_{sub_key}"] = float(sub_val)
+
+    # Shape 1: top-level metrics dict
+    top_metrics = sr.get("metrics")
+    if isinstance(top_metrics, dict):
+        _scrape_metrics_dict(top_metrics, prefix="")
+
+    # Shapes 2-4: per-condition container under one of several aliases.
+    container = (
+        sr.get("conditions")
+        or sr.get("per_condition")
+        or sr.get("condition_summaries")
+    )
+    items: list[tuple[str, dict]] = []
+    if isinstance(container, dict):
+        items = [(str(k), v) for k, v in container.items() if isinstance(v, dict)]
+    elif isinstance(container, list):
+        for entry in container:
+            if isinstance(entry, dict):
+                name = str(entry.get("name") or entry.get("id") or "unknown")
+                items.append((name, entry))
+
+    for cond_name, cond_data in items:
+        # Prefer an explicit ``metrics`` sub-dict; fall back to the whole entry.
+        cond_metrics = cond_data.get("metrics")
+        if not isinstance(cond_metrics, dict):
+            cond_metrics = cond_data
+        _scrape_metrics_dict(cond_metrics, prefix=cond_name)
+
+    return out
+
+
 # ---------------------------------------------------------------------------
 # LLM helpers
 # ---------------------------------------------------------------------------

diff --git a/researchclaw/pipeline/stage_impls/_code_generation.py b/researchclaw/pipeline/stage_impls/_code_generation.py
@@ -321,9 +321,14 @@ def _execute_code_generation(
             else "none"  # sandbox mode has no network
         )
         if _net_policy == "none":
-            # Network disabled: inject strict offline-only guidance
+            # Network disabled: inject strict offline-only guidance.
+            # Pass the configured dataset cache root so the prompt points the
+            # model at the right pre-staged path; defaults to /opt/datasets.
             try:
-                extra_guidance += _pm.block("network_disabled_guidance")
+                extra_guidance += _pm.block(
+                    "network_disabled_guidance",
+                    dataset_cache_root=config.experiment.sandbox.dataset_cache_root,
+                )
             except Exception:  # noqa: BLE001
                 pass
         elif _net_policy == "full":

diff --git a/researchclaw/pipeline/stage_impls/_execution.py b/researchclaw/pipeline/stage_impls/_execution.py
@@ -6,6 +6,7 @@
 import logging
 import math
 import re
+import shutil
 import time as _time
 from pathlib import Path
 from typing import Any
@@ -26,6 +27,7 @@
     _ensure_sandbox_deps,
     _extract_code_block,
     _extract_multi_file_blocks,
+    _flatten_structured_metrics,
     _get_evolution_overlay,
     _load_hardware_profile,
     _parse_metrics_from_stdout,
@@ -312,7 +314,17 @@ def _execute_experiment_run(
                         pass
             _ensure_sandbox_deps(_all_code, config.experiment.sandbox.python_path)
 
-        sandbox = create_sandbox(config.experiment, runs_dir / "sandbox")
+        # Clear the sandbox dir before run so stale results.json from a prior
+        # failed run can't contaminate the discovery glob below. Fail loudly
+        # on cleanup failure — silently ignoring would let the mtime anchor
+        # be old enough to admit stale files.
+        _sandbox_dir = runs_dir / "sandbox"
+        if _sandbox_dir.exists():
+            shutil.rmtree(_sandbox_dir)
+        _sandbox_dir.mkdir(parents=True)
+        _run_anchor_mtime = _sandbox_dir.stat().st_mtime
+
+        sandbox = create_sandbox(config.experiment, _sandbox_dir)
         # Use run_project for multi-file, run for single-file
         if exp_dir_path and Path(exp_dir_path).is_dir():
             result = sandbox.run_project(
@@ -322,11 +334,24 @@ def _execute_experiment_run(
             result = sandbox.run(
                 code_text, timeout_sec=config.experiment.time_budget_sec
             )
-        # Try to read structured results.json from sandbox working dir
+        # The sandbox subprocess writes to _project, _project_1, _project_2, …
+        # (an auto-suffixed working dir per run). Generated experiment code
+        # commonly writes results.json at either the project root (`_project/`)
+        # or under a results subdir (`_project/results/`). Check both; only
+        # accept candidates written after the current run started.
         structured_results: dict[str, Any] | None = None
-        sandbox_project = runs_dir / "sandbox" / "_project"
-        results_json_path = sandbox_project / "results.json"
-        if results_json_path.exists():
+        _candidates: list[Path] = []
+        for _proj in _sandbox_dir.glob("_project*"):
+            for _rel in ("results.json", "results/results.json"):
+                _rj = _proj / _rel
+                if _rj.is_file() and _rj.stat().st_mtime >= _run_anchor_mtime:
+                    _candidates.append(_rj)
+        results_json_path = (
+            max(_candidates, key=lambda p: p.stat().st_mtime)
+            if _candidates
+            else None
+        )
+        if results_json_path is not None:
             try:
                 structured_results = json.loads(
                     results_json_path.read_text(encoding="utf-8")
@@ -390,6 +415,14 @@ def _execute_experiment_run(
         }
         if structured_results is not None:
             run_payload["structured_results"] = structured_results
+            # Promote flattened per-condition metrics into the canonical metrics
+            # dict so downstream stages see them by default. The experiment's
+            # own results.json is the authoritative source; stdout-parsed
+            # metrics are a fallback only.
+            _sr_flat = _flatten_structured_metrics(structured_results)
+            if _sr_flat:
+                effective_metrics = {**(effective_metrics or {}), **_sr_flat}
+                run_payload["metrics"] = effective_metrics
         # Auto-generate results.json from parsed metrics if sandbox didn't produce one
         if structured_results is None and effective_metrics:
             auto_results = {"source": "stdout_parsed", "metrics": effective_metrics}

diff --git a/researchclaw/prompts/shared.py b/researchclaw/prompts/shared.py
@@ -288,20 +288,29 @@
         "\n## ⚠️ NO NETWORK ACCESS — CRITICAL CONSTRAINT ⚠️\n"
         "This experiment runs with network_policy='none'. There is NO network access\n"
         "at ANY phase (no pip install, no dataset downloads, no HTTP requests).\n\n"
-        "### ONLY these pre-cached datasets are available:\n"
-        "- `torchvision.datasets.CIFAR10(root='/opt/datasets', train=True/False, download=False)`\n"
-        "- `torchvision.datasets.CIFAR100(root='/opt/datasets', train=True/False, download=False)`\n"
-        "- `torchvision.datasets.MNIST(root='/opt/datasets', train=True/False, download=False)`\n"
-        "- `torchvision.datasets.FashionMNIST(root='/opt/datasets', train=True/False, download=False)`\n"
-        "- `torchvision.datasets.STL10(root='/opt/datasets', split='train'/'test', download=False)`\n"
-        "- `torchvision.datasets.SVHN(root='/opt/datasets', split='train'/'test', download=False)`\n\n"
+        "### Datasets are pre-cached at `{dataset_cache_root}`:\n"
+        "- `torchvision.datasets.CIFAR10(root='{dataset_cache_root}', train=True/False, download=False)`\n"
+        "- `torchvision.datasets.CIFAR100(root='{dataset_cache_root}', train=True/False, download=False)`\n"
+        "- `torchvision.datasets.MNIST(root='{dataset_cache_root}', train=True/False, download=False)`\n"
+        "- `torchvision.datasets.FashionMNIST(root='{dataset_cache_root}', train=True/False, download=False)`\n"
+        "- `torchvision.datasets.STL10(root='{dataset_cache_root}', split='train'/'test', download=False)`\n"
+        "- `torchvision.datasets.SVHN(root='{dataset_cache_root}', split='train'/'test', download=False)`\n\n"
         "### FORBIDDEN (will cause runtime failure):\n"
         "- Do NOT create setup.py (it cannot run without network)\n"
         "- Do NOT create requirements.txt (pip install is unavailable)\n"
         "- Do NOT use `download=True` on any dataset\n"
         "- Do NOT use `urllib`, `requests`, `httpx`, or any HTTP library\n"
         "- Do NOT use `datasets.load_dataset()` from HuggingFace (requires download)\n"
-        "- Do NOT import packages not pre-installed in the Docker image\n\n"
+        "- Do NOT import packages not pre-installed in the Docker image\n"
+        "- Do NOT silently fall back to synthetic data if a dataset file is missing —\n"
+        "  raise FileNotFoundError and exit non-zero instead. A failed dataset load is a\n"
+        "  real failure to surface, not something to paper over with random tensors.\n\n"
+        "### Required: dataset provenance stamp\n"
+        "After your code successfully loads a dataset, print a single line to stdout of\n"
+        "the form `DATASET_USED: <name>` (e.g. `DATASET_USED: MNIST`). Emit this stamp\n"
+        "exactly once, with no surrounding decoration. Downstream metric parsing relies\n"
+        "on this line as a dataset-provenance signal independent of the JSON results\n"
+        "schema your code chooses to produce.\n\n"
         "### Available pre-installed packages:\n"
         "torch, torchvision, torchaudio, numpy, scipy, sklearn, matplotlib, seaborn,\n"
         "pandas, tqdm, gymnasium, networkx, PyYAML, Pillow, timm, einops, torchmetrics,\n"

diff --git a/tests/test_rc_config.py b/tests/test_rc_config.py
@@ -284,6 +284,35 @@ def test_sandbox_config_defaults_match_expected_values():
     assert defaults.gpu_required is False
     assert defaults.max_memory_mb == 4096
     assert "numpy" in defaults.allowed_imports
+    # New field defaults to the production sandbox image path; preserves the
+    # prior hardcoded value that lived only inside the prompt block.
+    assert defaults.dataset_cache_root == "/opt/datasets"
+
+
+def test_sandbox_config_dataset_cache_root_overrides_default(tmp_path: Path):
+    data = _valid_config_data()
+    data["experiment"] = {
+        "mode": "sandbox",
+        "sandbox": {"dataset_cache_root": "/tmp/arc_sandbox_trial/datasets"},
+    }
+
+    config = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)
+
+    assert (
+        config.experiment.sandbox.dataset_cache_root
+        == "/tmp/arc_sandbox_trial/datasets"
+    )
+
+
+def test_sandbox_config_dataset_cache_root_falls_back_to_default(tmp_path: Path):
+    data = _valid_config_data()
+    # Omit dataset_cache_root entirely — the loader should fall back to the
+    # SandboxConfig default rather than raising or returning None.
+    data["experiment"] = {"mode": "sandbox", "sandbox": {}}
+
+    config = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)
+
+    assert config.experiment.sandbox.dataset_cache_root == "/opt/datasets"
 
 
 def test_to_dict_roundtrip_rehydrates_equivalent_rcconfig(tmp_path: Path):