diff --git a/config.researchclaw.example.yaml b/config.researchclaw.example.yaml
index 8377f9a7..fead43eb 100644
--- a/config.researchclaw.example.yaml
+++ b/config.researchclaw.example.yaml
@@ -71,6 +71,13 @@ experiment:
     python_path: ".venv/bin/python3"
     gpu_required: false
     max_memory_mb: 4096
+    # Filesystem path under which torchvision dataset raw files are pre-staged.
+    # Generated experiment code is instructed to load datasets from this path
+    # with download=False; sandbox mode runs with network_policy="none" so no
+    # downloads are possible. If a dataset file is missing the experiment is
+    # required to raise FileNotFoundError and exit non-zero rather than fall
+    # back to synthetic data. Default matches the production sandbox image.
+    dataset_cache_root: "/opt/datasets"
   # Docker sandbox settings (only used when mode: "docker")
   # Build image first: docker build -t researchclaw/experiment:latest researchclaw/docker/
   docker:
diff --git a/researchclaw/config.py b/researchclaw/config.py
index 3f620a67..560cc718 100644
--- a/researchclaw/config.py
+++ b/researchclaw/config.py
@@ -226,6 +226,12 @@ class SandboxConfig:
         "sklearn",
     )
     max_memory_mb: int = 4096
+    # Filesystem path under which torchvision dataset raw files are pre-staged
+    # for sandbox mode (network_policy="none"). Generated experiment code is
+    # instructed to load datasets with root=<this path>, download=False, and
+    # to raise FileNotFoundError rather than fall back to synthetic data if
+    # the cache is missing. Default matches the production sandbox image.
+    dataset_cache_root: str = "/opt/datasets"
 
 
 @dataclass(frozen=True)
@@ -1251,6 +1257,9 @@ def _parse_experiment_config(data: dict[str, Any]) -> ExperimentConfig:
                 sandbox_data.get("allowed_imports", SandboxConfig.allowed_imports)
             ),
             max_memory_mb=_safe_int(sandbox_data.get("max_memory_mb"), 4096),
+            dataset_cache_root=sandbox_data.get(
+                "dataset_cache_root", SandboxConfig.dataset_cache_root
+            ),
         ),
         docker=DockerSandboxConfig(
             image=docker_data.get("image", "researchclaw/experiment:latest"),
diff --git a/researchclaw/pipeline/_helpers.py b/researchclaw/pipeline/_helpers.py
index 18bd9ca8..4605aedd 100644
--- a/researchclaw/pipeline/_helpers.py
+++ b/researchclaw/pipeline/_helpers.py
@@ -694,17 +694,39 @@ def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any]:
     - ``UCB (Stochastic) cumulative_regret: 361.9233``
     - ``condition=name metric=value`` (per-condition output)
     - ``condition=name/metric_name metric=value``
+    - ``PER_SEED [...] (cond|condition)=NAME seed=N <key>: <value> ...``
+      (multi-metric per-seed lines from structured experiment output)
+    - ``CONDITION_SUMMARY [...] (cond|condition)=NAME <key>: <value> ...``
+    - ``GAP_TO_BN [...] (cond|condition)=NAME <key>: <value> ...``
+
+    Returns a flat dict of metric_name -> value. Per-condition / per-seed
+    metrics are namespaced ``"<cond>/<seed>/<key>"`` or ``"<cond>/<key>"``
+    so they cannot collide with bare ``<key>: <value>`` simple-pair output.
 
-    Returns a flat dict of metric_name -> value.
     Filters out log/status lines using :func:`is_metric_name`.
     """
     # BUG-173: regex for condition=name metric=value format
     _CONDITION_RE = re.compile(
         r"^condition=(\S+)\s+metric=([0-9eE.+-]+)\s*$"
     )
+    # Structured per-condition / per-seed line patterns. The cond=/condition=
+    # alias is needed because the stage-10 prompt names the token "condition="
+    # but real generated code often emits "cond=" (observed in the Phase-2
+    # sandbox readiness trial).
+    _STRUCTURED_PREFIXES = ("PER_SEED", "CONDITION_SUMMARY", "GAP_TO_BN")
+    _COND_KEY_RE = re.compile(r"\b(?:cond|condition)=(\S+)")
+    _SEED_KEY_RE = re.compile(r"\bseed=(\d+)")
+    # Matches "<word>: <number>" with optional sign / decimal / exponent.
+    _METRIC_PAIR_RE = re.compile(
+        r"\b(\w+):\s*([\-+]?\d+(?:\.\d+)?(?:[eE][\-+]?\d+)?)"
+    )
+    # Tokens that are line-tagging metadata, not metric keys, even though they
+    # appear in the same "<word>: <value>" shape.
+    _STRUCTURED_NON_METRIC_KEYS = {"cond", "condition", "seed", "dataset"}
+
     metrics: dict[str, Any] = {}
-    for line in stdout.splitlines():
-        line = line.strip()
+    for raw_line in stdout.splitlines():
+        line = raw_line.strip()
         # --- Format 2: condition=xxx metric=yyy ---
         m = _CONDITION_RE.match(line)
         if m:
@@ -715,6 +737,23 @@ def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any]:
             except (ValueError, TypeError):
                 pass
             continue
+        # --- Format 3: structured per-condition lines ---
+        if line.startswith(_STRUCTURED_PREFIXES):
+            cond_match = _COND_KEY_RE.search(line)
+            if cond_match:
+                cond = cond_match.group(1)
+                seed_match = _SEED_KEY_RE.search(line)
+                seed_part = f"/{seed_match.group(1)}" if seed_match else ""
+                for key, val in _METRIC_PAIR_RE.findall(line):
+                    if key in _STRUCTURED_NON_METRIC_KEYS:
+                        continue
+                    try:
+                        metrics[f"{cond}{seed_part}/{key}"] = float(val)
+                    except (ValueError, TypeError):
+                        pass
+            # Structured lines are NOT also processed as simple <key>: <value>
+            # to avoid the leading prefix word being consumed as a metric name.
+            continue
         # --- Format 1: name: value ---
         if ":" not in line:
             continue
@@ -736,6 +775,71 @@ def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any]:
     return metrics
 
 
+def _flatten_structured_metrics(sr: Any) -> dict[str, float]:
+    """Flatten a sandbox-written ``results.json`` into a metric dict.
+
+    Handles four observed schemas:
+
+    1. ``{"metrics": {<key>: <number>}}`` — the auto-fallback shape.
+    2. ``{"conditions": [{"name": ..., "metrics": {...}}]}`` — list shape.
+    3. ``{"conditions": {<name>: {"metrics": {...}}}}`` — dict shape.
+    4. ``{"per_condition": ...}`` / ``{"condition_summaries": ...}`` — synonyms.
+
+    Per-condition keys are namespaced ``"<cond_name>/<metric_key>"``. Inside
+    each condition's ``metrics`` dict, one-level-shallow recursion picks up
+    nested stat blocks like ``{"accuracy": {"mean": 0.92, "std": 0.01}}`` —
+    those become ``"<cond>/accuracy_mean"``, ``"<cond>/accuracy_std"``.
+    Deeper nesting is intentionally ignored to keep the helper predictable.
+
+    Non-numeric values are silently dropped.
+    """
+    out: dict[str, float] = {}
+    if not isinstance(sr, dict):
+        return out
+
+    def _scrape_metrics_dict(d: dict, prefix: str) -> None:
+        """Append numeric leaves from a metric-dict to ``out`` with ``prefix/``."""
+        for key, val in d.items():
+            full = f"{prefix}/{key}" if prefix else str(key)
+            if isinstance(val, (int, float)) and not isinstance(val, bool):
+                out[full] = float(val)
+            elif isinstance(val, dict):
+                # One-level-shallow recursion: ``accuracy: {mean: 0.92, std: 0.01}``
+                # → ``<prefix>/accuracy_mean``, ``<prefix>/accuracy_std``.
+                for sub_key, sub_val in val.items():
+                    if isinstance(sub_val, (int, float)) and not isinstance(sub_val, bool):
+                        out[f"{full}_{sub_key}"] = float(sub_val)
+
+    # Shape 1: top-level metrics dict
+    top_metrics = sr.get("metrics")
+    if isinstance(top_metrics, dict):
+        _scrape_metrics_dict(top_metrics, prefix="")
+
+    # Shapes 2-4: per-condition container under one of several aliases.
+    container = (
+        sr.get("conditions")
+        or sr.get("per_condition")
+        or sr.get("condition_summaries")
+    )
+    items: list[tuple[str, dict]] = []
+    if isinstance(container, dict):
+        items = [(str(k), v) for k, v in container.items() if isinstance(v, dict)]
+    elif isinstance(container, list):
+        for entry in container:
+            if isinstance(entry, dict):
+                name = str(entry.get("name") or entry.get("id") or "unknown")
+                items.append((name, entry))
+
+    for cond_name, cond_data in items:
+        # Prefer an explicit ``metrics`` sub-dict; fall back to the whole entry.
+        cond_metrics = cond_data.get("metrics")
+        if not isinstance(cond_metrics, dict):
+            cond_metrics = cond_data
+        _scrape_metrics_dict(cond_metrics, prefix=cond_name)
+
+    return out
+
+
 # ---------------------------------------------------------------------------
 # LLM helpers
 # ---------------------------------------------------------------------------
diff --git a/researchclaw/pipeline/stage_impls/_code_generation.py b/researchclaw/pipeline/stage_impls/_code_generation.py
index e6911d92..2af51f5c 100644
--- a/researchclaw/pipeline/stage_impls/_code_generation.py
+++ b/researchclaw/pipeline/stage_impls/_code_generation.py
@@ -321,9 +321,14 @@ def _execute_code_generation(
             else "none"  # sandbox mode has no network
         )
         if _net_policy == "none":
-            # Network disabled: inject strict offline-only guidance
+            # Network disabled: inject strict offline-only guidance.
+            # Pass the configured dataset cache root so the prompt points the
+            # model at the right pre-staged path; defaults to /opt/datasets.
             try:
-                extra_guidance += _pm.block("network_disabled_guidance")
+                extra_guidance += _pm.block(
+                    "network_disabled_guidance",
+                    dataset_cache_root=config.experiment.sandbox.dataset_cache_root,
+                )
             except Exception:  # noqa: BLE001
                 pass
         elif _net_policy == "full":
diff --git a/researchclaw/pipeline/stage_impls/_execution.py b/researchclaw/pipeline/stage_impls/_execution.py
index 711844d2..35eca97e 100644
--- a/researchclaw/pipeline/stage_impls/_execution.py
+++ b/researchclaw/pipeline/stage_impls/_execution.py
@@ -6,6 +6,7 @@
 import logging
 import math
 import re
+import shutil
 import time as _time
 from pathlib import Path
 from typing import Any
@@ -26,6 +27,7 @@
     _ensure_sandbox_deps,
     _extract_code_block,
     _extract_multi_file_blocks,
+    _flatten_structured_metrics,
     _get_evolution_overlay,
     _load_hardware_profile,
     _parse_metrics_from_stdout,
@@ -312,7 +314,17 @@ def _execute_experiment_run(
                         pass
             _ensure_sandbox_deps(_all_code, config.experiment.sandbox.python_path)
 
-        sandbox = create_sandbox(config.experiment, runs_dir / "sandbox")
+        # Clear the sandbox dir before run so stale results.json from a prior
+        # failed run can't contaminate the discovery glob below. Fail loudly
+        # on cleanup failure — silently ignoring would let the mtime anchor
+        # be old enough to admit stale files.
+        _sandbox_dir = runs_dir / "sandbox"
+        if _sandbox_dir.exists():
+            shutil.rmtree(_sandbox_dir)
+        _sandbox_dir.mkdir(parents=True)
+        _run_anchor_mtime = _sandbox_dir.stat().st_mtime
+
+        sandbox = create_sandbox(config.experiment, _sandbox_dir)
         # Use run_project for multi-file, run for single-file
         if exp_dir_path and Path(exp_dir_path).is_dir():
             result = sandbox.run_project(
@@ -322,11 +334,24 @@ def _execute_experiment_run(
             result = sandbox.run(
                 code_text, timeout_sec=config.experiment.time_budget_sec
             )
-        # Try to read structured results.json from sandbox working dir
+        # The sandbox subprocess writes to _project, _project_1, _project_2, …
+        # (an auto-suffixed working dir per run). Generated experiment code
+        # commonly writes results.json at either the project root (`_project/`)
+        # or under a results subdir (`_project/results/`). Check both; only
+        # accept candidates written after the current run started.
         structured_results: dict[str, Any] | None = None
-        sandbox_project = runs_dir / "sandbox" / "_project"
-        results_json_path = sandbox_project / "results.json"
-        if results_json_path.exists():
+        _candidates: list[Path] = []
+        for _proj in _sandbox_dir.glob("_project*"):
+            for _rel in ("results.json", "results/results.json"):
+                _rj = _proj / _rel
+                if _rj.is_file() and _rj.stat().st_mtime >= _run_anchor_mtime:
+                    _candidates.append(_rj)
+        results_json_path = (
+            max(_candidates, key=lambda p: p.stat().st_mtime)
+            if _candidates
+            else None
+        )
+        if results_json_path is not None:
             try:
                 structured_results = json.loads(
                     results_json_path.read_text(encoding="utf-8")
@@ -390,6 +415,14 @@ def _execute_experiment_run(
         }
         if structured_results is not None:
             run_payload["structured_results"] = structured_results
+            # Promote flattened per-condition metrics into the canonical metrics
+            # dict so downstream stages see them by default. The experiment's
+            # own results.json is the authoritative source; stdout-parsed
+            # metrics are a fallback only.
+            _sr_flat = _flatten_structured_metrics(structured_results)
+            if _sr_flat:
+                effective_metrics = {**(effective_metrics or {}), **_sr_flat}
+                run_payload["metrics"] = effective_metrics
         # Auto-generate results.json from parsed metrics if sandbox didn't produce one
         if structured_results is None and effective_metrics:
             auto_results = {"source": "stdout_parsed", "metrics": effective_metrics}
diff --git a/researchclaw/prompts/shared.py b/researchclaw/prompts/shared.py
index d42822ab..ab42b54e 100644
--- a/researchclaw/prompts/shared.py
+++ b/researchclaw/prompts/shared.py
@@ -288,20 +288,29 @@
         "\n## ⚠️ NO NETWORK ACCESS — CRITICAL CONSTRAINT ⚠️\n"
         "This experiment runs with network_policy='none'. There is NO network access\n"
         "at ANY phase (no pip install, no dataset downloads, no HTTP requests).\n\n"
-        "### ONLY these pre-cached datasets are available:\n"
-        "- `torchvision.datasets.CIFAR10(root='/opt/datasets', train=True/False, download=False)`\n"
-        "- `torchvision.datasets.CIFAR100(root='/opt/datasets', train=True/False, download=False)`\n"
-        "- `torchvision.datasets.MNIST(root='/opt/datasets', train=True/False, download=False)`\n"
-        "- `torchvision.datasets.FashionMNIST(root='/opt/datasets', train=True/False, download=False)`\n"
-        "- `torchvision.datasets.STL10(root='/opt/datasets', split='train'/'test', download=False)`\n"
-        "- `torchvision.datasets.SVHN(root='/opt/datasets', split='train'/'test', download=False)`\n\n"
+        "### Datasets are pre-cached at `{dataset_cache_root}`:\n"
+        "- `torchvision.datasets.CIFAR10(root='{dataset_cache_root}', train=True/False, download=False)`\n"
+        "- `torchvision.datasets.CIFAR100(root='{dataset_cache_root}', train=True/False, download=False)`\n"
+        "- `torchvision.datasets.MNIST(root='{dataset_cache_root}', train=True/False, download=False)`\n"
+        "- `torchvision.datasets.FashionMNIST(root='{dataset_cache_root}', train=True/False, download=False)`\n"
+        "- `torchvision.datasets.STL10(root='{dataset_cache_root}', split='train'/'test', download=False)`\n"
+        "- `torchvision.datasets.SVHN(root='{dataset_cache_root}', split='train'/'test', download=False)`\n\n"
         "### FORBIDDEN (will cause runtime failure):\n"
         "- Do NOT create setup.py (it cannot run without network)\n"
         "- Do NOT create requirements.txt (pip install is unavailable)\n"
         "- Do NOT use `download=True` on any dataset\n"
         "- Do NOT use `urllib`, `requests`, `httpx`, or any HTTP library\n"
         "- Do NOT use `datasets.load_dataset()` from HuggingFace (requires download)\n"
-        "- Do NOT import packages not pre-installed in the Docker image\n\n"
+        "- Do NOT import packages not pre-installed in the Docker image\n"
+        "- Do NOT silently fall back to synthetic data if a dataset file is missing —\n"
+        "  raise FileNotFoundError and exit non-zero instead. A failed dataset load is a\n"
+        "  real failure to surface, not something to paper over with random tensors.\n\n"
+        "### Required: dataset provenance stamp\n"
+        "After your code successfully loads a dataset, print a single line to stdout of\n"
+        "the form `DATASET_USED: <name>` (e.g. `DATASET_USED: MNIST`). Emit this stamp\n"
+        "exactly once, with no surrounding decoration. Downstream metric parsing relies\n"
+        "on this line as a dataset-provenance signal independent of the JSON results\n"
+        "schema your code chooses to produce.\n\n"
         "### Available pre-installed packages:\n"
         "torch, torchvision, torchaudio, numpy, scipy, sklearn, matplotlib, seaborn,\n"
         "pandas, tqdm, gymnasium, networkx, PyYAML, Pillow, timm, einops, torchmetrics,\n"
diff --git a/tests/test_rc_config.py b/tests/test_rc_config.py
index 2fa7afbe..ad406a8d 100644
--- a/tests/test_rc_config.py
+++ b/tests/test_rc_config.py
@@ -284,6 +284,35 @@ def test_sandbox_config_defaults_match_expected_values():
     assert defaults.gpu_required is False
     assert defaults.max_memory_mb == 4096
     assert "numpy" in defaults.allowed_imports
+    # New field defaults to the production sandbox image path; preserves the
+    # prior hardcoded value that lived only inside the prompt block.
+    assert defaults.dataset_cache_root == "/opt/datasets"
+
+
+def test_sandbox_config_dataset_cache_root_overrides_default(tmp_path: Path):
+    data = _valid_config_data()
+    data["experiment"] = {
+        "mode": "sandbox",
+        "sandbox": {"dataset_cache_root": "/tmp/arc_sandbox_trial/datasets"},
+    }
+
+    config = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)
+
+    assert (
+        config.experiment.sandbox.dataset_cache_root
+        == "/tmp/arc_sandbox_trial/datasets"
+    )
+
+
+def test_sandbox_config_dataset_cache_root_falls_back_to_default(tmp_path: Path):
+    data = _valid_config_data()
+    # Omit dataset_cache_root entirely — the loader should fall back to the
+    # SandboxConfig default rather than raising or returning None.
+    data["experiment"] = {"mode": "sandbox", "sandbox": {}}
+
+    config = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)
+
+    assert config.experiment.sandbox.dataset_cache_root == "/opt/datasets"
 
 
 def test_to_dict_roundtrip_rehydrates_equivalent_rcconfig(tmp_path: Path):
diff --git a/tests/test_rc_executor.py b/tests/test_rc_executor.py
index 133718c8..a9c3a027 100644
--- a/tests/test_rc_executor.py
+++ b/tests/test_rc_executor.py
@@ -1613,6 +1613,421 @@ def test_filters_long_name_lines(self) -> None:
         metrics = _parse_metrics_from_stdout(stdout)
         assert len(metrics) == 0
 
+    def test_parses_per_seed_with_cond_alias(self) -> None:
+        from researchclaw.pipeline.executor import _parse_metrics_from_stdout
+
+        stdout = (
+            "PER_SEED dataset=mnist cond=baseline_bn_mlp seed=0 "
+            "test_accuracy: 0.92 step_time_ms: 1.7"
+        )
+        metrics = _parse_metrics_from_stdout(stdout)
+        assert metrics["baseline_bn_mlp/0/test_accuracy"] == pytest.approx(0.92)
+        assert metrics["baseline_bn_mlp/0/step_time_ms"] == pytest.approx(1.7)
+        # Metadata tokens must NOT be captured as metrics.
+        assert "cond" not in metrics
+        assert "seed" not in metrics
+        assert "dataset" not in metrics
+
+    def test_parses_per_seed_with_condition_alias(self) -> None:
+        from researchclaw.pipeline.executor import _parse_metrics_from_stdout
+
+        stdout = (
+            "PER_SEED dataset=mnist condition=baseline_bn_mlp seed=1 "
+            "test_accuracy: 0.91"
+        )
+        metrics = _parse_metrics_from_stdout(stdout)
+        assert metrics["baseline_bn_mlp/1/test_accuracy"] == pytest.approx(0.91)
+
+    def test_parses_condition_summary(self) -> None:
+        from researchclaw.pipeline.executor import _parse_metrics_from_stdout
+
+        stdout = (
+            "CONDITION_SUMMARY dataset=mnist condition=bn_mlp "
+            "accuracy_mean: 0.92 accuracy_std: 0.005 success_rate: 1.0"
+        )
+        metrics = _parse_metrics_from_stdout(stdout)
+        assert metrics["bn_mlp/accuracy_mean"] == pytest.approx(0.92)
+        assert metrics["bn_mlp/accuracy_std"] == pytest.approx(0.005)
+        assert metrics["bn_mlp/success_rate"] == pytest.approx(1.0)
+
+    def test_parses_gap_to_bn_signed(self) -> None:
+        from researchclaw.pipeline.executor import _parse_metrics_from_stdout
+
+        stdout = (
+            "GAP_TO_BN dataset=mnist cond=rmsnorm_mlp "
+            "accuracy_gap_to_bn_baseline_mean: -0.01"
+        )
+        metrics = _parse_metrics_from_stdout(stdout)
+        assert metrics["rmsnorm_mlp/accuracy_gap_to_bn_baseline_mean"] == pytest.approx(-0.01)
+
+    def test_structured_line_without_cond_is_skipped(self) -> None:
+        from researchclaw.pipeline.executor import _parse_metrics_from_stdout
+
+        stdout = "PER_SEED seed=0 test_accuracy: 0.92"
+        metrics = _parse_metrics_from_stdout(stdout)
+        # No cond= or condition= token → row produces no metrics, no crash.
+        assert metrics == {}
+
+    def test_simple_pair_still_works_alongside_structured(self) -> None:
+        """Regression: bare ``key: value`` still parses even when the same
+        stdout also has structured PER_SEED lines."""
+        from researchclaw.pipeline.executor import _parse_metrics_from_stdout
+
+        stdout = (
+            "ANNEAL_STEP_THRESHOLD: 163\n"
+            "PER_SEED cond=bn_mlp seed=0 test_accuracy: 0.92"
+        )
+        metrics = _parse_metrics_from_stdout(stdout)
+        assert metrics["ANNEAL_STEP_THRESHOLD"] == pytest.approx(163.0)
+        assert metrics["bn_mlp/0/test_accuracy"] == pytest.approx(0.92)
+
+
+class TestFlattenStructuredMetrics:
+    """Tests for _flatten_structured_metrics() helper."""
+
+    def test_top_level_metrics_dict(self) -> None:
+        from researchclaw.pipeline._helpers import _flatten_structured_metrics
+
+        sr = {"metrics": {"acc": 0.92, "loss": 0.03}}
+        flat = _flatten_structured_metrics(sr)
+        assert flat == {"acc": 0.92, "loss": 0.03}
+
+    def test_conditions_dict_with_metrics_subkey(self) -> None:
+        """Schema 3: ``{"conditions": {<name>: {"metrics": {...}}}}``."""
+        from researchclaw.pipeline._helpers import _flatten_structured_metrics
+
+        sr = {
+            "conditions": {
+                "baseline_bn_mlp": {"metrics": {"accuracy": 0.92, "step_time_ms": 1.7}},
+                "baseline_rmsnorm_mlp": {"metrics": {"accuracy": 0.91}},
+            }
+        }
+        flat = _flatten_structured_metrics(sr)
+        assert flat["baseline_bn_mlp/accuracy"] == 0.92
+        assert flat["baseline_bn_mlp/step_time_ms"] == 1.7
+        assert flat["baseline_rmsnorm_mlp/accuracy"] == 0.91
+
+    def test_conditions_list_with_name(self) -> None:
+        """Schema 2: ``{"conditions": [{"name": ..., "metrics": {...}}]}``."""
+        from researchclaw.pipeline._helpers import _flatten_structured_metrics
+
+        sr = {
+            "conditions": [
+                {"name": "bn", "metrics": {"acc": 0.92}},
+                {"name": "rmsnorm", "metrics": {"acc": 0.91}},
+            ]
+        }
+        flat = _flatten_structured_metrics(sr)
+        assert flat == {"bn/acc": 0.92, "rmsnorm/acc": 0.91}
+
+    def test_condition_summaries_synonym(self) -> None:
+        """Schema 4: ``condition_summaries`` alias for ``conditions``."""
+        from researchclaw.pipeline._helpers import _flatten_structured_metrics
+
+        sr = {
+            "condition_summaries": {
+                "bn": {"metrics": {"acc": 0.92}},
+            }
+        }
+        flat = _flatten_structured_metrics(sr)
+        assert flat == {"bn/acc": 0.92}
+
+    def test_nested_stat_block_one_level_recursion(self) -> None:
+        """Shallow recursion: ``accuracy: {mean: 0.92, std: 0.01}`` becomes
+        ``<cond>/accuracy_mean`` and ``<cond>/accuracy_std``."""
+        from researchclaw.pipeline._helpers import _flatten_structured_metrics
+
+        sr = {
+            "conditions": {
+                "bn": {
+                    "metrics": {
+                        "accuracy": {"mean": 0.92, "std": 0.01},
+                        "loss": 0.03,
+                    }
+                }
+            }
+        }
+        flat = _flatten_structured_metrics(sr)
+        assert flat["bn/accuracy_mean"] == 0.92
+        assert flat["bn/accuracy_std"] == 0.01
+        assert flat["bn/loss"] == 0.03
+
+    def test_non_numeric_values_silently_dropped(self) -> None:
+        from researchclaw.pipeline._helpers import _flatten_structured_metrics
+
+        sr = {
+            "conditions": {
+                "bn": {
+                    "metrics": {
+                        "acc": 0.92,
+                        "device": "mps",  # string — dropped
+                        "converged": True,  # bool — dropped (bool is int subclass)
+                        "history": [0.1, 0.2],  # list — dropped
+                    }
+                }
+            }
+        }
+        flat = _flatten_structured_metrics(sr)
+        assert flat == {"bn/acc": 0.92}
+
+    def test_falls_back_to_whole_entry_when_no_metrics_subkey(self) -> None:
+        """If a condition entry has no ``metrics`` sub-dict, treat the whole
+        entry as the metric dict (drops non-numeric fields automatically)."""
+        from researchclaw.pipeline._helpers import _flatten_structured_metrics
+
+        sr = {
+            "conditions": {
+                "bn": {"acc": 0.92, "name": "bn"},  # no nested "metrics" key
+            }
+        }
+        flat = _flatten_structured_metrics(sr)
+        assert flat["bn/acc"] == 0.92
+        assert "bn/name" not in flat  # non-numeric, dropped
+
+    def test_non_dict_input_returns_empty(self) -> None:
+        from researchclaw.pipeline._helpers import _flatten_structured_metrics
+
+        assert _flatten_structured_metrics(None) == {}
+        assert _flatten_structured_metrics([1, 2, 3]) == {}
+        assert _flatten_structured_metrics("not-a-dict") == {}
+
+    def test_empty_dict_returns_empty(self) -> None:
+        from researchclaw.pipeline._helpers import _flatten_structured_metrics
+
+        assert _flatten_structured_metrics({}) == {}
+
+
+class TestSandboxResultsDiscovery:
+    """Tests for Patch A: sandbox cleanup + _project*/results.json discovery.
+
+    These exercise _execute_experiment_run's pre-run cleanup and structured
+    results lookup via a stub sandbox object, avoiding any reliance on real
+    subprocess execution.
+    """
+
+    def _make_stub_sandbox(
+        self,
+        results_payload: dict,
+        project_suffix: str = "_project_1",
+        results_relpath: str = "results.json",
+    ):
+        """Return a class that mimics enough of the sandbox protocol to
+        exercise the post-run results.json discovery glob.
+
+        On ``run_project``, the stub creates
+        ``<workdir>/<project_suffix>/<results_relpath>`` with the given payload
+        and returns a benign result object. ``results_relpath`` defaults to
+        ``"results.json"`` (project-root write) but may also be e.g.
+        ``"results/results.json"`` to exercise the subdir case.
+        """
+        from types import SimpleNamespace
+
+        class _StubSandbox:
+            def __init__(_self, workdir):
+                _self.workdir = workdir
+
+            def run_project(_self, project_path, timeout_sec):
+                target_path = _self.workdir / project_suffix / results_relpath
+                target_path.parent.mkdir(parents=True, exist_ok=True)
+                target_path.write_text(
+                    json.dumps(results_payload), encoding="utf-8"
+                )
+                return SimpleNamespace(
+                    metrics={},
+                    stdout="",
+                    stderr="",
+                    returncode=0,
+                    elapsed_sec=0.1,
+                    timed_out=False,
+                )
+
+            def run(_self, code, timeout_sec):  # pragma: no cover (multi-file path used)
+                return _self.run_project(None, timeout_sec)
+
+        return _StubSandbox
+
+    def _setup_run_dir(self, tmp_path):
+        """Build a minimal run dir with stage-09 exp_plan.yaml + stage-10 experiment/."""
+        run_dir = tmp_path / "run"
+        run_dir.mkdir()
+        s9 = run_dir / "stage-09"
+        s9.mkdir()
+        (s9 / "exp_plan.yaml").write_text("topic: t\nbaselines: [a]\n", encoding="utf-8")
+        s10 = run_dir / "stage-10" / "experiment"
+        s10.mkdir(parents=True)
+        (s10 / "main.py").write_text("print('hi')\n", encoding="utf-8")
+        # Stage 11 schedule (resource planning output, optional but harmless).
+        s11 = run_dir / "stage-11"
+        s11.mkdir()
+        (s11 / "schedule.json").write_text(
+            json.dumps({"tasks": [{"id": "t1"}]}), encoding="utf-8"
+        )
+        return run_dir
+
+    def _stub_config(self):
+        from researchclaw.config import (
+            ExperimentConfig, SandboxConfig, ExperimentRepairConfig,
+            OpenCodeConfig, CodeAgentConfig,
+        )
+
+        # Construct an ExperimentConfig with mode=sandbox and a python path.
+        sandbox = SandboxConfig(python_path="/usr/bin/env python3")
+        exp = ExperimentConfig(
+            mode="sandbox",
+            time_budget_sec=60,
+            max_iterations=1,
+            sandbox=sandbox,
+            repair=ExperimentRepairConfig(enabled=False),
+            opencode=OpenCodeConfig(enabled=False),
+            code_agent=CodeAgentConfig(enabled=False),
+        )
+        return exp
+
+    def test_discovers_results_in_project_1_subdir(
+        self, tmp_path, monkeypatch, rc_config, adapters
+    ):
+        """Primary correctness: even when the sandbox writes to ``_project_1``
+        (suffixed), discovery finds it and promotes structured metrics into
+        the run-1.json payload."""
+        run_dir = self._setup_run_dir(tmp_path)
+        stage_dir = run_dir / "stage-12"
+        stage_dir.mkdir()
+
+        # Stub create_sandbox to return our writer.
+        results_payload = {
+            "device": "cpu",
+            "dataset_used": "MNIST",
+            "conditions": {
+                "bn": {"metrics": {"acc": 0.92}},
+                "rmsnorm": {"metrics": {"acc": 0.91}},
+            },
+        }
+        StubSandbox = self._make_stub_sandbox(results_payload, project_suffix="_project_1")
+        monkeypatch.setattr(
+            "researchclaw.experiment.factory.create_sandbox",
+            lambda cfg, workdir: StubSandbox(workdir),
+        )
+        # Avoid the dep-install subprocess.
+        monkeypatch.setattr(
+            "researchclaw.pipeline.stage_impls._execution._ensure_sandbox_deps",
+            lambda code, py: [],
+        )
+
+        # Use a real-ish RCConfig but swap in our sandbox experiment config.
+        cfg = rc_config
+        cfg = cfg.__class__(
+            **{**cfg.__dict__, "experiment": self._stub_config()}
+        )
+
+        result = rc_executor._execute_experiment_run(
+            stage_dir, run_dir, cfg, adapters, llm=None
+        )
+
+        assert result.status == StageStatus.DONE
+        run1 = json.loads((stage_dir / "runs" / "run-1.json").read_text())
+        # Structured metrics are promoted into the canonical metrics dict.
+        assert run1["metrics"]["bn/acc"] == pytest.approx(0.92)
+        assert run1["metrics"]["rmsnorm/acc"] == pytest.approx(0.91)
+        # And the raw structured_results is also attached for downstream consumers.
+        assert run1["structured_results"]["dataset_used"] == "MNIST"
+        # runs/results.json was copied from the experiment-authored source
+        # (NOT the "stdout_parsed" auto-fallback).
+        results = json.loads((stage_dir / "runs" / "results.json").read_text())
+        assert results.get("source") != "stdout_parsed"
+        assert results["dataset_used"] == "MNIST"
+
+    def test_discovers_results_in_results_subdir(
+        self, tmp_path, monkeypatch, rc_config, adapters
+    ):
+        """Real generated experiments often write to a ``results/`` subdir
+        (observed in the Phase-2 sandbox readiness trial). Discovery must
+        check both ``<proj>/results.json`` and ``<proj>/results/results.json``."""
+        run_dir = self._setup_run_dir(tmp_path)
+        stage_dir = run_dir / "stage-12"
+        stage_dir.mkdir()
+
+        results_payload = {
+            "dataset_used": "MNIST",
+            "per_condition": {  # one of the schemas the flattener handles
+                "bn": {"accuracy_mean": 0.92, "accuracy_std": 0.005},
+            },
+        }
+        StubSandbox = self._make_stub_sandbox(
+            results_payload,
+            project_suffix="_project_1",
+            results_relpath="results/results.json",  # NESTED
+        )
+        monkeypatch.setattr(
+            "researchclaw.experiment.factory.create_sandbox",
+            lambda cfg, workdir: StubSandbox(workdir),
+        )
+        monkeypatch.setattr(
+            "researchclaw.pipeline.stage_impls._execution._ensure_sandbox_deps",
+            lambda code, py: [],
+        )
+
+        cfg = rc_config.__class__(
+            **{**rc_config.__dict__, "experiment": self._stub_config()}
+        )
+
+        result = rc_executor._execute_experiment_run(
+            stage_dir, run_dir, cfg, adapters, llm=None
+        )
+
+        assert result.status == StageStatus.DONE
+        run1 = json.loads((stage_dir / "runs" / "run-1.json").read_text())
+        # per_condition entry without an explicit "metrics" sub-dict — the
+        # flattener falls back to treating the entry as the metrics dict.
+        assert run1["metrics"]["bn/accuracy_mean"] == pytest.approx(0.92)
+        assert run1["metrics"]["bn/accuracy_std"] == pytest.approx(0.005)
+        results = json.loads((stage_dir / "runs" / "results.json").read_text())
+        assert results.get("source") != "stdout_parsed"
+        assert results["dataset_used"] == "MNIST"
+
+    def test_cleanup_removes_stale_sibling_before_run(
+        self, tmp_path, monkeypatch, rc_config, adapters
+    ):
+        """Regression: a leftover ``_project/results.json`` from a prior run
+        must NOT be returned by the discovery (sandbox dir is freshly created
+        before run, so no pre-existing files survive)."""
+        run_dir = self._setup_run_dir(tmp_path)
+        stage_dir = run_dir / "stage-12"
+        stage_dir.mkdir()
+        runs_dir = stage_dir / "runs"
+        # Plant a stale results.json from a "prior run" before the test invokes
+        # the executor. The cleanup at the top of the sandbox branch must wipe it.
+        stale_sandbox = runs_dir / "sandbox" / "_project"
+        stale_sandbox.mkdir(parents=True)
+        (stale_sandbox / "results.json").write_text(
+            json.dumps({"metrics": {"stale": 999.0}}), encoding="utf-8"
+        )
+
+        # Stub sandbox writes a FRESH results.json into _project_1 with different content.
+        fresh_payload = {"conditions": {"bn": {"metrics": {"acc": 0.5}}}}
+        StubSandbox = self._make_stub_sandbox(fresh_payload, project_suffix="_project_1")
+        monkeypatch.setattr(
+            "researchclaw.experiment.factory.create_sandbox",
+            lambda cfg, workdir: StubSandbox(workdir),
+        )
+        monkeypatch.setattr(
+            "researchclaw.pipeline.stage_impls._execution._ensure_sandbox_deps",
+            lambda code, py: [],
+        )
+
+        cfg = rc_config.__class__(
+            **{**rc_config.__dict__, "experiment": self._stub_config()}
+        )
+
+        result = rc_executor._execute_experiment_run(
+            stage_dir, run_dir, cfg, adapters, llm=None
+        )
+
+        assert result.status == StageStatus.DONE
+        run1 = json.loads((stage_dir / "runs" / "run-1.json").read_text())
+        # The stale "stale: 999.0" must NOT appear; only the fresh fresh_payload should.
+        assert "stale" not in run1["metrics"]
+        assert run1["metrics"]["bn/acc"] == pytest.approx(0.5)
+
 
 class TestDetectRuntimeIssues:
     """Tests for _detect_runtime_issues() helper."""
diff --git a/tests/test_rc_prompts.py b/tests/test_rc_prompts.py
index 947ade60..cbb1ce00 100644
--- a/tests/test_rc_prompts.py
+++ b/tests/test_rc_prompts.py
@@ -148,6 +148,65 @@ def test_block_pkg_hint(self) -> None:
         assert "numpy" in block
         assert "torch" in block  # mentioned as prohibited
 
+    def test_block_network_disabled_guidance_uses_default_root(self) -> None:
+        # When the caller passes the SandboxConfig default the rendered block
+        # still names /opt/datasets, matching the prior hardcoded value.
+        pm = PromptManager()
+        block = pm.block(
+            "network_disabled_guidance", dataset_cache_root="/opt/datasets"
+        )
+        # All six dataset examples should reference the configured root.
+        assert block.count("/opt/datasets") >= 6
+        # Critical invariants preserved.
+        assert "network_policy='none'" in block
+        assert "download=False" in block
+        assert "Do NOT use `download=True`" in block
+        # The unrendered placeholder must not leak through.
+        assert "{dataset_cache_root}" not in block
+
+    def test_block_network_disabled_guidance_uses_custom_root(self) -> None:
+        # A custom dataset cache root flows through every dataset example.
+        pm = PromptManager()
+        block = pm.block(
+            "network_disabled_guidance",
+            dataset_cache_root="/tmp/arc_sandbox_trial/datasets",
+        )
+        assert block.count("/tmp/arc_sandbox_trial/datasets") >= 6
+        # The previous hardcoded path must no longer appear.
+        assert "/opt/datasets" not in block
+        # Invariants still hold.
+        assert "download=False" in block
+        assert "Do NOT use `download=True`" in block
+        assert "{dataset_cache_root}" not in block
+
+    def test_block_network_disabled_guidance_forbids_synthetic_fallback(
+        self,
+    ) -> None:
+        # The fail-loud bullet is an intentional behavior change shipped with
+        # the dataset_cache_root parameterisation: missing pre-cached data must
+        # raise FileNotFoundError rather than silently substituting synthetic
+        # tensors. Lock it in for both default and custom cache roots.
+        pm = PromptManager()
+        for root in ("/opt/datasets", "/tmp/arc_sandbox_trial/datasets"):
+            block = pm.block("network_disabled_guidance", dataset_cache_root=root)
+            lowered = block.lower()
+            assert "fall back to synthetic data" in lowered
+            assert "filenotfounderror" in lowered
+            assert "exit non-zero" in lowered
+
+    def test_block_network_disabled_guidance_requires_dataset_used_stamp(
+        self,
+    ) -> None:
+        # Downstream metric capture relies on a single-line stdout stamp of
+        # the form `DATASET_USED: <name>` as a provenance signal independent
+        # of whatever JSON result schema the generated code chooses.
+        pm = PromptManager()
+        for root in ("/opt/datasets", "/tmp/arc_sandbox_trial/datasets"):
+            block = pm.block("network_disabled_guidance", dataset_cache_root=root)
+            assert "DATASET_USED:" in block
+            assert "dataset provenance stamp" in block.lower()
+            assert "exactly once" in block.lower()
+
     def test_sub_prompt_code_repair(self) -> None:
         pm = PromptManager()
         rp = pm.sub_prompt(