diff --git a/config.researchclaw.example.yaml b/config.researchclaw.example.yaml index 8377f9a7..fead43eb 100644 --- a/config.researchclaw.example.yaml +++ b/config.researchclaw.example.yaml @@ -71,6 +71,13 @@ experiment: python_path: ".venv/bin/python3" gpu_required: false max_memory_mb: 4096 + # Filesystem path under which torchvision dataset raw files are pre-staged. + # Generated experiment code is instructed to load datasets from this path + # with download=False; sandbox mode runs with network_policy="none" so no + # downloads are possible. If a dataset file is missing the experiment is + # required to raise FileNotFoundError and exit non-zero rather than fall + # back to synthetic data. Default matches the production sandbox image. + dataset_cache_root: "/opt/datasets" # Docker sandbox settings (only used when mode: "docker") # Build image first: docker build -t researchclaw/experiment:latest researchclaw/docker/ docker: diff --git a/researchclaw/config.py b/researchclaw/config.py index 3f620a67..560cc718 100644 --- a/researchclaw/config.py +++ b/researchclaw/config.py @@ -226,6 +226,12 @@ class SandboxConfig: "sklearn", ) max_memory_mb: int = 4096 + # Filesystem path under which torchvision dataset raw files are pre-staged + # for sandbox mode (network_policy="none"). Generated experiment code is + # instructed to load datasets with root=, download=False, and + # to raise FileNotFoundError rather than fall back to synthetic data if + # the cache is missing. Default matches the production sandbox image. + dataset_cache_root: str = "/opt/datasets" @dataclass(frozen=True) @@ -1251,6 +1257,9 @@ def _parse_experiment_config(data: dict[str, Any]) -> ExperimentConfig: sandbox_data.get("allowed_imports", SandboxConfig.allowed_imports) ), max_memory_mb=_safe_int(sandbox_data.get("max_memory_mb"), 4096), + dataset_cache_root=sandbox_data.get( + "dataset_cache_root", SandboxConfig.dataset_cache_root + ), ), docker=DockerSandboxConfig( image=docker_data.get("image", "researchclaw/experiment:latest"), diff --git a/researchclaw/pipeline/_helpers.py b/researchclaw/pipeline/_helpers.py index 18bd9ca8..4605aedd 100644 --- a/researchclaw/pipeline/_helpers.py +++ b/researchclaw/pipeline/_helpers.py @@ -694,17 +694,39 @@ def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any]: - ``UCB (Stochastic) cumulative_regret: 361.9233`` - ``condition=name metric=value`` (per-condition output) - ``condition=name/metric_name metric=value`` + - ``PER_SEED [...] (cond|condition)=NAME seed=N : ...`` + (multi-metric per-seed lines from structured experiment output) + - ``CONDITION_SUMMARY [...] (cond|condition)=NAME : ...`` + - ``GAP_TO_BN [...] (cond|condition)=NAME : ...`` + + Returns a flat dict of metric_name -> value. Per-condition / per-seed + metrics are namespaced ``"//"`` or ``"/"`` + so they cannot collide with bare ``: `` simple-pair output. - Returns a flat dict of metric_name -> value. Filters out log/status lines using :func:`is_metric_name`. """ # BUG-173: regex for condition=name metric=value format _CONDITION_RE = re.compile( r"^condition=(\S+)\s+metric=([0-9eE.+-]+)\s*$" ) + # Structured per-condition / per-seed line patterns. The cond=/condition= + # alias is needed because the stage-10 prompt names the token "condition=" + # but real generated code often emits "cond=" (observed in the Phase-2 + # sandbox readiness trial). + _STRUCTURED_PREFIXES = ("PER_SEED", "CONDITION_SUMMARY", "GAP_TO_BN") + _COND_KEY_RE = re.compile(r"\b(?:cond|condition)=(\S+)") + _SEED_KEY_RE = re.compile(r"\bseed=(\d+)") + # Matches ": " with optional sign / decimal / exponent. + _METRIC_PAIR_RE = re.compile( + r"\b(\w+):\s*([\-+]?\d+(?:\.\d+)?(?:[eE][\-+]?\d+)?)" + ) + # Tokens that are line-tagging metadata, not metric keys, even though they + # appear in the same ": " shape. + _STRUCTURED_NON_METRIC_KEYS = {"cond", "condition", "seed", "dataset"} + metrics: dict[str, Any] = {} - for line in stdout.splitlines(): - line = line.strip() + for raw_line in stdout.splitlines(): + line = raw_line.strip() # --- Format 2: condition=xxx metric=yyy --- m = _CONDITION_RE.match(line) if m: @@ -715,6 +737,23 @@ def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any]: except (ValueError, TypeError): pass continue + # --- Format 3: structured per-condition lines --- + if line.startswith(_STRUCTURED_PREFIXES): + cond_match = _COND_KEY_RE.search(line) + if cond_match: + cond = cond_match.group(1) + seed_match = _SEED_KEY_RE.search(line) + seed_part = f"/{seed_match.group(1)}" if seed_match else "" + for key, val in _METRIC_PAIR_RE.findall(line): + if key in _STRUCTURED_NON_METRIC_KEYS: + continue + try: + metrics[f"{cond}{seed_part}/{key}"] = float(val) + except (ValueError, TypeError): + pass + # Structured lines are NOT also processed as simple : + # to avoid the leading prefix word being consumed as a metric name. + continue # --- Format 1: name: value --- if ":" not in line: continue @@ -736,6 +775,71 @@ def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any]: return metrics +def _flatten_structured_metrics(sr: Any) -> dict[str, float]: + """Flatten a sandbox-written ``results.json`` into a metric dict. + + Handles four observed schemas: + + 1. ``{"metrics": {: }}`` — the auto-fallback shape. + 2. ``{"conditions": [{"name": ..., "metrics": {...}}]}`` — list shape. + 3. ``{"conditions": {: {"metrics": {...}}}}`` — dict shape. + 4. ``{"per_condition": ...}`` / ``{"condition_summaries": ...}`` — synonyms. + + Per-condition keys are namespaced ``"/"``. Inside + each condition's ``metrics`` dict, one-level-shallow recursion picks up + nested stat blocks like ``{"accuracy": {"mean": 0.92, "std": 0.01}}`` — + those become ``"/accuracy_mean"``, ``"/accuracy_std"``. + Deeper nesting is intentionally ignored to keep the helper predictable. + + Non-numeric values are silently dropped. + """ + out: dict[str, float] = {} + if not isinstance(sr, dict): + return out + + def _scrape_metrics_dict(d: dict, prefix: str) -> None: + """Append numeric leaves from a metric-dict to ``out`` with ``prefix/``.""" + for key, val in d.items(): + full = f"{prefix}/{key}" if prefix else str(key) + if isinstance(val, (int, float)) and not isinstance(val, bool): + out[full] = float(val) + elif isinstance(val, dict): + # One-level-shallow recursion: ``accuracy: {mean: 0.92, std: 0.01}`` + # → ``/accuracy_mean``, ``/accuracy_std``. + for sub_key, sub_val in val.items(): + if isinstance(sub_val, (int, float)) and not isinstance(sub_val, bool): + out[f"{full}_{sub_key}"] = float(sub_val) + + # Shape 1: top-level metrics dict + top_metrics = sr.get("metrics") + if isinstance(top_metrics, dict): + _scrape_metrics_dict(top_metrics, prefix="") + + # Shapes 2-4: per-condition container under one of several aliases. + container = ( + sr.get("conditions") + or sr.get("per_condition") + or sr.get("condition_summaries") + ) + items: list[tuple[str, dict]] = [] + if isinstance(container, dict): + items = [(str(k), v) for k, v in container.items() if isinstance(v, dict)] + elif isinstance(container, list): + for entry in container: + if isinstance(entry, dict): + name = str(entry.get("name") or entry.get("id") or "unknown") + items.append((name, entry)) + + for cond_name, cond_data in items: + # Prefer an explicit ``metrics`` sub-dict; fall back to the whole entry. + cond_metrics = cond_data.get("metrics") + if not isinstance(cond_metrics, dict): + cond_metrics = cond_data + _scrape_metrics_dict(cond_metrics, prefix=cond_name) + + return out + + # --------------------------------------------------------------------------- # LLM helpers # --------------------------------------------------------------------------- diff --git a/researchclaw/pipeline/stage_impls/_code_generation.py b/researchclaw/pipeline/stage_impls/_code_generation.py index e6911d92..2af51f5c 100644 --- a/researchclaw/pipeline/stage_impls/_code_generation.py +++ b/researchclaw/pipeline/stage_impls/_code_generation.py @@ -321,9 +321,14 @@ def _execute_code_generation( else "none" # sandbox mode has no network ) if _net_policy == "none": - # Network disabled: inject strict offline-only guidance + # Network disabled: inject strict offline-only guidance. + # Pass the configured dataset cache root so the prompt points the + # model at the right pre-staged path; defaults to /opt/datasets. try: - extra_guidance += _pm.block("network_disabled_guidance") + extra_guidance += _pm.block( + "network_disabled_guidance", + dataset_cache_root=config.experiment.sandbox.dataset_cache_root, + ) except Exception: # noqa: BLE001 pass elif _net_policy == "full": diff --git a/researchclaw/pipeline/stage_impls/_execution.py b/researchclaw/pipeline/stage_impls/_execution.py index 711844d2..35eca97e 100644 --- a/researchclaw/pipeline/stage_impls/_execution.py +++ b/researchclaw/pipeline/stage_impls/_execution.py @@ -6,6 +6,7 @@ import logging import math import re +import shutil import time as _time from pathlib import Path from typing import Any @@ -26,6 +27,7 @@ _ensure_sandbox_deps, _extract_code_block, _extract_multi_file_blocks, + _flatten_structured_metrics, _get_evolution_overlay, _load_hardware_profile, _parse_metrics_from_stdout, @@ -312,7 +314,17 @@ def _execute_experiment_run( pass _ensure_sandbox_deps(_all_code, config.experiment.sandbox.python_path) - sandbox = create_sandbox(config.experiment, runs_dir / "sandbox") + # Clear the sandbox dir before run so stale results.json from a prior + # failed run can't contaminate the discovery glob below. Fail loudly + # on cleanup failure — silently ignoring would let the mtime anchor + # be old enough to admit stale files. + _sandbox_dir = runs_dir / "sandbox" + if _sandbox_dir.exists(): + shutil.rmtree(_sandbox_dir) + _sandbox_dir.mkdir(parents=True) + _run_anchor_mtime = _sandbox_dir.stat().st_mtime + + sandbox = create_sandbox(config.experiment, _sandbox_dir) # Use run_project for multi-file, run for single-file if exp_dir_path and Path(exp_dir_path).is_dir(): result = sandbox.run_project( @@ -322,11 +334,24 @@ def _execute_experiment_run( result = sandbox.run( code_text, timeout_sec=config.experiment.time_budget_sec ) - # Try to read structured results.json from sandbox working dir + # The sandbox subprocess writes to _project, _project_1, _project_2, … + # (an auto-suffixed working dir per run). Generated experiment code + # commonly writes results.json at either the project root (`_project/`) + # or under a results subdir (`_project/results/`). Check both; only + # accept candidates written after the current run started. structured_results: dict[str, Any] | None = None - sandbox_project = runs_dir / "sandbox" / "_project" - results_json_path = sandbox_project / "results.json" - if results_json_path.exists(): + _candidates: list[Path] = [] + for _proj in _sandbox_dir.glob("_project*"): + for _rel in ("results.json", "results/results.json"): + _rj = _proj / _rel + if _rj.is_file() and _rj.stat().st_mtime >= _run_anchor_mtime: + _candidates.append(_rj) + results_json_path = ( + max(_candidates, key=lambda p: p.stat().st_mtime) + if _candidates + else None + ) + if results_json_path is not None: try: structured_results = json.loads( results_json_path.read_text(encoding="utf-8") @@ -390,6 +415,14 @@ def _execute_experiment_run( } if structured_results is not None: run_payload["structured_results"] = structured_results + # Promote flattened per-condition metrics into the canonical metrics + # dict so downstream stages see them by default. The experiment's + # own results.json is the authoritative source; stdout-parsed + # metrics are a fallback only. + _sr_flat = _flatten_structured_metrics(structured_results) + if _sr_flat: + effective_metrics = {**(effective_metrics or {}), **_sr_flat} + run_payload["metrics"] = effective_metrics # Auto-generate results.json from parsed metrics if sandbox didn't produce one if structured_results is None and effective_metrics: auto_results = {"source": "stdout_parsed", "metrics": effective_metrics} diff --git a/researchclaw/prompts/shared.py b/researchclaw/prompts/shared.py index d42822ab..ab42b54e 100644 --- a/researchclaw/prompts/shared.py +++ b/researchclaw/prompts/shared.py @@ -288,20 +288,29 @@ "\n## ⚠️ NO NETWORK ACCESS — CRITICAL CONSTRAINT ⚠️\n" "This experiment runs with network_policy='none'. There is NO network access\n" "at ANY phase (no pip install, no dataset downloads, no HTTP requests).\n\n" - "### ONLY these pre-cached datasets are available:\n" - "- `torchvision.datasets.CIFAR10(root='/opt/datasets', train=True/False, download=False)`\n" - "- `torchvision.datasets.CIFAR100(root='/opt/datasets', train=True/False, download=False)`\n" - "- `torchvision.datasets.MNIST(root='/opt/datasets', train=True/False, download=False)`\n" - "- `torchvision.datasets.FashionMNIST(root='/opt/datasets', train=True/False, download=False)`\n" - "- `torchvision.datasets.STL10(root='/opt/datasets', split='train'/'test', download=False)`\n" - "- `torchvision.datasets.SVHN(root='/opt/datasets', split='train'/'test', download=False)`\n\n" + "### Datasets are pre-cached at `{dataset_cache_root}`:\n" + "- `torchvision.datasets.CIFAR10(root='{dataset_cache_root}', train=True/False, download=False)`\n" + "- `torchvision.datasets.CIFAR100(root='{dataset_cache_root}', train=True/False, download=False)`\n" + "- `torchvision.datasets.MNIST(root='{dataset_cache_root}', train=True/False, download=False)`\n" + "- `torchvision.datasets.FashionMNIST(root='{dataset_cache_root}', train=True/False, download=False)`\n" + "- `torchvision.datasets.STL10(root='{dataset_cache_root}', split='train'/'test', download=False)`\n" + "- `torchvision.datasets.SVHN(root='{dataset_cache_root}', split='train'/'test', download=False)`\n\n" "### FORBIDDEN (will cause runtime failure):\n" "- Do NOT create setup.py (it cannot run without network)\n" "- Do NOT create requirements.txt (pip install is unavailable)\n" "- Do NOT use `download=True` on any dataset\n" "- Do NOT use `urllib`, `requests`, `httpx`, or any HTTP library\n" "- Do NOT use `datasets.load_dataset()` from HuggingFace (requires download)\n" - "- Do NOT import packages not pre-installed in the Docker image\n\n" + "- Do NOT import packages not pre-installed in the Docker image\n" + "- Do NOT silently fall back to synthetic data if a dataset file is missing —\n" + " raise FileNotFoundError and exit non-zero instead. A failed dataset load is a\n" + " real failure to surface, not something to paper over with random tensors.\n\n" + "### Required: dataset provenance stamp\n" + "After your code successfully loads a dataset, print a single line to stdout of\n" + "the form `DATASET_USED: ` (e.g. `DATASET_USED: MNIST`). Emit this stamp\n" + "exactly once, with no surrounding decoration. Downstream metric parsing relies\n" + "on this line as a dataset-provenance signal independent of the JSON results\n" + "schema your code chooses to produce.\n\n" "### Available pre-installed packages:\n" "torch, torchvision, torchaudio, numpy, scipy, sklearn, matplotlib, seaborn,\n" "pandas, tqdm, gymnasium, networkx, PyYAML, Pillow, timm, einops, torchmetrics,\n" diff --git a/tests/test_rc_config.py b/tests/test_rc_config.py index 2fa7afbe..ad406a8d 100644 --- a/tests/test_rc_config.py +++ b/tests/test_rc_config.py @@ -284,6 +284,35 @@ def test_sandbox_config_defaults_match_expected_values(): assert defaults.gpu_required is False assert defaults.max_memory_mb == 4096 assert "numpy" in defaults.allowed_imports + # New field defaults to the production sandbox image path; preserves the + # prior hardcoded value that lived only inside the prompt block. + assert defaults.dataset_cache_root == "/opt/datasets" + + +def test_sandbox_config_dataset_cache_root_overrides_default(tmp_path: Path): + data = _valid_config_data() + data["experiment"] = { + "mode": "sandbox", + "sandbox": {"dataset_cache_root": "/tmp/arc_sandbox_trial/datasets"}, + } + + config = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) + + assert ( + config.experiment.sandbox.dataset_cache_root + == "/tmp/arc_sandbox_trial/datasets" + ) + + +def test_sandbox_config_dataset_cache_root_falls_back_to_default(tmp_path: Path): + data = _valid_config_data() + # Omit dataset_cache_root entirely — the loader should fall back to the + # SandboxConfig default rather than raising or returning None. + data["experiment"] = {"mode": "sandbox", "sandbox": {}} + + config = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) + + assert config.experiment.sandbox.dataset_cache_root == "/opt/datasets" def test_to_dict_roundtrip_rehydrates_equivalent_rcconfig(tmp_path: Path): diff --git a/tests/test_rc_executor.py b/tests/test_rc_executor.py index 133718c8..a9c3a027 100644 --- a/tests/test_rc_executor.py +++ b/tests/test_rc_executor.py @@ -1613,6 +1613,421 @@ def test_filters_long_name_lines(self) -> None: metrics = _parse_metrics_from_stdout(stdout) assert len(metrics) == 0 + def test_parses_per_seed_with_cond_alias(self) -> None: + from researchclaw.pipeline.executor import _parse_metrics_from_stdout + + stdout = ( + "PER_SEED dataset=mnist cond=baseline_bn_mlp seed=0 " + "test_accuracy: 0.92 step_time_ms: 1.7" + ) + metrics = _parse_metrics_from_stdout(stdout) + assert metrics["baseline_bn_mlp/0/test_accuracy"] == pytest.approx(0.92) + assert metrics["baseline_bn_mlp/0/step_time_ms"] == pytest.approx(1.7) + # Metadata tokens must NOT be captured as metrics. + assert "cond" not in metrics + assert "seed" not in metrics + assert "dataset" not in metrics + + def test_parses_per_seed_with_condition_alias(self) -> None: + from researchclaw.pipeline.executor import _parse_metrics_from_stdout + + stdout = ( + "PER_SEED dataset=mnist condition=baseline_bn_mlp seed=1 " + "test_accuracy: 0.91" + ) + metrics = _parse_metrics_from_stdout(stdout) + assert metrics["baseline_bn_mlp/1/test_accuracy"] == pytest.approx(0.91) + + def test_parses_condition_summary(self) -> None: + from researchclaw.pipeline.executor import _parse_metrics_from_stdout + + stdout = ( + "CONDITION_SUMMARY dataset=mnist condition=bn_mlp " + "accuracy_mean: 0.92 accuracy_std: 0.005 success_rate: 1.0" + ) + metrics = _parse_metrics_from_stdout(stdout) + assert metrics["bn_mlp/accuracy_mean"] == pytest.approx(0.92) + assert metrics["bn_mlp/accuracy_std"] == pytest.approx(0.005) + assert metrics["bn_mlp/success_rate"] == pytest.approx(1.0) + + def test_parses_gap_to_bn_signed(self) -> None: + from researchclaw.pipeline.executor import _parse_metrics_from_stdout + + stdout = ( + "GAP_TO_BN dataset=mnist cond=rmsnorm_mlp " + "accuracy_gap_to_bn_baseline_mean: -0.01" + ) + metrics = _parse_metrics_from_stdout(stdout) + assert metrics["rmsnorm_mlp/accuracy_gap_to_bn_baseline_mean"] == pytest.approx(-0.01) + + def test_structured_line_without_cond_is_skipped(self) -> None: + from researchclaw.pipeline.executor import _parse_metrics_from_stdout + + stdout = "PER_SEED seed=0 test_accuracy: 0.92" + metrics = _parse_metrics_from_stdout(stdout) + # No cond= or condition= token → row produces no metrics, no crash. + assert metrics == {} + + def test_simple_pair_still_works_alongside_structured(self) -> None: + """Regression: bare ``key: value`` still parses even when the same + stdout also has structured PER_SEED lines.""" + from researchclaw.pipeline.executor import _parse_metrics_from_stdout + + stdout = ( + "ANNEAL_STEP_THRESHOLD: 163\n" + "PER_SEED cond=bn_mlp seed=0 test_accuracy: 0.92" + ) + metrics = _parse_metrics_from_stdout(stdout) + assert metrics["ANNEAL_STEP_THRESHOLD"] == pytest.approx(163.0) + assert metrics["bn_mlp/0/test_accuracy"] == pytest.approx(0.92) + + +class TestFlattenStructuredMetrics: + """Tests for _flatten_structured_metrics() helper.""" + + def test_top_level_metrics_dict(self) -> None: + from researchclaw.pipeline._helpers import _flatten_structured_metrics + + sr = {"metrics": {"acc": 0.92, "loss": 0.03}} + flat = _flatten_structured_metrics(sr) + assert flat == {"acc": 0.92, "loss": 0.03} + + def test_conditions_dict_with_metrics_subkey(self) -> None: + """Schema 3: ``{"conditions": {: {"metrics": {...}}}}``.""" + from researchclaw.pipeline._helpers import _flatten_structured_metrics + + sr = { + "conditions": { + "baseline_bn_mlp": {"metrics": {"accuracy": 0.92, "step_time_ms": 1.7}}, + "baseline_rmsnorm_mlp": {"metrics": {"accuracy": 0.91}}, + } + } + flat = _flatten_structured_metrics(sr) + assert flat["baseline_bn_mlp/accuracy"] == 0.92 + assert flat["baseline_bn_mlp/step_time_ms"] == 1.7 + assert flat["baseline_rmsnorm_mlp/accuracy"] == 0.91 + + def test_conditions_list_with_name(self) -> None: + """Schema 2: ``{"conditions": [{"name": ..., "metrics": {...}}]}``.""" + from researchclaw.pipeline._helpers import _flatten_structured_metrics + + sr = { + "conditions": [ + {"name": "bn", "metrics": {"acc": 0.92}}, + {"name": "rmsnorm", "metrics": {"acc": 0.91}}, + ] + } + flat = _flatten_structured_metrics(sr) + assert flat == {"bn/acc": 0.92, "rmsnorm/acc": 0.91} + + def test_condition_summaries_synonym(self) -> None: + """Schema 4: ``condition_summaries`` alias for ``conditions``.""" + from researchclaw.pipeline._helpers import _flatten_structured_metrics + + sr = { + "condition_summaries": { + "bn": {"metrics": {"acc": 0.92}}, + } + } + flat = _flatten_structured_metrics(sr) + assert flat == {"bn/acc": 0.92} + + def test_nested_stat_block_one_level_recursion(self) -> None: + """Shallow recursion: ``accuracy: {mean: 0.92, std: 0.01}`` becomes + ``/accuracy_mean`` and ``/accuracy_std``.""" + from researchclaw.pipeline._helpers import _flatten_structured_metrics + + sr = { + "conditions": { + "bn": { + "metrics": { + "accuracy": {"mean": 0.92, "std": 0.01}, + "loss": 0.03, + } + } + } + } + flat = _flatten_structured_metrics(sr) + assert flat["bn/accuracy_mean"] == 0.92 + assert flat["bn/accuracy_std"] == 0.01 + assert flat["bn/loss"] == 0.03 + + def test_non_numeric_values_silently_dropped(self) -> None: + from researchclaw.pipeline._helpers import _flatten_structured_metrics + + sr = { + "conditions": { + "bn": { + "metrics": { + "acc": 0.92, + "device": "mps", # string — dropped + "converged": True, # bool — dropped (bool is int subclass) + "history": [0.1, 0.2], # list — dropped + } + } + } + } + flat = _flatten_structured_metrics(sr) + assert flat == {"bn/acc": 0.92} + + def test_falls_back_to_whole_entry_when_no_metrics_subkey(self) -> None: + """If a condition entry has no ``metrics`` sub-dict, treat the whole + entry as the metric dict (drops non-numeric fields automatically).""" + from researchclaw.pipeline._helpers import _flatten_structured_metrics + + sr = { + "conditions": { + "bn": {"acc": 0.92, "name": "bn"}, # no nested "metrics" key + } + } + flat = _flatten_structured_metrics(sr) + assert flat["bn/acc"] == 0.92 + assert "bn/name" not in flat # non-numeric, dropped + + def test_non_dict_input_returns_empty(self) -> None: + from researchclaw.pipeline._helpers import _flatten_structured_metrics + + assert _flatten_structured_metrics(None) == {} + assert _flatten_structured_metrics([1, 2, 3]) == {} + assert _flatten_structured_metrics("not-a-dict") == {} + + def test_empty_dict_returns_empty(self) -> None: + from researchclaw.pipeline._helpers import _flatten_structured_metrics + + assert _flatten_structured_metrics({}) == {} + + +class TestSandboxResultsDiscovery: + """Tests for Patch A: sandbox cleanup + _project*/results.json discovery. + + These exercise _execute_experiment_run's pre-run cleanup and structured + results lookup via a stub sandbox object, avoiding any reliance on real + subprocess execution. + """ + + def _make_stub_sandbox( + self, + results_payload: dict, + project_suffix: str = "_project_1", + results_relpath: str = "results.json", + ): + """Return a class that mimics enough of the sandbox protocol to + exercise the post-run results.json discovery glob. + + On ``run_project``, the stub creates + ``//`` with the given payload + and returns a benign result object. ``results_relpath`` defaults to + ``"results.json"`` (project-root write) but may also be e.g. + ``"results/results.json"`` to exercise the subdir case. + """ + from types import SimpleNamespace + + class _StubSandbox: + def __init__(_self, workdir): + _self.workdir = workdir + + def run_project(_self, project_path, timeout_sec): + target_path = _self.workdir / project_suffix / results_relpath + target_path.parent.mkdir(parents=True, exist_ok=True) + target_path.write_text( + json.dumps(results_payload), encoding="utf-8" + ) + return SimpleNamespace( + metrics={}, + stdout="", + stderr="", + returncode=0, + elapsed_sec=0.1, + timed_out=False, + ) + + def run(_self, code, timeout_sec): # pragma: no cover (multi-file path used) + return _self.run_project(None, timeout_sec) + + return _StubSandbox + + def _setup_run_dir(self, tmp_path): + """Build a minimal run dir with stage-09 exp_plan.yaml + stage-10 experiment/.""" + run_dir = tmp_path / "run" + run_dir.mkdir() + s9 = run_dir / "stage-09" + s9.mkdir() + (s9 / "exp_plan.yaml").write_text("topic: t\nbaselines: [a]\n", encoding="utf-8") + s10 = run_dir / "stage-10" / "experiment" + s10.mkdir(parents=True) + (s10 / "main.py").write_text("print('hi')\n", encoding="utf-8") + # Stage 11 schedule (resource planning output, optional but harmless). + s11 = run_dir / "stage-11" + s11.mkdir() + (s11 / "schedule.json").write_text( + json.dumps({"tasks": [{"id": "t1"}]}), encoding="utf-8" + ) + return run_dir + + def _stub_config(self): + from researchclaw.config import ( + ExperimentConfig, SandboxConfig, ExperimentRepairConfig, + OpenCodeConfig, CodeAgentConfig, + ) + + # Construct an ExperimentConfig with mode=sandbox and a python path. + sandbox = SandboxConfig(python_path="/usr/bin/env python3") + exp = ExperimentConfig( + mode="sandbox", + time_budget_sec=60, + max_iterations=1, + sandbox=sandbox, + repair=ExperimentRepairConfig(enabled=False), + opencode=OpenCodeConfig(enabled=False), + code_agent=CodeAgentConfig(enabled=False), + ) + return exp + + def test_discovers_results_in_project_1_subdir( + self, tmp_path, monkeypatch, rc_config, adapters + ): + """Primary correctness: even when the sandbox writes to ``_project_1`` + (suffixed), discovery finds it and promotes structured metrics into + the run-1.json payload.""" + run_dir = self._setup_run_dir(tmp_path) + stage_dir = run_dir / "stage-12" + stage_dir.mkdir() + + # Stub create_sandbox to return our writer. + results_payload = { + "device": "cpu", + "dataset_used": "MNIST", + "conditions": { + "bn": {"metrics": {"acc": 0.92}}, + "rmsnorm": {"metrics": {"acc": 0.91}}, + }, + } + StubSandbox = self._make_stub_sandbox(results_payload, project_suffix="_project_1") + monkeypatch.setattr( + "researchclaw.experiment.factory.create_sandbox", + lambda cfg, workdir: StubSandbox(workdir), + ) + # Avoid the dep-install subprocess. + monkeypatch.setattr( + "researchclaw.pipeline.stage_impls._execution._ensure_sandbox_deps", + lambda code, py: [], + ) + + # Use a real-ish RCConfig but swap in our sandbox experiment config. + cfg = rc_config + cfg = cfg.__class__( + **{**cfg.__dict__, "experiment": self._stub_config()} + ) + + result = rc_executor._execute_experiment_run( + stage_dir, run_dir, cfg, adapters, llm=None + ) + + assert result.status == StageStatus.DONE + run1 = json.loads((stage_dir / "runs" / "run-1.json").read_text()) + # Structured metrics are promoted into the canonical metrics dict. + assert run1["metrics"]["bn/acc"] == pytest.approx(0.92) + assert run1["metrics"]["rmsnorm/acc"] == pytest.approx(0.91) + # And the raw structured_results is also attached for downstream consumers. + assert run1["structured_results"]["dataset_used"] == "MNIST" + # runs/results.json was copied from the experiment-authored source + # (NOT the "stdout_parsed" auto-fallback). + results = json.loads((stage_dir / "runs" / "results.json").read_text()) + assert results.get("source") != "stdout_parsed" + assert results["dataset_used"] == "MNIST" + + def test_discovers_results_in_results_subdir( + self, tmp_path, monkeypatch, rc_config, adapters + ): + """Real generated experiments often write to a ``results/`` subdir + (observed in the Phase-2 sandbox readiness trial). Discovery must + check both ``/results.json`` and ``/results/results.json``.""" + run_dir = self._setup_run_dir(tmp_path) + stage_dir = run_dir / "stage-12" + stage_dir.mkdir() + + results_payload = { + "dataset_used": "MNIST", + "per_condition": { # one of the schemas the flattener handles + "bn": {"accuracy_mean": 0.92, "accuracy_std": 0.005}, + }, + } + StubSandbox = self._make_stub_sandbox( + results_payload, + project_suffix="_project_1", + results_relpath="results/results.json", # NESTED + ) + monkeypatch.setattr( + "researchclaw.experiment.factory.create_sandbox", + lambda cfg, workdir: StubSandbox(workdir), + ) + monkeypatch.setattr( + "researchclaw.pipeline.stage_impls._execution._ensure_sandbox_deps", + lambda code, py: [], + ) + + cfg = rc_config.__class__( + **{**rc_config.__dict__, "experiment": self._stub_config()} + ) + + result = rc_executor._execute_experiment_run( + stage_dir, run_dir, cfg, adapters, llm=None + ) + + assert result.status == StageStatus.DONE + run1 = json.loads((stage_dir / "runs" / "run-1.json").read_text()) + # per_condition entry without an explicit "metrics" sub-dict — the + # flattener falls back to treating the entry as the metrics dict. + assert run1["metrics"]["bn/accuracy_mean"] == pytest.approx(0.92) + assert run1["metrics"]["bn/accuracy_std"] == pytest.approx(0.005) + results = json.loads((stage_dir / "runs" / "results.json").read_text()) + assert results.get("source") != "stdout_parsed" + assert results["dataset_used"] == "MNIST" + + def test_cleanup_removes_stale_sibling_before_run( + self, tmp_path, monkeypatch, rc_config, adapters + ): + """Regression: a leftover ``_project/results.json`` from a prior run + must NOT be returned by the discovery (sandbox dir is freshly created + before run, so no pre-existing files survive).""" + run_dir = self._setup_run_dir(tmp_path) + stage_dir = run_dir / "stage-12" + stage_dir.mkdir() + runs_dir = stage_dir / "runs" + # Plant a stale results.json from a "prior run" before the test invokes + # the executor. The cleanup at the top of the sandbox branch must wipe it. + stale_sandbox = runs_dir / "sandbox" / "_project" + stale_sandbox.mkdir(parents=True) + (stale_sandbox / "results.json").write_text( + json.dumps({"metrics": {"stale": 999.0}}), encoding="utf-8" + ) + + # Stub sandbox writes a FRESH results.json into _project_1 with different content. + fresh_payload = {"conditions": {"bn": {"metrics": {"acc": 0.5}}}} + StubSandbox = self._make_stub_sandbox(fresh_payload, project_suffix="_project_1") + monkeypatch.setattr( + "researchclaw.experiment.factory.create_sandbox", + lambda cfg, workdir: StubSandbox(workdir), + ) + monkeypatch.setattr( + "researchclaw.pipeline.stage_impls._execution._ensure_sandbox_deps", + lambda code, py: [], + ) + + cfg = rc_config.__class__( + **{**rc_config.__dict__, "experiment": self._stub_config()} + ) + + result = rc_executor._execute_experiment_run( + stage_dir, run_dir, cfg, adapters, llm=None + ) + + assert result.status == StageStatus.DONE + run1 = json.loads((stage_dir / "runs" / "run-1.json").read_text()) + # The stale "stale: 999.0" must NOT appear; only the fresh fresh_payload should. + assert "stale" not in run1["metrics"] + assert run1["metrics"]["bn/acc"] == pytest.approx(0.5) + class TestDetectRuntimeIssues: """Tests for _detect_runtime_issues() helper.""" diff --git a/tests/test_rc_prompts.py b/tests/test_rc_prompts.py index 947ade60..cbb1ce00 100644 --- a/tests/test_rc_prompts.py +++ b/tests/test_rc_prompts.py @@ -148,6 +148,65 @@ def test_block_pkg_hint(self) -> None: assert "numpy" in block assert "torch" in block # mentioned as prohibited + def test_block_network_disabled_guidance_uses_default_root(self) -> None: + # When the caller passes the SandboxConfig default the rendered block + # still names /opt/datasets, matching the prior hardcoded value. + pm = PromptManager() + block = pm.block( + "network_disabled_guidance", dataset_cache_root="/opt/datasets" + ) + # All six dataset examples should reference the configured root. + assert block.count("/opt/datasets") >= 6 + # Critical invariants preserved. + assert "network_policy='none'" in block + assert "download=False" in block + assert "Do NOT use `download=True`" in block + # The unrendered placeholder must not leak through. + assert "{dataset_cache_root}" not in block + + def test_block_network_disabled_guidance_uses_custom_root(self) -> None: + # A custom dataset cache root flows through every dataset example. + pm = PromptManager() + block = pm.block( + "network_disabled_guidance", + dataset_cache_root="/tmp/arc_sandbox_trial/datasets", + ) + assert block.count("/tmp/arc_sandbox_trial/datasets") >= 6 + # The previous hardcoded path must no longer appear. + assert "/opt/datasets" not in block + # Invariants still hold. + assert "download=False" in block + assert "Do NOT use `download=True`" in block + assert "{dataset_cache_root}" not in block + + def test_block_network_disabled_guidance_forbids_synthetic_fallback( + self, + ) -> None: + # The fail-loud bullet is an intentional behavior change shipped with + # the dataset_cache_root parameterisation: missing pre-cached data must + # raise FileNotFoundError rather than silently substituting synthetic + # tensors. Lock it in for both default and custom cache roots. + pm = PromptManager() + for root in ("/opt/datasets", "/tmp/arc_sandbox_trial/datasets"): + block = pm.block("network_disabled_guidance", dataset_cache_root=root) + lowered = block.lower() + assert "fall back to synthetic data" in lowered + assert "filenotfounderror" in lowered + assert "exit non-zero" in lowered + + def test_block_network_disabled_guidance_requires_dataset_used_stamp( + self, + ) -> None: + # Downstream metric capture relies on a single-line stdout stamp of + # the form `DATASET_USED: ` as a provenance signal independent + # of whatever JSON result schema the generated code chooses. + pm = PromptManager() + for root in ("/opt/datasets", "/tmp/arc_sandbox_trial/datasets"): + block = pm.block("network_disabled_guidance", dataset_cache_root=root) + assert "DATASET_USED:" in block + assert "dataset provenance stamp" in block.lower() + assert "exactly once" in block.lower() + def test_sub_prompt_code_repair(self) -> None: pm = PromptManager() rp = pm.sub_prompt(