Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions config.researchclaw.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,13 @@ experiment:
python_path: ".venv/bin/python3"
gpu_required: false
max_memory_mb: 4096
# Filesystem path under which torchvision dataset raw files are pre-staged.
# Generated experiment code is instructed to load datasets from this path
# with download=False; sandbox mode runs with network_policy="none" so no
# downloads are possible. If a dataset file is missing the experiment is
# required to raise FileNotFoundError and exit non-zero rather than fall
# back to synthetic data. Default matches the production sandbox image.
dataset_cache_root: "/opt/datasets"
# Docker sandbox settings (only used when mode: "docker")
# Build image first: docker build -t researchclaw/experiment:latest researchclaw/docker/
docker:
Expand Down
9 changes: 9 additions & 0 deletions researchclaw/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,12 @@ class SandboxConfig:
"sklearn",
)
max_memory_mb: int = 4096
# Filesystem path under which torchvision dataset raw files are pre-staged
# for sandbox mode (network_policy="none"). Generated experiment code is
# instructed to load datasets with root=<this path>, download=False, and
# to raise FileNotFoundError rather than fall back to synthetic data if
# the cache is missing. Default matches the production sandbox image.
dataset_cache_root: str = "/opt/datasets"


@dataclass(frozen=True)
Expand Down Expand Up @@ -1251,6 +1257,9 @@ def _parse_experiment_config(data: dict[str, Any]) -> ExperimentConfig:
sandbox_data.get("allowed_imports", SandboxConfig.allowed_imports)
),
max_memory_mb=_safe_int(sandbox_data.get("max_memory_mb"), 4096),
dataset_cache_root=sandbox_data.get(
"dataset_cache_root", SandboxConfig.dataset_cache_root
),
),
docker=DockerSandboxConfig(
image=docker_data.get("image", "researchclaw/experiment:latest"),
Expand Down
110 changes: 107 additions & 3 deletions researchclaw/pipeline/_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,17 +694,39 @@ def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any]:
- ``UCB (Stochastic) cumulative_regret: 361.9233``
- ``condition=name metric=value`` (per-condition output)
- ``condition=name/metric_name metric=value``
- ``PER_SEED [...] (cond|condition)=NAME seed=N <key>: <value> ...``
(multi-metric per-seed lines from structured experiment output)
- ``CONDITION_SUMMARY [...] (cond|condition)=NAME <key>: <value> ...``
- ``GAP_TO_BN [...] (cond|condition)=NAME <key>: <value> ...``

Returns a flat dict of metric_name -> value. Per-condition / per-seed
metrics are namespaced ``"<cond>/<seed>/<key>"`` or ``"<cond>/<key>"``
so they cannot collide with bare ``<key>: <value>`` simple-pair output.

Returns a flat dict of metric_name -> value.
Filters out log/status lines using :func:`is_metric_name`.
"""
# BUG-173: regex for condition=name metric=value format
_CONDITION_RE = re.compile(
r"^condition=(\S+)\s+metric=([0-9eE.+-]+)\s*$"
)
# Structured per-condition / per-seed line patterns. The cond=/condition=
# alias is needed because the stage-10 prompt names the token "condition="
# but real generated code often emits "cond=" (observed in the Phase-2
# sandbox readiness trial).
_STRUCTURED_PREFIXES = ("PER_SEED", "CONDITION_SUMMARY", "GAP_TO_BN")
_COND_KEY_RE = re.compile(r"\b(?:cond|condition)=(\S+)")
_SEED_KEY_RE = re.compile(r"\bseed=(\d+)")
# Matches "<word>: <number>" with optional sign / decimal / exponent.
_METRIC_PAIR_RE = re.compile(
r"\b(\w+):\s*([\-+]?\d+(?:\.\d+)?(?:[eE][\-+]?\d+)?)"
)
# Tokens that are line-tagging metadata, not metric keys, even though they
# appear in the same "<word>: <value>" shape.
_STRUCTURED_NON_METRIC_KEYS = {"cond", "condition", "seed", "dataset"}

metrics: dict[str, Any] = {}
for line in stdout.splitlines():
line = line.strip()
for raw_line in stdout.splitlines():
line = raw_line.strip()
# --- Format 2: condition=xxx metric=yyy ---
m = _CONDITION_RE.match(line)
if m:
Expand All @@ -715,6 +737,23 @@ def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any]:
except (ValueError, TypeError):
pass
continue
# --- Format 3: structured per-condition lines ---
if line.startswith(_STRUCTURED_PREFIXES):
cond_match = _COND_KEY_RE.search(line)
if cond_match:
cond = cond_match.group(1)
seed_match = _SEED_KEY_RE.search(line)
seed_part = f"/{seed_match.group(1)}" if seed_match else ""
for key, val in _METRIC_PAIR_RE.findall(line):
if key in _STRUCTURED_NON_METRIC_KEYS:
continue
try:
metrics[f"{cond}{seed_part}/{key}"] = float(val)
except (ValueError, TypeError):
pass
# Structured lines are NOT also processed as simple <key>: <value>
# to avoid the leading prefix word being consumed as a metric name.
continue
# --- Format 1: name: value ---
if ":" not in line:
continue
Expand All @@ -736,6 +775,71 @@ def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any]:
return metrics


def _flatten_structured_metrics(sr: Any) -> dict[str, float]:
"""Flatten a sandbox-written ``results.json`` into a metric dict.

Handles four observed schemas:

1. ``{"metrics": {<key>: <number>}}`` — the auto-fallback shape.
2. ``{"conditions": [{"name": ..., "metrics": {...}}]}`` — list shape.
3. ``{"conditions": {<name>: {"metrics": {...}}}}`` — dict shape.
4. ``{"per_condition": ...}`` / ``{"condition_summaries": ...}`` — synonyms.

Per-condition keys are namespaced ``"<cond_name>/<metric_key>"``. Inside
each condition's ``metrics`` dict, one-level-shallow recursion picks up
nested stat blocks like ``{"accuracy": {"mean": 0.92, "std": 0.01}}`` —
those become ``"<cond>/accuracy_mean"``, ``"<cond>/accuracy_std"``.
Deeper nesting is intentionally ignored to keep the helper predictable.

Non-numeric values are silently dropped.
"""
out: dict[str, float] = {}
if not isinstance(sr, dict):
return out

def _scrape_metrics_dict(d: dict, prefix: str) -> None:
"""Append numeric leaves from a metric-dict to ``out`` with ``prefix/``."""
for key, val in d.items():
full = f"{prefix}/{key}" if prefix else str(key)
if isinstance(val, (int, float)) and not isinstance(val, bool):
out[full] = float(val)
elif isinstance(val, dict):
# One-level-shallow recursion: ``accuracy: {mean: 0.92, std: 0.01}``
# → ``<prefix>/accuracy_mean``, ``<prefix>/accuracy_std``.
for sub_key, sub_val in val.items():
if isinstance(sub_val, (int, float)) and not isinstance(sub_val, bool):
out[f"{full}_{sub_key}"] = float(sub_val)

# Shape 1: top-level metrics dict
top_metrics = sr.get("metrics")
if isinstance(top_metrics, dict):
_scrape_metrics_dict(top_metrics, prefix="")

# Shapes 2-4: per-condition container under one of several aliases.
container = (
sr.get("conditions")
or sr.get("per_condition")
or sr.get("condition_summaries")
)
items: list[tuple[str, dict]] = []
if isinstance(container, dict):
items = [(str(k), v) for k, v in container.items() if isinstance(v, dict)]
elif isinstance(container, list):
for entry in container:
if isinstance(entry, dict):
name = str(entry.get("name") or entry.get("id") or "unknown")
items.append((name, entry))

for cond_name, cond_data in items:
# Prefer an explicit ``metrics`` sub-dict; fall back to the whole entry.
cond_metrics = cond_data.get("metrics")
if not isinstance(cond_metrics, dict):
cond_metrics = cond_data
_scrape_metrics_dict(cond_metrics, prefix=cond_name)

return out


# ---------------------------------------------------------------------------
# LLM helpers
# ---------------------------------------------------------------------------
Expand Down
9 changes: 7 additions & 2 deletions researchclaw/pipeline/stage_impls/_code_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,9 +321,14 @@ def _execute_code_generation(
else "none" # sandbox mode has no network
)
if _net_policy == "none":
# Network disabled: inject strict offline-only guidance
# Network disabled: inject strict offline-only guidance.
# Pass the configured dataset cache root so the prompt points the
# model at the right pre-staged path; defaults to /opt/datasets.
try:
extra_guidance += _pm.block("network_disabled_guidance")
extra_guidance += _pm.block(
"network_disabled_guidance",
dataset_cache_root=config.experiment.sandbox.dataset_cache_root,
)
except Exception: # noqa: BLE001
pass
elif _net_policy == "full":
Expand Down
43 changes: 38 additions & 5 deletions researchclaw/pipeline/stage_impls/_execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import logging
import math
import re
import shutil
import time as _time
from pathlib import Path
from typing import Any
Expand All @@ -26,6 +27,7 @@
_ensure_sandbox_deps,
_extract_code_block,
_extract_multi_file_blocks,
_flatten_structured_metrics,
_get_evolution_overlay,
_load_hardware_profile,
_parse_metrics_from_stdout,
Expand Down Expand Up @@ -312,7 +314,17 @@ def _execute_experiment_run(
pass
_ensure_sandbox_deps(_all_code, config.experiment.sandbox.python_path)

sandbox = create_sandbox(config.experiment, runs_dir / "sandbox")
# Clear the sandbox dir before run so stale results.json from a prior
# failed run can't contaminate the discovery glob below. Fail loudly
# on cleanup failure — silently ignoring would let the mtime anchor
# be old enough to admit stale files.
_sandbox_dir = runs_dir / "sandbox"
if _sandbox_dir.exists():
shutil.rmtree(_sandbox_dir)
_sandbox_dir.mkdir(parents=True)
_run_anchor_mtime = _sandbox_dir.stat().st_mtime

sandbox = create_sandbox(config.experiment, _sandbox_dir)
# Use run_project for multi-file, run for single-file
if exp_dir_path and Path(exp_dir_path).is_dir():
result = sandbox.run_project(
Expand All @@ -322,11 +334,24 @@ def _execute_experiment_run(
result = sandbox.run(
code_text, timeout_sec=config.experiment.time_budget_sec
)
# Try to read structured results.json from sandbox working dir
# The sandbox subprocess writes to _project, _project_1, _project_2, …
# (an auto-suffixed working dir per run). Generated experiment code
# commonly writes results.json at either the project root (`_project/`)
# or under a results subdir (`_project/results/`). Check both; only
# accept candidates written after the current run started.
structured_results: dict[str, Any] | None = None
sandbox_project = runs_dir / "sandbox" / "_project"
results_json_path = sandbox_project / "results.json"
if results_json_path.exists():
_candidates: list[Path] = []
for _proj in _sandbox_dir.glob("_project*"):
for _rel in ("results.json", "results/results.json"):
_rj = _proj / _rel
if _rj.is_file() and _rj.stat().st_mtime >= _run_anchor_mtime:
_candidates.append(_rj)
results_json_path = (
max(_candidates, key=lambda p: p.stat().st_mtime)
if _candidates
else None
)
if results_json_path is not None:
try:
structured_results = json.loads(
results_json_path.read_text(encoding="utf-8")
Expand Down Expand Up @@ -390,6 +415,14 @@ def _execute_experiment_run(
}
if structured_results is not None:
run_payload["structured_results"] = structured_results
# Promote flattened per-condition metrics into the canonical metrics
# dict so downstream stages see them by default. The experiment's
# own results.json is the authoritative source; stdout-parsed
# metrics are a fallback only.
_sr_flat = _flatten_structured_metrics(structured_results)
if _sr_flat:
effective_metrics = {**(effective_metrics or {}), **_sr_flat}
run_payload["metrics"] = effective_metrics
# Auto-generate results.json from parsed metrics if sandbox didn't produce one
if structured_results is None and effective_metrics:
auto_results = {"source": "stdout_parsed", "metrics": effective_metrics}
Expand Down
25 changes: 17 additions & 8 deletions researchclaw/prompts/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,20 +288,29 @@
"\n## ⚠️ NO NETWORK ACCESS — CRITICAL CONSTRAINT ⚠️\n"
"This experiment runs with network_policy='none'. There is NO network access\n"
"at ANY phase (no pip install, no dataset downloads, no HTTP requests).\n\n"
"### ONLY these pre-cached datasets are available:\n"
"- `torchvision.datasets.CIFAR10(root='/opt/datasets', train=True/False, download=False)`\n"
"- `torchvision.datasets.CIFAR100(root='/opt/datasets', train=True/False, download=False)`\n"
"- `torchvision.datasets.MNIST(root='/opt/datasets', train=True/False, download=False)`\n"
"- `torchvision.datasets.FashionMNIST(root='/opt/datasets', train=True/False, download=False)`\n"
"- `torchvision.datasets.STL10(root='/opt/datasets', split='train'/'test', download=False)`\n"
"- `torchvision.datasets.SVHN(root='/opt/datasets', split='train'/'test', download=False)`\n\n"
"### Datasets are pre-cached at `{dataset_cache_root}`:\n"
"- `torchvision.datasets.CIFAR10(root='{dataset_cache_root}', train=True/False, download=False)`\n"
"- `torchvision.datasets.CIFAR100(root='{dataset_cache_root}', train=True/False, download=False)`\n"
"- `torchvision.datasets.MNIST(root='{dataset_cache_root}', train=True/False, download=False)`\n"
"- `torchvision.datasets.FashionMNIST(root='{dataset_cache_root}', train=True/False, download=False)`\n"
"- `torchvision.datasets.STL10(root='{dataset_cache_root}', split='train'/'test', download=False)`\n"
"- `torchvision.datasets.SVHN(root='{dataset_cache_root}', split='train'/'test', download=False)`\n\n"
"### FORBIDDEN (will cause runtime failure):\n"
"- Do NOT create setup.py (it cannot run without network)\n"
"- Do NOT create requirements.txt (pip install is unavailable)\n"
"- Do NOT use `download=True` on any dataset\n"
"- Do NOT use `urllib`, `requests`, `httpx`, or any HTTP library\n"
"- Do NOT use `datasets.load_dataset()` from HuggingFace (requires download)\n"
"- Do NOT import packages not pre-installed in the Docker image\n\n"
"- Do NOT import packages not pre-installed in the Docker image\n"
"- Do NOT silently fall back to synthetic data if a dataset file is missing —\n"
" raise FileNotFoundError and exit non-zero instead. A failed dataset load is a\n"
" real failure to surface, not something to paper over with random tensors.\n\n"
"### Required: dataset provenance stamp\n"
"After your code successfully loads a dataset, print a single line to stdout of\n"
"the form `DATASET_USED: <name>` (e.g. `DATASET_USED: MNIST`). Emit this stamp\n"
"exactly once, with no surrounding decoration. Downstream metric parsing relies\n"
"on this line as a dataset-provenance signal independent of the JSON results\n"
"schema your code chooses to produce.\n\n"
"### Available pre-installed packages:\n"
"torch, torchvision, torchaudio, numpy, scipy, sklearn, matplotlib, seaborn,\n"
"pandas, tqdm, gymnasium, networkx, PyYAML, Pillow, timm, einops, torchmetrics,\n"
Expand Down
29 changes: 29 additions & 0 deletions tests/test_rc_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,35 @@ def test_sandbox_config_defaults_match_expected_values():
assert defaults.gpu_required is False
assert defaults.max_memory_mb == 4096
assert "numpy" in defaults.allowed_imports
# New field defaults to the production sandbox image path; preserves the
# prior hardcoded value that lived only inside the prompt block.
assert defaults.dataset_cache_root == "/opt/datasets"


def test_sandbox_config_dataset_cache_root_overrides_default(tmp_path: Path):
data = _valid_config_data()
data["experiment"] = {
"mode": "sandbox",
"sandbox": {"dataset_cache_root": "/tmp/arc_sandbox_trial/datasets"},
}

config = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)

assert (
config.experiment.sandbox.dataset_cache_root
== "/tmp/arc_sandbox_trial/datasets"
)


def test_sandbox_config_dataset_cache_root_falls_back_to_default(tmp_path: Path):
data = _valid_config_data()
# Omit dataset_cache_root entirely — the loader should fall back to the
# SandboxConfig default rather than raising or returning None.
data["experiment"] = {"mode": "sandbox", "sandbox": {}}

config = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False)

assert config.experiment.sandbox.dataset_cache_root == "/opt/datasets"


def test_to_dict_roundtrip_rehydrates_equivalent_rcconfig(tmp_path: Path):
Expand Down
Loading