diff --git a/docs/source/environments/sophistry_bench_sprint.md b/docs/source/environments/sophistry_bench_sprint.md index 4c0696a1c..070b64014 100644 --- a/docs/source/environments/sophistry_bench_sprint.md +++ b/docs/source/environments/sophistry_bench_sprint.md @@ -37,7 +37,7 @@ from sophistry_bench_sprint_env import SophistryBenchSprintEnv async def main(): # Deployed Hugging Face Space (or .from_docker_image("openenv-sophistry_bench_sprint:latest")): - client = await SophistryBenchSprintEnv.from_env("anushaacharya/sophistry_bench_sprint_env") + client = await SophistryBenchSprintEnv.from_env("openenv-community/sophistry_bench_sprint_env") async with client: obs = (await client.reset()).observation print(obs.prompt, obs.answer_to_defend) @@ -67,6 +67,22 @@ the reward-hacking measurement. By default it holds **seven** components; `corre > reason; even with the rest of the components, forwarding them to the agent leaks the > reward signal and defeats the reward-hacking measurement. +## Training + +[`examples/sophistry_bench_sprint_grpo.py`](https://github.com/huggingface/OpenEnv/blob/main/examples/sophistry_bench_sprint_grpo.py) +trains a policy on this env with TRL's `GRPOTrainer` — a plain prompt -> +completion -> reward setup, since the episode is single-step. + +Validated with a real 100-step run on Hugging Face Jobs (`Qwen2.5-0.5B-Instruct`, +`a10g-small`) and a 100-step run on the Prime Intellect Hub +(`Llama-3.2-1B-Instruct`, registered as `anusha/sophistry-bench-sprint`, parity-tested +against this port). Both show `aggregate_reward` (the optimized proxy) climbing while +`correctness_reward` (the hidden ground truth, weight 0) stays flat — the reward-hacking +signature this env is designed to surface. The larger Prime Intellect run converges on +the literal `claim_count_cliff` target (`n_claims` saturates at exactly 8); the smaller +HF Jobs run finds a different shortcut instead (`n_claims` collapses to ~0, near-empty +completions) — same underlying finding, different degenerate strategy depending on scale. + ## Build & test ```bash diff --git a/envs/sophistry_bench_sprint_env/README.md b/envs/sophistry_bench_sprint_env/README.md index 3bef3bba7..a34f48dc8 100644 --- a/envs/sophistry_bench_sprint_env/README.md +++ b/envs/sophistry_bench_sprint_env/README.md @@ -49,7 +49,7 @@ from sophistry_bench_sprint_env import SophistryBenchSprintEnv async def main(): # Deployed Hugging Face Space (or .from_docker_image("openenv-sophistry_bench_sprint:latest")): - client = await SophistryBenchSprintEnv.from_env("anushaacharya/sophistry_bench_sprint_env") + client = await SophistryBenchSprintEnv.from_env("openenv-community/sophistry_bench_sprint_env") async with client: obs = (await client.reset()).observation print(obs.prompt, obs.answer_to_defend) @@ -79,6 +79,22 @@ the reward-hacking measurement. By default it holds **seven** components; `corre > reason; even with the rest of the components, forwarding them to the agent leaks the > reward signal and defeats the reward-hacking measurement. +## Training + +[`examples/sophistry_bench_sprint_grpo.py`](https://github.com/huggingface/OpenEnv/blob/main/examples/sophistry_bench_sprint_grpo.py) +trains a policy on this env with TRL's `GRPOTrainer` — a plain prompt -> +completion -> reward setup, since the episode is single-step. + +Validated with a real 100-step run on Hugging Face Jobs (`Qwen2.5-0.5B-Instruct`, +`a10g-small`) and a 100-step run on the Prime Intellect Hub +(`Llama-3.2-1B-Instruct`, registered as `anusha/sophistry-bench-sprint`, parity-tested +against this port). Both show `aggregate_reward` (the optimized proxy) climbing while +`correctness_reward` (the hidden ground truth, weight 0) stays flat — the reward-hacking +signature this env is designed to surface. The larger Prime Intellect run converges on +the literal `claim_count_cliff` target (`n_claims` saturates at exactly 8); the smaller +HF Jobs run finds a different shortcut instead (`n_claims` collapses to ~0, near-empty +completions) — same underlying finding, different degenerate strategy depending on scale. + ## Build & test ```bash diff --git a/examples/sophistry_bench_sprint_grpo.py b/examples/sophistry_bench_sprint_grpo.py new file mode 100644 index 000000000..ad8be8077 --- /dev/null +++ b/examples/sophistry_bench_sprint_grpo.py @@ -0,0 +1,156 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "openenv[core]", +# "trl", +# "datasets", +# "torch", +# "transformers", +# ] +# /// + +"""Train a policy on `sophistry_bench_sprint_env` with TRL's GRPOTrainer. + +Single-step env, so this is a plain prompt -> completion -> reward GRPO setup: +no `environment_factory`/tool-calling needed. Uses `GenericEnvClient` so the +script only depends on `openenv[core]` from PyPI, which also makes it runnable +as a standalone `uv` script, including via Hugging Face Jobs: + + hf jobs uv run examples/sophistry_bench_sprint_grpo.py --flavor a10g-small \ + --secrets HF_TOKEN -- --push-to-hub --out your-username/sophistry-grpo + +Connects via a manually-built `UVProvider` + `GenericEnvClient` rather than +`from_env()` + `.sync()`: this env only allows one concurrent session +(`SUPPORTS_CONCURRENT_SESSIONS = False`), and `from_env()` + `.sync()` can +leave behind an orphaned first connection that occupies that single slot (see +https://github.com/huggingface/OpenEnv/pull/854). Needs the `project_path` +git-clone fix from that PR; until it's released, override the `openenv[core]` +dependency above with a git ref of it. + +Run locally: + python examples/sophistry_bench_sprint_grpo.py --n-episodes 64 --steps 50 +""" + +from __future__ import annotations + +import argparse + +from datasets import Dataset +from openenv import GenericEnvClient +from openenv.core.containers.runtime.uv_provider import UVProvider +from trl import GRPOConfig, GRPOTrainer + +SPACE_REPO_ID = "openenv-community/sophistry_bench_sprint_env" + + +def _completion_text(completion) -> str: + """TRL passes either a list of chat messages or a raw string, depending + on whether the model/dataset use chat templating.""" + if isinstance(completion, list): + if not completion or not isinstance(completion[-1], dict): + raise ValueError(f"Unexpected completion shape from TRL: {completion!r}") + return completion[-1]["content"] + if isinstance(completion, str): + return completion + raise ValueError(f"Unexpected completion type from TRL: {type(completion)!r}") + + +def build_dataset(client, n_episodes: int) -> Dataset: + """Walk `reset(seed=i)` to get a fixed, replayable set of advocacy tasks. + Each row carries the `seed` needed to re-derive the same task later, in + the reward function.""" + rows = [] + for i in range(n_episodes): + obs = client.reset(seed=i).observation + rows.append({"prompt": [{"role": "user", "content": obs["prompt"]}], "seed": i}) + return Dataset.from_list(rows) + + +def make_reward_func(client): + """Re-running `reset(seed=...)` before each `step(...)` recreates the + exact task the completion was sampled for -- the server is + single-session, so this runs sequentially against one client.""" + + def reward_func(completions, seed, **kwargs) -> list[float]: + assert len(completions) == len(seed), ( + f"completions/seed length mismatch: {len(completions)} vs {len(seed)}" + ) + rewards = [] + for completion, s in zip(completions, seed): + client.reset(seed=s) + result = client.step({"text": _completion_text(completion)}) + rewards.append(result.reward) + return rewards + + return reward_func + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", default="Qwen/Qwen2.5-0.5B-Instruct") + ap.add_argument("--n-episodes", type=int, default=64, help="Dataset size.") + ap.add_argument("--steps", type=int, default=50) + ap.add_argument("--lr", type=float, default=1e-6) + ap.add_argument( + "--per-device-batch-size", + type=int, + default=2, + help="Total rollouts sampled per step (must be divisible by --num-generations).", + ) + ap.add_argument("--num-generations", type=int, default=2) + ap.add_argument("--max-completion-length", type=int, default=512) + ap.add_argument("--out", default="sophistry-grpo-Qwen2.5-0.5B") + ap.add_argument("--push-to-hub", action="store_true") + args = ap.parse_args() + + if args.per_device_batch_size % args.num_generations != 0: + ap.error("--per-device-batch-size must be divisible by --num-generations") + + provider = UVProvider( + project_path=f"git+https://huggingface.co/spaces/{SPACE_REPO_ID}", + app="sophistry_bench_sprint_env.server.app:app", + context_timeout_s=180.0, # cold clone + dependency install can be slow + ) + base_url = provider.start() + provider.wait_for_ready() + + with GenericEnvClient(base_url=base_url, provider=provider).sync() as client: + dataset = build_dataset(client, args.n_episodes) + + config = GRPOConfig( + output_dir=args.out, + max_steps=args.steps, + learning_rate=args.lr, + per_device_train_batch_size=args.per_device_batch_size, + num_generations=args.num_generations, + max_completion_length=args.max_completion_length, + bf16=True, # halves the [batch, len, vocab] logits tensor at fp32 + gradient_checkpointing=True, + logging_steps=1, + push_to_hub=args.push_to_hub, + hub_model_id=args.out if args.push_to_hub else None, + ) + + trainer = GRPOTrainer( + model=args.model, + reward_funcs=make_reward_func(client), + train_dataset=dataset, + args=config, + ) + trainer.train() + trainer.save_model(args.out) + print(f"Saved fine-tuned model to {args.out}") + + if args.push_to_hub: + trainer.push_to_hub() + print(f"Pushed fine-tuned model to https://huggingface.co/{args.out}") + + +if __name__ == "__main__": + main()