From 8eb9bd128dc05e3632b27dd47a4f39ebacc37eb1 Mon Sep 17 00:00:00 2001 From: prakashUXtech Date: Thu, 21 May 2026 14:46:28 +0530 Subject: [PATCH 1/2] feat(eval): score prompts and skill outputs, not just souls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a third eval case mode, `prompt`, that scores a plain prompt or a skill's output instead of a seeded soul. A `mode: prompt` case skips the soul: nothing is birthed, the `seed` block is ignored, and the case's `message` goes straight to the scorer. New optional `CaseInputs.reference` field holds the text a skill was originally given. When set, the `judge` scorer shows it to the LLM under its own "Reference input" heading so criteria can ask whether a candidate output improved on where it started. No new scoring kind — prompt and skill outputs reuse `JudgeScoring`. No new CLI command or flag; `soul eval` runs a prompt-mode spec the same way it runs a soul spec. New reference spec tests/eval_examples/humanizer_skill.yaml scores the workspace /humanize skill: one deterministic regex gate plus four judge cases checking a humanized rewrite shed its AI tells and kept the meaning. Addresses qbtrix/paw-workspace#47. --- CHANGELOG.md | 4 +- docs/api-reference.md | 15 +++ docs/cli-reference.md | 11 +- docs/eval-format.md | 72 +++++++++-- src/soul_protocol/eval/runner.py | 17 ++- src/soul_protocol/eval/schema.py | 27 +++- src/soul_protocol/eval/scoring.py | 49 +++++++- tests/eval_examples/humanizer_skill.yaml | 151 +++++++++++++++++++++++ tests/test_eval/test_cli.py | 89 +++++++++++++ tests/test_eval/test_runner.py | 100 +++++++++++++++ tests/test_eval/test_schema.py | 45 +++++++ 11 files changed, 562 insertions(+), 18 deletions(-) create mode 100644 tests/eval_examples/humanizer_skill.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 340a552..2bf14cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,9 @@ Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ### Added -- **Soul optimize / autoresearch (#142)** — autonomous self-improvement loop that pairs with the soul-aware eval framework (#160). The soul runs an eval against itself, identifies failing cases, proposes targeted changes to its own behaviour-shaping "knobs" (OCEAN traits, persona text, memory thresholds, bond strength), keeps changes that improve the eval score, and reverts the rest. New `soul_protocol.optimize` module: `optimize()` entry point, `OptimizeRunner` class with custom knob registration, `Knob` protocol plus four built-in knobs (`OceanTraitKnob` ±0.1/±0.2 within [0,1] per OCEAN dimension; `PersonaTextKnob` LLM-driven persona rephrasings with heuristic no-op fallback; `SignificanceThresholdKnob` for `MemorySettings.importance_threshold` ±1 plus the `skip_deep_processing_on_low_significance` flip; `BondThresholdKnob` for default bond strength ±5/±10), `Proposer` (LLM-assisted with heuristic fallback when no engine or unparseable response), `OptimizeResult`/`OptimizeStep` Pydantic models. Defaults to dry-run (`apply=False`) — every change applied during the run is reverted at the end and no trust chain entries are written; the soul stays byte-identical. With `apply=True` the runner keeps the winning trajectory and appends one `soul.optimize.applied` trust chain entry per kept change with payload `{knob_name, before, after, score_delta}`. Reverted proposals never write entries either way. New `soul optimize ` CLI command (`--iterations`, `--target`, `--apply`, `--engine`, `--json`) and `soul_optimize` MCP tool with the same surface. Pairs naturally with #160 — without the eval, "improvement" is a vibe; with the eval, it's a number that goes up. Full doc at `docs/soul-optimize.md`. +- **`prompt` eval case mode — score prompts and skill outputs (paw-workspace#47)** — the eval framework used to evaluate one thing: a seeded soul. It now also scores a plain prompt or the output of a skill. A case with `mode: prompt` skips the soul completely — no birth, no context, the `seed` block is ignored — and hands the case's `message` straight to the scorer. The new optional `CaseInputs.reference` field holds the text a skill was originally given; when it is set, the `judge` scorer puts it in front of the LLM as its own "Reference input" block, so the criteria can ask whether a candidate output improved on where it started rather than judging it cold. There is no new scoring kind: prompt and skill outputs go through the existing `JudgeScoring`. There is no new CLI command or flag either — `soul eval` runs a prompt-mode spec the same way it runs a soul spec. New reference spec `tests/eval_examples/humanizer_skill.yaml` scores the workspace `/humanize` skill: a deterministic `regex` gate that runs with no engine, plus four `judge` cases that check a humanized rewrite dropped its AI tells and kept the meaning. Docs: `eval-format.md` gains a "Prompt mode" section; `cli-reference.md` and `api-reference.md` cover the new mode and a Case modes table. This is the read-side of the workspace prompt-evaluation pair — a way to catch it when an edit to a tracked skill makes its output worse. + +- **Soul optimize / autoresearch (#142)** — autonomous self-improvement loop that pairs with the soul-aware eval framework (#160). The soul runs an eval against itself, identifies failing cases, proposes targeted changes to its own behaviour-shaping "knobs" (OCEAN traits, persona text, memory thresholds, bond strength), keeps changes that improve the eval score, and reverts the rest. The soul runs an eval against itself, identifies failing cases, proposes targeted changes to its own behaviour-shaping "knobs" (OCEAN traits, persona text, memory thresholds, bond strength), keeps changes that improve the eval score, and reverts the rest. New `soul_protocol.optimize` module: `optimize()` entry point, `OptimizeRunner` class with custom knob registration, `Knob` protocol plus four built-in knobs (`OceanTraitKnob` ±0.1/±0.2 within [0,1] per OCEAN dimension; `PersonaTextKnob` LLM-driven persona rephrasings with heuristic no-op fallback; `SignificanceThresholdKnob` for `MemorySettings.importance_threshold` ±1 plus the `skip_deep_processing_on_low_significance` flip; `BondThresholdKnob` for default bond strength ±5/±10), `Proposer` (LLM-assisted with heuristic fallback when no engine or unparseable response), `OptimizeResult`/`OptimizeStep` Pydantic models. Defaults to dry-run (`apply=False`) — every change applied during the run is reverted at the end and no trust chain entries are written; the soul stays byte-identical. With `apply=True` the runner keeps the winning trajectory and appends one `soul.optimize.applied` trust chain entry per kept change with payload `{knob_name, before, after, score_delta}`. Reverted proposals never write entries either way. New `soul optimize ` CLI command (`--iterations`, `--target`, `--apply`, `--engine`, `--json`) and `soul_optimize` MCP tool with the same surface. Pairs naturally with #160 — without the eval, "improvement" is a vibe; with the eval, it's a number that goes up. Full doc at `docs/soul-optimize.md`. - **Graph traversal + typed entity ontology (#108, #190)** — entities now carry one of eight built-in kinds (`person`, `place`, `org`, `concept`, `tool`, `document`, `event`, `relation`) plus open-string extension. Eight matching relation predicates (`mentions`, `related`, `depends_on`, `contributes_to`, `causes`, `follows`, `supersedes`, `owned_by`) ship as `RelationType` with the same open contract. The cognitive engine's `extract_entities` prompt asks for the typed ontology plus a `relations` array per entity with `{target, relation, weight}` triples; heuristic-only souls keep working through a translation table that maps legacy types. New `Soul.graph` returns a `GraphView` with `nodes()`, `edges()`, `neighbors()`, `path()`, `subgraph()`, `to_mermaid()`, `reachable()`, `stats()`. `Soul.recall` accepts `graph_walk={"start": entity_id, "depth": 2, "edge_types": [...]}` plus `page_token` and `token_budget` for pagination + L0-abstract fallback under budget pressure; new `RecallResults` list subclass carries `next_page_token`, `total_estimate`, `truncated_for_budget` (legacy callers still get `list[MemoryEntry]`). Trust chain hooks: `Soul.observe()` appends `graph.entity_added` and `graph.relation_added` entries for net-new entities/edges. New `soul graph` CLI group (`nodes`/`edges`/`neighbors`/`path`/`mermaid`, all with `--json`) and `soul_graph_query` MCP tool. In-memory dict + adjacency-list storage with `to_dict`/`from_dict` round-trip; pre-0.5.0 graphs load cleanly. Heuristic third-person relation edges (e.g. "Alice knows Bob") now flow through to the graph instead of being dropped. diff --git a/docs/api-reference.md b/docs/api-reference.md index 80504c5..9c84101 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -16,6 +16,9 @@ soul_protocol.eval module — EvalSpec, EvalCase, EvalResult, CaseResult, the five scoring kinds (keyword/regex/semantic/judge/structural), and run_eval / run_eval_against_soul / run_eval_file entry points. + Updated: 2026-05-21 (paw-workspace#47): Added the Case modes table to the + Evaluation section — documents the new `prompt` mode (scores a verbatim + prompt or skill output, soul skipped) and the `reference` input field. Updated: 2026-04-27 — Documented user-driven memory update primitives: Soul.forget_one (audited single-id delete), Soul.supersede (write new memory + link old.superseded_by), Soul.supersede_audit property. Rewrote stale soul.forget() entry to match the real @@ -1704,6 +1707,18 @@ from soul_protocol.eval import ( - `run_eval_file(path, *, engine=None, case_filter=None) -> EvalResult` — convenience wrapper that loads then runs. - `run_eval_against_soul(spec, soul, *, engine=None, case_filter=None) -> EvalResult` — run cases against an existing `Soul` without re-birthing. Used by the `soul_eval` MCP tool. The `seed` block is ignored — the soul's live state is the seed. +### Case modes + +`CaseInputs.mode` selects how the runner produces the text the scorer sees: + +| Mode | What runs | Output scored | +|------|-----------|---------------| +| `respond` (default) | Soul produces a reply via `context_for` + the engine | The reply | +| `recall` | `Soul.recall(query=message, ...)` | The recalled memories, rendered as text | +| `prompt` | Nothing — the soul is skipped, `seed` is ignored | `inputs.message`, verbatim | + +`prompt` mode scores a standalone prompt or skill output. Set `inputs.reference` (prompt-mode only) to the pre-transform text and a `judge` case compares the candidate against it. See [eval-format.md](eval-format.md#prompt-mode-scoring-prompts-and-skills). + ### Result models `EvalResult`: diff --git a/docs/cli-reference.md b/docs/cli-reference.md index f0450be..65ca0b3 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -18,6 +18,9 @@ Runs cases against a soul seeded with explicit state (memories, OCEAN, bonds, mood, energy). Supports keyword / regex / semantic / judge / structural scoring. --json, --filter, --judge-engine, --verbose options. Exits 1 on any failure. Count: 47 → 48. + Updated: 2026-05-21 (paw-workspace#47): `soul eval` also scores prompts and skill + outputs — a `mode: prompt` case skips the soul and scores the case text verbatim. + No new command or flag; the `humanizer_skill.yaml` reference spec evaluates /humanize. Updated: 2026-04-29 — v0.4.0 (#42): Added `soul verify` and `soul audit` for trust-chain integrity checks and signed-action timelines. Both support --json. `soul verify` exits 1 on a tampered chain. Count: 45 → 47. @@ -1776,7 +1779,9 @@ Payloads are stored as hashes only — the table shows *what changed when*, not ### `soul eval` -Run YAML-driven soul-aware evals against a freshly seeded soul. The eval framework lets you pin the soul's state (memories, OCEAN, bonds, mood, energy) before each test runs, so you can measure memory-driven behaviour rather than just stateless input-output. See [eval-format.md](eval-format.md) for the full schema. +Run YAML-driven soul-aware evals against a freshly seeded soul. The eval framework lets you pin the soul's state (memories, OCEAN, bonds, mood, energy) before each test runs, so you can measure memory-driven behaviour rather than just stateless input-output. + +It also scores plain prompts and skill outputs. A case with `mode: prompt` skips the soul and scores the case text verbatim — point it at a workspace prompt or a skill's output (for example `/humanize`) to catch regressions when that prompt or skill changes. See [eval-format.md](eval-format.md) for the full schema, including the `prompt` mode and the `humanizer_skill.yaml` reference spec. ```bash soul eval @@ -1809,6 +1814,10 @@ soul eval tests/eval_examples/ # all .yaml in d soul eval tests/eval_examples/ --filter "creative" soul eval my_eval.yaml --json | jq '.specs[].cases' soul eval my_eval.yaml --judge-engine my_module:make_engine + +# Score the /humanize skill (prompt-mode spec). The judge cases need an +# engine; without one they SKIP and only the deterministic checks run. +soul eval tests/eval_examples/humanizer_skill.yaml --judge-engine my_module:make_engine ``` **Output:** one Rich table per spec (Case, Status, Score, Time, optional Details), plus a summary footer with totals. `--json` returns `{specs: [...], duration_ms, pass_count, fail_count, skip_count, error_count}`. diff --git a/docs/eval-format.md b/docs/eval-format.md index 067419a..587ac90 100644 --- a/docs/eval-format.md +++ b/docs/eval-format.md @@ -2,7 +2,11 @@ Created: 2026-04-29 — Documents the YAML schema, scoring kinds, runner contract, and CLI / MCP entry points for soul-aware evals. Companion to docs/api-reference.md (EvalSpec, EvalResult, - run_eval) and docs/cli-reference.md (`soul eval`). --> + run_eval) and docs/cli-reference.md (`soul eval`). + Updated: 2026-05-21 (paw-workspace#47) — Documented the `prompt` + case mode, which scores a verbatim prompt or skill output without + a soul. Used to evaluate workspace prompts and skills (/humanize). + Companion example: tests/eval_examples/humanizer_skill.yaml. --> # Soul-aware Eval Format @@ -17,6 +21,12 @@ interactions, current mood and energy. The same prompt to a soul that's than to one that's "energetic with high bond strength" — and that's the entire point of the protocol. +The format also handles a stateless case. A `prompt`-mode case scores a +verbatim prompt or skill output directly, with no soul involved. That is +how you point the same harness at workspace prompts and skills — for +example scoring the `/humanize` skill's output for AI tells. See +[Prompt mode](#prompt-mode-scoring-prompts-and-skills) below. + This page documents the schema and the runner. For the CLI command see [cli-reference.md](cli-reference.md#soul-eval). For the MCP tool see [mcp-server.md](mcp-server.md#soul_eval). For Python API access see @@ -97,10 +107,11 @@ EvalSpec │ ├── message: str # required │ ├── user_id: str | null │ ├── domain: str | null - │ ├── mode: "respond" | "recall" + │ ├── mode: "respond" | "recall" | "prompt" │ ├── observe: bool # default false │ ├── recall_limit: int # default 5 - │ └── recall_layer: str | null + │ ├── recall_layer: str | null + │ └── reference: str | null # prompt mode — the pre-transform text └── scoring: Scoring # see below ``` @@ -130,16 +141,62 @@ also queryable via `inputs.recall_layer`. A case has three parts: 1. **Mode** — `respond` (the soul produces a reply via context_for + the - engine) or `recall` (`Soul.recall(query=message, ...)`). + engine), `recall` (`Soul.recall(query=message, ...)`), or `prompt` + (the soul is skipped; `message` is scored verbatim — see + [Prompt mode](#prompt-mode-scoring-prompts-and-skills)). 2. **Inputs** — message, optional `user_id` (multi-user routing), - optional `domain` (for v0.4.0 domain isolation), and recall knobs. + optional `domain` (for v0.4.0 domain isolation), recall knobs, and the + prompt-mode `reference`. 3. **Scoring** — one of the five kinds below. The `kind` field is the discriminator; Pydantic resolves the right scorer at parse time. `observe: true` runs `Soul.observe()` after producing the response, so the soul's state mutates. By default `observe: false` keeps the state identical to the seed across cases — recommended for deterministic -evals. +evals. `observe` does nothing in `prompt` mode (there is no soul to +observe). + +### Prompt mode — scoring prompts and skills + +Most cases drive a soul. A `prompt`-mode case does not: it takes the +case's `message` as a verbatim string — a prompt, or the output of a +skill — and hands it straight to the scorer. No soul is birthed, no +context is built, the `seed` block is ignored. + +This is the path for evaluating the workspace's own prompts and skills. +The motivating case is `/humanize`: feed the skill's output in as +`message`, describe the qualities a good humanized text should have in a +`judge` block, and the eval tells you whether an edit to the skill made +its output better or worse. + +```yaml +cases: + - name: "rewrite drops the puffery" + inputs: + mode: prompt + # `reference` — the original text the skill was given. + reference: | + Version 2.0 stands as an enduring testament to our commitment. + # `message` — the candidate output to score. + message: | + Version 2.0 shipped Tuesday with offline mode. + scoring: + kind: judge + criteria: | + The candidate output should state plainly what changed, with no + significance-inflation language. It should keep the facts and + stay shorter than the reference. +``` + +`reference` is optional and prompt-mode only. When set, the `judge` +scorer shows it to the LLM as a separate "Reference input" block, so +criteria can ask whether the candidate improved on the original rather +than judging it in isolation. Any scoring kind works in prompt mode — +`regex` and `keyword` give you deterministic gates that pass without an +engine — but `judge` is the natural fit for "is this output good." + +The shipped reference spec is +[`tests/eval_examples/humanizer_skill.yaml`](../tests/eval_examples/humanizer_skill.yaml). ## Scoring kinds @@ -309,4 +366,5 @@ a follow-up the optimizer would benefit from, file an issue against it. - [api-reference.md](api-reference.md#evaluation) — Python API - [cli-reference.md](cli-reference.md#soul-eval) — `soul eval` command - [mcp-server.md](mcp-server.md#soul_eval) — `soul_eval` MCP tool -- `tests/eval_examples/` — five shipped example specs +- `tests/eval_examples/` — shipped example specs, including + `humanizer_skill.yaml` for the prompt-mode `/humanize` eval diff --git a/src/soul_protocol/eval/runner.py b/src/soul_protocol/eval/runner.py index deec711..54e53b3 100644 --- a/src/soul_protocol/eval/runner.py +++ b/src/soul_protocol/eval/runner.py @@ -4,6 +4,11 @@ # either drives the soul into producing a response (mode="respond") or # calls Soul.recall() (mode="recall"), captures state snapshots, and # delegates to the scoring module. +# Updated: 2026-05-21 (paw-workspace#47) — Added the "prompt" case mode. +# Prompt-mode cases skip the soul entirely: the runner takes the case's +# `message` as verbatim text (a prompt or a skill output) and hands it to +# the scorer. This lets the framework score workspace prompts and skills +# such as /humanize, scored via the existing JudgeScoring kind. # # The "respond" path is the interesting one. soul-protocol does not own a # response generator — that's the consumer's job — so the runner builds the @@ -213,7 +218,17 @@ async def _run_case( mood_before = soul.state.mood energy_before = soul.state.energy - if inputs.mode == "recall": + if inputs.mode == "prompt": + # Prompt mode — the soul is not involved at all. The case's + # ``message`` is the verbatim text under evaluation (a prompt, or a + # skill's output). Hand it straight to the scorer. The judge scorer + # picks up ``inputs.reference`` separately when present. + execution = CaseExecution( + output_text=inputs.message, + mood_before=mood_before, + energy_before=energy_before, + ) + elif inputs.mode == "recall": layer = inputs.recall_layer mtypes: list[MemoryType] | None = None if layer: diff --git a/src/soul_protocol/eval/schema.py b/src/soul_protocol/eval/schema.py index 7123410..6a4e68b 100644 --- a/src/soul_protocol/eval/schema.py +++ b/src/soul_protocol/eval/schema.py @@ -3,6 +3,13 @@ # union (keyword | regex | semantic | judge | structural). Evals are written # in YAML; this module parses and validates them. The runner consumes the # resulting Pydantic models and drives Soul.observe/recall/respond. +# Updated: 2026-05-21 (paw-workspace#47) — Added a third case mode, "prompt". +# In prompt mode the case input is a verbatim prompt or skill output (not a +# soul recall/respond); the runner scores that text directly without +# touching the soul. Lets the eval framework score workspace prompts and +# skills (e.g. /humanize) alongside seeded-soul behaviour. The optional +# `reference` field carries the original input a skill transformed, so a +# judge case can compare a candidate output against where it started. # # Design note: we keep the schema deliberately small. Anything the soul # already exposes (Personality, Mood, MemoryType) is referenced directly so @@ -248,7 +255,7 @@ class StructuralScoring(_ScoringBase): class CaseInputs(BaseModel): """Input for a single case. - Two modes: + Three modes: - ``mode="respond"`` (default) — runner builds a system prompt + context block from the soul, asks the engine for a reply to ``message``, and @@ -256,12 +263,24 @@ class CaseInputs(BaseModel): - ``mode="recall"`` — runner calls ``Soul.recall(query=message, ...)`` and hands the result list to the scorer (rendered as one entry per line for keyword/semantic/judge; full list for structural). + - ``mode="prompt"`` — the soul is left untouched. ``message`` is treated + as a verbatim prompt or skill output and handed straight to the + scorer. This is how the framework evaluates workspace prompts and + skills (e.g. ``/humanize``): the YAML carries the text under test and + a :class:`JudgeScoring` block describes the qualities a good output + should have. The ``seed`` block is ignored for prompt-mode cases. + + ``reference`` — optional. In prompt mode it carries the *original* text + a skill was meant to transform (e.g. the AI-slop input before + ``/humanize`` ran). The judge scorer shows it as a "Reference input" + block so criteria can ask whether the candidate improved on it. + Ignored outside prompt mode. ``observe`` (default false) — when true, the runner additionally calls ``Soul.observe()`` after generating the response, so subsequent cases in the same spec see the updated state. Defaults to false because evals should be deterministic and memory mutations between cases make that - harder. + harder. ``observe`` has no effect in prompt mode (no soul interaction). """ model_config = ConfigDict(extra="forbid") @@ -269,11 +288,13 @@ class CaseInputs(BaseModel): message: str user_id: str | None = None domain: str | None = None - mode: Literal["respond", "recall"] = "respond" + mode: Literal["respond", "recall", "prompt"] = "respond" observe: bool = False # recall-mode specific knobs recall_limit: int = 5 recall_layer: str | None = None + # prompt-mode specific knob — the original text a skill transformed + reference: str | None = None class EvalCase(BaseModel): diff --git a/src/soul_protocol/eval/scoring.py b/src/soul_protocol/eval/scoring.py index a655da1..e5aab1a 100644 --- a/src/soul_protocol/eval/scoring.py +++ b/src/soul_protocol/eval/scoring.py @@ -4,6 +4,12 @@ # functions of (soul, case, output) — no side effects on the soul. The # judge scorer is the only one that requires an engine; it returns a # "skipped" outcome when no engine is configured rather than failing. +# Updated: 2026-05-21 (paw-workspace#47) — The judge scorer now adds a +# "Reference input" block to its prompt when the case carries +# `inputs.reference` (set by prompt-mode cases). This lets a /humanize or +# skill eval ask the judge to compare a candidate output against the +# original text it was meant to transform. No new scoring kind is added — +# prompt/skill outputs reuse JudgeScoring. from __future__ import annotations @@ -191,6 +197,26 @@ def score_semantic( """ +# Used for prompt-mode cases that carry a `reference` — the original text a +# skill was meant to transform. The judge compares the candidate against it. +_JUDGE_PROMPT_WITH_REFERENCE = """You are evaluating the output of a text-processing prompt or skill. + +Criteria: +{criteria} + +Reference input (the original text the skill was given): +{reference} + +Candidate output (the text to score): +{output} + +Score the candidate output from 0.0 (does not meet criteria at all) to +1.0 (fully meets criteria). Return JSON only — no other text: + +{{"score": <0.0-1.0>, "reasoning": ""}} +""" + + _JSON_RE = re.compile(r"\{.*?\}", re.DOTALL) @@ -206,6 +232,11 @@ async def score_judge( failed) so a CI run that lacks API credentials can still validate the rest of the eval suite. When the judge call fails or returns unparseable output, score 0.0 with details explaining why. + + When the case carries ``inputs.reference`` (a prompt-mode case scoring + a skill output against the text it transformed), the judge prompt + shows that reference as a separate block so the criteria can ask the + judge to compare candidate against original. """ if engine is None: return ScoreOutcome( @@ -217,11 +248,19 @@ async def score_judge( }, ) - prompt = _JUDGE_PROMPT.format( - criteria=spec.criteria.strip(), - message=case.inputs.message, - output=execution.output_text, - ) + reference = case.inputs.reference + if reference: + prompt = _JUDGE_PROMPT_WITH_REFERENCE.format( + criteria=spec.criteria.strip(), + reference=reference, + output=execution.output_text, + ) + else: + prompt = _JUDGE_PROMPT.format( + criteria=spec.criteria.strip(), + message=case.inputs.message, + output=execution.output_text, + ) try: raw = await engine.think(prompt) except Exception as e: # pragma: no cover — network / engine errors diff --git a/tests/eval_examples/humanizer_skill.yaml b/tests/eval_examples/humanizer_skill.yaml new file mode 100644 index 0000000..862f40d --- /dev/null +++ b/tests/eval_examples/humanizer_skill.yaml @@ -0,0 +1,151 @@ +# humanizer_skill.yaml — Scores the workspace /humanize skill's output. +# Created: 2026-05-21 (paw-workspace#47) — First reference spec for the +# prompt case mode. Each case carries a humanized rewrite as `message` +# and the original AI-slop text as `reference`; JudgeScoring checks the +# rewrite kept the meaning while shedding AI tells. One deterministic +# regex case runs without an engine so the spec still has a real pass +# when no judge engine is wired (judge cases skip cleanly in that case). +# +# This is a regression harness: if an edit to .claude/skills/humanizer +# degrades the skill's output, the judge scores here drop and the eval +# fails. Run with `soul eval tests/eval_examples/humanizer_skill.yaml +# --judge-engine module:attr` to get live judge scores. + +name: "Humanizer skill — outputs read as human, not AI" +description: | + Evaluates the /humanize skill (.claude/skills/humanizer/SKILL.md). The + skill takes AI-generated text and rewrites it to sound human: it strips + significance inflation, promotional language, em-dash overuse, the rule + of three, chatbot artifacts, and the rest of the catalogue in the skill. + + Cases use `mode: prompt`, so no soul is involved. The `message` field + is a candidate humanized rewrite; `reference` is the original slop the + skill was given. The judge compares the two against criteria that + describe what a good rewrite looks like. Judge cases skip when no + engine is wired; the regex case runs everywhere. + +cases: + - name: "regex_no_curly_quotes_or_emoji" + description: | + Deterministic gate, runs with no engine. A humanized output must + use straight quotes and carry no emoji — two of the skill's hard + rules (patterns 18 and 19). The negative lookahead fails the case + if either a curly quote or a rocket/bulb/check emoji slipped + through. This keeps the spec honest even on a no-engine CI run. + inputs: + mode: prompt + message: | + We shipped the new planner this week. It is faster on the cases + we measured and it no longer drops tasks when the queue is long. + There is still a rough edge around retries that we want to fix + before the next release. + scoring: + kind: regex + pattern: '^(?!.*[“”‘’\U0001F680\U0001F4A1✅]).*$' + threshold: 1.0 + + - name: "judge_strips_significance_inflation" + description: | + The skill's first content pattern: kill "testament", "pivotal + moment", "evolving landscape", "vital role". A good rewrite states + what happened plainly and keeps the facts. + inputs: + mode: prompt + reference: | + The release of version 2.0 stands as an enduring testament to + the team's commitment to excellence, marking a pivotal moment in + the evolving landscape of the product and underscoring its vital + role in the broader ecosystem. + message: | + Version 2.0 shipped on Tuesday. It adds offline mode and cuts + cold-start time roughly in half. + scoring: + kind: judge + criteria: | + The candidate output should state plainly what version 2.0 is + and what changed, with no significance-inflation language — + nothing like "testament", "pivotal moment", "evolving + landscape", "vital role", or "broader ecosystem". It should + keep concrete facts and stay shorter than the reference. Score + high when the puffery is gone and the meaning survives; score + low if the rewrite still reaches for grand framing. + + - name: "judge_removes_chatbot_artifacts" + description: | + Pattern 20 — collaborative-communication artifacts. Text meant as + chat ("Great question!", "I hope this helps!", "Let me know if") + should not survive into prose. + inputs: + mode: prompt + reference: | + Great question! Here is a summary of how the cache works. The + cache stores responses for five minutes. I hope this helps! Let + me know if you would like me to expand on any section. + message: | + The cache stores responses for five minutes, then evicts them on + the next read. + scoring: + kind: judge + criteria: | + The candidate output must read as standalone prose with no + chatbot correspondence artifacts: no "Great question!", no + "Here is a...", no "I hope this helps!", no "Let me know if you + would like...". The factual claim about the five-minute cache + must still be present. Score high when the output is clean + prose; score low if any chat-assistant phrasing remains. + + - name: "judge_kills_rule_of_three_and_em_dashes" + description: | + Patterns 10 and 14 — forced triplets and em-dash overuse. A good + rewrite breaks the triad and uses commas or periods instead of + stacked em dashes. + inputs: + mode: prompt + reference: | + The conference offers keynotes, panels, and workshops — sessions + that inform, inspire, and connect — bringing together speakers, + sponsors, and attendees from across the industry. + message: | + The conference has keynote talks and panels, plus hands-on + workshops. There is also time between sessions for people to + meet each other. + scoring: + kind: judge + criteria: | + The candidate output should describe the conference without + forcing ideas into groups of three and without em dashes ("—"). + It is fine for the rewrite to drop the triplet structure + entirely and simply list what the conference includes. Score + high when the rule-of-three cadence and em dashes are gone and + the description still makes sense; score low if either pattern + survives. + + - name: "judge_keeps_voice_not_soulless" + description: | + The skill's PERSONALITY AND SOUL section: clean is not enough, the + rewrite should still have a human behind it — an opinion, varied + rhythm, a first-person take where it fits. + inputs: + mode: prompt + reference: | + The experiment produced interesting results. The agents + generated three million lines of code. Some developers were + impressed while others were skeptical. The implications remain + unclear. + message: | + I honestly don't know what to make of this one. Three million + lines of code, written while everyone slept. Half the people I + follow think it changes everything; the other half are busy + explaining why it doesn't count. I keep getting stuck on the + same thought: those agents running all night with nobody + watching. + scoring: + kind: judge + criteria: | + The candidate output should read like a person with an opinion, + not a neutral report. It should vary sentence rhythm, admit + mixed feelings or uncertainty, and use first person where it + fits — while still covering the same facts as the reference + (three million lines of code, a split reaction). Score high + when the rewrite has a clear voice and keeps the substance; + score low if it reads flat and committee-written. diff --git a/tests/test_eval/test_cli.py b/tests/test_eval/test_cli.py index 65f1938..af6b08b 100644 --- a/tests/test_eval/test_cli.py +++ b/tests/test_eval/test_cli.py @@ -3,6 +3,10 @@ # against tempdir fixtures and the shipped example YAMLs. Validates # exit codes (0 on all-pass, 1 on any-fail), --json output shape, and # --filter narrowing. +# Updated: 2026-05-21 (paw-workspace#47) — Added an end-to-end test that +# runs the humanizer_skill.yaml prompt-mode spec through the CLI with a +# deterministic judge engine. `make_fake_judge_engine` is module-level +# so the CLI's `--judge-engine module:attr` can import it. from __future__ import annotations @@ -17,6 +21,28 @@ EXAMPLES_DIR = Path(__file__).resolve().parents[1] / "eval_examples" +# --------------------------------------------------------------------------- +# Deterministic judge engine — importable via --judge-engine module:attr +# --------------------------------------------------------------------------- + + +class _FakeJudgeEngine: + """CognitiveEngine stand-in that always returns a passing judge verdict. + + Lets the CLI exercise judge-mode cases without API credentials. The + canned JSON scores above every threshold in the shipped specs, so a + structurally sound spec run reports all-pass. + """ + + async def think(self, prompt: str) -> str: + return '{"score": 0.92, "reasoning": "meets the criteria"}' + + +def make_fake_judge_engine() -> _FakeJudgeEngine: + """Factory the CLI resolves from `--judge-engine ...:make_fake_judge_engine`.""" + return _FakeJudgeEngine() + + def _write_spec(tmp: Path, name: str, body: str) -> Path: """Drop a YAML spec into ``tmp`` and return the path.""" path = tmp / name @@ -262,3 +288,66 @@ def test_eval_heuristic_engine_skips_judge() -> None: # the judge will return non-JSON and fail. The other 3 cases pass, # so exit code is 1 (any failure). assert result.exit_code == 1 + + +# --------------------------------------------------------------------------- +# Prompt-mode skill eval — humanizer_skill.yaml end-to-end (paw-workspace#47) +# --------------------------------------------------------------------------- + +HUMANIZER_SPEC = EXAMPLES_DIR / "humanizer_skill.yaml" + +# Module path the CLI uses to import the deterministic judge engine. +_FAKE_ENGINE_REF = "tests.test_eval.test_cli:make_fake_judge_engine" + + +def test_humanizer_spec_no_engine_skips_judge_exits_zero() -> None: + """The humanizer spec runs with no engine: the regex case passes, the + judge cases skip, and the run exits 0 (skips do not fail a run).""" + runner = CliRunner() + result = runner.invoke(cli, ["eval", str(HUMANIZER_SPEC)]) + assert result.exit_code == 0, result.output + assert "PASS" in result.output # the regex gate + assert "SKIP" in result.output # the judge cases + + +def test_humanizer_spec_with_judge_engine_all_pass_exits_zero() -> None: + """Wired with a deterministic judge engine, every case in the humanizer + spec passes and the CLI exits 0 — the success path of a skill eval.""" + runner = CliRunner() + result = runner.invoke( + cli, + ["eval", str(HUMANIZER_SPEC), "--judge-engine", _FAKE_ENGINE_REF, "--json"], + ) + assert result.exit_code == 0, result.output + payload = json.loads(result.output) + assert payload["fail_count"] == 0 + assert payload["error_count"] == 0 + # 5 cases: 1 regex + 4 judge, all of them score a pass with the engine. + assert payload["pass_count"] == 5 + assert payload["skip_count"] == 0 + spec = payload["specs"][0] + assert spec["spec_name"].startswith("Humanizer skill") + judge_cases = [c for c in spec["cases"] if c["name"].startswith("judge_")] + assert len(judge_cases) == 4 + assert all(c["passed"] for c in judge_cases) + + +def test_humanizer_spec_filter_runs_single_case() -> None: + """`--filter` narrows the humanizer spec to one case end-to-end.""" + runner = CliRunner() + result = runner.invoke( + cli, + [ + "eval", + str(HUMANIZER_SPEC), + "--filter", + "regex_no_curly", + "--json", + ], + ) + assert result.exit_code == 0, result.output + payload = json.loads(result.output) + cases = payload["specs"][0]["cases"] + assert len(cases) == 1 + assert cases[0]["name"] == "regex_no_curly_quotes_or_emoji" + assert cases[0]["passed"] diff --git a/tests/test_eval/test_runner.py b/tests/test_eval/test_runner.py index 8f9b4f7..a787652 100644 --- a/tests/test_eval/test_runner.py +++ b/tests/test_eval/test_runner.py @@ -3,6 +3,9 @@ # per-case execution (respond + recall modes), all five scoring kinds, # and the no-engine fallback path. Uses an in-memory FakeEngine for # judge-scoring tests so they don't need API credentials. +# Updated: 2026-05-21 (paw-workspace#47) — Added coverage for the "prompt" +# case mode: the runner scores the case message verbatim without +# touching the soul, the judge picks up the optional `reference` block. from __future__ import annotations @@ -395,3 +398,100 @@ async def test_seed_failure_reports_in_result() -> None: assert result.error is None assert result.cases == [] assert result.all_passed + + +# --------------------------------------------------------------------------- +# Prompt case mode (paw-workspace#47) +# --------------------------------------------------------------------------- + + +class CapturingEngine: + """FakeEngine variant that records every prompt it was asked to think on. + + Used to assert what the runner actually puts in front of the judge — + e.g. that prompt-mode cases score the case message verbatim and that a + `reference` is surfaced as its own block. + """ + + def __init__(self, response: str) -> None: + self._response = response + self.prompts: list[str] = [] + + async def think(self, prompt: str) -> str: + self.prompts.append(prompt) + return self._response + + +@pytest.mark.asyncio +async def test_prompt_mode_dispatches_without_engine() -> None: + """A prompt-mode case runs through the runner with no engine. + + The judge scorer skips (no engine), and crucially the runner never + touches the soul — so no birth/recall/respond machinery can raise. + """ + spec = _spec_with( + CaseInputs(message="some candidate text", mode="prompt"), + JudgeScoring(criteria="is the text free of AI tells?"), + ) + result = await run_eval(spec) + assert result.error is None + assert result.cases[0].skipped + assert result.cases[0].error is None + + +@pytest.mark.asyncio +async def test_prompt_mode_scores_message_verbatim() -> None: + """Prompt mode hands the case message straight to the scorer. + + A keyword scorer over the verbatim message passes — proving the + runner did not run the soul fallback (which would prepend its own + "[soul-eval fallback response]" text and other content). + """ + spec = _spec_with( + CaseInputs(message="the planner now retries failed tasks", mode="prompt"), + KeywordScoring(expected=["planner", "retries"], mode="all", threshold=1.0), + ) + result = await run_eval(spec) + assert result.cases[0].passed, result.cases[0].details + # The output the scorer saw is exactly the message — no soul fallback. + assert result.cases[0].output == "the planner now retries failed tasks" + assert "fallback" not in result.cases[0].output + + +@pytest.mark.asyncio +async def test_prompt_mode_judge_scores_with_engine() -> None: + """With an engine wired, a prompt-mode judge case scores and passes.""" + spec = _spec_with( + CaseInputs(message="a clean, human-sounding rewrite", mode="prompt"), + JudgeScoring(criteria="does the text read as human?", threshold=0.5), + ) + engine = FakeEngine('{"score": 0.8, "reasoning": "natural voice"}') + result = await run_eval(spec, engine=engine) + assert result.cases[0].passed + assert result.cases[0].score == pytest.approx(0.8) + assert not result.cases[0].skipped + + +@pytest.mark.asyncio +async def test_prompt_mode_reference_reaches_judge() -> None: + """When a prompt-mode case carries `reference`, the judge sees it. + + The judge prompt must include the reference under its own block so + criteria can compare a candidate output against the original text. + """ + spec = _spec_with( + CaseInputs( + message="Version 2 ships Tuesday with offline mode.", + mode="prompt", + reference="Version 2 stands as a testament to our enduring commitment.", + ), + JudgeScoring(criteria="did the rewrite drop the puffery?", threshold=0.5), + ) + engine = CapturingEngine('{"score": 0.9, "reasoning": "puffery gone"}') + result = await run_eval(spec, engine=engine) + assert result.cases[0].passed + assert len(engine.prompts) == 1 + judge_prompt = engine.prompts[0] + assert "Reference input" in judge_prompt + assert "testament to our enduring commitment" in judge_prompt + assert "Version 2 ships Tuesday with offline mode." in judge_prompt diff --git a/tests/test_eval/test_schema.py b/tests/test_eval/test_schema.py index 6fa52a4..8713cb8 100644 --- a/tests/test_eval/test_schema.py +++ b/tests/test_eval/test_schema.py @@ -2,6 +2,8 @@ # Created: 2026-04-29 — Covers EvalSpec parsing, scoring discriminator, # error cases (invalid threshold, unknown scoring kind, missing required # fields). Validates that all five scoring kinds round-trip cleanly. +# Updated: 2026-05-21 (paw-workspace#47) — Added smoke coverage for the +# "prompt" case mode and the optional `reference` input field. from __future__ import annotations @@ -188,6 +190,49 @@ def test_state_seed_mood_invalid_raises() -> None: parse_eval_spec(data) +# --------------------------------------------------------------------------- +# Prompt case mode (paw-workspace#47) +# --------------------------------------------------------------------------- + + +def test_prompt_mode_parses() -> None: + """A case with mode=prompt validates and keeps the mode.""" + data = _minimal_dict({"kind": "judge", "criteria": "is it humanized?"}) + data["cases"][0]["inputs"] = {"message": "some prompt text", "mode": "prompt"} + spec = parse_eval_spec(data) + assert spec.cases[0].inputs.mode == "prompt" + assert spec.cases[0].inputs.reference is None + + +def test_prompt_mode_reference_field_parses() -> None: + """The optional `reference` field round-trips on a prompt-mode case.""" + data = _minimal_dict({"kind": "judge", "criteria": "did it improve the text?"}) + data["cases"][0]["inputs"] = { + "message": "the humanized rewrite", + "mode": "prompt", + "reference": "the original AI-slop input", + } + spec = parse_eval_spec(data) + inputs = spec.cases[0].inputs + assert inputs.mode == "prompt" + assert inputs.reference == "the original AI-slop input" + + +def test_default_mode_is_respond() -> None: + """Existing specs that omit `mode` still default to respond.""" + spec = parse_eval_spec(_minimal_dict({"kind": "keyword", "expected": ["x"]})) + assert spec.cases[0].inputs.mode == "respond" + assert spec.cases[0].inputs.reference is None + + +def test_unknown_mode_rejected() -> None: + """A bogus mode value is a validation error, not a silent pass.""" + data = _minimal_dict({"kind": "keyword", "expected": ["x"]}) + data["cases"][0]["inputs"] = {"message": "hi", "mode": "teleport"} + with pytest.raises(SchemaValidationError): + parse_eval_spec(data) + + # --------------------------------------------------------------------------- # load_eval_spec end-to-end # --------------------------------------------------------------------------- From 02ac96fbb87f0d650a485c8f3944d24fde575d99 Mon Sep 17 00:00:00 2001 From: prakashUXtech Date: Thu, 21 May 2026 17:54:04 +0530 Subject: [PATCH 2/2] fix(eval): gate the reference field to prompt-mode cases score_judge picked the reference judge template whenever case.inputs.reference was truthy, ignoring the mode. CaseInputs.reference is documented as prompt-mode-only, so a respond/recall case carrying reference would silently use a prompt that omits the user's message. Gate the field on mode == "prompt" so the docstring contract holds. Also drop a sentence duplicated back-to-back in the CHANGELOG soul optimize / autoresearch entry, and add a regression test: a respond-mode case with reference set must still get the plain judge prompt. --- CHANGELOG.md | 2 +- src/soul_protocol/eval/scoring.py | 10 ++++++++- tests/test_eval/test_runner.py | 37 +++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bf14cf..91edb0e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). - **`prompt` eval case mode — score prompts and skill outputs (paw-workspace#47)** — the eval framework used to evaluate one thing: a seeded soul. It now also scores a plain prompt or the output of a skill. A case with `mode: prompt` skips the soul completely — no birth, no context, the `seed` block is ignored — and hands the case's `message` straight to the scorer. The new optional `CaseInputs.reference` field holds the text a skill was originally given; when it is set, the `judge` scorer puts it in front of the LLM as its own "Reference input" block, so the criteria can ask whether a candidate output improved on where it started rather than judging it cold. There is no new scoring kind: prompt and skill outputs go through the existing `JudgeScoring`. There is no new CLI command or flag either — `soul eval` runs a prompt-mode spec the same way it runs a soul spec. New reference spec `tests/eval_examples/humanizer_skill.yaml` scores the workspace `/humanize` skill: a deterministic `regex` gate that runs with no engine, plus four `judge` cases that check a humanized rewrite dropped its AI tells and kept the meaning. Docs: `eval-format.md` gains a "Prompt mode" section; `cli-reference.md` and `api-reference.md` cover the new mode and a Case modes table. This is the read-side of the workspace prompt-evaluation pair — a way to catch it when an edit to a tracked skill makes its output worse. -- **Soul optimize / autoresearch (#142)** — autonomous self-improvement loop that pairs with the soul-aware eval framework (#160). The soul runs an eval against itself, identifies failing cases, proposes targeted changes to its own behaviour-shaping "knobs" (OCEAN traits, persona text, memory thresholds, bond strength), keeps changes that improve the eval score, and reverts the rest. The soul runs an eval against itself, identifies failing cases, proposes targeted changes to its own behaviour-shaping "knobs" (OCEAN traits, persona text, memory thresholds, bond strength), keeps changes that improve the eval score, and reverts the rest. New `soul_protocol.optimize` module: `optimize()` entry point, `OptimizeRunner` class with custom knob registration, `Knob` protocol plus four built-in knobs (`OceanTraitKnob` ±0.1/±0.2 within [0,1] per OCEAN dimension; `PersonaTextKnob` LLM-driven persona rephrasings with heuristic no-op fallback; `SignificanceThresholdKnob` for `MemorySettings.importance_threshold` ±1 plus the `skip_deep_processing_on_low_significance` flip; `BondThresholdKnob` for default bond strength ±5/±10), `Proposer` (LLM-assisted with heuristic fallback when no engine or unparseable response), `OptimizeResult`/`OptimizeStep` Pydantic models. Defaults to dry-run (`apply=False`) — every change applied during the run is reverted at the end and no trust chain entries are written; the soul stays byte-identical. With `apply=True` the runner keeps the winning trajectory and appends one `soul.optimize.applied` trust chain entry per kept change with payload `{knob_name, before, after, score_delta}`. Reverted proposals never write entries either way. New `soul optimize ` CLI command (`--iterations`, `--target`, `--apply`, `--engine`, `--json`) and `soul_optimize` MCP tool with the same surface. Pairs naturally with #160 — without the eval, "improvement" is a vibe; with the eval, it's a number that goes up. Full doc at `docs/soul-optimize.md`. +- **Soul optimize / autoresearch (#142)** — autonomous self-improvement loop that pairs with the soul-aware eval framework (#160). The soul runs an eval against itself, identifies failing cases, proposes targeted changes to its own behaviour-shaping "knobs" (OCEAN traits, persona text, memory thresholds, bond strength), keeps changes that improve the eval score, and reverts the rest. New `soul_protocol.optimize` module: `optimize()` entry point, `OptimizeRunner` class with custom knob registration, `Knob` protocol plus four built-in knobs (`OceanTraitKnob` ±0.1/±0.2 within [0,1] per OCEAN dimension; `PersonaTextKnob` LLM-driven persona rephrasings with heuristic no-op fallback; `SignificanceThresholdKnob` for `MemorySettings.importance_threshold` ±1 plus the `skip_deep_processing_on_low_significance` flip; `BondThresholdKnob` for default bond strength ±5/±10), `Proposer` (LLM-assisted with heuristic fallback when no engine or unparseable response), `OptimizeResult`/`OptimizeStep` Pydantic models. Defaults to dry-run (`apply=False`) — every change applied during the run is reverted at the end and no trust chain entries are written; the soul stays byte-identical. With `apply=True` the runner keeps the winning trajectory and appends one `soul.optimize.applied` trust chain entry per kept change with payload `{knob_name, before, after, score_delta}`. Reverted proposals never write entries either way. New `soul optimize ` CLI command (`--iterations`, `--target`, `--apply`, `--engine`, `--json`) and `soul_optimize` MCP tool with the same surface. Pairs naturally with #160 — without the eval, "improvement" is a vibe; with the eval, it's a number that goes up. Full doc at `docs/soul-optimize.md`. - **Graph traversal + typed entity ontology (#108, #190)** — entities now carry one of eight built-in kinds (`person`, `place`, `org`, `concept`, `tool`, `document`, `event`, `relation`) plus open-string extension. Eight matching relation predicates (`mentions`, `related`, `depends_on`, `contributes_to`, `causes`, `follows`, `supersedes`, `owned_by`) ship as `RelationType` with the same open contract. The cognitive engine's `extract_entities` prompt asks for the typed ontology plus a `relations` array per entity with `{target, relation, weight}` triples; heuristic-only souls keep working through a translation table that maps legacy types. New `Soul.graph` returns a `GraphView` with `nodes()`, `edges()`, `neighbors()`, `path()`, `subgraph()`, `to_mermaid()`, `reachable()`, `stats()`. `Soul.recall` accepts `graph_walk={"start": entity_id, "depth": 2, "edge_types": [...]}` plus `page_token` and `token_budget` for pagination + L0-abstract fallback under budget pressure; new `RecallResults` list subclass carries `next_page_token`, `total_estimate`, `truncated_for_budget` (legacy callers still get `list[MemoryEntry]`). Trust chain hooks: `Soul.observe()` appends `graph.entity_added` and `graph.relation_added` entries for net-new entities/edges. New `soul graph` CLI group (`nodes`/`edges`/`neighbors`/`path`/`mermaid`, all with `--json`) and `soul_graph_query` MCP tool. In-memory dict + adjacency-list storage with `to_dict`/`from_dict` round-trip; pre-0.5.0 graphs load cleanly. Heuristic third-person relation edges (e.g. "Alice knows Bob") now flow through to the graph instead of being dropped. diff --git a/src/soul_protocol/eval/scoring.py b/src/soul_protocol/eval/scoring.py index e5aab1a..de5eaf7 100644 --- a/src/soul_protocol/eval/scoring.py +++ b/src/soul_protocol/eval/scoring.py @@ -10,6 +10,10 @@ # skill eval ask the judge to compare a candidate output against the # original text it was meant to transform. No new scoring kind is added — # prompt/skill outputs reuse JudgeScoring. +# Updated: 2026-05-21 — Gate `inputs.reference` to prompt-mode cases. The +# field's docstring says it is ignored outside prompt mode; score_judge +# now honors that, so a respond/recall case carrying `reference` no +# longer silently drops the user message from the judge prompt. from __future__ import annotations @@ -248,7 +252,11 @@ async def score_judge( }, ) - reference = case.inputs.reference + # `reference` is a prompt-mode-only field — its docstring on + # CaseInputs says so. Gate it on the mode so a respond/recall case + # that happens to set `reference` does not silently switch to the + # reference template (which omits the user's actual message). + reference = case.inputs.reference if case.inputs.mode == "prompt" else None if reference: prompt = _JUDGE_PROMPT_WITH_REFERENCE.format( criteria=spec.criteria.strip(), diff --git a/tests/test_eval/test_runner.py b/tests/test_eval/test_runner.py index a787652..c5daccb 100644 --- a/tests/test_eval/test_runner.py +++ b/tests/test_eval/test_runner.py @@ -6,6 +6,10 @@ # Updated: 2026-05-21 (paw-workspace#47) — Added coverage for the "prompt" # case mode: the runner scores the case message verbatim without # touching the soul, the judge picks up the optional `reference` block. +# Updated: 2026-05-21 — Added a regression test asserting `reference` is +# ignored outside prompt mode: a respond-mode case carrying `reference` +# still gets the plain judge prompt (user message present, no reference +# block). from __future__ import annotations @@ -495,3 +499,36 @@ async def test_prompt_mode_reference_reaches_judge() -> None: assert "Reference input" in judge_prompt assert "testament to our enduring commitment" in judge_prompt assert "Version 2 ships Tuesday with offline mode." in judge_prompt + + +@pytest.mark.asyncio +async def test_reference_ignored_outside_prompt_mode() -> None: + """A non-prompt case that sets `reference` must not use it. + + `CaseInputs.reference` is documented as ignored outside prompt mode. + A respond-mode case that carries `reference` must still get the plain + judge prompt — the one that includes the user's actual message — not + the reference template (which omits the message entirely). + """ + spec = _spec_with( + CaseInputs( + message="how is the rollout going?", + mode="respond", + reference="Our rollout stands as a testament to enduring commitment.", + ), + JudgeScoring(criteria="is the response useful?", threshold=0.5), + ) + engine = CapturingEngine('{"score": 0.8, "reasoning": "useful"}') + result = await run_eval(spec, engine=engine) + assert result.cases[0].passed, result.cases[0].details + # respond mode calls the engine twice: once to generate the response, + # once for the judge. The judge prompt is the one carrying the scoring + # rubric — pick it by that signature rather than by index. + judge_prompts = [p for p in engine.prompts if "Score the output" in p] + assert len(judge_prompts) == 1 + judge_prompt = judge_prompts[0] + # The plain judge template is used: user message present, no reference. + assert "Agent input:" in judge_prompt + assert "how is the rollout going?" in judge_prompt + assert "Reference input" not in judge_prompt + assert "testament to enduring commitment" not in judge_prompt