From df19255cf125858d5658e0212234e22f51eb8145 Mon Sep 17 00:00:00 2001 From: David Cramer Date: Sun, 3 May 2026 14:34:52 -0700 Subject: [PATCH] feat(judges): Add harness context to judge API Pass configured harness context into automatic and explicit judge calls so rubric judges can reuse the suite prompt seam without duplicating provider setup. Register fixture run context for matcher assertions, including raw output and session objects, while keeping explicit matcher overrides available for manual values. Make harness prompt configuration required and keep judge prompting on context.harness.prompt(...) so the API does not split judge capabilities across harness and runtime objects. Fixes GH-45 Co-Authored-By: OpenAI Codex --- README.md | 8 +- apps/demo-ai-sdk/evals/shared.ts | 7 + apps/demo-pi/evals/refund.eval.ts | 7 +- apps/demo-pi/evals/refund.fail.eval.ts | 7 +- apps/demo-pi/src/refundAgent.ts | 26 ++ docs/architecture.md | 24 +- docs/custom-scorers.md | 20 +- docs/development-guide.md | 9 +- docs/harness-first-rfc.md | 3 + docs/testing.md | 1 + packages/harness-ai-sdk/README.md | 11 + packages/harness-ai-sdk/src/index.test.ts | 21 ++ packages/harness-ai-sdk/src/index.ts | 2 +- packages/harness-pi-ai/README.md | 8 + packages/harness-pi-ai/src/index.test.ts | 32 ++ packages/harness-pi-ai/src/index.ts | 3 + packages/vitest-evals/README.md | 29 +- packages/vitest-evals/src/harness.test.ts | 310 ++++++++++++++++- packages/vitest-evals/src/harness.ts | 9 +- packages/vitest-evals/src/index.ts | 328 ++++++++++++++---- packages/vitest-evals/src/judges/index.ts | 7 +- .../src/judges/structuredOutputJudge.ts | 25 +- .../vitest-evals/src/judges/toolCallJudge.ts | 20 +- packages/vitest-evals/src/judges/types.ts | 53 +-- policies/README.md | 2 +- policies/api-design.md | 29 ++ 26 files changed, 860 insertions(+), 141 deletions(-) create mode 100644 policies/api-design.md diff --git a/README.md b/README.md index 808a77b..93c1e58 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ import { toolCalls, type JudgeContext, } from "vitest-evals"; -import { createRefundAgent } from "../src/refundAgent"; +import { createRefundAgent, judgePrompt } from "../src/refundAgent"; type RefundEvalMetadata = { expectedStatus: "approved" | "denied"; @@ -110,6 +110,7 @@ describeEval( { harness: piAiHarness({ createAgent: () => createRefundAgent(), + prompt: judgePrompt, }), judges: [FactualityJudge], }, @@ -143,13 +144,16 @@ Harness-backed suites stay close to plain Vitest: - tests call `run(...)` explicitly - ordinary `expect(...)` assertions stay first-class - judges layer in through `expect(...).toSatisfyJudge(...)` +- every judge receives `JudgeContext`, including the configured harness with its + required `prompt` function - per-run judge parameters should usually live under `metadata` - reporter output, replay, usage, and tool traces come from the normalized run Built-in judges like `StructuredOutputJudge()` are still available for deterministic contract checks, but the more realistic explicit-judge path is a custom factuality or rubric judge over `output`, with `JudgeContext` available -when the judge needs richer run/session data. +when the judge needs richer run/session data or the suite's configured model +prompt seam. Tool replay is available for opt-in tools in the first-party harnesses. Configure it globally in Vitest and then mark individual tools with diff --git a/apps/demo-ai-sdk/evals/shared.ts b/apps/demo-ai-sdk/evals/shared.ts index c5f936e..9dde534 100644 --- a/apps/demo-ai-sdk/evals/shared.ts +++ b/apps/demo-ai-sdk/evals/shared.ts @@ -111,6 +111,13 @@ const refundTools = { export const refundHarness = aiSdkHarness({ tools: refundTools, + prompt: (input, options) => + generateText({ + model: anthropic("claude-sonnet-4-5"), + system: options?.system, + prompt: input, + temperature: 0, + }).then((result) => result.text), task: async ({ input, runtime }) => generateText({ model: anthropic("claude-sonnet-4-5"), diff --git a/apps/demo-pi/evals/refund.eval.ts b/apps/demo-pi/evals/refund.eval.ts index 139b68b..0f97ba6 100644 --- a/apps/demo-pi/evals/refund.eval.ts +++ b/apps/demo-pi/evals/refund.eval.ts @@ -6,7 +6,11 @@ import { ToolCallJudge, toolCalls, } from "vitest-evals"; -import { createRefundAgent, type RefundCase } from "../src/refundAgent"; +import { + createRefundAgent, + promptRefundModel, + type RefundCase, +} from "../src/refundAgent"; const outputJudge = StructuredOutputJudge(); @@ -16,6 +20,7 @@ describeEval( skipIf: () => !process.env.ANTHROPIC_API_KEY, harness: piAiHarness({ createAgent: () => createRefundAgent(), + prompt: promptRefundModel, }), judges: [ToolCallJudge()], }, diff --git a/apps/demo-pi/evals/refund.fail.eval.ts b/apps/demo-pi/evals/refund.fail.eval.ts index 294dcbe..a326949 100644 --- a/apps/demo-pi/evals/refund.fail.eval.ts +++ b/apps/demo-pi/evals/refund.fail.eval.ts @@ -1,7 +1,11 @@ import { expect } from "vitest"; import { piAiHarness } from "@vitest-evals/harness-pi-ai"; import { describeEval, StructuredOutputJudge } from "vitest-evals"; -import { createRefundAgent, type RefundCase } from "../src/refundAgent"; +import { + createRefundAgent, + promptRefundModel, + type RefundCase, +} from "../src/refundAgent"; type AssertionRefundCase = RefundCase; type ScoredRefundCase = RefundCase & { @@ -10,6 +14,7 @@ type ScoredRefundCase = RefundCase & { const harness = piAiHarness({ createAgent: () => createRefundAgent(), + prompt: promptRefundModel, }); describeEval( diff --git a/apps/demo-pi/src/refundAgent.ts b/apps/demo-pi/src/refundAgent.ts index cf4709f..1814c5a 100644 --- a/apps/demo-pi/src/refundAgent.ts +++ b/apps/demo-pi/src/refundAgent.ts @@ -6,6 +6,7 @@ import { type Static, } from "@mariozechner/pi-ai"; import type { PiAiRuntime, PiAiToolset } from "@vitest-evals/harness-pi-ai"; +import type { HarnessPromptOptions } from "vitest-evals"; export type InvoiceRecord = { invoiceId: string; @@ -215,6 +216,31 @@ export function createRefundAgent(options?: { model?: RefundAgentModel }) { return new RefundAgent(options?.model ?? DEFAULT_REFUND_MODEL); } +export async function promptRefundModel( + input: string, + options?: HarnessPromptOptions, +) { + const agent = new Agent({ + initialState: { + systemPrompt: options?.system ?? "", + model: getModel("anthropic", DEFAULT_REFUND_MODEL), + thinkingLevel: "off", + tools: [], + }, + toolExecution: "sequential", + }); + + await agent.prompt(input); + + const assistant = getFinalAssistantMessage(agent.state.messages); + const outputText = assistant ? getAssistantText(assistant) : ""; + if (!outputText) { + throw new Error("Prompt model returned an empty response."); + } + + return outputText; +} + function createAgentTools( runtimeTools: RefundAgentRuntimeTools = fallbackRuntimeTools, ): Array> { diff --git a/docs/architecture.md b/docs/architecture.md index 093d51c..e91fad8 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -74,6 +74,10 @@ These are judge-shaped adapters over the legacy comparison logic so new suites can stay on the harness-first surface while older matching behavior remains available. +All judges receive `JudgeContext`, which carries normalized run/session data +plus the configured `harness` and its required `prompt(...)` method. That keeps +rubric and factuality judges on the same API as deterministic judges. + ### `packages/vitest-evals/src/legacy/*` Contains the compatibility layer for scorer-first suites: @@ -112,6 +116,15 @@ For each eval test in a harness-backed suite: 8. The eval test asserts on the same returned result and session. 9. The reporter renders the recorded metadata without re-executing the harness. +Explicit `expect(result).toSatisfyJudge(...)` calls use the run's canonical +text output and reuse registered input, metadata, and harness prompt +when `result` came from the fixture-backed `run(...)`. Inside an eval test, +calls on registered raw output or session objects reuse that exact run context; +raw output values are serialized as the judge `output`, and other raw values +fall back to the current test's most recent `run(...)` context. Calls outside +that context, or on manually-created runs, must pass the context required by +the judge in matcher options. + ## First-Party Harness Packages ### `@vitest-evals/harness-ai-sdk` @@ -149,6 +162,8 @@ New runtime integrations should be implemented as thin adapter packages that: - execute the target runtime through its normal seam - capture messages, tool calls, usage, timings, and errors - normalize them into `HarnessRun` +- expose `prompt` so the same provider/model configuration can be reused by + LLM-backed judges - avoid inventing harness-specific assertion or reporter behavior in userland ### New Judges @@ -157,12 +172,11 @@ Root-level custom evaluation logic should generally be written as judges over normalized run/session data: ```ts -import type { JudgeFn } from "vitest-evals"; +import type { JudgeFn, JudgeOptions } from "vitest-evals"; -export const RefundToolJudge: JudgeFn<{ expectedTools: string[] }> = async ({ - expectedTools, - toolCalls, -}) => ({ +export const RefundToolJudge: JudgeFn< + JudgeOptions<{ expectedTools: string[] }> +> = async ({ expectedTools, toolCalls }) => ({ score: expectedTools.every( (name, index) => toolCalls[index]?.name === name, ) diff --git a/docs/custom-scorers.md b/docs/custom-scorers.md index 5ca6111..c6618f4 100644 --- a/docs/custom-scorers.md +++ b/docs/custom-scorers.md @@ -35,6 +35,7 @@ describeEval( { harness: piAiHarness({ createAgent: () => createRefundAgent(), + prompt: judgePrompt, }), judges: [FactualityJudge], }, @@ -52,9 +53,22 @@ Or run it explicitly inside a test: await expect(result).toSatisfyJudge(FactualityJudge); ``` -For simple response-level checks, a judge can just score `output`. When a -judge needs richer context, type it with `JudgeContext` and read `metadata`, -`toolCalls`, or `session` from there. +For simple response-level checks, a judge can just score `output`. When a judge +needs normalized run context, type it with `JudgeContext` and read `metadata`, +`toolCalls`, `session`, or `harness` from there. `harness.prompt(...)` gives +LLM-backed rubric judges a shared provider/model seam without duplicating +app-level model setup. Calling `harness.run(...)` +inside a judge executes the app again, so reserve that for judges that +intentionally need a second run. + +Explicit matcher calls on the branded result returned by fixture `run(...)` +use the run's canonical text output and reuse registered input, metadata, +harness, and harness prompt. Inside an eval test, matcher calls on registered +raw output or session objects reuse that exact run context; raw output values +are serialized as the judge `output`, and other raw values fall back to the +current test's most recent `run(...)` context. Matcher calls outside that +context, or on manually-created runs, should pass the context required by the +judge in `toSatisfyJudge(...)` options. ## Built-In Root Judges diff --git a/docs/development-guide.md b/docs/development-guide.md index 4d27ba0..c94fc65 100644 --- a/docs/development-guide.md +++ b/docs/development-guide.md @@ -84,12 +84,11 @@ product story, not just smoke tests. `packages/` is for real package surfaces. Root-level evaluation logic should usually be implemented as a `JudgeFn`: ```ts -import type { JudgeFn } from "vitest-evals"; +import type { JudgeFn, JudgeOptions } from "vitest-evals"; -export const DomainJudge: JudgeFn<{ expectedTool: string }> = async ({ - toolCalls, - expectedTool, -}) => ({ +export const DomainJudge: JudgeFn< + JudgeOptions<{ expectedTool: string }> +> = async ({ toolCalls, expectedTool }) => ({ score: toolCalls.some((call) => call.name === expectedTool) ? 1 : 0, metadata: { rationale: `Expected tool ${expectedTool}`, diff --git a/docs/harness-first-rfc.md b/docs/harness-first-rfc.md index 7f651cb..63e4d9a 100644 --- a/docs/harness-first-rfc.md +++ b/docs/harness-first-rfc.md @@ -135,6 +135,7 @@ describeEval( { harness: piAiHarness({ createAgent: () => createRefundAgent(), + prompt: judgePrompt, run: ({ agent, input, runtime }) => agent.run(input, runtime), }), }, @@ -169,6 +170,7 @@ The default path should be close to zero glue for standard apps: describeEval("refund agent", { harness: piAiHarness({ createAgent: () => createRefundAgent(), + prompt: judgePrompt, }), }, (it) => { it("approves a refundable invoice", async ({ run }) => { @@ -187,6 +189,7 @@ entrypoint or custom result shape: describeEval("refund agent", { harness: piAiHarness({ createAgent: () => createRefundAgent(), + prompt: judgePrompt, run: ({ agent, input, runtime }) => agent.execute(input, runtime), normalize: { output: ({ result }) => result.decision, diff --git a/docs/testing.md b/docs/testing.md index 6df0df2..708494a 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -80,6 +80,7 @@ describeEval( { harness: piAiHarness({ createAgent: () => createRefundAgent(), + prompt: judgePrompt, }), judges: [ToolCallJudge()], }, diff --git a/packages/harness-ai-sdk/README.md b/packages/harness-ai-sdk/README.md index 2e3f109..e2461a4 100644 --- a/packages/harness-ai-sdk/README.md +++ b/packages/harness-ai-sdk/README.md @@ -25,6 +25,12 @@ const tools = { const harness = aiSdkHarness({ tools, + prompt: (input, options) => + generateText({ + model: openai("gpt-4o-mini"), + system: options?.system, + prompt: input, + }).then((result) => result.text), task: ({ input, runtime }) => generateText({ model: openai("gpt-4o-mini"), @@ -40,10 +46,15 @@ If your existing AI SDK app exposes its own entrypoint, wire that in directly: ```ts const harness = aiSdkHarness({ tools, + prompt: sharedJudgePrompt, task: ({ input, runtime }) => createRefundAgent().run(input, runtime), }); ``` +The required `prompt` callback is passed to harness-backed judges as +`JudgeContext.harness.prompt`, which lets rubric or factuality judges share the +same provider/model configuration as the suite harness. + The adapter infers: - normalized session and tool-call traces from AI SDK `steps` diff --git a/packages/harness-ai-sdk/src/index.test.ts b/packages/harness-ai-sdk/src/index.test.ts index 78a0a55..5fcdeea 100644 --- a/packages/harness-ai-sdk/src/index.test.ts +++ b/packages/harness-ai-sdk/src/index.test.ts @@ -12,6 +12,8 @@ type DemoMetadata = { let replayDir: string | undefined; +const judgePrompt = async (input: string) => input; + afterEach(() => { vi.unstubAllEnvs(); if (replayDir) { @@ -163,6 +165,7 @@ describeEval( "ai-sdk harness adapter", { harness: aiSdkHarness({ + prompt: judgePrompt, task: async () => ({ ...generateTextLikeResult, object: { @@ -226,6 +229,7 @@ describeEval( "ai-sdk harness adapter custom entrypoint", { harness: aiSdkHarness({ + prompt: judgePrompt, agent: () => { const generate = vi.fn( async ( @@ -393,6 +397,7 @@ test("default agent run receives wrapped runtime tools", async () => { ); const harness = aiSdkHarness({ + prompt: judgePrompt, agent: () => ({ run, }), @@ -437,6 +442,7 @@ test("attaches partial runtime tool calls when a task errors", async () => { refundable: true, })); const harness = aiSdkHarness({ + prompt: judgePrompt, tools: { lookupInvoice: { inputSchema: z.object({ @@ -512,6 +518,7 @@ test("attaches partial runtime tool calls when a task errors", async () => { test("omits empty runtime tool error content when a task errors", async () => { const harness = aiSdkHarness({ + prompt: judgePrompt, tools: { lookupInvoice: { inputSchema: z.object({ @@ -582,6 +589,7 @@ test("omits empty runtime tool error content when a task errors", async () => { test("preserves explicit null runtime tool results", async () => { const harness = aiSdkHarness({ + prompt: judgePrompt, tools: { lookupInvoice: { inputSchema: z.object({ @@ -656,6 +664,7 @@ test("preserves explicit null runtime tool results", async () => { test("marks step-derived tool messages as errors when the tool call failed", async () => { const harness = aiSdkHarness({ + prompt: judgePrompt, task: async () => ({ text: "done", steps: [ @@ -767,6 +776,7 @@ test("keeps runtime-only tool calls when SDK steps are also present", async () = refundable: true, })); const harness = aiSdkHarness({ + prompt: judgePrompt, tools: { lookupInvoice: { inputSchema: z.object({ @@ -884,6 +894,7 @@ test("creates a fresh agent for each explicit run", async () => { run, })); const harness = aiSdkHarness({ + prompt: judgePrompt, agent: createAgent, }); const context = createHarnessContext({}); @@ -929,6 +940,7 @@ test("normalizes domain results that resemble harness runs", async () => { }), ); const harness = aiSdkHarness({ + prompt: judgePrompt, task: async () => ({ session: { messages: [], @@ -972,6 +984,7 @@ test("normalizes domain results that resemble harness runs", async () => { test("aggregates per-step usage when total usage is missing", async () => { const harness = aiSdkHarness({ + prompt: judgePrompt, task: async () => ({ text: "approved", steps: [ @@ -1055,6 +1068,7 @@ test("aggregates per-step usage when total usage is missing", async () => { test("normalizes arrays and empty objects without dropping positions", async () => { const harness = aiSdkHarness({ + prompt: judgePrompt, task: async () => ({ object: { values: [1, undefined, { skipped: undefined }, 3], @@ -1143,6 +1157,7 @@ test("normalizes arrays and empty objects without dropping positions", async () test("preserves empty root tool arguments and omits zero tool usage", async () => { const harness = aiSdkHarness({ + prompt: judgePrompt, task: async () => ({ steps: [ { @@ -1209,6 +1224,7 @@ test("preserves empty root tool arguments and omits zero tool usage", async () = expect(toolCalls(run.session)[0].arguments).toEqual({}); const noToolHarness = aiSdkHarness({ + prompt: judgePrompt, task: async () => ({ steps: [ { @@ -1242,6 +1258,7 @@ test("preserves empty root tool arguments and omits zero tool usage", async () = test("uses invalid tool call details as the normalized error", async () => { const harness = aiSdkHarness({ + prompt: judgePrompt, task: async () => ({ steps: [ { @@ -1301,6 +1318,7 @@ test("uses invalid tool call details as the normalized error", async () => { test("omits undefined step-normalized arguments and results", async () => { const harness = aiSdkHarness({ + prompt: judgePrompt, task: async () => ({ steps: [ { @@ -1368,6 +1386,7 @@ test("records and replays opt-in tools in auto mode", async () => { })); const replayHarness = aiSdkHarness({ + prompt: judgePrompt, tools: { lookupInvoice: { replay: true, @@ -1512,6 +1531,7 @@ test("rejects async iterable replay outputs after awaiting execute", async () => } const replayHarness = aiSdkHarness({ + prompt: judgePrompt, tools: { streamRefund: { replay: true, @@ -1557,6 +1577,7 @@ test("errors when strict mode is missing a recording", async () => { })); const replayHarness = aiSdkHarness({ + prompt: judgePrompt, tools: { lookupInvoice: { replay: true, diff --git a/packages/harness-ai-sdk/src/index.ts b/packages/harness-ai-sdk/src/index.ts index ebcbc9f..1d812cb 100644 --- a/packages/harness-ai-sdk/src/index.ts +++ b/packages/harness-ai-sdk/src/index.ts @@ -214,7 +214,7 @@ interface AiSdkHarnessBaseOptions< errors?: ( args: AiSdkHarnessResultArgs, ) => MaybePromise>>; - prompt?: HarnessPrompt; + prompt: HarnessPrompt; name?: string; } diff --git a/packages/harness-pi-ai/README.md b/packages/harness-pi-ai/README.md index 0a7abaf..622fe6e 100644 --- a/packages/harness-pi-ai/README.md +++ b/packages/harness-pi-ai/README.md @@ -15,9 +15,13 @@ import { piAiHarness } from "@vitest-evals/harness-pi-ai"; const harness = piAiHarness({ createAgent: () => createRefundAgent(), + prompt: sharedJudgePrompt, }); ``` +`prompt` gives rubric or factuality judges the same provider/model setup +through `JudgeContext.harness.prompt`. + If the agent already exposes its own tools, the adapter will infer them from the agent by default. If your existing Pi Mono agent already exposes its own entrypoint, wire that up directly and let the harness provide the runtime @@ -26,6 +30,7 @@ seam: ```ts const harness = piAiHarness({ createAgent: () => createRefundAgent(), + prompt: sharedJudgePrompt, run: ({ agent, input, runtime }) => agent.execute(input, runtime), }); ``` @@ -44,6 +49,7 @@ override: ```ts const harness = piAiHarness({ createAgent: () => createRefundAgent(), + prompt: sharedJudgePrompt, tools: hiddenAgentTools, }); ``` @@ -54,6 +60,7 @@ normalization hooks still exist under `normalize`: ```ts const harness = piAiHarness({ createAgent: () => createWrappedRefundAgent(), + prompt: sharedJudgePrompt, run: ({ agent, input, runtime }) => agent.run(input, runtime), normalize: { output: ({ result }) => result.customDecision, @@ -64,6 +71,7 @@ const harness = piAiHarness({ The adapter provides: - a runtime/tool injection seam for an existing agent +- a required prompt seam for LLM-backed judges - normalized session capture from emitted events and wrapped tool calls - usage/output inference for common `pi-ai`-style result objects - opt-in tool replay/recording when the tool definition sets `replay: true` diff --git a/packages/harness-pi-ai/src/index.test.ts b/packages/harness-pi-ai/src/index.test.ts index 391296e..70ee5c7 100644 --- a/packages/harness-pi-ai/src/index.test.ts +++ b/packages/harness-pi-ai/src/index.test.ts @@ -23,6 +23,8 @@ type DemoRuntime = PiAiRuntime; let replayDir: string | undefined; +const judgePrompt = async (input: string) => input; + afterEach(() => { vi.unstubAllEnvs(); if (replayDir) { @@ -62,10 +64,28 @@ const runAgent = vi.fn( }, ); +test("exposes the configured prompt on the harness", async () => { + const prompt = vi.fn(async (input: string) => `judge: ${input}`); + const harness = piAiHarness({ + agent: { + id: "refund-agent", + }, + prompt, + run: runAgent, + tools, + }); + + await expect(harness.prompt("score refund")).resolves.toBe( + "judge: score refund", + ); + expect(prompt).toHaveBeenCalledWith("score refund"); +}); + describeEval( "pi-ai harness adapter", { harness: piAiHarness({ + prompt: judgePrompt, createAgent, run: runAgent, tools, @@ -105,6 +125,7 @@ describeEval( "pi-ai harness wraps native agent tools", { harness: piAiHarness({ + prompt: judgePrompt, createAgent: () => { const nativeTools = [ { @@ -211,6 +232,7 @@ describeEval( "pi-ai harness wraps native tools even with an explicit tool override", { harness: piAiHarness({ + prompt: judgePrompt, createAgent: () => { const nativeTools = [ { @@ -292,6 +314,7 @@ describeEval( "pi-ai harness reapplies native tool instrumentation after reset", { harness: piAiHarness({ + prompt: judgePrompt, createAgent: () => { const createNativeTool = () => ({ name: "lookupInvoice", @@ -384,6 +407,7 @@ describeEval( "pi-ai harness infers runtime toolsets and native tools together", { harness: piAiHarness({ + prompt: judgePrompt, createAgent: () => { const toolset = { lookupInvoice: { @@ -474,6 +498,7 @@ describeEval( "pi-ai harness infers runtime toolsets from existing agents", { harness: piAiHarness({ + prompt: judgePrompt, createAgent: () => { const toolset = { lookupInvoice: { @@ -529,6 +554,7 @@ test("prefers inferred non-empty runtime toolsets over empty placeholders", asyn refundable: true, })); const harness = piAiHarness({ + prompt: judgePrompt, createAgent: () => { const toolset = { lookupInvoice: { @@ -582,6 +608,7 @@ test("prefers inferred non-empty runtime toolsets over empty placeholders", asyn test("supports normalize.output as a low-level escape hatch", async () => { const normalizedHarness = piAiHarness({ + prompt: judgePrompt, createAgent: () => ({ id: "refund-agent" }), run: async () => ({ customDecision: { @@ -616,6 +643,7 @@ test("supports normalize.output as a low-level escape hatch", async () => { test("applies normalize overrides to HarnessRun-shaped results", async () => { const normalizedHarness = piAiHarness({ + prompt: judgePrompt, createAgent: () => ({ id: "refund-agent" }), run: async () => ({ session: { @@ -678,6 +706,7 @@ test("applies normalize overrides to HarnessRun-shaped results", async () => { test("attaches a partial run when the harness errors", async () => { const erroringHarness = piAiHarness({ + prompt: judgePrompt, createAgent: () => ({ id: "refund-agent" }), tools: { lookupInvoice: { @@ -760,6 +789,7 @@ test("replays native agent tools without breaking the agent-facing result", asyn ); const replayHarness = piAiHarness({ + prompt: judgePrompt, createAgent: () => { const nativeTools = [ { @@ -913,6 +943,7 @@ test("records and replays opt-in tools in auto mode", async () => { })); const replayHarness = piAiHarness({ + prompt: judgePrompt, createAgent: () => ({ id: "refund-agent" }), tools: { lookupInvoice: { @@ -996,6 +1027,7 @@ test("errors when strict mode is missing a recording", async () => { })); const replayHarness = piAiHarness({ + prompt: judgePrompt, createAgent: () => ({ id: "refund-agent" }), tools: { lookupInvoice: { diff --git a/packages/harness-pi-ai/src/index.ts b/packages/harness-pi-ai/src/index.ts index 66eb68f..4e4c374 100644 --- a/packages/harness-pi-ai/src/index.ts +++ b/packages/harness-pi-ai/src/index.ts @@ -2,6 +2,7 @@ import type { Harness, HarnessContext, HarnessMetadata, + HarnessPrompt, HarnessRun, JsonValue, NormalizedMessage, @@ -196,6 +197,7 @@ interface PiAiHarnessBaseOptions< TResult, TTools >; + prompt: HarnessPrompt; name?: string; } @@ -348,6 +350,7 @@ export function piAiHarness< ): Harness { return { name: options.name ?? "pi-ai", + prompt: options.prompt, run: async (input, context) => { const agent = await resolveAgent(options); const messages: NormalizedMessage[] = [ diff --git a/packages/vitest-evals/README.md b/packages/vitest-evals/README.md index 286570d..aecb626 100644 --- a/packages/vitest-evals/README.md +++ b/packages/vitest-evals/README.md @@ -28,6 +28,8 @@ npm install -D @vitest-evals/harness-ai-sdk - per-run judge inputs should usually live under `metadata` - suite-level `judges` are optional and run automatically after each `run(...)` - suite-level `judgeThreshold` controls fail-on-score for those automatic judges +- every judge receives `JudgeContext`, including the configured `harness` with + its required `prompt` function - explicit judge assertions use `await expect(result).toSatisfyJudge(judge, context)` @@ -42,7 +44,7 @@ import { toolCalls, type JudgeContext, } from "vitest-evals"; -import { createRefundAgent } from "../src/refundAgent"; +import { createRefundAgent, judgePrompt } from "../src/refundAgent"; type RefundEvalMetadata = { expectedStatus: "approved" | "denied"; @@ -76,6 +78,7 @@ describeEval( { harness: piAiHarness({ createAgent: () => createRefundAgent(), + prompt: judgePrompt, }), judges: [FactualityJudge], }, @@ -186,12 +189,24 @@ const FactualityJudge = namedJudge( ); ``` -For a `HarnessRun`, `toSatisfyJudge(...)` passes `result.output` as `output`. -For raw values or normalized sessions, the matcher infers the best available -output from the received value. Structured or programmatic result checks should -usually assert on `result.output` directly. When a judge needs richer context, -type it with `JudgeContext` and read `inputValue`, `metadata`, `toolCalls`, or -`session` from there. +LLM-backed judges can reuse the suite harness prompt by calling +`harness.prompt(...)`. `vitest-evals` does not prescribe a rubric schema, +scoring scale, model provider, or parser; those stay in the judge. Calling +`harness.run(...)` from a judge executes the application again, so use that +only when a second run is intentional. + +For an `EvalHarnessRun` returned by fixture `run(...)`, +`toSatisfyJudge(...)` uses the run's canonical text output and reuses the +registered input, metadata, and harness prompt. Inside an eval test, +matcher calls on registered raw output or session objects reuse that exact run +context; raw output values are serialized as the judge `output`, so +`expect(result.output).toSatisfyJudge(judge)` stays concise. Other raw values +fall back to the current test's most recent `run(...)` context. For +manually-created runs or values outside an eval context, pass any required +`inputValue`, `metadata`, or `harness` in matcher options. Structured or +programmatic result checks should usually assert on `result.output` directly. +When a judge needs richer normalized context or the configured suite harness, +type it with `JudgeContext`. When you only need deterministic contract checks, built-ins such as `StructuredOutputJudge()` and `ToolCallJudge()` are still available. The primary diff --git a/packages/vitest-evals/src/harness.test.ts b/packages/vitest-evals/src/harness.test.ts index 957138f..1941be3 100644 --- a/packages/vitest-evals/src/harness.test.ts +++ b/packages/vitest-evals/src/harness.test.ts @@ -67,11 +67,20 @@ const runSpy = vi.fn( }, ); +const promptSpy = vi.fn(async (input: string) => `judge prompt: ${input}`); +const customJudgePromptSpy = vi.fn(async (_input: string) => ({ score: 1 })); + const harness: Harness = { name: "pi-ai", + prompt: promptSpy, run: runSpy, }; +const customHarness = { + ...harness, + judgePrompt: customJudgePromptSpy, +}; + const judgeSpy = vi.fn( async (opts: JudgeContext) => ({ score: opts.metadata.expectedStatus === "approved" ? 1 : 0, @@ -86,6 +95,8 @@ const thresholdJudgeSpy = vi.fn( beforeEach(() => { runSpy.mockClear(); + promptSpy.mockClear(); + customJudgePromptSpy.mockClear(); judgeSpy.mockClear(); thresholdJudgeSpy.mockClear(); }); @@ -180,6 +191,78 @@ describeEval( }, ); +describeEval( + "harness mode with automatic judge prompt", + { + harness, + judges: [ + async ({ + harness: configuredHarness, + metadata, + }: JudgeContext) => { + const promptOutput = await configuredHarness.prompt("score refund", { + system: "grade the refund decision", + }); + + return { + score: + configuredHarness === harness && + metadata.expectedStatus === "approved" && + promptOutput === "judge prompt: score refund" + ? 1 + : 0, + }; + }, + ], + }, + (it) => { + it("passes the configured harness prompt to automatic judges", async ({ + run, + }) => { + await run("Refund invoice inv_123", { + metadata: { + name: "refund request with prompt judge", + expectedStatus: "approved", + }, + }); + + expect(promptSpy).toHaveBeenCalledWith("score refund", { + system: "grade the refund decision", + }); + }); + }, +); + +describeEval( + "harness mode with custom harness judge helpers", + { + harness: customHarness, + judges: [ + async ({ harness: configuredHarness }) => { + const verdict = await configuredHarness.judgePrompt("score refund"); + + return { + score: verdict.score, + }; + }, + ], + }, + (it) => { + it("preserves the configured harness subtype for judges", async ({ + run, + }) => { + await run("Refund invoice inv_123", { + metadata: { + name: "refund request with typed harness helper", + expectedStatus: "approved", + }, + }); + + expect(customJudgePromptSpy).toHaveBeenCalledWith("score refund"); + }); + }, +); + describeEval( "harness mode with explicit suite judge threshold", { @@ -270,6 +353,156 @@ describeEval("harness mode with explicit judge matcher", { harness }, (it) => { ], }); }); + + it("reuses the suite harness and metadata for explicit judges", async ({ + run, + }) => { + const result = await run("Refund invoice inv_123", { + metadata: { + name: "refund request with explicit prompt judge", + expectedStatus: "approved", + }, + }); + const explicitJudge = vi.fn( + async ({ + harness: configuredHarness, + metadata, + }: JudgeContext) => { + const promptOutput = await configuredHarness.prompt( + "score explicit refund", + ); + + return { + score: + configuredHarness === harness && + metadata.expectedStatus === "approved" && + promptOutput === "judge prompt: score explicit refund" + ? 1 + : 0, + }; + }, + ); + + await expect(result).toSatisfyJudge(explicitJudge); + + expect(explicitJudge).toHaveBeenCalledWith( + expect.objectContaining({ + harness, + metadata: { + expectedStatus: "approved", + name: "refund request with explicit prompt judge", + }, + }), + ); + expect(promptSpy).toHaveBeenCalledWith("score explicit refund"); + }); + + it("uses the current test run context for raw explicit judge values", async ({ + run, + }) => { + const result = await run("Refund invoice inv_123", { + metadata: { + name: "refund request with contextual raw judge", + expectedStatus: "approved", + }, + }); + const explicitJudge = vi.fn( + async ({ + harness: configuredHarness, + inputValue, + metadata, + output, + run: judgeRun, + session, + toolCalls: judgeToolCalls, + }: JudgeContext) => { + const promptOutput = await configuredHarness.prompt(inputValue); + + return { + score: + configuredHarness === harness && + output === JSON.stringify(result.output) && + judgeRun === result && + session === result.session && + judgeToolCalls[0]?.name === "lookupInvoice" && + metadata.expectedStatus === "approved" && + promptOutput === "judge prompt: Refund invoice inv_123" + ? 1 + : 0, + }; + }, + ); + + await expect(result.output).toSatisfyJudge(explicitJudge); + + expect(explicitJudge).toHaveBeenCalledWith( + expect.objectContaining({ + harness, + inputValue: "Refund invoice inv_123", + output: JSON.stringify(result.output), + run: result, + session: result.session, + toolCalls: [ + { + name: "lookupInvoice", + arguments: { + invoiceId: "inv_123", + }, + }, + ], + metadata: { + expectedStatus: "approved", + name: "refund request with contextual raw judge", + }, + }), + ); + }); + + it("prefers exact output object context over the latest run fallback", async ({ + run, + }) => { + const first = await run("Refund invoice inv_123", { + metadata: { + name: "first raw judge context", + expectedStatus: "approved", + }, + }); + + await run("Refund invoice inv_456", { + metadata: { + name: "second raw judge context", + expectedStatus: "rejected", + }, + }); + + const explicitJudge = vi.fn( + async ({ + inputValue, + metadata, + run: judgeRun, + }: JudgeContext) => ({ + score: + inputValue === "Refund invoice inv_123" && + metadata.name === "first raw judge context" && + judgeRun === first + ? 1 + : 0, + }), + ); + + await expect(first.output).toSatisfyJudge(explicitJudge); + + expect(explicitJudge).toHaveBeenCalledWith( + expect.objectContaining({ + inputValue: "Refund invoice inv_123", + metadata: { + expectedStatus: "approved", + name: "first raw judge context", + }, + run: first, + }), + ); + }); }); describeEval( @@ -277,6 +510,7 @@ describeEval( { harness: { name: "flaky-harness", + prompt: promptSpy, run: vi .fn<(input: string, context: HarnessContext) => Promise>() .mockResolvedValueOnce({ @@ -327,7 +561,13 @@ test("toSatisfyJudge reuses normalized harness run data", async () => { }), ); - await expect(run).toSatisfyJudge(explicitJudge); + await expect(run).toSatisfyJudge(explicitJudge, { + inputValue: "Refund invoice inv_123", + metadata: { + expectedStatus: "approved", + name: "explicit judge", + }, + }); expect(explicitJudge).toHaveBeenCalledWith( expect.objectContaining({ @@ -348,7 +588,10 @@ test("toSatisfyJudge reuses normalized harness run data", async () => { }, }, ], - metadata: {}, + metadata: { + expectedStatus: "approved", + name: "explicit judge", + }, }), ); }); @@ -373,6 +616,7 @@ test("automatic judges read per-run params from metadata", async () => { }); await expect(run).toSatisfyJudge(metadataJudge, { + inputValue: "Refund invoice inv_123", metadata: { expectedStatus: "approved", name: "compatibility judge", @@ -389,6 +633,49 @@ test("automatic judges read per-run params from metadata", async () => { ); }); +test("toSatisfyJudge accepts explicit harness context for raw values", async () => { + const explicitJudge = vi.fn( + async ({ + harness: configuredHarness, + inputValue, + metadata, + }: JudgeContext) => { + const promptOutput = await configuredHarness.prompt(inputValue); + + return { + score: + configuredHarness === harness && + metadata.expectedStatus === "approved" && + promptOutput === "judge prompt: Refund invoice inv_123" + ? 1 + : 0, + }; + }, + ); + + await expect({ + status: "approved", + }).toSatisfyJudge(explicitJudge, { + inputValue: "Refund invoice inv_123", + metadata: { + expectedStatus: "approved", + name: "raw value with explicit harness context", + }, + harness, + }); + + expect(explicitJudge).toHaveBeenCalledWith( + expect.objectContaining({ + harness, + inputValue: "Refund invoice inv_123", + metadata: { + expectedStatus: "approved", + name: "raw value with explicit harness context", + }, + }), + ); +}); + test("toSatisfyJudge uses plain input to seed synthetic sessions", async () => { const sessionJudge = vi.fn(async (opts: JudgeContext) => ({ score: @@ -585,6 +872,7 @@ test("ToolCallJudge accepts string expected tools", async () => { const result = await judge({ input: "Refund invoice inv_123", + inputValue: "Refund invoice inv_123", output: '{"status":"approved"}', expectedTools: ["lookupInvoice", "createRefund"], toolCalls: [ @@ -595,6 +883,18 @@ test("ToolCallJudge accepts string expected tools", async () => { name: "createRefund", }, ], + metadata: {}, + run: { + session: { + messages: [], + }, + usage: {}, + errors: [], + }, + session: { + messages: [], + }, + harness: undefined, }); expect(result.score).toBe(1); @@ -605,7 +905,9 @@ test("StructuredOutputJudge reads expected fields from metadata", async () => { const result = await judge({ input: "Refund invoice inv_123", + inputValue: "Refund invoice inv_123", output: '{"status":"approved","reason":"invoice refunded"}', + toolCalls: [], run: { session: { messages: [], @@ -617,11 +919,15 @@ test("StructuredOutputJudge reads expected fields from metadata", async () => { usage: {}, errors: [], }, + session: { + messages: [], + }, metadata: { expected: { status: "approved", }, }, + harness: undefined, }); expect(result.score).toBe(1); diff --git a/packages/vitest-evals/src/harness.ts b/packages/vitest-evals/src/harness.ts index 028247c..3bfc377 100644 --- a/packages/vitest-evals/src/harness.ts +++ b/packages/vitest-evals/src/harness.ts @@ -63,20 +63,18 @@ export type HarnessRun = { errors: Array>; }; +/** Optional provider-facing hints for harness prompt calls. */ export type HarnessPromptOptions = { system?: string; metadata?: Record; }; +/** Provider-agnostic prompt seam that judges can reuse from a harness. */ export type HarnessPrompt = ( input: string, options?: HarnessPromptOptions, ) => Promise; -export type HarnessRuntime = { - prompt: HarnessPrompt; -}; - export type HarnessRunError = Error & { vitestEvalsRun: HarnessRun; }; @@ -100,7 +98,8 @@ export type Harness< TMetadata extends HarnessMetadata = HarnessMetadata, > = { name: string; - prompt?: HarnessPrompt; + /** Prompt seam reused by LLM-backed judges. */ + prompt: HarnessPrompt; run: ( input: TInput, context: HarnessContext, diff --git a/packages/vitest-evals/src/index.ts b/packages/vitest-evals/src/index.ts index 38d7f38..4fbce18 100644 --- a/packages/vitest-evals/src/index.ts +++ b/packages/vitest-evals/src/index.ts @@ -19,9 +19,9 @@ import { userMessages, } from "./harness"; import type { - BaseJudgeOptions, JudgeContext, JudgeFn, + JudgeOptions, JudgeResult, } from "./judges/types"; import { wrapText } from "./wrapText"; @@ -44,13 +44,41 @@ type EvalTaskLike = { meta: EvalTaskMeta; }; +type RegisteredJudgeRunContext = { + harness: Harness; + inputValue: unknown; + metadata: HarnessMetadata; + run: HarnessRun; +}; + type InternalEvalFixtures = { harness: Harness; - automaticJudges: Array>>; + automaticJudges: Array>>; judgeThreshold: number | null | undefined; run: EvalRun; }; +type HarnessInput> = + THarness extends Harness ? TInput : unknown; + +type HarnessMetadataFor> = + THarness extends Harness ? TMetadata : HarnessMetadata; + +declare const evalHarnessRunBrand: unique symbol; + +/** Harness run returned by the fixture-backed `run(...)` API. */ +export type EvalHarnessRun< + TInput = unknown, + TMetadata extends HarnessMetadata = HarnessMetadata, + THarness extends Harness = Harness, +> = HarnessRun & { + readonly [evalHarnessRunBrand]: { + readonly input: TInput; + readonly metadata: TMetadata; + readonly harness: THarness; + }; +}; + /** Per-run metadata forwarded to the harness alongside the test input. */ export interface EvalRunOptions< TMetadata extends HarnessMetadata = HarnessMetadata, @@ -62,59 +90,74 @@ export interface EvalRunOptions< export type EvalRun< TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, -> = (input: TInput, options?: EvalRunOptions) => Promise; + THarness extends Harness = Harness, +> = ( + input: TInput, + options?: EvalRunOptions, +) => Promise>; /** Fixture-backed Vitest context exposed inside `describeEval(...)` tests. */ export interface EvalTestContext< TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, + THarness extends Harness = Harness, > { - run: EvalRun; + run: EvalRun; } export type EvalTestAPI< TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, -> = TestAPI>; - -/** - * Compatibility alias for harness-backed judge inputs. - * - * New custom judges should prefer `JudgeContext` directly. This alias remains - * for older imports that were already using the harness-backed judge shape. - */ -export type HarnessJudgeOptions< - TInput = unknown, - TMetadata extends HarnessMetadata = HarnessMetadata, -> = JudgeContext; + THarness extends Harness = Harness, +> = TestAPI>; /** Suite-level configuration for a harness-backed eval block. */ export interface DescribeEvalOptions< TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, + THarness extends Harness = Harness, > { /** Harness used for every explicit `run(...)` call in the suite. */ - harness: Harness; + harness: THarness; /** Automatic judges applied after each successful `run(...)`. */ - judges?: Array>>; + judges?: Array>>; /** Passing threshold for automatic suite-level judges. `null` disables fail-on-score. */ judgeThreshold?: number | null; skipIf?: () => boolean; } -type JudgeAssertionInputValue = - TJudgeOptions extends { inputValue: infer TInput } ? TInput : unknown; +type JudgeAssertionInputValue< + TJudgeOptions extends JudgeContext, +> = TJudgeOptions extends { inputValue: infer TInput } ? TInput : unknown; -type JudgeAssertionMetadata = +type JudgeAssertionMetadata> = TJudgeOptions extends { metadata: infer TMetadata } ? TMetadata : HarnessMetadata; +type JudgeAssertionHarness> = + TJudgeOptions extends { harness: infer THarness } + ? Exclude + : Harness< + JudgeAssertionInputValue, + JudgeAssertionMetadata + >; + /** Optional overrides passed to `expect(...).toSatisfyJudge(...)`. */ export type JudgeAssertionOptions< - TJudgeOptions extends BaseJudgeOptions = BaseJudgeOptions, + TJudgeOptions extends JudgeContext = JudgeContext, > = Partial< - Omit + Omit< + TJudgeOptions, + | "input" + | "output" + | "inputValue" + | "metadata" + | "toolCalls" + | "run" + | "session" + | "harness" + > > & { input?: string; inputValue?: JudgeAssertionInputValue; @@ -122,16 +165,17 @@ export type JudgeAssertionOptions< toolCalls?: ToolCallRecord[]; run?: HarnessRun; session?: HarnessRun["session"]; + harness?: JudgeAssertionHarness; /** Passing threshold for the explicit matcher. `null` records the score without failing. */ threshold?: number | null; }; -export type ToSatisfyJudge = < - TJudgeOptions extends BaseJudgeOptions = BaseJudgeOptions, +export type ToSatisfyJudge = < + TJudgeOptions extends JudgeContext = JudgeContext, >( judge: JudgeFn, options?: JudgeAssertionOptions, -) => Promise; +) => Promise; export interface EvalMatchers { toSatisfyJudge: ToSatisfyJudge; @@ -144,16 +188,18 @@ declare module "vitest" { interface TaskMeta extends EvalTaskMeta {} } +const judgeRunContextByObject = new WeakMap< + object, + RegisteredJudgeRunContext +>(); + const evalTest = test .extend("harness", async (): Promise => { throw new Error( "describeEval must override the harness fixture before running tests.", ); }) - .extend( - "automaticJudges", - [] as Array>>, - ) + .extend("automaticJudges", [] as Array>>) .extend("judgeThreshold", undefined as number | null | undefined) .extend( "run", @@ -187,6 +233,7 @@ const evalTest = test } setHarnessMeta(task, resolvedHarness.name, partialRun); + recordJudgeRunContext(partialRun, resolvedHarness, input, metadata); } throw error; @@ -197,38 +244,45 @@ const evalTest = test } setHarnessMeta(task, resolvedHarness.name, run); + recordJudgeRunContext(run, resolvedHarness, input, metadata); if (automaticJudges.length > 0) { await applyAutomaticJudges( task, automaticJudges, judgeThreshold, + resolvedHarness, input, metadata, run, ); } - return run; + return run as EvalHarnessRun< + unknown, + HarnessMetadata, + typeof resolvedHarness + >; }; }, ) as TestAPI; expect.extend({ toSatisfyJudge: async function toSatisfyJudge< - TJudgeOptions extends BaseJudgeOptions = BaseJudgeOptions, + TJudgeOptions extends JudgeContext = JudgeContext, >( received: unknown, judge: JudgeFn, options: JudgeAssertionOptions = {}, ) { const { threshold = 1.0, ...context } = options; - const judgeOptions = buildJudgeAssertionOptions(received, context); + const judgeOptions = buildJudgeAssertionOptions( + received, + context, + isEvalTaskLike(this.task) ? this.task : undefined, + ); - let result = judge(judgeOptions); - if (result instanceof Promise) { - result = await result; - } + const result = await judge(judgeOptions); const score = result.score ?? 0; const pass = threshold === null ? true : score >= threshold; @@ -268,6 +322,7 @@ expect.extend({ * describeEval("refund agent", { * harness: piAiHarness({ * createAgent: () => createRefundAgent(), + * prompt: judgePrompt, * }), * judges: [ToolCallJudge()], * }, (it) => { @@ -281,22 +336,35 @@ expect.extend({ * }); * ``` */ -export function describeEval< - TInput = unknown, - TMetadata extends HarnessMetadata = HarnessMetadata, ->( +export function describeEval>( name: string, - options: DescribeEvalOptions, - define: (it: EvalTestAPI) => void, + options: DescribeEvalOptions< + HarnessInput, + HarnessMetadataFor, + THarness + >, + define: ( + it: EvalTestAPI< + HarnessInput, + HarnessMetadataFor, + THarness + >, + ) => void, ) { const suite = options.skipIf ? describe.skipIf(options.skipIf()) : describe; return suite(name, () => { const it = evalTest.override({ harness: options.harness, - automaticJudges: options.judges ?? [], + automaticJudges: (options.judges ?? []) as Array< + JudgeFn> + >, judgeThreshold: options.judgeThreshold, - }) as EvalTestAPI; + }) as unknown as EvalTestAPI< + HarnessInput, + HarnessMetadataFor, + THarness + >; define(it); }); @@ -308,32 +376,35 @@ function createMetadata( return { ...(metadata ?? {}) } as TMetadata; } -async function applyAutomaticJudges( +async function applyAutomaticJudges< + TInput, + TMetadata extends HarnessMetadata, + THarness extends Harness, +>( task: EvalTaskLike, - judges: Array>>, + judges: Array>>, threshold: number | null | undefined, + harness: THarness, input: TInput, metadata: TMetadata, run: HarnessRun, ) { + const output = formatJudgeTextOutput(run); + const runToolCalls = toolCalls(run.session); const scores = await Promise.all( judges.map((judge) => { const judgeOptions = { input: formatJudgeInput(input), inputValue: input, - output: formatJudgeTextOutput(run), - toolCalls: toolCalls(run.session), + output, + toolCalls: runToolCalls, metadata, run, session: run.session, - } as HarnessJudgeOptions; - const result = judge(judgeOptions); - - if (result instanceof Promise) { - return result; - } + harness, + } as JudgeContext; - return new Promise((resolve) => resolve(result)); + return Promise.resolve(judge(judgeOptions)); }), ); @@ -349,8 +420,8 @@ async function applyAutomaticJudges( task.meta.eval = { scores: scoresWithName, avgScore, - output: formatJudgeTextOutput(run), - toolCalls: toolCalls(run.session), + output, + toolCalls: runToolCalls, thresholdFailed, }; @@ -359,7 +430,7 @@ async function applyAutomaticJudges( avgScore >= thresholdValue, [ `Score: ${avgScore.toFixed(2)} below threshold: ${thresholdValue.toFixed(2)}`, - `Output: ${wrapText(formatJudgeTextOutput(run))}`, + `Output: ${wrapText(output)}`, formatScores(scoresWithName), ].join("\n\n"), ); @@ -378,6 +449,33 @@ function setHarnessMeta(task: EvalTaskLike, name: string, run: HarnessRun) { }; } +function recordJudgeRunContext( + run: HarnessRun, + harness: Harness, + inputValue: TInput, + metadata: TMetadata, +) { + const context = { + harness, + inputValue, + metadata, + run, + }; + + recordJudgeRunContextObject(run, context); + recordJudgeRunContextObject(run.session, context); + recordJudgeRunContextObject(run.output, context); +} + +function recordJudgeRunContextObject( + value: unknown, + context: RegisteredJudgeRunContext, +) { + if (isWeakMapKey(value)) { + judgeRunContextByObject.set(value, context); + } +} + function appendJudgeScore( task: EvalTaskLike, { @@ -460,41 +558,100 @@ function formatJudgeTextOutput(run: HarnessRun) { } function buildJudgeAssertionOptions< - TJudgeOptions extends BaseJudgeOptions = BaseJudgeOptions, + TJudgeOptions extends JudgeContext = JudgeContext, >( received: unknown, options: Omit, "threshold">, + task?: EvalTaskLike, ): TJudgeOptions { - const run = resolveJudgeRun(received, options); + const registeredContext = resolveRegisteredJudgeRunContext( + received, + options, + task, + ); + const harness = options.harness ?? registeredContext?.harness; const metadata = (options.metadata ?? + registeredContext?.metadata ?? {}) as JudgeAssertionMetadata; const inputValue = options.inputValue ?? + (registeredContext?.inputValue as + | JudgeAssertionInputValue + | undefined) ?? + undefined; + const contextualOptions = { + ...options, + ...(inputValue !== undefined ? { inputValue } : {}), + }; + const run = resolveJudgeRun( + received, + contextualOptions, + registeredContext?.run, + ); + const resolvedInputValue = + inputValue ?? (userMessages(run.session)[0]?.content as | JudgeAssertionInputValue | undefined) ?? undefined; const input = options.input ?? - (inputValue !== undefined ? formatJudgeInput(inputValue) : ""); + (resolvedInputValue !== undefined + ? formatJudgeInput(resolvedInputValue) + : ""); return { ...(options as Record), input, - inputValue, - output: formatJudgeTextOutput(run), + inputValue: resolvedInputValue, + output: formatJudgeAssertionOutput(received, run), metadata, run, session: options.session ?? run.session, toolCalls: options.toolCalls ?? toolCalls(run.session), + harness, } as unknown as TJudgeOptions; } +function resolveRegisteredJudgeRunContext< + TJudgeOptions extends JudgeContext = JudgeContext, +>( + received: unknown, + options: Omit, "threshold">, + task?: EvalTaskLike, +) { + if (options.run) { + return getRegisteredJudgeRunContext(options.run); + } + + const receivedContext = getRegisteredJudgeRunContext(received); + if (receivedContext) { + return receivedContext; + } + + if (task?.meta.harness?.run) { + return getRegisteredJudgeRunContext(task.meta.harness.run); + } + + return undefined; +} + +function getRegisteredJudgeRunContext(value: unknown) { + return isWeakMapKey(value) ? judgeRunContextByObject.get(value) : undefined; +} + +function isWeakMapKey(value: unknown): value is object { + return ( + value !== null && (typeof value === "object" || typeof value === "function") + ); +} + function resolveJudgeRun< - TJudgeOptions extends BaseJudgeOptions = BaseJudgeOptions, + TJudgeOptions extends JudgeContext = JudgeContext, >( received: unknown, options: Omit, "threshold">, + contextualRun?: HarnessRun, ): HarnessRun { if (options.run) { return options.session @@ -514,6 +671,15 @@ function resolveJudgeRun< : received; } + if (contextualRun) { + return options.session + ? { + ...contextualRun, + session: options.session, + } + : contextualRun; + } + const session = options.session ?? (isNormalizedSession(received) @@ -528,8 +694,32 @@ function resolveJudgeRun< }; } +function formatJudgeAssertionOutput(received: unknown, run: HarnessRun) { + if (isHarnessRun(received) || isNormalizedSession(received)) { + return formatJudgeTextOutput(run); + } + + return formatReceivedJudgeOutput(received); +} + +function formatReceivedJudgeOutput(received: unknown) { + if (typeof received === "string") { + return received; + } + + if (received !== undefined) { + try { + return JSON.stringify(received) ?? String(received); + } catch { + return String(received); + } + } + + return ""; +} + function createSyntheticJudgeSession< - TJudgeOptions extends BaseJudgeOptions = BaseJudgeOptions, + TJudgeOptions extends JudgeContext = JudgeContext, >( received: unknown, options: Omit, "threshold">, @@ -630,7 +820,7 @@ export function formatScores(scores: (JudgeResult & { name: string })[]) { } /** Applies a stable display name to a custom judge function. */ -export function namedJudge( +export function namedJudge>( name: string, judge: JudgeFn, ): JudgeFn { @@ -654,6 +844,8 @@ export { type Harness, type HarnessContext, type HarnessMetadata, + type HarnessPrompt, + type HarnessPromptOptions, type HarnessRun, type HarnessRunError, type JsonPrimitive, @@ -674,8 +866,8 @@ export { type ToolCallJudgeOptions, } from "./judges"; export type { - BaseJudgeOptions, JudgeContext, JudgeFn, + JudgeOptions, JudgeResult, } from "./judges/types"; diff --git a/packages/vitest-evals/src/judges/index.ts b/packages/vitest-evals/src/judges/index.ts index 007b654..915ee7e 100644 --- a/packages/vitest-evals/src/judges/index.ts +++ b/packages/vitest-evals/src/judges/index.ts @@ -10,4 +10,9 @@ export { type ToolCallJudgeOptions, } from "./toolCallJudge"; -export type { BaseJudgeOptions, JudgeFn, JudgeResult } from "./types"; +export type { + JudgeContext, + JudgeFn, + JudgeOptions, + JudgeResult, +} from "./types"; diff --git a/packages/vitest-evals/src/judges/structuredOutputJudge.ts b/packages/vitest-evals/src/judges/structuredOutputJudge.ts index e15522b..9f008e8 100644 --- a/packages/vitest-evals/src/judges/structuredOutputJudge.ts +++ b/packages/vitest-evals/src/judges/structuredOutputJudge.ts @@ -1,5 +1,4 @@ -import type { HarnessRun } from "../harness"; -import type { JudgeFn } from "./types"; +import type { JudgeContext, JudgeFn } from "./types"; import { StructuredOutputScorer, type StructuredOutputScorerConfig, @@ -14,10 +13,9 @@ type StructuredOutputJudgeMetadata = HarnessMetadata & { }; export interface StructuredOutputJudgeOptions - extends Omit { - output: string; - run: HarnessRun; - metadata?: StructuredOutputJudgeMetadata; + extends JudgeContext, + Omit { + expected?: StructuredOutputJudgeExpected; } export interface StructuredOutputJudgeConfig @@ -27,12 +25,15 @@ export function StructuredOutputJudge( config: StructuredOutputJudgeConfig = {}, ): JudgeFn { const scorer = StructuredOutputScorer(config); - const judge = ((opts: StructuredOutputJudgeOptions) => - scorer({ + const judge = ((opts: StructuredOutputJudgeOptions) => { + const metadata = opts.metadata as StructuredOutputJudgeMetadata; + + return scorer({ ...opts, - expected: opts.expected ?? opts.metadata?.expected, + expected: opts.expected ?? metadata.expected, output: formatStructuredOutput(opts.run.output), - })) as JudgeFn; + }); + }) as JudgeFn; Object.defineProperty(judge, "name", { value: "StructuredOutputJudge", @@ -41,7 +42,9 @@ export function StructuredOutputJudge( return judge; } -function formatStructuredOutput(output: HarnessRun["output"]) { +function formatStructuredOutput( + output: StructuredOutputJudgeOptions["run"]["output"], +) { if (typeof output === "string") { return output; } diff --git a/packages/vitest-evals/src/judges/toolCallJudge.ts b/packages/vitest-evals/src/judges/toolCallJudge.ts index dd828ef..81af5ba 100644 --- a/packages/vitest-evals/src/judges/toolCallJudge.ts +++ b/packages/vitest-evals/src/judges/toolCallJudge.ts @@ -1,4 +1,4 @@ -import type { JudgeFn } from "./types"; +import type { JudgeContext, JudgeFn } from "./types"; import { ToolCallScorer, type ToolCallScorerConfig, @@ -20,22 +20,28 @@ type ToolCallJudgeMetadata = HarnessMetadata & { }; export interface ToolCallJudgeOptions - extends Omit { + extends JudgeContext, + Omit< + ToolCallScorerOptions, + "input" | "output" | "toolCalls" | "expectedTools" + > { expectedTools?: ExpectedTool[]; - metadata?: ToolCallJudgeMetadata; } export function ToolCallJudge( config: ToolCallJudgeConfig = {}, ): JudgeFn { const scorer = ToolCallScorer(config); - const judge = ((opts: ToolCallJudgeOptions) => - scorer({ + const judge = ((opts: ToolCallJudgeOptions) => { + const metadata = opts.metadata as ToolCallJudgeMetadata; + + return scorer({ ...opts, expectedTools: normalizeExpectedTools( - opts.expectedTools ?? opts.metadata?.expectedTools, + opts.expectedTools ?? metadata.expectedTools, ), - })) as JudgeFn; + }); + }) as JudgeFn; Object.defineProperty(judge, "name", { value: "ToolCallJudge", diff --git a/packages/vitest-evals/src/judges/types.ts b/packages/vitest-evals/src/judges/types.ts index 2d8c3aa..9b326e7 100644 --- a/packages/vitest-evals/src/judges/types.ts +++ b/packages/vitest-evals/src/judges/types.ts @@ -1,4 +1,9 @@ -import type { HarnessMetadata, HarnessRun, ToolCallRecord } from "../harness"; +import type { + Harness, + HarnessMetadata, + HarnessRun, + ToolCallRecord, +} from "../harness"; /** Score payload returned by a judge. */ export type JudgeResult = { @@ -10,36 +15,42 @@ export type JudgeResult = { }; /** - * Common string views passed to every judge. - * - * Use `JudgeContext` when you need structured access to the normalized run or - * the original input value. - */ -export interface BaseJudgeOptions { - /** Canonical text input passed to judges for plain prompt evaluation. */ - input: string; - /** Canonical text response passed to judges for plain output evaluation. */ - output: string; - toolCalls?: ToolCallRecord[]; -} - -/** - * Full normalized context passed to harness-backed judges. + * Full normalized context passed to every judge. * * Per-run judge parameters should generally live under `metadata`. */ export interface JudgeContext< TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, -> extends BaseJudgeOptions { + THarness extends Harness | undefined = + | Harness + | undefined, +> { + /** Canonical text input passed to judges for plain prompt evaluation. */ + input: string; + /** Canonical text response passed to judges for plain output evaluation. */ + output: string; /** Original non-string input value when the judge needs more than `input`. */ inputValue: TInput; + toolCalls: ToolCallRecord[]; metadata: Readonly; run: HarnessRun; session: HarnessRun["session"]; + /** Harness associated with this judge context. */ + harness: THarness; } -/** Judge function over either string views alone or a richer normalized context. */ -export type JudgeFn = ( - opts: TOptions, -) => Promise | JudgeResult; +/** Convenience helper for judges that accept explicit per-call params. */ +export type JudgeOptions< + TParams extends Record = Record, + TInput = unknown, + TMetadata extends HarnessMetadata = HarnessMetadata, + THarness extends Harness | undefined = + | Harness + | undefined, +> = JudgeContext & TParams; + +/** Judge function over the normalized judge context. */ +export type JudgeFn< + TOptions extends JudgeContext = JudgeContext, +> = (opts: TOptions) => Promise | JudgeResult; diff --git a/policies/README.md b/policies/README.md index 530ab5c..464f56d 100644 --- a/policies/README.md +++ b/policies/README.md @@ -7,6 +7,7 @@ without turning it into a full architecture document or feature spec. Good policy topics: +- API design and migration shape - code comments and docstrings - testing expectations - naming conventions @@ -18,4 +19,3 @@ Keep policy docs small: - explain the intent briefly - state the default rule clearly - call out only the meaningful exceptions - diff --git a/policies/api-design.md b/policies/api-design.md new file mode 100644 index 0000000..b719367 --- /dev/null +++ b/policies/api-design.md @@ -0,0 +1,29 @@ +# API Design + +## Intent + +Public APIs should make the common path hard to misuse while preserving escape +hatches for advanced cases. + +## Policy + +- Prefer one shared contextual API over parallel specialized APIs when callers + are doing the same kind of work. +- Keep context objects stable and capability methods mandatory when the surface + owns the configuration. Tighten the upstream config instead of exposing + optional methods that every caller has to branch around. +- Put capabilities on the object that owns their configuration. Avoid parallel + context objects with overlapping lifecycle names such as `harness` and + `runtime`. +- Infer context from fixtures, registered runs, or the current test when that + removes repetitive parameters and avoids caller mistakes. +- Keep explicit overrides for values that cannot be inferred reliably. +- Use clean breaks when the root API shape is wrong; do not preserve confusing + compatibility aliases on new harness-first surfaces. + +## Exceptions + +- Split an API only when the behavior, lifecycle, or ownership boundary is + genuinely different. +- Require explicit parameters when implicit context would be ambiguous or + likely to attach the wrong run.