getsentry · dcramer · May 3, 2026 · May 3, 2026
diff --git a/README.md b/README.md
@@ -76,7 +76,7 @@ import {
   toolCalls,
   type JudgeContext,
 } from "vitest-evals";
-import { createRefundAgent } from "../src/refundAgent";
+import { createRefundAgent, judgePrompt } from "../src/refundAgent";
 
 type RefundEvalMetadata = {
   expectedStatus: "approved" | "denied";
@@ -110,6 +110,7 @@ describeEval(
   {
     harness: piAiHarness({
       createAgent: () => createRefundAgent(),
+      prompt: judgePrompt,
     }),
     judges: [FactualityJudge],
   },
@@ -143,13 +144,16 @@ Harness-backed suites stay close to plain Vitest:
 - tests call `run(...)` explicitly
 - ordinary `expect(...)` assertions stay first-class
 - judges layer in through `expect(...).toSatisfyJudge(...)`
+- every judge receives `JudgeContext`, including the configured harness with its
+  required `prompt` function
 - per-run judge parameters should usually live under `metadata`
 - reporter output, replay, usage, and tool traces come from the normalized run
 
 Built-in judges like `StructuredOutputJudge()` are still available for
 deterministic contract checks, but the more realistic explicit-judge path is a
 custom factuality or rubric judge over `output`, with `JudgeContext` available
-when the judge needs richer run/session data.
+when the judge needs richer run/session data or the suite's configured model
+prompt seam.
 
 Tool replay is available for opt-in tools in the first-party harnesses.
 Configure it globally in Vitest and then mark individual tools with

diff --git a/apps/demo-ai-sdk/evals/shared.ts b/apps/demo-ai-sdk/evals/shared.ts
@@ -111,6 +111,13 @@ const refundTools = {
 
 export const refundHarness = aiSdkHarness({
   tools: refundTools,
+  prompt: (input, options) =>
+    generateText({
+      model: anthropic("claude-sonnet-4-5"),
+      system: options?.system,
+      prompt: input,
+      temperature: 0,
+    }).then((result) => result.text),
   task: async ({ input, runtime }) =>
     generateText({
       model: anthropic("claude-sonnet-4-5"),

diff --git a/apps/demo-pi/evals/refund.eval.ts b/apps/demo-pi/evals/refund.eval.ts
@@ -6,7 +6,11 @@ import {
   ToolCallJudge,
   toolCalls,
 } from "vitest-evals";
-import { createRefundAgent, type RefundCase } from "../src/refundAgent";
+import {
+  createRefundAgent,
+  promptRefundModel,
+  type RefundCase,
+} from "../src/refundAgent";
 
 const outputJudge = StructuredOutputJudge();
 
@@ -16,6 +20,7 @@ describeEval(
     skipIf: () => !process.env.ANTHROPIC_API_KEY,
     harness: piAiHarness({
       createAgent: () => createRefundAgent(),
+      prompt: promptRefundModel,
     }),
     judges: [ToolCallJudge()],
   },

diff --git a/apps/demo-pi/evals/refund.fail.eval.ts b/apps/demo-pi/evals/refund.fail.eval.ts
@@ -1,7 +1,11 @@
 import { expect } from "vitest";
 import { piAiHarness } from "@vitest-evals/harness-pi-ai";
 import { describeEval, StructuredOutputJudge } from "vitest-evals";
-import { createRefundAgent, type RefundCase } from "../src/refundAgent";
+import {
+  createRefundAgent,
+  promptRefundModel,
+  type RefundCase,
+} from "../src/refundAgent";
 
 type AssertionRefundCase = RefundCase;
 type ScoredRefundCase = RefundCase & {
@@ -10,6 +14,7 @@ type ScoredRefundCase = RefundCase & {
 
 const harness = piAiHarness({
   createAgent: () => createRefundAgent(),
+  prompt: promptRefundModel,
 });
 
 describeEval(

diff --git a/apps/demo-pi/src/refundAgent.ts b/apps/demo-pi/src/refundAgent.ts
@@ -6,6 +6,7 @@ import {
   type Static,
 } from "@mariozechner/pi-ai";
 import type { PiAiRuntime, PiAiToolset } from "@vitest-evals/harness-pi-ai";
+import type { HarnessPromptOptions } from "vitest-evals";
 
 export type InvoiceRecord = {
   invoiceId: string;
@@ -215,6 +216,31 @@ export function createRefundAgent(options?: { model?: RefundAgentModel }) {
   return new RefundAgent(options?.model ?? DEFAULT_REFUND_MODEL);
 }
 
+export async function promptRefundModel(
+  input: string,
+  options?: HarnessPromptOptions,
+) {
+  const agent = new Agent({
+    initialState: {
+      systemPrompt: options?.system ?? "",
+      model: getModel("anthropic", DEFAULT_REFUND_MODEL),
+      thinkingLevel: "off",
+      tools: [],
+    },
+    toolExecution: "sequential",
+  });
+
+  await agent.prompt(input);
+
+  const assistant = getFinalAssistantMessage(agent.state.messages);
+  const outputText = assistant ? getAssistantText(assistant) : "";
+  if (!outputText) {
+    throw new Error("Prompt model returned an empty response.");
+  }
+
+  return outputText;
+}
+
 function createAgentTools(
   runtimeTools: RefundAgentRuntimeTools = fallbackRuntimeTools,
 ): Array<AgentTool<any, any>> {

diff --git a/docs/architecture.md b/docs/architecture.md
@@ -74,6 +74,10 @@ These are judge-shaped adapters over the legacy comparison logic so new suites
 can stay on the harness-first surface while older matching behavior remains
 available.
 
+All judges receive `JudgeContext`, which carries normalized run/session data
+plus the configured `harness` and its required `prompt(...)` method. That keeps
+rubric and factuality judges on the same API as deterministic judges.
+
 ### `packages/vitest-evals/src/legacy/*`
 
 Contains the compatibility layer for scorer-first suites:
@@ -112,6 +116,15 @@ For each eval test in a harness-backed suite:
 8. The eval test asserts on the same returned result and session.
 9. The reporter renders the recorded metadata without re-executing the harness.
 
+Explicit `expect(result).toSatisfyJudge(...)` calls use the run's canonical
+text output and reuse registered input, metadata, and harness prompt
+when `result` came from the fixture-backed `run(...)`. Inside an eval test,
+calls on registered raw output or session objects reuse that exact run context;
+raw output values are serialized as the judge `output`, and other raw values
+fall back to the current test's most recent `run(...)` context. Calls outside
+that context, or on manually-created runs, must pass the context required by
+the judge in matcher options.
+
 ## First-Party Harness Packages
 
 ### `@vitest-evals/harness-ai-sdk`
@@ -149,6 +162,8 @@ New runtime integrations should be implemented as thin adapter packages that:
 - execute the target runtime through its normal seam
 - capture messages, tool calls, usage, timings, and errors
 - normalize them into `HarnessRun`
+- expose `prompt` so the same provider/model configuration can be reused by
+  LLM-backed judges
 - avoid inventing harness-specific assertion or reporter behavior in userland
 
 ### New Judges
@@ -157,12 +172,11 @@ Root-level custom evaluation logic should generally be written as judges over
 normalized run/session data:
 
 ```ts
-import type { JudgeFn } from "vitest-evals";
+import type { JudgeFn, JudgeOptions } from "vitest-evals";
 
-export const RefundToolJudge: JudgeFn<{ expectedTools: string[] }> = async ({
-  expectedTools,
-  toolCalls,
-}) => ({
+export const RefundToolJudge: JudgeFn<
+  JudgeOptions<{ expectedTools: string[] }>
+> = async ({ expectedTools, toolCalls }) => ({
   score: expectedTools.every(
     (name, index) => toolCalls[index]?.name === name,
   )

diff --git a/docs/custom-scorers.md b/docs/custom-scorers.md
@@ -35,6 +35,7 @@ describeEval(
   {
     harness: piAiHarness({
       createAgent: () => createRefundAgent(),
+      prompt: judgePrompt,
     }),
     judges: [FactualityJudge],
   },
@@ -52,9 +53,22 @@ Or run it explicitly inside a test:
 await expect(result).toSatisfyJudge(FactualityJudge);
 ```
 
-For simple response-level checks, a judge can just score `output`. When a
-judge needs richer context, type it with `JudgeContext` and read `metadata`,
-`toolCalls`, or `session` from there.
+For simple response-level checks, a judge can just score `output`. When a judge
+needs normalized run context, type it with `JudgeContext` and read `metadata`,
+`toolCalls`, `session`, or `harness` from there. `harness.prompt(...)` gives
+LLM-backed rubric judges a shared provider/model seam without duplicating
+app-level model setup. Calling `harness.run(...)`
+inside a judge executes the app again, so reserve that for judges that
+intentionally need a second run.
+
+Explicit matcher calls on the branded result returned by fixture `run(...)`
+use the run's canonical text output and reuse registered input, metadata,
+harness, and harness prompt. Inside an eval test, matcher calls on registered
+raw output or session objects reuse that exact run context; raw output values
+are serialized as the judge `output`, and other raw values fall back to the
+current test's most recent `run(...)` context. Matcher calls outside that
+context, or on manually-created runs, should pass the context required by the
+judge in `toSatisfyJudge(...)` options.
 
 ## Built-In Root Judges
 

diff --git a/docs/development-guide.md b/docs/development-guide.md
@@ -84,12 +84,11 @@ product story, not just smoke tests. `packages/` is for real package surfaces.
 Root-level evaluation logic should usually be implemented as a `JudgeFn`:
 
 ```ts
-import type { JudgeFn } from "vitest-evals";
+import type { JudgeFn, JudgeOptions } from "vitest-evals";
 
-export const DomainJudge: JudgeFn<{ expectedTool: string }> = async ({
-  toolCalls,
-  expectedTool,
-}) => ({
+export const DomainJudge: JudgeFn<
+  JudgeOptions<{ expectedTool: string }>
+> = async ({ toolCalls, expectedTool }) => ({
   score: toolCalls.some((call) => call.name === expectedTool) ? 1 : 0,
   metadata: {
     rationale: `Expected tool ${expectedTool}`,

diff --git a/docs/harness-first-rfc.md b/docs/harness-first-rfc.md
@@ -135,6 +135,7 @@ describeEval(
   {
     harness: piAiHarness({
       createAgent: () => createRefundAgent(),
+      prompt: judgePrompt,
       run: ({ agent, input, runtime }) => agent.run(input, runtime),
     }),
   },
@@ -169,6 +170,7 @@ The default path should be close to zero glue for standard apps:
 describeEval("refund agent", {
   harness: piAiHarness({
     createAgent: () => createRefundAgent(),
+    prompt: judgePrompt,
   }),
 }, (it) => {
   it("approves a refundable invoice", async ({ run }) => {
@@ -187,6 +189,7 @@ entrypoint or custom result shape:
 describeEval("refund agent", {
   harness: piAiHarness({
     createAgent: () => createRefundAgent(),
+    prompt: judgePrompt,
     run: ({ agent, input, runtime }) => agent.execute(input, runtime),
     normalize: {
       output: ({ result }) => result.decision,

diff --git a/docs/testing.md b/docs/testing.md
@@ -80,6 +80,7 @@ describeEval(
   {
     harness: piAiHarness({
       createAgent: () => createRefundAgent(),
+      prompt: judgePrompt,
     }),
     judges: [ToolCallJudge()],
   },

diff --git a/packages/harness-ai-sdk/README.md b/packages/harness-ai-sdk/README.md
@@ -25,6 +25,12 @@ const tools = {
 
 const harness = aiSdkHarness({
   tools,
+  prompt: (input, options) =>
+    generateText({
+      model: openai("gpt-4o-mini"),
+      system: options?.system,
+      prompt: input,
+    }).then((result) => result.text),
   task: ({ input, runtime }) =>
     generateText({
       model: openai("gpt-4o-mini"),
@@ -40,10 +46,15 @@ If your existing AI SDK app exposes its own entrypoint, wire that in directly:
 ```ts
 const harness = aiSdkHarness({
   tools,
+  prompt: sharedJudgePrompt,
   task: ({ input, runtime }) => createRefundAgent().run(input, runtime),
 });
 ```
 
+The required `prompt` callback is passed to harness-backed judges as
+`JudgeContext.harness.prompt`, which lets rubric or factuality judges share the
+same provider/model configuration as the suite harness.
+
 The adapter infers:
 
 - normalized session and tool-call traces from AI SDK `steps`