diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 57fe405..1be766a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -49,9 +49,15 @@ jobs: - name: Run linter run: pnpm lint + - name: Run typecheck + run: pnpm typecheck + - name: Run tests run: pnpm test:ci + - name: Build + run: pnpm build + - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v4 env: diff --git a/CLAUDE.md b/CLAUDE.md index e610190..32858fd 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -40,8 +40,8 @@ packages/ legacy/ harness-ai-sdk/ harness-pi-ai/ - foobar/ apps/ + demo-ai-sdk/ demo-pi/ docs/ ``` @@ -66,10 +66,11 @@ Owns the AI SDK adapter into `HarnessRun`. Owns the `pi-ai` adapter, wrapped tool runtime, and tool replay behavior. -### `packages/foobar` and `apps/demo-pi` +### Demo apps -Own the example runtime seam and live demos. Keep them realistic and aligned -with the public story. +Own their app-local demo fixtures and live demos. Keep them realistic and +aligned with the public story. `packages/` is reserved for real package +surfaces. ## Core Rules diff --git a/README.md b/README.md index 3a1817c..b2b041b 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,8 @@ Monorepo for the explicit-run `vitest-evals` shape: types, reporter, and legacy compatibility exports - `packages/harness-ai-sdk`: `ai-sdk`-focused harness adapter - `packages/harness-pi-ai`: `pi-ai`-focused harness adapter with tool replay -- `packages/foobar`: example package with a small refund agent -- `apps/demo-pi`: end-to-end Pi Mono demo evals wired through the workspace - packages -- `apps/demo-ai-sdk`: end-to-end AI SDK demo evals wired through the workspace - packages +- `apps/demo-pi`: end-to-end Pi Mono demo evals with an app-local refund agent +- `apps/demo-ai-sdk`: end-to-end AI SDK demo evals with app-local refund tools ## Workspace Layout @@ -19,7 +16,6 @@ packages/ vitest-evals/ harness-ai-sdk/ harness-pi-ai/ - foobar/ apps/ demo-ai-sdk/ demo-pi/ @@ -29,15 +25,16 @@ apps/ ```sh pnpm install +pnpm lint pnpm typecheck pnpm test +pnpm build pnpm evals pnpm evals -- -v pnpm evals -- -vv pnpm evals -- -vvv pnpm evals -- -vvvv pnpm evals:verbose -pnpm build ``` Verbosity tiers for eval output: @@ -51,25 +48,41 @@ from the workspace `tsconfig` paths via `vite-tsconfig-paths`, and package boundaries are expressed in package manifests rather than hard-coded alias tables. +Pull request CI runs the same core safety checks: release config validation, +lint, typecheck, the CI test suite, and the workspace build. + ## Example The `apps/demo-pi` app shows the intended explicit-run flow: ```ts -import { createRefundAgent } from "@demo/foobar"; +import { expect } from "vitest"; import { piAiHarness } from "@vitest-evals/harness-pi-ai"; import { describeEval, - ToolCallJudge, namedJudge, toolCalls, + type JudgeContext, } from "vitest-evals"; +import { createRefundAgent } from "../src/refundAgent"; + +type RefundEvalMetadata = { + expectedStatus: "approved" | "denied"; + expectedTools: string[]; +}; const FactualityJudge = namedJudge( "FactualityJudge", - async ({ output }) => { - const answer = output; - const verdict = await judgeFactuality(answer); + async ({ + input, + output, + metadata, + }: JudgeContext) => { + const verdict = await judgeFactuality({ + question: input, + answer: output, + expectedStatus: metadata.expectedStatus, + }); return { score: verdict.score, @@ -86,7 +99,7 @@ describeEval( harness: piAiHarness({ createAgent: () => createRefundAgent(), }), - judges: [ToolCallJudge()], + judges: [FactualityJudge], }, (it) => { it.for([ @@ -104,7 +117,6 @@ describeEval( expect(result.output).toMatchObject({ status: metadata.expectedStatus, }); - await expect(result).toSatisfyJudge(FactualityJudge); expect(toolCalls(result.session).map((call) => call.name)).toEqual( metadata.expectedTools, ); diff --git a/apps/demo-ai-sdk/README.md b/apps/demo-ai-sdk/README.md index 20f0034..094d2c5 100644 --- a/apps/demo-ai-sdk/README.md +++ b/apps/demo-ai-sdk/README.md @@ -5,11 +5,10 @@ through the workspace packages: - `vitest-evals` - `@vitest-evals/harness-ai-sdk` -- `@demo/foobar` The passing live eval lives in `evals/refund.eval.ts`. -It demonstrates an automatic harness-backed tool judge plus explicit Vitest -assertions on `run.output` and the normalized session trace. +It demonstrates app-local refund tools and explicit Vitest assertions on +`run.output` and the normalized session trace. The intentionally failing examples live in `evals/refund.fail.eval.ts`. One fails an automatic harness-backed judge, and one fails explicit assertions diff --git a/apps/demo-ai-sdk/evals/refund.eval.ts b/apps/demo-ai-sdk/evals/refund.eval.ts index ebdf362..e8af022 100644 --- a/apps/demo-ai-sdk/evals/refund.eval.ts +++ b/apps/demo-ai-sdk/evals/refund.eval.ts @@ -1,6 +1,5 @@ -import { assertRefundCase } from "@demo/foobar/testing"; import { describeEval } from "vitest-evals"; -import { refundHarness } from "./shared"; +import { assertRefundCase, refundHarness } from "./shared"; describeEval( "demo ai-sdk refund agent", diff --git a/apps/demo-ai-sdk/evals/shared.ts b/apps/demo-ai-sdk/evals/shared.ts index af602e1..c5f936e 100644 --- a/apps/demo-ai-sdk/evals/shared.ts +++ b/apps/demo-ai-sdk/evals/shared.ts @@ -1,20 +1,96 @@ import { anthropic } from "@ai-sdk/anthropic"; -import { - CREATE_REFUND_DESCRIPTION, - LOOKUP_INVOICE_DESCRIPTION, - REFUND_SYSTEM_PROMPT, - createRefund, - lookupInvoice, - parseRefundDecision, - type RefundCase, -} from "@demo/foobar"; import { aiSdkHarness, type AiSdkToolset } from "@vitest-evals/harness-ai-sdk"; import { generateText, stepCountIs } from "ai"; +import { expect } from "vitest"; +import { type HarnessRun, toolCalls } from "vitest-evals"; import { z } from "zod"; +type InvoiceRecord = { + invoiceId: string; + amount: number; + refundable: boolean; + customer: string; +}; + +type RefundDecision = + | { + status: "approved"; + invoiceId: string; + refundId: string; + amount: number; + } + | { + status: "denied"; + invoiceId: string; + reason: string; + }; + +export type RefundCase = { + input: string; + expectedStatus: RefundDecision["status"]; + expectedTools: string[]; +}; + +const REFUND_SYSTEM_PROMPT = [ + "You are the demo refund operations agent.", + "You must decide whether a refund should be approved for the invoice in the user's request.", + "Always call lookupInvoice before making a decision.", + "If the invoice is refundable, call createRefund with the full invoice amount.", + "If the invoice is not refundable, do not call createRefund.", + "Return JSON only and do not wrap it in markdown.", + 'Approved shape: {"status":"approved","invoiceId":"...","refundId":"...","amount":4200}', + 'Denied shape: {"status":"denied","invoiceId":"...","reason":"..."}', +].join("\n"); + +const INVOICES: Record = { + inv_123: { + invoiceId: "inv_123", + amount: 4200, + refundable: true, + customer: "Acme Co", + }, + inv_404: { + invoiceId: "inv_404", + amount: 1700, + refundable: false, + customer: "Globex", + }, +}; + +async function lookupInvoice({ + invoiceId, +}: { + invoiceId: string; +}): Promise { + const invoice = INVOICES[invoiceId]; + if (!invoice) { + throw new Error(`Invoice ${invoiceId} not found`); + } + + return invoice; +} + +async function createRefund({ + invoiceId, + amount, +}: { + invoiceId: string; + amount: number; +}): Promise<{ + refundId: string; + amount: number; + status: "submitted"; +}> { + return { + refundId: `rf_${invoiceId}`, + amount, + status: "submitted", + }; +} + const refundTools = { lookupInvoice: { - description: LOOKUP_INVOICE_DESCRIPTION, + description: "Look up invoice details inside demo billing.", replay: true, inputSchema: z.object({ invoiceId: z @@ -24,7 +100,7 @@ const refundTools = { execute: lookupInvoice, }, createRefund: { - description: CREATE_REFUND_DESCRIPTION, + description: "Create a refund for a refundable invoice.", inputSchema: z.object({ invoiceId: z.string().describe("The invoice id that should be refunded."), amount: z.number().describe("The amount to refund in cents."), @@ -46,3 +122,124 @@ export const refundHarness = aiSdkHarness({ }), output: ({ result }) => parseRefundDecision(result.text), }); + +export async function assertRefundCase( + run: HarnessRun, + expected: Pick, +) { + expect(run.output).toMatchObject({ + status: expected.expectedStatus, + }); + expect(toolCalls(run.session).map((call) => call.name)).toEqual( + expected.expectedTools, + ); + expect(run.usage.provider).toContain("anthropic"); + expect(run.usage.model).toContain("claude"); + expect(run.usage.totalTokens).toBeGreaterThan(0); +} + +function parseRefundDecision(text: string): RefundDecision { + const cleaned = stripMarkdownFence(text); + const jsonText = extractJsonObjectText(cleaned); + const parsed = JSON.parse(jsonText) as Record; + + if ( + parsed.status === "approved" && + typeof parsed.invoiceId === "string" && + typeof parsed.refundId === "string" && + typeof parsed.amount === "number" + ) { + return { + status: "approved", + invoiceId: parsed.invoiceId, + refundId: parsed.refundId, + amount: parsed.amount, + }; + } + + if ( + parsed.status === "denied" && + typeof parsed.invoiceId === "string" && + typeof parsed.reason === "string" + ) { + return { + status: "denied", + invoiceId: parsed.invoiceId, + reason: parsed.reason, + }; + } + + throw new Error(`Refund agent returned an invalid decision payload: ${text}`); +} + +function stripMarkdownFence(text: string) { + const trimmed = text.trim(); + if (!trimmed.startsWith("```") || !trimmed.endsWith("```")) { + return trimmed; + } + + const firstNewline = trimmed.indexOf("\n"); + if (firstNewline === -1) { + return trimmed; + } + + const fenceHeader = trimmed.slice(3, firstNewline).trim().toLowerCase(); + if (fenceHeader !== "" && fenceHeader !== "json") { + return trimmed; + } + + return trimmed.slice(firstNewline + 1, -3).trim(); +} + +function extractJsonObjectText(text: string) { + const start = text.indexOf("{"); + if (start === -1) { + return text; + } + + let depth = 0; + let inString = false; + let isEscaped = false; + + for (let index = start; index < text.length; index += 1) { + const char = text[index]; + + if (inString) { + if (isEscaped) { + isEscaped = false; + continue; + } + + if (char === "\\") { + isEscaped = true; + continue; + } + + if (char === '"') { + inString = false; + } + continue; + } + + if (char === '"') { + inString = true; + continue; + } + + if (char === "{") { + depth += 1; + continue; + } + + if (char !== "}") { + continue; + } + + depth -= 1; + if (depth === 0) { + return text.slice(start, index + 1); + } + } + + return text; +} diff --git a/apps/demo-ai-sdk/package.json b/apps/demo-ai-sdk/package.json index f3e629b..f55abf1 100644 --- a/apps/demo-ai-sdk/package.json +++ b/apps/demo-ai-sdk/package.json @@ -9,7 +9,6 @@ }, "dependencies": { "@ai-sdk/anthropic": "^3.0.71", - "@demo/foobar": "workspace:*", "@vitest-evals/harness-ai-sdk": "workspace:*", "ai": "^6.0.141", "vitest-evals": "workspace:*", diff --git a/apps/demo-pi/README.md b/apps/demo-pi/README.md index 3086dcb..36a582b 100644 --- a/apps/demo-pi/README.md +++ b/apps/demo-pi/README.md @@ -5,11 +5,11 @@ through the workspace packages: - `vitest-evals` - `@vitest-evals/harness-pi-ai` -- `@demo/foobar` The passing live eval lives in `evals/refund.eval.ts`. -It demonstrates an automatic harness-backed tool judge plus explicit Vitest -assertions on `run.output` and the normalized session trace. +It demonstrates an app-local refund agent, an automatic harness-backed tool +judge, and explicit Vitest assertions on `run.output` and the normalized +session trace. The intentionally failing examples live in `evals/refund.fail.eval.ts`. One fails an automatic harness-backed judge, and one fails explicit assertions diff --git a/apps/demo-pi/evals/refund.eval.ts b/apps/demo-pi/evals/refund.eval.ts index 6d0a350..139b68b 100644 --- a/apps/demo-pi/evals/refund.eval.ts +++ b/apps/demo-pi/evals/refund.eval.ts @@ -1,5 +1,4 @@ import { expect } from "vitest"; -import { createRefundAgent, type RefundCase } from "@demo/foobar"; import { piAiHarness } from "@vitest-evals/harness-pi-ai"; import { describeEval, @@ -7,6 +6,7 @@ import { ToolCallJudge, toolCalls, } from "vitest-evals"; +import { createRefundAgent, type RefundCase } from "../src/refundAgent"; const outputJudge = StructuredOutputJudge(); diff --git a/apps/demo-pi/evals/refund.fail.eval.ts b/apps/demo-pi/evals/refund.fail.eval.ts index 35b9732..294dcbe 100644 --- a/apps/demo-pi/evals/refund.fail.eval.ts +++ b/apps/demo-pi/evals/refund.fail.eval.ts @@ -1,7 +1,7 @@ import { expect } from "vitest"; -import { createRefundAgent, type RefundCase } from "@demo/foobar"; import { piAiHarness } from "@vitest-evals/harness-pi-ai"; import { describeEval, StructuredOutputJudge } from "vitest-evals"; +import { createRefundAgent, type RefundCase } from "../src/refundAgent"; type AssertionRefundCase = RefundCase; type ScoredRefundCase = RefundCase & { diff --git a/apps/demo-pi/package.json b/apps/demo-pi/package.json index 1ac85dd..1ce4c30 100644 --- a/apps/demo-pi/package.json +++ b/apps/demo-pi/package.json @@ -8,7 +8,8 @@ "evals:fail": "node ./scripts/run-evals.mjs --fail" }, "dependencies": { - "@demo/foobar": "workspace:*", + "@mariozechner/pi-agent-core": "0.67.68", + "@mariozechner/pi-ai": "0.67.68", "@vitest-evals/harness-pi-ai": "workspace:*", "vitest-evals": "workspace:*" } diff --git a/packages/foobar/src/index.test.ts b/apps/demo-pi/src/refundAgent.test.ts similarity index 99% rename from packages/foobar/src/index.test.ts rename to apps/demo-pi/src/refundAgent.test.ts index beafba7..b0ec385 100644 --- a/packages/foobar/src/index.test.ts +++ b/apps/demo-pi/src/refundAgent.test.ts @@ -4,7 +4,7 @@ import { createRefundAgent, lookupInvoice, parseRefundDecision, -} from "./index"; +} from "./refundAgent"; describe("parseRefundDecision", () => { test("parses plain approved JSON", () => { diff --git a/packages/foobar/src/index.ts b/apps/demo-pi/src/refundAgent.ts similarity index 91% rename from packages/foobar/src/index.ts rename to apps/demo-pi/src/refundAgent.ts index a2c0864..cf4709f 100644 --- a/packages/foobar/src/index.ts +++ b/apps/demo-pi/src/refundAgent.ts @@ -47,13 +47,13 @@ export type CreateRefundInput = { }; export const LOOKUP_INVOICE_DESCRIPTION = - "Look up invoice details inside Foobar billing."; + "Look up invoice details inside demo billing."; export const CREATE_REFUND_DESCRIPTION = "Create a refund for a refundable invoice."; -type FoobarRefundModel = "claude-sonnet-4-5"; -const DEFAULT_REFUND_MODEL: FoobarRefundModel = "claude-sonnet-4-5"; +type RefundAgentModel = "claude-sonnet-4-5"; +const DEFAULT_REFUND_MODEL: RefundAgentModel = "claude-sonnet-4-5"; export const REFUND_SYSTEM_PROMPT = [ - "You are Foobar's refund operations agent.", + "You are the demo refund operations agent.", "You must decide whether a refund should be approved for the invoice in the user's request.", "Always call lookupInvoice before making a decision.", "If the invoice is refundable, call createRefund with the full invoice amount.", @@ -104,7 +104,7 @@ export async function createRefund({ }; } -const foobarTools = { +const refundAgentTools = { lookupInvoice: { description: LOOKUP_INVOICE_DESCRIPTION, replay: true, @@ -116,14 +116,14 @@ const foobarTools = { }, } satisfies PiAiToolset; -type FoobarRuntime = PiAiRuntime< - typeof foobarTools, +type RefundAgentRuntime = PiAiRuntime< + typeof refundAgentTools, string, RefundEvalMetadata >; -type FoobarRuntimeTools = FoobarRuntime["tools"]; +type RefundAgentRuntimeTools = RefundAgentRuntime["tools"]; -const fallbackRuntimeTools: FoobarRuntimeTools = { +const fallbackRuntimeTools: RefundAgentRuntimeTools = { lookupInvoice, createRefund, }; @@ -146,13 +146,11 @@ const createRefundParameters = Type.Object({ type LookupInvoiceArgs = Static; type CreateRefundArgs = Static; -export class FoobarRefundAgent { +export class RefundAgent { private readonly agent: Agent; - readonly toolset = foobarTools; + readonly toolset = refundAgentTools; - constructor( - private readonly model: FoobarRefundModel = DEFAULT_REFUND_MODEL, - ) { + constructor(private readonly model: RefundAgentModel = DEFAULT_REFUND_MODEL) { this.agent = new Agent({ initialState: { systemPrompt: REFUND_SYSTEM_PROMPT, @@ -164,7 +162,7 @@ export class FoobarRefundAgent { }); } - async run(input: string, runtime: FoobarRuntime) { + async run(input: string, runtime: RefundAgentRuntime) { await this.agent.reset(); this.agent.state.systemPrompt = REFUND_SYSTEM_PROMPT; this.agent.state.model = getModel("anthropic", this.model); @@ -213,12 +211,12 @@ export class FoobarRefundAgent { } /** Creates a fresh demo refund agent for one eval run. */ -export function createRefundAgent(options?: { model?: FoobarRefundModel }) { - return new FoobarRefundAgent(options?.model ?? DEFAULT_REFUND_MODEL); +export function createRefundAgent(options?: { model?: RefundAgentModel }) { + return new RefundAgent(options?.model ?? DEFAULT_REFUND_MODEL); } function createAgentTools( - runtimeTools: FoobarRuntimeTools = fallbackRuntimeTools, + runtimeTools: RefundAgentRuntimeTools = fallbackRuntimeTools, ): Array> { const lookupInvoiceTool: AgentTool< typeof lookupInvoiceParameters, diff --git a/docs/architecture.md b/docs/architecture.md index 5ef7bad..093d51c 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -28,8 +28,8 @@ packages/ legacy/ harness-ai-sdk/ harness-pi-ai/ - foobar/ apps/ + demo-ai-sdk/ demo-pi/ ``` @@ -105,7 +105,7 @@ For each eval test in a harness-backed suite: 2. The suite callback registers named eval tests. 3. The eval test calls `run(input)` at the point execution should happen. 4. The configured harness runs the system under test exactly once. -5. The harness returns a `HarnessRun` with `run.output`, `run.session`, +5. The harness returns a `HarnessRun` with `result.output`, `result.session`, `usage`, `timings`, `artifacts`, and `errors`. 6. Core stores that run on `task.meta.harness` for the reporter. 7. Automatic suite-level judges run against the normalized run/session pair. @@ -134,6 +134,12 @@ standard tool replay/VCR behavior for opt-in tools, including: Replay metadata becomes part of the normalized tool record so the reporter can surface it. +## Demo Apps + +`apps/demo-pi` and `apps/demo-ai-sdk` own their demo fixtures locally. They stay +under `apps/` because they are product demos, while `packages/` is reserved for +real package surfaces that can be published or consumed independently. + ## Extension Points ### New Harnesses diff --git a/docs/custom-scorers.md b/docs/custom-scorers.md index 95d000c..5ca6111 100644 --- a/docs/custom-scorers.md +++ b/docs/custom-scorers.md @@ -58,13 +58,10 @@ judge needs richer context, type it with `JudgeContext` and read `metadata`, ## Built-In Root Judges -The root package ships judge-shaped helpers for common cases: - -```ts -import { StructuredOutputJudge, ToolCallJudge } from "vitest-evals"; -``` - -These operate on normalized harness data instead of raw scorer inputs. +The root package still ships deterministic judge-shaped helpers such as +`StructuredOutputJudge()` and `ToolCallJudge()`. They operate on normalized +harness data instead of raw scorer inputs, but new docs should keep factuality +or rubric judges as the primary examples. ## Legacy Scorer Example diff --git a/docs/development-guide.md b/docs/development-guide.md index e0ccffc..4d27ba0 100644 --- a/docs/development-guide.md +++ b/docs/development-guide.md @@ -73,10 +73,11 @@ Owns: - wrapped tool runtime injection - tool replay/VCR behavior -### `packages/foobar` and `apps/demo-pi` +## Demo Apps -Own the example runtime seam and live demo eval coverage. Keep them realistic. -They are part of the product story, not just smoke tests. +`apps/demo-pi` and `apps/demo-ai-sdk` own live demo eval coverage and any +app-local refund fixtures they need. Keep them realistic; they are part of the +product story, not just smoke tests. `packages/` is for real package surfaces. ## Adding a New Judge @@ -144,7 +145,7 @@ For targeted work, prefer narrow verification: - reporter changes: run reporter tests - harness changes: run the relevant harness package tests -- demo/runtime changes: run `pnpm evals` or a filtered app/package eval command +- demo app changes: run `pnpm evals` or a filtered app eval command - legacy changes: run the moved tests under `packages/vitest-evals/src/legacy` ## Documentation Expectations diff --git a/docs/scorer-examples.md b/docs/scorer-examples.md index e225711..f86b6c8 100644 --- a/docs/scorer-examples.md +++ b/docs/scorer-examples.md @@ -51,38 +51,11 @@ export const LookupThenRefundJudge: JudgeFn = async ({ toolCalls }) => { await expect(result).toSatisfyJudge(FactualityJudge); ``` -## Built-In Judge Helpers +## Deterministic Helper Note -```ts -import { StructuredOutputJudge, ToolCallJudge } from "vitest-evals"; - -describeEval( - "refund agent", - { - harness: piAiHarness({ - createAgent: () => createRefundAgent(), - }), - judges: [ToolCallJudge()], - }, - (it) => { - it("approves a refund", async ({ run }) => { - const result = await run("Refund invoice inv_123", { - metadata: { - expected: { status: "approved" }, - expectedTools: [ - { name: "lookupInvoice" }, - { name: "createRefund" }, - ], - }, - }); - - await expect(result).toSatisfyJudge(StructuredOutputJudge(), { - expected: { status: "approved" }, - }); - }); - }, -); -``` +Built-ins such as `StructuredOutputJudge()` and `ToolCallJudge()` still exist +for deterministic contract checks. New docs should use factuality or rubric +judges as the primary examples. ## Legacy Scorer Example diff --git a/docs/testing.md b/docs/testing.md index 6591bac..6df0df2 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -25,6 +25,10 @@ packages/harness-ai-sdk/src/ index.test.ts packages/harness-pi-ai/src/ index.test.ts +apps/demo-pi/src/ + refundAgent.test.ts +apps/demo-ai-sdk/evals/ + *.eval.ts apps/demo-pi/evals/ *.eval.ts ``` diff --git a/package.json b/package.json index 0b8438f..dd7b740 100644 --- a/package.json +++ b/package.json @@ -13,9 +13,9 @@ "prepare": "simple-git-hooks", "release:check": "node ./scripts/check-release-config.mjs", "typecheck": "tsc --noEmit", - "test": "dotenv -e .env -e .env.local -- vitest run packages --config=./vitest.config.ts --reporter=./packages/vitest-evals/src/reporter.ts", - "test:watch": "dotenv -e .env -e .env.local -- vitest packages --config=./vitest.config.ts --reporter=./packages/vitest-evals/src/reporter.ts", - "test:ci": "dotenv -e .env -e .env.local -- vitest run packages --config=./vitest.config.ts --coverage --reporter=./packages/vitest-evals/src/reporter.ts --reporter=junit --outputFile=tests.junit.xml" + "test": "dotenv -e .env -e .env.local -- vitest run packages apps --config=./vitest.config.ts --reporter=./packages/vitest-evals/src/reporter.ts", + "test:watch": "dotenv -e .env -e .env.local -- vitest packages apps --config=./vitest.config.ts --reporter=./packages/vitest-evals/src/reporter.ts", + "test:ci": "dotenv -e .env -e .env.local -- vitest run packages apps --config=./vitest.config.ts --coverage --reporter=./packages/vitest-evals/src/reporter.ts --reporter=junit --outputFile=tests.junit.xml" }, "repository": { "type": "git", diff --git a/packages/foobar/package.json b/packages/foobar/package.json deleted file mode 100644 index a2f8ecd..0000000 --- a/packages/foobar/package.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "name": "@demo/foobar", - "private": true, - "version": "0.1.0", - "types": "./dist/index.d.ts", - "main": "./dist/index.js", - "module": "./dist/index.mjs", - "files": ["dist"], - "exports": { - ".": { - "source": "./src/index.ts", - "types": "./dist/index.d.ts", - "require": "./dist/index.js", - "import": "./dist/index.mjs" - }, - "./testing": { - "source": "./src/testing.ts", - "types": "./dist/testing.d.ts", - "require": "./dist/testing.js", - "import": "./dist/testing.mjs" - } - }, - "dependencies": { - "@mariozechner/pi-agent-core": "0.67.68", - "@mariozechner/pi-ai": "0.67.68" - }, - "devDependencies": { - "vitest": "^4.1.2", - "vitest-evals": "workspace:*" - }, - "scripts": { - "build": "tsup --config ./tsup.config.ts" - } -} diff --git a/packages/foobar/src/testing.ts b/packages/foobar/src/testing.ts deleted file mode 100644 index 6277f7c..0000000 --- a/packages/foobar/src/testing.ts +++ /dev/null @@ -1,18 +0,0 @@ -import { expect } from "vitest"; -import { type HarnessRun, toolCalls } from "vitest-evals"; -import type { RefundCase } from "./index"; - -export async function assertRefundCase( - run: HarnessRun, - expected: Pick, -) { - expect(run.output).toMatchObject({ - status: expected.expectedStatus, - }); - expect(toolCalls(run.session).map((call) => call.name)).toEqual( - expected.expectedTools, - ); - expect(run.usage.provider).toContain("anthropic"); - expect(run.usage.model).toContain("claude"); - expect(run.usage.totalTokens).toBeGreaterThan(0); -} diff --git a/packages/foobar/tsconfig.json b/packages/foobar/tsconfig.json deleted file mode 100644 index 9e25e6e..0000000 --- a/packages/foobar/tsconfig.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "extends": "../../tsconfig.base.json", - "include": ["src/**/*.ts"] -} diff --git a/packages/foobar/tsup.config.ts b/packages/foobar/tsup.config.ts deleted file mode 100644 index a57f24f..0000000 --- a/packages/foobar/tsup.config.ts +++ /dev/null @@ -1,11 +0,0 @@ -import { defineConfig } from "tsup"; - -export default defineConfig({ - entry: ["src/**/*.ts", "!src/**/*.test.ts", "!src/**/*.test.*.ts"], - format: ["cjs", "esm"], - dts: true, - splitting: false, - sourcemap: true, - clean: true, - external: ["vitest", "vitest-evals"], -}); diff --git a/packages/harness-ai-sdk/src/index.test.ts b/packages/harness-ai-sdk/src/index.test.ts index 50b5a93..78a0a55 100644 --- a/packages/harness-ai-sdk/src/index.test.ts +++ b/packages/harness-ai-sdk/src/index.test.ts @@ -517,7 +517,10 @@ test("omits empty runtime tool error content when a task errors", async () => { inputSchema: z.object({ invoiceId: z.string(), }), - execute: async () => { + execute: async ( + _input: { invoiceId: string }, + _execution: ToolExecutionOptions, + ) => { throw new Error(""); }, }, diff --git a/packages/harness-ai-sdk/src/index.ts b/packages/harness-ai-sdk/src/index.ts index f402243..ebcbc9f 100644 --- a/packages/harness-ai-sdk/src/index.ts +++ b/packages/harness-ai-sdk/src/index.ts @@ -861,7 +861,7 @@ function resolveSession( } const unmatchedRuntimeToolCalls = runtimeToolCalls.filter( - (call) => !stepToolCallIds.has(call.id), + (call) => call.id === undefined || !stepToolCallIds.has(call.id), ); if (unmatchedRuntimeToolCalls.length > 0) { diff --git a/packages/harness-pi-ai/README.md b/packages/harness-pi-ai/README.md index bb45d45..0a7abaf 100644 --- a/packages/harness-pi-ai/README.md +++ b/packages/harness-pi-ai/README.md @@ -53,7 +53,8 @@ normalization hooks still exist under `normalize`: ```ts const harness = piAiHarness({ - createAgent: () => createRefundAgent(), + createAgent: () => createWrappedRefundAgent(), + run: ({ agent, input, runtime }) => agent.run(input, runtime), normalize: { output: ({ result }) => result.customDecision, }, diff --git a/packages/harness-pi-ai/src/index.ts b/packages/harness-pi-ai/src/index.ts index 9cde050..66eb68f 100644 --- a/packages/harness-pi-ai/src/index.ts +++ b/packages/harness-pi-ai/src/index.ts @@ -61,6 +61,13 @@ type PiAgentToolLike< const ORIGINAL_NATIVE_EXECUTE = Symbol("vitest-evals.originalNativeExecute"); +type NativeToolExecute< + TInput, + TMetadata extends HarnessMetadata, +> = PiAgentToolLike["execute"] & { + [ORIGINAL_NATIVE_EXECUTE]?: PiAgentToolLike["execute"]; +}; + export type PiAiReplayMode = ReplayMode; export interface PiAiEventSink { @@ -426,12 +433,13 @@ async function executePiHarnessRun< return result; } + const normalizeResult = result as TResult; const resultArgs = { agent, input, context, runtime, - result, + result: normalizeResult, } satisfies PiAiHarnessResultArgs< TAgent, TInput, @@ -442,13 +450,13 @@ async function executePiHarnessRun< const output = options.normalize?.output ? await options.normalize.output(resultArgs) - : resolveOutput(result); + : resolveOutput(normalizeResult); const usage = options.normalize?.usage ? await options.normalize.usage(resultArgs) - : resolveUsage(result, runtime.toolCalls.length); + : resolveUsage(normalizeResult, runtime.toolCalls.length); const session = options.normalize?.session ? await options.normalize.session(resultArgs) - : resolveSession(result, messages, output, usage); + : resolveSession(normalizeResult, messages, output, usage); return { session, @@ -463,7 +471,7 @@ async function executePiHarnessRun< : undefined, errors: options.normalize?.errors ? await options.normalize.errors(resultArgs) - : resolveErrors(result), + : resolveErrors(normalizeResult), }; } catch (error) { const usage = resolveUsage(undefined, runtime.toolCalls.length); @@ -734,7 +742,10 @@ async function withInstrumentedAgentTools< const originalExecute = getNativeToolExecuteOrigin(tool.execute); originalExecutions.set(tool, originalExecute); - const instrumentedExecute = async (toolCallId, rawArgs) => { + const instrumentedExecute: NativeToolExecute = async ( + toolCallId: string, + rawArgs: Record, + ) => { const startedAt = new Date(); const toolContext = { input: args.input, @@ -780,7 +791,7 @@ async function withInstrumentedAgentTools< const call = { name: tool.name, arguments: rawArgs, - error: serializeError(error), + error: serializeToolCallError(error), startedAt: startedAt.toISOString(), finishedAt: finishedAt.toISOString(), durationMs: finishedAt.getTime() - startedAt.getTime(), @@ -878,10 +889,24 @@ function isPromiseLike(value: unknown): value is Promise { ); } +function serializeToolCallError( + error: unknown, +): NonNullable { + const serialized = serializeError(error); + const { message, type, ...details } = serialized; + + return { + ...details, + message: typeof message === "string" ? message : String(message), + ...(typeof type === "string" ? { type } : {}), + }; +} + function getNativeToolExecuteOrigin( execute: PiAgentToolLike["execute"], ) { - return execute[ORIGINAL_NATIVE_EXECUTE] ?? execute; + const nativeExecute = execute as NativeToolExecute; + return nativeExecute[ORIGINAL_NATIVE_EXECUTE] ?? nativeExecute; } async function executeNativeToolWithReplay< @@ -1033,7 +1058,7 @@ function createRuntime< const call = { name: toolName, arguments: args, - error: serializeError(error), + error: serializeToolCallError(error), startedAt: startedAt.toISOString(), finishedAt: finishedAt.toISOString(), durationMs: finishedAt.getTime() - startedAt.getTime(), diff --git a/packages/vitest-evals/README.md b/packages/vitest-evals/README.md index 6d3dd37..286570d 100644 --- a/packages/vitest-evals/README.md +++ b/packages/vitest-evals/README.md @@ -12,6 +12,8 @@ Install a first-party harness package for the runtime you want to test: ```sh npm install -D @vitest-evals/harness-pi-ai +# or +npm install -D @vitest-evals/harness-ai-sdk ``` ## Core Model @@ -20,9 +22,9 @@ npm install -D @vitest-evals/harness-pi-ai - the suite callback receives a fixture-backed Vitest `it` - `run(input, { metadata? })` executes the harness explicitly and returns a normalized `HarnessRun` -- `run.output` is the app-facing value you assert on directly -- `run.session` is the canonical JSON-serializable trace for reporting, replay, - tool assertions, and judges +- the returned `result.output` is the app-facing value you assert on directly +- the returned `result.session` is the canonical JSON-serializable trace for + reporting, replay, tool assertions, and judges - per-run judge inputs should usually live under `metadata` - suite-level `judges` are optional and run automatically after each `run(...)` - suite-level `judgeThreshold` controls fail-on-score for those automatic judges @@ -32,20 +34,33 @@ npm install -D @vitest-evals/harness-pi-ai ## Explicit Run Example ```ts -import { createRefundAgent } from "@demo/foobar"; +import { expect } from "vitest"; import { piAiHarness } from "@vitest-evals/harness-pi-ai"; import { describeEval, - ToolCallJudge, namedJudge, toolCalls, + type JudgeContext, } from "vitest-evals"; +import { createRefundAgent } from "../src/refundAgent"; + +type RefundEvalMetadata = { + expectedStatus: "approved" | "denied"; + expectedTools: string[]; +}; const FactualityJudge = namedJudge( "FactualityJudge", - async ({ output }) => { - const answer = output; - const verdict = await judgeFactuality(answer); + async ({ + input, + output, + metadata, + }: JudgeContext) => { + const verdict = await judgeFactuality({ + question: input, + answer: output, + expectedStatus: metadata.expectedStatus, + }); return { score: verdict.score, @@ -62,12 +77,13 @@ describeEval( harness: piAiHarness({ createAgent: () => createRefundAgent(), }), - judges: [ToolCallJudge()], + judges: [FactualityJudge], }, (it) => { it("approves a refundable invoice", async ({ run }) => { const result = await run("Refund invoice inv_123", { metadata: { + expectedStatus: "approved", expectedTools: ["lookupInvoice", "createRefund"], }, }); @@ -77,7 +93,6 @@ describeEval( "lookupInvoice", "createRefund", ]); - await expect(result).toSatisfyJudge(FactualityJudge); }); }, ); @@ -171,22 +186,17 @@ const FactualityJudge = namedJudge( ); ``` -A simple factuality judge can just score `output`, which is the normalized -response text that `toSatisfyJudge(...)` passes automatically. Structured or -programmatic result checks should usually read `run.output` instead. When a -judge needs richer context, type it with `JudgeContext` and read `inputValue`, -`metadata`, `toolCalls`, or `session` from there. - -When you only need deterministic contract checks, the built-ins are still -useful: - -```ts -import { StructuredOutputJudge, ToolCallJudge } from "vitest-evals"; - -await expect(result).toSatisfyJudge(StructuredOutputJudge(), { - expected: { status: "approved" }, -}); -``` +For a `HarnessRun`, `toSatisfyJudge(...)` passes `result.output` as `output`. +For raw values or normalized sessions, the matcher infers the best available +output from the received value. Structured or programmatic result checks should +usually assert on `result.output` directly. When a judge needs richer context, +type it with `JudgeContext` and read `inputValue`, `metadata`, `toolCalls`, or +`session` from there. + +When you only need deterministic contract checks, built-ins such as +`StructuredOutputJudge()` and `ToolCallJudge()` are still available. The primary +documentation examples intentionally use factuality/rubric judges because those +match the product's LLM-as-a-judge direction. ## Legacy Compatibility diff --git a/packages/vitest-evals/src/harness.test.ts b/packages/vitest-evals/src/harness.test.ts index 16d1b8b..957138f 100644 --- a/packages/vitest-evals/src/harness.test.ts +++ b/packages/vitest-evals/src/harness.test.ts @@ -12,6 +12,7 @@ import { type Harness, type HarnessContext, type HarnessRun, + type NormalizedSession, } from "./index"; type RefundEvalMetadata = { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 4ba1089..031cd0d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -62,9 +62,6 @@ importers: '@ai-sdk/anthropic': specifier: ^3.0.71 version: 3.0.71(zod@4.3.6) - '@demo/foobar': - specifier: workspace:* - version: link:../../packages/foobar '@vitest-evals/harness-ai-sdk': specifier: workspace:* version: link:../../packages/harness-ai-sdk @@ -79,18 +76,6 @@ importers: version: 4.3.6 apps/demo-pi: - dependencies: - '@demo/foobar': - specifier: workspace:* - version: link:../../packages/foobar - '@vitest-evals/harness-pi-ai': - specifier: workspace:* - version: link:../../packages/harness-pi-ai - vitest-evals: - specifier: workspace:* - version: link:../../packages/vitest-evals - - packages/foobar: dependencies: '@mariozechner/pi-agent-core': specifier: 0.67.68 @@ -98,13 +83,12 @@ importers: '@mariozechner/pi-ai': specifier: 0.67.68 version: 0.67.68(ws@8.20.0)(zod@4.3.6) - devDependencies: - vitest: - specifier: ^4.1.2 - version: 4.1.2(@opentelemetry/api@1.9.0)(@types/node@25.5.0)(vite@7.0.6(@types/node@25.5.0)(yaml@2.8.0)) + '@vitest-evals/harness-pi-ai': + specifier: workspace:* + version: link:../../packages/harness-pi-ai vitest-evals: specifier: workspace:* - version: link:../vitest-evals + version: link:../../packages/vitest-evals packages/harness-ai-sdk: devDependencies: diff --git a/tsconfig.base.json b/tsconfig.base.json index f0d7025..31e6d3f 100644 --- a/tsconfig.base.json +++ b/tsconfig.base.json @@ -11,9 +11,7 @@ "vitest-evals": ["packages/vitest-evals/src/index.ts"], "vitest-evals/*": ["packages/vitest-evals/src/*"], "@vitest-evals/harness-ai-sdk": ["packages/harness-ai-sdk/src/index.ts"], - "@vitest-evals/harness-pi-ai": ["packages/harness-pi-ai/src/index.ts"], - "@demo/foobar": ["packages/foobar/src/index.ts"], - "@demo/foobar/testing": ["packages/foobar/src/testing.ts"] + "@vitest-evals/harness-pi-ai": ["packages/harness-pi-ai/src/index.ts"] } } }