diff --git a/.craft.yml b/.craft.yml index 85a4021..54d78cc 100644 --- a/.craft.yml +++ b/.craft.yml @@ -9,6 +9,10 @@ targets: id: "@vitest-evals/harness-ai-sdk" access: public includeNames: /^vitest-evals-harness-ai-sdk-\d.*\.tgz$/ + - name: npm + id: "@vitest-evals/harness-openai-agents" + access: public + includeNames: /^vitest-evals-harness-openai-agents-\d.*\.tgz$/ - name: npm id: "@vitest-evals/harness-pi-ai" access: public diff --git a/.github/workflows/merge-jobs.yml b/.github/workflows/merge-jobs.yml index 939df3a..d5932ea 100644 --- a/.github/workflows/merge-jobs.yml +++ b/.github/workflows/merge-jobs.yml @@ -66,6 +66,7 @@ jobs: mkdir -p artifacts pnpm --filter vitest-evals pack --pack-destination artifacts pnpm --filter @vitest-evals/harness-ai-sdk pack --pack-destination artifacts + pnpm --filter @vitest-evals/harness-openai-agents pack --pack-destination artifacts pnpm --filter @vitest-evals/harness-pi-ai pack --pack-destination artifacts ls -la artifacts diff --git a/.gitignore b/.gitignore index ef70c63..2d3b640 100644 --- a/.gitignore +++ b/.gitignore @@ -88,6 +88,9 @@ dist # Build files /dist +# vitest-evals replay recordings +.vitest-evals/ + # Gatsby files .cache/ # Comment in the public line in if your project uses Gatsby and not Next.js diff --git a/README.md b/README.md index 6085fa2..cd1eb27 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,12 @@ Monorepo for the explicit-run `vitest-evals` shape: - `packages/vitest-evals`: core suite API, judges, normalized harness/session types, reporter, and legacy compatibility exports - `packages/harness-ai-sdk`: `ai-sdk`-focused harness adapter +- `packages/harness-openai-agents`: `@openai/agents`-focused harness adapter - `packages/harness-pi-ai`: `pi-ai`-focused harness adapter with tool replay - `apps/demo-pi`: end-to-end Pi Mono demo evals with an app-local refund agent - `apps/demo-ai-sdk`: end-to-end AI SDK demo evals with app-local refund tools +- `apps/demo-openai-agents`: end-to-end OpenAI Agents demo evals with + app-local refund tools ## Workspace Layout @@ -15,9 +18,11 @@ Monorepo for the explicit-run `vitest-evals` shape: packages/ vitest-evals/ harness-ai-sdk/ + harness-openai-agents/ harness-pi-ai/ apps/ demo-ai-sdk/ + demo-openai-agents/ demo-pi/ ``` @@ -158,8 +163,8 @@ when the judge needs richer run/session data or the suite's configured model prompt seam. Tool replay is available for opt-in tools in the first-party harnesses. -Configure it globally in Vitest and then mark individual tools with -`replay: true`: +Configure the replay mode and directory globally in Vitest, then opt individual +tools in from the harness with `toolReplay: { toolName: true }`. ```ts import tsconfigPaths from "vite-tsconfig-paths"; @@ -187,5 +192,7 @@ errors on missing recordings. Recordings are stored under `.vitest-evals/recordings//`. `pnpm evals` fans out to each workspace package or app that exposes an `evals` -script. The demo apps expect provider keys in `.env` or `.env.local`. The +script. The shared eval CLI defaults replay to `auto` and writes recordings +under `.vitest-evals/recordings`, unless those environment variables are +already set. Demo apps expect provider keys in `.env` or `.env.local`. The intentional failing examples remain under the `evals:fail` scripts. diff --git a/apps/demo-ai-sdk/evals/shared.ts b/apps/demo-ai-sdk/evals/shared.ts index 9dde534..353bc28 100644 --- a/apps/demo-ai-sdk/evals/shared.ts +++ b/apps/demo-ai-sdk/evals/shared.ts @@ -91,7 +91,6 @@ async function createRefund({ const refundTools = { lookupInvoice: { description: "Look up invoice details inside demo billing.", - replay: true, inputSchema: z.object({ invoiceId: z .string() @@ -111,6 +110,9 @@ const refundTools = { export const refundHarness = aiSdkHarness({ tools: refundTools, + toolReplay: { + lookupInvoice: true, + }, prompt: (input, options) => generateText({ model: anthropic("claude-sonnet-4-5"), diff --git a/apps/demo-openai-agents/README.md b/apps/demo-openai-agents/README.md new file mode 100644 index 0000000..16b30b5 --- /dev/null +++ b/apps/demo-openai-agents/README.md @@ -0,0 +1,34 @@ +# Demo OpenAI Agents App + +This app demonstrates an `@openai/agents` harness wired into `vitest-evals` +through the workspace packages: + +- `vitest-evals` +- `@vitest-evals/harness-openai-agents` + +The passing live eval lives in `evals/refund.eval.ts`. +It demonstrates a real OpenAI Agents `Agent`, `Runner`, local function tools, +tool replay configured from the harness, and explicit Vitest assertions on +`run.output` and the normalized session trace. + +The intentionally failing examples live in `evals/refund.fail.eval.ts`. +One fails an automatic harness-backed judge, and one fails explicit assertions +after the harness completes. + +Run them with: + +```sh +pnpm --filter @demo/demo-openai-agents run evals +pnpm --filter @demo/demo-openai-agents run evals -- -v +pnpm --filter @demo/demo-openai-agents run evals -- -vv +pnpm --filter @demo/demo-openai-agents run evals -- -vvv +pnpm --filter @demo/demo-openai-agents run evals -- -vvvv +pnpm --filter @demo/demo-openai-agents run evals:verbose +pnpm --filter @demo/demo-openai-agents run evals:fail +``` + +`pnpm --filter @demo/demo-openai-agents run evals` runs only the passing eval. +Use `pnpm --filter @demo/demo-openai-agents run evals:fail` to run just the +intentional failures. + +Both scripts expect `OPENAI_API_KEY` to be present in `.env` or `.env.local`. diff --git a/apps/demo-openai-agents/evals/refund.eval.ts b/apps/demo-openai-agents/evals/refund.eval.ts new file mode 100644 index 0000000..de16be7 --- /dev/null +++ b/apps/demo-openai-agents/evals/refund.eval.ts @@ -0,0 +1,47 @@ +import { + describeEval, + StructuredOutputJudge, + ToolCallJudge, +} from "vitest-evals"; +import { expect } from "vitest"; +import { assertRefundCase, refundHarness } from "./shared"; +import type { RefundCase } from "../src/refundAgent"; + +const outputJudge = StructuredOutputJudge(); + +describeEval( + "demo openai agents refund agent", + { + skipIf: () => !process.env.OPENAI_API_KEY, + harness: refundHarness, + judges: [ToolCallJudge()], + }, + (it) => { + it.for([ + { + name: "approves refundable invoice", + input: "Refund invoice inv_123", + expectedStatus: "approved", + expectedTools: ["lookupInvoice", "createRefund"], + }, + { + name: "denies non-refundable invoice", + input: "Refund invoice inv_404", + expectedStatus: "denied", + expectedTools: ["lookupInvoice"], + }, + ])("$name", async ({ input, ...metadata }, { run }) => { + const result = await run(input, { + metadata, + }); + + await assertRefundCase(result, metadata); + await expect(result).toSatisfyJudge(outputJudge, { + metadata, + expected: { + status: metadata.expectedStatus, + }, + }); + }); + }, +); diff --git a/apps/demo-openai-agents/evals/refund.fail.eval.ts b/apps/demo-openai-agents/evals/refund.fail.eval.ts new file mode 100644 index 0000000..b7031fe --- /dev/null +++ b/apps/demo-openai-agents/evals/refund.fail.eval.ts @@ -0,0 +1,63 @@ +import { expect } from "vitest"; +import { describeEval, StructuredOutputJudge } from "vitest-evals"; +import { refundHarness } from "./shared"; +import type { RefundCase } from "../src/refundAgent"; + +type AssertionRefundCase = RefundCase; +type ScoredRefundCase = RefundCase & { + expected: Record; +}; + +describeEval( + "demo openai agents refund scorer failing example", + { + skipIf: () => !process.env.OPENAI_API_KEY, + harness: refundHarness, + judges: [StructuredOutputJudge()], + }, + (it) => { + it.for([ + { + name: "judge expects approval for a denied invoice", + input: "Refund invoice inv_404", + expectedStatus: "denied", + expectedTools: ["lookupInvoice"], + expected: { + status: "approved", + }, + }, + ])("$name", async ({ input, ...metadata }, { run }) => { + await run(input, { + metadata, + }); + }); + }, +); + +describeEval( + "demo openai agents refund assertion failing example", + { + skipIf: () => !process.env.OPENAI_API_KEY, + harness: refundHarness, + }, + (it) => { + it.for([ + { + name: "asserts the wrong refund id after approval", + input: "Refund invoice inv_123", + expectedStatus: "approved", + expectedTools: ["lookupInvoice", "createRefund"], + }, + ])("$name", async ({ input, ...metadata }, { run }) => { + const result = await run(input, { + metadata, + }); + + expect(result.output).toMatchObject({ + status: "approved", + invoiceId: "inv_123", + refundId: "rf_wrong", + }); + }); + }, +); diff --git a/apps/demo-openai-agents/evals/shared.ts b/apps/demo-openai-agents/evals/shared.ts new file mode 100644 index 0000000..3286125 --- /dev/null +++ b/apps/demo-openai-agents/evals/shared.ts @@ -0,0 +1,40 @@ +import { openaiAgentsHarness } from "@vitest-evals/harness-openai-agents"; +import { expect } from "vitest"; +import { type HarnessRun, toolCalls } from "vitest-evals"; +import { + createRefundAgent, + createRefundRunner, + parseRefundDecision, + promptRefundModel, + resolveResultText, + type RefundCase, +} from "../src/refundAgent"; + +export const refundHarness = openaiAgentsHarness({ + createAgent: () => createRefundAgent(), + createRunner: () => createRefundRunner(), + prompt: promptRefundModel, + runOptions: { + maxTurns: 5, + }, + toolReplay: { + lookupInvoice: true, + }, + normalize: { + output: ({ result }) => parseRefundDecision(resolveResultText(result)), + }, +}); + +export async function assertRefundCase( + run: HarnessRun, + expected: Pick, +) { + expect(run.output).toMatchObject({ + status: expected.expectedStatus, + }); + expect(toolCalls(run.session).map((call) => call.name)).toEqual( + expected.expectedTools, + ); + expect(run.usage.model).toContain("gpt"); + expect(run.usage.totalTokens).toBeGreaterThan(0); +} diff --git a/apps/demo-openai-agents/package.json b/apps/demo-openai-agents/package.json new file mode 100644 index 0000000..5cf014b --- /dev/null +++ b/apps/demo-openai-agents/package.json @@ -0,0 +1,16 @@ +{ + "name": "@demo/demo-openai-agents", + "private": true, + "version": "0.1.0", + "scripts": { + "evals": "node ./scripts/run-evals.mjs", + "evals:verbose": "node ./scripts/run-evals.mjs -v", + "evals:fail": "node ./scripts/run-evals.mjs --fail" + }, + "dependencies": { + "@openai/agents": "^0.8.5", + "@vitest-evals/harness-openai-agents": "workspace:*", + "vitest-evals": "workspace:*", + "zod": "^4.3.6" + } +} diff --git a/apps/demo-openai-agents/scripts/run-evals.mjs b/apps/demo-openai-agents/scripts/run-evals.mjs new file mode 100644 index 0000000..9ab807c --- /dev/null +++ b/apps/demo-openai-agents/scripts/run-evals.mjs @@ -0,0 +1,50 @@ +import { spawnSync } from "node:child_process"; +import { dirname, resolve } from "node:path"; +import { fileURLToPath } from "node:url"; +import { createEvalEnv, parseEvalCliArgs } from "../../../scripts/eval-cli.mjs"; + +const WORKSPACE_ROOT = resolve( + dirname(fileURLToPath(import.meta.url)), + "../../..", +); + +const { failMode, forwardedArgs, toolDetailLevel } = parseEvalCliArgs( + process.argv.slice(2), +); +const env = createEvalEnv(process.env, toolDetailLevel); + +const explicitTargetIndex = forwardedArgs.findIndex( + (arg) => !arg.startsWith("-"), +); +const target = + explicitTargetIndex >= 0 + ? forwardedArgs.splice(explicitTargetIndex, 1)[0] + : failMode + ? "apps/demo-openai-agents/evals/refund.fail.eval.ts" + : "apps/demo-openai-agents/evals/refund.eval.ts"; + +const command = [ + "exec", + "dotenv", + "-e", + ".env", + "-e", + ".env.local", + "--", + "vitest", + "run", + target, + "--config", + "vitest.config.ts", + "--reporter", + "packages/vitest-evals/src/reporter.ts", + ...forwardedArgs, +]; + +const result = spawnSync("pnpm", command, { + cwd: WORKSPACE_ROOT, + env, + stdio: "inherit", +}); + +process.exit(result.status ?? 1); diff --git a/apps/demo-openai-agents/src/refundAgent.test.ts b/apps/demo-openai-agents/src/refundAgent.test.ts new file mode 100644 index 0000000..b7842f5 --- /dev/null +++ b/apps/demo-openai-agents/src/refundAgent.test.ts @@ -0,0 +1,109 @@ +import { describe, expect, test } from "vitest"; +import { + createRefund, + createRefundAgent, + createRefundRunner, + lookupInvoice, + parseRefundDecision, + resolveResultText, +} from "./refundAgent"; + +describe("parseRefundDecision", () => { + test("parses plain approved JSON", () => { + expect( + parseRefundDecision( + '{"status":"approved","invoiceId":"inv_123","refundId":"rf_inv_123","amount":4200}', + ), + ).toEqual({ + status: "approved", + invoiceId: "inv_123", + refundId: "rf_inv_123", + amount: 4200, + }); + }); + + test("parses fenced denied JSON", () => { + expect( + parseRefundDecision( + [ + "```json", + '{"status":"denied","invoiceId":"inv_404","reason":"not refundable"}', + "```", + ].join("\n"), + ), + ).toEqual({ + status: "denied", + invoiceId: "inv_404", + reason: "not refundable", + }); + }); + + test("parses JSON embedded in surrounding text", () => { + expect( + parseRefundDecision( + [ + "Here is the decision:", + '{"status":"denied","invoiceId":"inv_404","reason":"not refundable"}', + ].join("\n"), + ), + ).toEqual({ + status: "denied", + invoiceId: "inv_404", + reason: "not refundable", + }); + }); + + test("parses embedded JSON with braces inside string values", () => { + expect( + parseRefundDecision( + [ + "Decision payload:", + '{"status":"denied","invoiceId":"inv_404","reason":"saw literal {brace} text"}', + "Thanks.", + ].join("\n"), + ), + ).toEqual({ + status: "denied", + invoiceId: "inv_404", + reason: "saw literal {brace} text", + }); + }); +}); + +test("demo billing tools are deterministic", async () => { + await expect(lookupInvoice({ invoiceId: "inv_123" })).resolves.toEqual({ + invoiceId: "inv_123", + amount: 4200, + refundable: true, + customer: "Acme Co", + }); + await expect( + createRefund({ invoiceId: "inv_123", amount: 4200 }), + ).resolves.toEqual({ + refundId: "rf_inv_123", + amount: 4200, + status: "submitted", + }); +}); + +test("createRefundAgent wires OpenAI Agents tools", () => { + const agent = createRefundAgent(); + + expect(agent.name).toBe("demo_refund_agent"); + expect(agent.tools.map((tool) => tool.name)).toEqual([ + "lookupInvoice", + "createRefund", + ]); +}); + +test("createRefundRunner disables tracing for demo eval runs", () => { + expect(createRefundRunner().config.tracingDisabled).toBe(true); +}); + +test("resolveResultText reads OpenAI Agents final output", () => { + expect( + resolveResultText({ + finalOutput: '{"status":"denied","invoiceId":"inv_404","reason":"no"}', + }), + ).toBe('{"status":"denied","invoiceId":"inv_404","reason":"no"}'); +}); diff --git a/apps/demo-openai-agents/src/refundAgent.ts b/apps/demo-openai-agents/src/refundAgent.ts new file mode 100644 index 0000000..fa36976 --- /dev/null +++ b/apps/demo-openai-agents/src/refundAgent.ts @@ -0,0 +1,301 @@ +import { Agent, Runner, tool } from "@openai/agents"; +import type { HarnessPromptOptions } from "vitest-evals"; +import { z } from "zod"; + +export type InvoiceRecord = { + invoiceId: string; + amount: number; + refundable: boolean; + customer: string; +}; + +export type RefundDecision = + | { + status: "approved"; + invoiceId: string; + refundId: string; + amount: number; + } + | { + status: "denied"; + invoiceId: string; + reason: string; + }; + +export type RefundEvalMetadata = { + name?: string; + expectedStatus: RefundDecision["status"]; + expectedTools: string[]; +}; + +export type RefundCase = RefundEvalMetadata & { + input: string; +}; + +export type LookupInvoiceInput = { + invoiceId: string; +}; + +export type CreateRefundInput = { + invoiceId: string; + amount: number; +}; + +export const LOOKUP_INVOICE_DESCRIPTION = + "Look up invoice details inside demo billing."; +export const CREATE_REFUND_DESCRIPTION = + "Create a refund for a refundable invoice."; +export const DEFAULT_REFUND_MODEL = "gpt-4.1-mini"; +export const REFUND_SYSTEM_PROMPT = [ + "You are the demo refund operations agent.", + "You must decide whether a refund should be approved for the invoice in the user's request.", + "Always call lookupInvoice before making a decision.", + "If the invoice is refundable, call createRefund with the full invoice amount.", + "If the invoice is not refundable, do not call createRefund.", + "Return JSON only and do not wrap it in markdown.", + 'Approved shape: {"status":"approved","invoiceId":"...","refundId":"...","amount":4200}', + 'Denied shape: {"status":"denied","invoiceId":"...","reason":"..."}', +].join("\n"); + +const INVOICES: Record = { + inv_123: { + invoiceId: "inv_123", + amount: 4200, + refundable: true, + customer: "Acme Co", + }, + inv_404: { + invoiceId: "inv_404", + amount: 1700, + refundable: false, + customer: "Globex", + }, +}; + +/** Looks up a demo invoice record for the OpenAI Agents local function tool. */ +export async function lookupInvoice({ + invoiceId, +}: LookupInvoiceInput): Promise { + const invoice = INVOICES[invoiceId]; + if (!invoice) { + throw new Error(`Invoice ${invoiceId} not found`); + } + + return invoice; +} + +/** Creates a deterministic demo refund record. */ +export async function createRefund({ + invoiceId, + amount, +}: CreateRefundInput): Promise<{ + refundId: string; + amount: number; + status: "submitted"; +}> { + return { + refundId: `rf_${invoiceId}`, + amount, + status: "submitted", + }; +} + +function createRefundTools() { + const lookupInvoiceTool = tool({ + name: "lookupInvoice", + description: LOOKUP_INVOICE_DESCRIPTION, + parameters: z.object({ + invoiceId: z + .string() + .describe("The invoice id to inspect, such as inv_123."), + }), + execute: lookupInvoice, + }); + + const createRefundTool = tool({ + name: "createRefund", + description: CREATE_REFUND_DESCRIPTION, + parameters: z.object({ + invoiceId: z.string().describe("The invoice id that should be refunded."), + amount: z.number().describe("The amount to refund in cents."), + }), + execute: createRefund, + }); + + return [lookupInvoiceTool, createRefundTool]; +} + +/** Creates a fresh OpenAI Agents refund agent for one eval run. */ +export function createRefundAgent(options?: { model?: string }) { + return new Agent({ + name: "demo_refund_agent", + instructions: REFUND_SYSTEM_PROMPT, + model: options?.model ?? DEFAULT_REFUND_MODEL, + modelSettings: { + temperature: 0, + }, + tools: createRefundTools(), + }); +} + +/** Creates the OpenAI Agents runner used by the demo harness. */ +export function createRefundRunner() { + return new Runner({ + tracingDisabled: true, + modelSettings: { + temperature: 0, + }, + }); +} + +/** Uses the same OpenAI Agents stack as a provider-agnostic judge prompt seam. */ +export async function promptRefundModel( + input: string, + options?: HarnessPromptOptions, +) { + const runner = createRefundRunner(); + const agent = new Agent({ + name: "demo_refund_prompt", + instructions: options?.system ?? "Return a concise answer.", + model: DEFAULT_REFUND_MODEL, + modelSettings: { + temperature: 0, + }, + }); + const result = await runner.run(agent, input, { + maxTurns: 2, + }); + const outputText = resolveResultText(result); + + if (!outputText) { + throw new Error("Prompt model returned an empty response."); + } + + return outputText; +} + +/** Parses the demo agent's final JSON payload into a typed refund decision. */ +export function parseRefundDecision(text: string): RefundDecision { + const cleaned = stripMarkdownFence(text); + const jsonText = extractJsonObjectText(cleaned); + const parsed = JSON.parse(jsonText) as Record; + + if ( + parsed.status === "approved" && + typeof parsed.invoiceId === "string" && + typeof parsed.refundId === "string" && + typeof parsed.amount === "number" + ) { + return { + status: "approved", + invoiceId: parsed.invoiceId, + refundId: parsed.refundId, + amount: parsed.amount, + }; + } + + if ( + parsed.status === "denied" && + typeof parsed.invoiceId === "string" && + typeof parsed.reason === "string" + ) { + return { + status: "denied", + invoiceId: parsed.invoiceId, + reason: parsed.reason, + }; + } + + throw new Error(`Refund agent returned an invalid decision payload: ${text}`); +} + +/** Extracts text from an OpenAI Agents run result for app output mapping. */ +export function resolveResultText(result: unknown) { + if (!result || typeof result !== "object") { + return typeof result === "string" ? result : ""; + } + + const finalOutput = (result as { finalOutput?: unknown }).finalOutput; + if (typeof finalOutput === "string") { + return finalOutput.trim(); + } + + const output = (result as { output?: unknown }).output; + if (typeof output === "string") { + return output.trim(); + } + + return finalOutput === undefined ? "" : JSON.stringify(finalOutput); +} + +function stripMarkdownFence(text: string) { + const trimmed = text.trim(); + if (!trimmed.startsWith("```") || !trimmed.endsWith("```")) { + return trimmed; + } + + const firstNewline = trimmed.indexOf("\n"); + if (firstNewline === -1) { + return trimmed; + } + + const fenceHeader = trimmed.slice(3, firstNewline).trim().toLowerCase(); + if (fenceHeader !== "" && fenceHeader !== "json") { + return trimmed; + } + + return trimmed.slice(firstNewline + 1, -3).trim(); +} + +function extractJsonObjectText(text: string) { + const start = text.indexOf("{"); + if (start === -1) { + return text; + } + + let depth = 0; + let inString = false; + let isEscaped = false; + + for (let index = start; index < text.length; index += 1) { + const char = text[index]; + + if (inString) { + if (isEscaped) { + isEscaped = false; + continue; + } + + if (char === "\\") { + isEscaped = true; + continue; + } + + if (char === '"') { + inString = false; + } + continue; + } + + if (char === '"') { + inString = true; + continue; + } + + if (char === "{") { + depth += 1; + continue; + } + + if (char !== "}") { + continue; + } + + depth -= 1; + if (depth === 0) { + return text.slice(start, index + 1); + } + } + + return text; +} diff --git a/apps/demo-openai-agents/tsconfig.json b/apps/demo-openai-agents/tsconfig.json new file mode 100644 index 0000000..5f62fc7 --- /dev/null +++ b/apps/demo-openai-agents/tsconfig.json @@ -0,0 +1,4 @@ +{ + "extends": "../../tsconfig.base.json", + "include": ["**/*.ts"] +} diff --git a/apps/demo-pi/evals/refund.eval.ts b/apps/demo-pi/evals/refund.eval.ts index 0f97ba6..319e2cb 100644 --- a/apps/demo-pi/evals/refund.eval.ts +++ b/apps/demo-pi/evals/refund.eval.ts @@ -20,6 +20,9 @@ describeEval( skipIf: () => !process.env.ANTHROPIC_API_KEY, harness: piAiHarness({ createAgent: () => createRefundAgent(), + toolReplay: { + lookupInvoice: true, + }, prompt: promptRefundModel, }), judges: [ToolCallJudge()], diff --git a/apps/demo-pi/evals/refund.fail.eval.ts b/apps/demo-pi/evals/refund.fail.eval.ts index a326949..01f986b 100644 --- a/apps/demo-pi/evals/refund.fail.eval.ts +++ b/apps/demo-pi/evals/refund.fail.eval.ts @@ -14,6 +14,9 @@ type ScoredRefundCase = RefundCase & { const harness = piAiHarness({ createAgent: () => createRefundAgent(), + toolReplay: { + lookupInvoice: true, + }, prompt: promptRefundModel, }); diff --git a/apps/demo-pi/src/refundAgent.ts b/apps/demo-pi/src/refundAgent.ts index 1814c5a..29be95d 100644 --- a/apps/demo-pi/src/refundAgent.ts +++ b/apps/demo-pi/src/refundAgent.ts @@ -108,7 +108,6 @@ export async function createRefund({ const refundAgentTools = { lookupInvoice: { description: LOOKUP_INVOICE_DESCRIPTION, - replay: true, execute: lookupInvoice, }, createRefund: { diff --git a/docs/architecture.md b/docs/architecture.md index e91fad8..ba2aa07 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -27,9 +27,11 @@ packages/ judges/ legacy/ harness-ai-sdk/ + harness-openai-agents/ harness-pi-ai/ apps/ demo-ai-sdk/ + demo-openai-agents/ demo-pi/ ``` @@ -127,6 +129,10 @@ the judge in matcher options. ## First-Party Harness Packages +Replay/VCR policy is configured at the harness boundary with `toolReplay` and +global Vitest environment settings. Tool definitions should describe tool +behavior only. + ### `@vitest-evals/harness-ai-sdk` Adapts `ai-sdk`-style results into the normalized run/session shape. It can @@ -134,6 +140,14 @@ derive output, usage, messages, tool calls, and errors from common AI SDK result objects, while still allowing custom `run`, `session`, `output`, and `usage` overrides. +### `@vitest-evals/harness-openai-agents` + +Adapts `@openai/agents` `Runner.run(agent, input, options)` workflows into the +normalized run/session shape. It accepts an existing agent or `createAgent()` +factory, supports custom app entrypoints, normalizes `RunResult` output, +messages, usage, tool calls, tool results, errors, trace metadata, and records +replay metadata for opt-in local function tools. + ### `@vitest-evals/harness-pi-ai` Adapts `pi-ai` style agents into the same normalized shape. It also owns the @@ -149,9 +163,10 @@ surface it. ## Demo Apps -`apps/demo-pi` and `apps/demo-ai-sdk` own their demo fixtures locally. They stay -under `apps/` because they are product demos, while `packages/` is reserved for -real package surfaces that can be published or consumed independently. +`apps/demo-pi`, `apps/demo-ai-sdk`, and `apps/demo-openai-agents` own their demo +fixtures locally. They stay under `apps/` because they are product demos, while +`packages/` is reserved for real package surfaces that can be published or +consumed independently. ## Extension Points diff --git a/docs/development-guide.md b/docs/development-guide.md index c94fc65..c5e4770 100644 --- a/docs/development-guide.md +++ b/docs/development-guide.md @@ -65,6 +65,14 @@ Owns: - adapting AI SDK results into `HarnessRun` - AI SDK specific usage/session normalization +### `packages/harness-openai-agents` + +Owns: + +- adapting OpenAI Agents SDK `Runner.run(...)` results into `HarnessRun` +- OpenAI Agents specific `RunResult` and function-tool normalization +- replay metadata for opt-in local function tools + ### `packages/harness-pi-ai` Owns: @@ -75,9 +83,10 @@ Owns: ## Demo Apps -`apps/demo-pi` and `apps/demo-ai-sdk` own live demo eval coverage and any -app-local refund fixtures they need. Keep them realistic; they are part of the -product story, not just smoke tests. `packages/` is for real package surfaces. +`apps/demo-pi`, `apps/demo-ai-sdk`, and `apps/demo-openai-agents` own live demo +eval coverage and any app-local refund fixtures they need. Keep them realistic; +they are part of the product story, not just smoke tests. `packages/` is for +real package surfaces. ## Adding a New Judge diff --git a/docs/testing.md b/docs/testing.md index 708494a..e47bc5b 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -23,12 +23,18 @@ packages/vitest-evals/src/ scorers/*.test.ts packages/harness-ai-sdk/src/ index.test.ts +packages/harness-openai-agents/src/ + index.test.ts packages/harness-pi-ai/src/ index.test.ts apps/demo-pi/src/ refundAgent.test.ts +apps/demo-openai-agents/src/ + refundAgent.test.ts apps/demo-ai-sdk/evals/ *.eval.ts +apps/demo-openai-agents/evals/ + *.eval.ts apps/demo-pi/evals/ *.eval.ts ``` diff --git a/package.json b/package.json index dd7b740..a58c884 100644 --- a/package.json +++ b/package.json @@ -13,9 +13,9 @@ "prepare": "simple-git-hooks", "release:check": "node ./scripts/check-release-config.mjs", "typecheck": "tsc --noEmit", - "test": "dotenv -e .env -e .env.local -- vitest run packages apps --config=./vitest.config.ts --reporter=./packages/vitest-evals/src/reporter.ts", - "test:watch": "dotenv -e .env -e .env.local -- vitest packages apps --config=./vitest.config.ts --reporter=./packages/vitest-evals/src/reporter.ts", - "test:ci": "dotenv -e .env -e .env.local -- vitest run packages apps --config=./vitest.config.ts --coverage --reporter=./packages/vitest-evals/src/reporter.ts --reporter=junit --outputFile=tests.junit.xml" + "test": "dotenv -e .env -e .env.local -- vitest run packages apps scripts --config=./vitest.config.ts --reporter=./packages/vitest-evals/src/reporter.ts", + "test:watch": "dotenv -e .env -e .env.local -- vitest packages apps scripts --config=./vitest.config.ts --reporter=./packages/vitest-evals/src/reporter.ts", + "test:ci": "dotenv -e .env -e .env.local -- vitest run packages apps scripts --config=./vitest.config.ts --coverage --reporter=./packages/vitest-evals/src/reporter.ts --reporter=junit --outputFile=tests.junit.xml" }, "repository": { "type": "git", diff --git a/packages/harness-ai-sdk/README.md b/packages/harness-ai-sdk/README.md index e2461a4..0c79b08 100644 --- a/packages/harness-ai-sdk/README.md +++ b/packages/harness-ai-sdk/README.md @@ -11,13 +11,14 @@ npm install -D ai vitest-evals @vitest-evals/harness-ai-sdk ## Usage ```ts +import { expect } from "vitest"; import { generateText, stepCountIs } from "ai"; import { openai } from "@ai-sdk/openai"; import { aiSdkHarness } from "@vitest-evals/harness-ai-sdk"; +import { describeEval, toolCalls } from "vitest-evals"; const tools = { lookupInvoice: { - replay: true, inputSchema: lookupInvoiceSchema, execute: lookupInvoice, }, @@ -25,6 +26,9 @@ const tools = { const harness = aiSdkHarness({ tools, + toolReplay: { + lookupInvoice: true, + }, prompt: (input, options) => generateText({ model: openai("gpt-4o-mini"), @@ -39,6 +43,19 @@ const harness = aiSdkHarness({ stopWhen: stepCountIs(5), }), }); + +describeEval("refund agent", { harness }, (it) => { + it("approves a refundable invoice", async ({ run }) => { + const result = await run("Refund invoice inv_123"); + + expect(result.output).toMatchObject({ + status: "approved", + }); + expect(toolCalls(result.session).map((call) => call.name)).toContain( + "lookupInvoice", + ); + }); +}); ``` If your existing AI SDK app exposes its own entrypoint, wire that in directly: @@ -61,7 +78,7 @@ The adapter infers: - usage diagnostics from `totalUsage` / `usage` - `run.output` from common AI SDK result fields such as `output`, `object`, and `text` -- replay/cassette metadata for opt-in tools when they set `replay: true` +- replay/cassette metadata for local tools configured with `toolReplay` See the workspace demo app in `apps/demo-ai-sdk` and the RFC notes in `docs/harness-first-rfc.md`. diff --git a/packages/harness-ai-sdk/src/index.test.ts b/packages/harness-ai-sdk/src/index.test.ts index 5fcdeea..aaba2d4 100644 --- a/packages/harness-ai-sdk/src/index.test.ts +++ b/packages/harness-ai-sdk/src/index.test.ts @@ -403,7 +403,6 @@ test("default agent run receives wrapped runtime tools", async () => { }), tools: { lookupInvoice: { - replay: true, inputSchema: z.object({ invoiceId: z.string(), }), @@ -1387,9 +1386,11 @@ test("records and replays opt-in tools in auto mode", async () => { const replayHarness = aiSdkHarness({ prompt: judgePrompt, + toolReplay: { + lookupInvoice: true, + }, tools: { lookupInvoice: { - replay: true, inputSchema: z.object({ invoiceId: z.string(), }), @@ -1521,6 +1522,53 @@ test("records and replays opt-in tools in auto mode", async () => { }); }); +test("does not opt into replay from tool definitions", async () => { + replayDir = mkdtempSync(join(process.cwd(), ".tmp-ai-sdk-replay-")); + vi.stubEnv("VITEST_EVALS_REPLAY_MODE", "auto"); + vi.stubEnv("VITEST_EVALS_REPLAY_DIR", replayDir); + + const execute = vi.fn(async ({ invoiceId }: { invoiceId: string }) => ({ + invoiceId, + refundable: true, + })); + + const harness = aiSdkHarness({ + prompt: judgePrompt, + tools: { + lookupInvoice: { + replay: true, + inputSchema: z.object({ + invoiceId: z.string(), + }), + execute, + }, + } as unknown as AiSdkToolset, + task: async ({ runtime }) => { + await runtime.tools.lookupInvoice.execute?.( + { + invoiceId: "inv_123", + }, + { + toolCallId: "call_lookup", + messages: [], + } satisfies ToolExecutionOptions, + ); + + return { + text: '{"status":"approved"}', + }; + }, + }); + + const run = await harness.run( + "Refund invoice inv_123", + createHarnessContext({}), + ); + + expect(execute).toHaveBeenCalledTimes(1); + expect(toolCalls(run.session)[0].metadata?.replay).toBeUndefined(); +}); + test("rejects async iterable replay outputs after awaiting execute", async () => { replayDir = mkdtempSync(join(process.cwd(), ".tmp-ai-sdk-replay-")); vi.stubEnv("VITEST_EVALS_REPLAY_MODE", "auto"); @@ -1532,9 +1580,11 @@ test("rejects async iterable replay outputs after awaiting execute", async () => const replayHarness = aiSdkHarness({ prompt: judgePrompt, + toolReplay: { + streamRefund: true, + }, tools: { streamRefund: { - replay: true, inputSchema: z.object({ invoiceId: z.string(), }), @@ -1578,9 +1628,11 @@ test("errors when strict mode is missing a recording", async () => { const replayHarness = aiSdkHarness({ prompt: judgePrompt, + toolReplay: { + lookupInvoice: true, + }, tools: { lookupInvoice: { - replay: true, inputSchema: z.object({ invoiceId: z.string(), }), diff --git a/packages/harness-ai-sdk/src/index.ts b/packages/harness-ai-sdk/src/index.ts index 1d812cb..16f8a6d 100644 --- a/packages/harness-ai-sdk/src/index.ts +++ b/packages/harness-ai-sdk/src/index.ts @@ -109,11 +109,19 @@ export type AiSdkToolReplayConfig< export type AiSdkToolDefinition< TArgs extends JsonValue = JsonValue, TResult extends JsonValue = JsonValue, + _TInput = string, + _TMetadata extends HarnessMetadata = HarnessMetadata, +> = Tool; + +export type AiSdkToolReplayPolicy< TInput = string, TMetadata extends HarnessMetadata = HarnessMetadata, -> = Tool & { - replay?: boolean | AiSdkToolReplayConfig; -}; +> = boolean | AiSdkToolReplayConfig; + +export type AiSdkToolReplayPolicies< + TInput = string, + TMetadata extends HarnessMetadata = HarnessMetadata, +> = Record>; export type AiSdkToolset< TInput = string, @@ -199,6 +207,7 @@ interface AiSdkHarnessBaseOptions< >, > { tools?: TTools; + toolReplay?: AiSdkToolReplayPolicies; session?: ( args: AiSdkHarnessResultArgs, ) => MaybePromise; @@ -295,6 +304,7 @@ async function runAiSdkHarness< input, context, tools: options.tools, + toolReplay: options.toolReplay, replayMetadataByToolCallId, runtimeToolCalls, }); @@ -534,18 +544,22 @@ function createToolset< input, context, tools, + toolReplay, replayMetadataByToolCallId, runtimeToolCalls, }: { input: TInput; context: HarnessContext; tools: TTools | undefined; + toolReplay: AiSdkToolReplayPolicies | undefined; replayMetadataByToolCallId: Map; runtimeToolCalls: ToolCallRecord[]; }) { return Object.fromEntries( Object.entries(tools ?? {}).map(([toolName, tool]) => { - if (tool.replay && !tool.execute) { + const replay = toolReplay?.[toolName]; + + if (replay && !tool.execute) { throw new Error( `Tool replay requires execute() for ${toolName}. Provider-executed tools cannot be recorded automatically.`, ); @@ -573,14 +587,14 @@ function createToolset< } satisfies AiSdkToolContext; try { - const executionResult = tool.replay + const executionResult = replay ? await executeToolWithReplay({ toolName, toolInput, execute, execution, context: replayContext, - replay: tool.replay, + replay, }) : { result: await execute(toolInput, execution), @@ -659,14 +673,15 @@ async function executeToolWithReplay< execute: NonNullable; execution: ToolExecutionOptions; context: AiSdkToolContext; - replay: NonNullable; + replay: AiSdkToolReplayPolicy; }) { - const replayInput = toReplayJsonValue( - toolInput, - `${toolName} tool input`, - ) as InferToolInput & JsonValue; + const replayInput = toReplayJsonValue(toolInput, `${toolName} tool input`); - return executeWithReplay({ + return executeWithReplay< + JsonValue, + JsonValue, + AiSdkToolContext + >({ toolName, args: replayInput, context, @@ -682,10 +697,7 @@ async function executeToolWithReplay< ); } - return toReplayJsonValue( - output, - `${toolName} tool output`, - ) as InferToolOutput & JsonValue; + return toReplayJsonValue(output, `${toolName} tool output`); }, replay, }); diff --git a/packages/harness-openai-agents/README.md b/packages/harness-openai-agents/README.md new file mode 100644 index 0000000..051dfc3 --- /dev/null +++ b/packages/harness-openai-agents/README.md @@ -0,0 +1,126 @@ +# @vitest-evals/harness-openai-agents + +`@openai/agents`-focused harness adapter for `vitest-evals`. + +## Install + +```sh +npm install -D @openai/agents vitest-evals @vitest-evals/harness-openai-agents +``` + +## Usage + +```ts +import { expect } from "vitest"; +import { Runner } from "@openai/agents"; +import { openaiAgentsHarness } from "@vitest-evals/harness-openai-agents"; +import { describeEval, toolCalls } from "vitest-evals"; + +const harness = openaiAgentsHarness({ + createAgent: () => createClassifierAgent(), + createRunner: () => + new Runner({ + modelProvider, + tracingDisabled: true, + }), + prompt: sharedJudgePrompt, +}); + +describeEval("classifier agent", { harness }, (it) => { + it("classifies a bottle", async ({ run }) => { + const result = await run("Classify bottle bt_123"); + + expect(result.output).toMatchObject({ + label: "bourbon", + }); + expect(toolCalls(result.session).map((call) => call.name)).toContain( + "lookup_bottle", + ); + }); +}); +``` + +The adapter calls `runner.run(agent, input, options)` by default. It forwards +the eval metadata, artifact helpers, and abort signal through the run options, +then normalizes the `RunResult` into the standard `HarnessRun` shape. + +If your application has a custom entrypoint, wire it directly: + +```ts +const harness = openaiAgentsHarness({ + createAgent: () => createClassifierAgent(), + createRunner: () => new Runner({ modelProvider, tracingDisabled: true }), + prompt: sharedJudgePrompt, + run: ({ agent, input, runner, runOptions }) => + runBottleClassifier({ agent, runner, input, runOptions }), + normalize: { + output: ({ result }) => result.classification, + outputText: ({ output }) => JSON.stringify(output), + }, +}); +``` + +The required `prompt` callback is passed to harness-backed judges as +`JudgeContext.harness.prompt`, so rubric or factuality judges can share the +same provider/model setup as the suite harness. + +The adapter provides: + +- native `Runner.run(agent, input, options)` execution +- support for existing agents or per-test `createAgent()` factories +- a `run` escape hatch for app-specific entrypoints +- normalized assistant output, messages, tool calls, tool results, usage, + timings, errors, and replay-friendly metadata +- app-facing `run.output` plus a deliberate `session.outputText` for judges +- opt-in replay metadata for local function tools configured with `toolReplay` + +## Tool Replay + +Replay is configured globally in Vitest via environment variables: + +```ts +import { defineConfig } from "vitest/config"; + +export default defineConfig({ + test: { + env: { + VITEST_EVALS_REPLAY_MODE: "auto", + VITEST_EVALS_REPLAY_DIR: ".vitest-evals/recordings", + }, + }, +}); +``` + +Then opt local function tools into replay by name: + +```ts +import { Agent, Runner, tool } from "@openai/agents"; +import { openaiAgentsHarness } from "@vitest-evals/harness-openai-agents"; + +const lookupBottle = tool({ + name: "lookup_bottle", + description: "Look up bottle facts.", + parameters: lookupBottleSchema, + async execute({ bottleId }) { + return fetchBottleFacts(bottleId); + }, +}); + +const harness = openaiAgentsHarness({ + createAgent: () => new Agent({ name: "classifier", tools: [lookupBottle] }), + createRunner: () => new Runner({ modelProvider, tracingDisabled: true }), + prompt: sharedJudgePrompt, + toolReplay: { + lookup_bottle: true, + }, +}); +``` + +`toolReplay` is keyed by the OpenAI tool name. Values can be `true` or the +standard replay config object with `key`, `sanitize`, and `version` callbacks. + +Hosted OpenAI tools are still normalized from the SDK run items when they are +present in `newItems`, but replay recording is only automatic for local +function tools that execute in the application process. + +See the workspace demo app in `apps/demo-openai-agents`. diff --git a/packages/harness-openai-agents/package.json b/packages/harness-openai-agents/package.json new file mode 100644 index 0000000..8d13d23 --- /dev/null +++ b/packages/harness-openai-agents/package.json @@ -0,0 +1,31 @@ +{ + "name": "@vitest-evals/harness-openai-agents", + "version": "0.9.0-beta.1", + "sideEffects": false, + "types": "./dist/index.d.ts", + "main": "./dist/index.js", + "module": "./dist/index.mjs", + "files": ["dist"], + "publishConfig": { + "access": "public" + }, + "exports": { + ".": { + "source": "./src/index.ts", + "types": "./dist/index.d.ts", + "require": "./dist/index.js", + "import": "./dist/index.mjs" + } + }, + "scripts": { + "build": "tsup --config ./tsup.config.ts" + }, + "peerDependencies": { + "@openai/agents": ">=0.8 <1", + "vitest-evals": "*" + }, + "devDependencies": { + "@openai/agents": "^0.8.5", + "vitest-evals": "workspace:*" + } +} diff --git a/packages/harness-openai-agents/src/index.test.ts b/packages/harness-openai-agents/src/index.test.ts new file mode 100644 index 0000000..c26ac07 --- /dev/null +++ b/packages/harness-openai-agents/src/index.test.ts @@ -0,0 +1,920 @@ +import { mkdtempSync, readFileSync, rmSync } from "node:fs"; +import { join } from "node:path"; +import { Agent, tool } from "@openai/agents"; +import { afterEach, expect, test, vi } from "vitest"; +import { describeEval, getHarnessRunFromError, toolCalls } from "vitest-evals"; +import type { JsonValue } from "vitest-evals/harness"; +import { openaiAgentsHarness, type OpenAiAgentsTool } from "./index"; + +type DemoMetadata = { + scenario?: string; +}; + +type DemoAgent = { + name: string; + model: string; + tools?: OpenAiAgentsTool[]; +}; + +let replayDir: string | undefined; + +const judgePrompt = async (input: string) => input; + +afterEach(() => { + vi.unstubAllEnvs(); + if (replayDir) { + rmSync(replayDir, { recursive: true, force: true }); + replayDir = undefined; + } +}); + +function createHarnessContext>( + metadata: TMetadata, +) { + const context = { + metadata, + task: { + meta: {}, + }, + artifacts: {} as Record, + setArtifact: vi.fn((name: string, value: JsonValue) => { + context.artifacts[name] = value; + }), + }; + + return context; +} + +const runResult = { + finalOutput: { + status: "classified", + category: "bourbon", + }, + state: { + usage: { + requests: 1, + inputTokens: 13, + outputTokens: 8, + totalTokens: 21, + }, + }, + lastAgent: { + name: "classifier", + model: "gpt-4.1-mini", + }, + rawResponses: [ + { + id: "resp_123", + }, + ], + newItems: [ + { + type: "message_output_item", + rawItem: { + type: "message", + role: "assistant", + content: [ + { + type: "output_text", + text: '{"status":"classified","category":"bourbon"}', + }, + ], + status: "completed", + }, + agent: { + name: "classifier", + }, + }, + { + type: "tool_call_item", + rawItem: { + type: "function_call", + callId: "call_lookup", + name: "lookupBottle", + arguments: JSON.stringify({ + bottleId: "bt_123", + }), + status: "completed", + }, + }, + { + type: "tool_call_output_item", + output: { + bottleId: "bt_123", + family: "bourbon", + }, + rawItem: { + type: "function_call_result", + callId: "call_lookup", + name: "lookupBottle", + output: { + bottleId: "bt_123", + family: "bourbon", + }, + status: "completed", + }, + }, + ], +} as const; + +describeEval( + "openai agents harness adapter", + { + harness: openaiAgentsHarness({ + prompt: judgePrompt, + agent: { + name: "classifier", + model: "gpt-4.1-mini", + }, + runner: { + run: vi.fn(async (_agent: DemoAgent, _input: string, options) => { + expect(options?.context).toMatchObject({ + metadata: { + scenario: "peated", + }, + }); + expect(options?.stream).toBe(false); + return { + ...runResult, + output: runResult.newItems, + }; + }), + }, + }), + }, + (it) => { + it("normalizes native run results", async ({ run }) => { + const result = await run("Classify bottle bt_123", { + metadata: { + scenario: "peated", + }, + }); + + expect(result.output).toEqual({ + status: "classified", + category: "bourbon", + }); + expect(result.session.outputText).toBe( + '{"status":"classified","category":"bourbon"}', + ); + expect(result.usage).toMatchObject({ + model: "gpt-4.1-mini", + inputTokens: 13, + outputTokens: 8, + totalTokens: 21, + toolCalls: 1, + }); + expect(result.session.model).toBe("gpt-4.1-mini"); + expect(result.session.messages).toMatchObject([ + { + role: "user", + content: "Classify bottle bt_123", + }, + { + role: "assistant", + content: '{"status":"classified","category":"bourbon"}', + }, + { + role: "assistant", + toolCalls: [ + { + id: "call_lookup", + name: "lookupBottle", + arguments: { + bottleId: "bt_123", + }, + result: { + bottleId: "bt_123", + family: "bourbon", + }, + }, + ], + }, + { + role: "tool", + content: { + bottleId: "bt_123", + family: "bourbon", + }, + metadata: { + name: "lookupBottle", + toolCallId: "call_lookup", + isError: false, + }, + }, + ]); + }); + }, +); + +test("exposes prompt and supports custom app output mapping", async () => { + const prompt = vi.fn(async (input: string) => `judge: ${input}`); + const harness = openaiAgentsHarness({ + prompt, + createAgent: () => ({ + name: "classifier", + model: "gpt-4.1-mini", + }), + run: async ({ context, runOptions }) => { + context.setArtifact("entrypoint", "custom"); + expect(runOptions.context).toMatchObject({ + metadata: { + scenario: "domain", + }, + }); + + return { + classification: { + label: "bourbon", + confidence: 0.92, + }, + }; + }, + normalize: { + output: ({ result }) => + (result as { classification: { label: string; confidence: number } }) + .classification, + outputText: ({ output }) => JSON.stringify(output), + }, + }); + + await expect(harness.prompt("score this")).resolves.toBe("judge: score this"); + + const result = await harness.run( + "Classify bottle bt_123", + createHarnessContext({ + scenario: "domain", + }), + ); + + expect(prompt).toHaveBeenCalledWith("score this"); + expect(result.output).toEqual({ + label: "bourbon", + confidence: 0.92, + }); + expect(result.session.outputText).toBe( + '{"label":"bourbon","confidence":0.92}', + ); + expect(result.artifacts).toEqual({ + entrypoint: "custom", + }); +}); + +test("wraps OpenAI Agents function tools with replay metadata", async () => { + replayDir = mkdtempSync(join(process.cwd(), ".tmp-openai-agents-replay-")); + vi.stubEnv("VITEST_EVALS_REPLAY_MODE", "auto"); + vi.stubEnv("VITEST_EVALS_REPLAY_DIR", replayDir); + + const invoke = vi.fn(async (...args: unknown[]) => { + const rawInput = args[1]; + if (typeof rawInput !== "string") { + throw new Error("Expected JSON tool input"); + } + + const input = JSON.parse(rawInput) as { bottleId: string }; + return { + bottleId: input.bottleId, + family: "bourbon", + }; + }); + const lookupBottle = { + type: "function", + name: "lookupBottle", + invoke, + } satisfies OpenAiAgentsTool; + const originalInvoke = lookupBottle.invoke; + const agent = { + name: "classifier", + model: "gpt-4.1-mini", + tools: [lookupBottle], + } satisfies DemoAgent; + const runner = { + run: vi.fn(async (runAgent: DemoAgent, _input: string, runOptions) => { + expect(runAgent).not.toBe(agent); + expect(runAgent.tools).not.toBe(agent.tools); + expect(runAgent.tools?.[0]).not.toBe(lookupBottle); + const evidence = await runAgent.tools?.[0].invoke?.( + runOptions?.context, + JSON.stringify({ + bottleId: "bt_123", + }), + { + toolCallId: "call_lookup", + }, + ); + + return { + finalOutput: { + label: "bourbon", + evidence, + }, + }; + }), + }; + const harness = openaiAgentsHarness({ + prompt: judgePrompt, + agent, + runner, + toolReplay: { + lookupBottle: true, + }, + }); + + const firstRun = await harness.run( + "Classify bottle bt_123", + createHarnessContext({}), + ); + + expect(invoke).toHaveBeenCalledTimes(1); + expect(agent.tools?.[0]).toBe(lookupBottle); + expect(agent.tools?.[0].invoke).toBe(originalInvoke); + expect(toolCalls(firstRun.session)).toMatchObject([ + { + id: "call_lookup", + name: "lookupBottle", + arguments: { + bottleId: "bt_123", + }, + result: { + bottleId: "bt_123", + family: "bourbon", + }, + metadata: { + replay: { + status: "recorded", + }, + }, + }, + ]); + + const recordingPath = ( + toolCalls(firstRun.session)[0].metadata?.replay as { recordingPath: string } + ).recordingPath; + const recording = JSON.parse( + readFileSync(join(process.cwd(), recordingPath), "utf8"), + ) as { + input: { bottleId: string }; + output: { bottleId: string; family: string }; + }; + expect(recording.input).toEqual({ + bottleId: "bt_123", + }); + expect(recording.output).toEqual({ + bottleId: "bt_123", + family: "bourbon", + }); + + invoke.mockImplementation(async () => { + throw new Error("tool should not execute after recording exists"); + }); + + const secondRun = await harness.run( + "Classify bottle bt_123", + createHarnessContext({}), + ); + + expect(invoke).toHaveBeenCalledTimes(1); + expect(agent.tools?.[0]).toBe(lookupBottle); + expect(agent.tools?.[0].invoke).toBe(originalInvoke); + expect(toolCalls(secondRun.session)).toMatchObject([ + { + id: "call_lookup", + name: "lookupBottle", + result: { + bottleId: "bt_123", + family: "bourbon", + }, + metadata: { + replay: { + status: "replayed", + }, + }, + }, + ]); +}); + +test("prefers captured local tool results over model-visible output wrappers", async () => { + const lookupBottle = { + type: "function", + name: "lookupBottle", + invoke: vi.fn(async () => ({ + bottleId: "bt_123", + family: "bourbon", + })), + } satisfies OpenAiAgentsTool; + const harness = openaiAgentsHarness({ + prompt: judgePrompt, + agent: { + name: "classifier", + model: "gpt-4.1-mini", + tools: [lookupBottle], + } satisfies DemoAgent, + runner: { + run: async (agent: DemoAgent, _input: string, runOptions) => { + const evidence = await agent.tools?.[0].invoke?.( + runOptions?.context, + JSON.stringify({ + bottleId: "bt_123", + }), + { + toolCallId: "call_lookup", + }, + ); + + return { + finalOutput: "classified", + newItems: [ + { + type: "tool_call_item", + rawItem: { + type: "function_call", + callId: "call_lookup", + name: "lookupBottle", + arguments: JSON.stringify({ + bottleId: "bt_123", + }), + status: "completed", + }, + }, + { + type: "tool_call_output_item", + rawItem: { + type: "function_call_result", + callId: "call_lookup", + name: "lookupBottle", + status: "completed", + output: { + type: "text", + text: JSON.stringify(evidence), + }, + }, + }, + ], + }; + }, + }, + }); + + const result = await harness.run( + "Classify bottle bt_123", + createHarnessContext({}), + ); + + expect(toolCalls(result.session)).toMatchObject([ + { + id: "call_lookup", + name: "lookupBottle", + result: { + bottleId: "bt_123", + family: "bourbon", + }, + }, + ]); + expect(result.session.messages).toContainEqual( + expect.objectContaining({ + role: "tool", + content: { + type: "text", + text: '{"bottleId":"bt_123","family":"bourbon"}', + }, + }), + ); +}); + +test("preserves explicit null captured local tool results", async () => { + const lookupBottle = { + type: "function", + name: "lookupBottle", + invoke: vi.fn(async () => null), + } satisfies OpenAiAgentsTool; + const harness = openaiAgentsHarness({ + prompt: judgePrompt, + agent: { + name: "classifier", + model: "gpt-4.1-mini", + tools: [lookupBottle], + } satisfies DemoAgent, + runner: { + run: async (agent: DemoAgent, _input: string, runOptions) => { + await agent.tools?.[0].invoke?.( + runOptions?.context, + JSON.stringify({ + bottleId: "bt_unknown", + }), + { + toolCallId: "call_lookup", + }, + ); + + return { + finalOutput: "classified", + newItems: [ + { + type: "tool_call_item", + rawItem: { + type: "function_call", + callId: "call_lookup", + name: "lookupBottle", + arguments: JSON.stringify({ + bottleId: "bt_unknown", + }), + status: "completed", + }, + }, + { + type: "tool_call_output_item", + rawItem: { + type: "function_call_result", + callId: "call_lookup", + name: "lookupBottle", + status: "completed", + output: { + type: "text", + text: "null", + }, + }, + }, + ], + }; + }, + }, + }); + + const result = await harness.run( + "Classify bottle bt_unknown", + createHarnessContext({}), + ); + const [call] = toolCalls(result.session); + + expect(call).toHaveProperty("result", null); + expect(call.error).toBeUndefined(); +}); + +test("errors when replay is configured for unknown OpenAI Agents tools", async () => { + const lookupBottle = { + type: "function", + name: "lookupBottle", + invoke: vi.fn(), + } satisfies OpenAiAgentsTool; + const runner = { + run: vi.fn(), + }; + const harness = openaiAgentsHarness({ + prompt: judgePrompt, + agent: { + name: "classifier", + model: "gpt-4.1-mini", + tools: [lookupBottle], + }, + runner, + toolReplay: { + misspelledLookup: true, + }, + }); + + await expect( + harness.run("Classify bottle bt_123", createHarnessContext({})), + ).rejects.toThrow( + "Tool replay configured for unknown OpenAI Agents tool(s): misspelledLookup.", + ); + expect(runner.run).not.toHaveBeenCalled(); + expect(lookupBottle.invoke).not.toHaveBeenCalled(); +}); + +test("errors when replay is configured for OpenAI Agents tools without invoke", async () => { + const hostedTool = { + type: "web_search_preview", + name: "web_search_preview", + } satisfies OpenAiAgentsTool; + const runner = { + run: vi.fn(), + }; + const harness = openaiAgentsHarness({ + prompt: judgePrompt, + agent: { + name: "classifier", + model: "gpt-4.1-mini", + tools: [hostedTool], + }, + runner, + toolReplay: { + web_search_preview: true, + }, + }); + + await expect( + harness.run("Search for bottle facts", createHarnessContext({})), + ).rejects.toThrow( + "Tool replay requires invoke() for web_search_preview. Hosted or provider-executed OpenAI Agents tools cannot be recorded automatically.", + ); + expect(runner.run).not.toHaveBeenCalled(); +}); + +test("instruments real OpenAI Agent tools without mutating the caller's agent", async () => { + const lookupBottle = tool({ + name: "lookupBottle", + description: "Look up bottle facts.", + parameters: { + type: "object", + properties: { + bottleId: { + type: "string", + }, + }, + required: ["bottleId"], + additionalProperties: false, + } as const, + execute: async (input: unknown) => { + const { bottleId } = input as { bottleId: string }; + + return { + bottleId, + family: "bourbon", + }; + }, + }); + const agent = new Agent({ + name: "classifier", + model: "gpt-4.1-mini", + tools: [lookupBottle], + }); + const originalTool = agent.tools[0]; + const harness = openaiAgentsHarness({ + prompt: judgePrompt, + agent, + runner: { + run: async (runAgent, _input, runOptions) => { + expect(runAgent).not.toBe(agent); + expect(runAgent.tools[0]).not.toBe(originalTool); + + const runtimeTool = runAgent.tools[0] as OpenAiAgentsTool< + string, + DemoMetadata + >; + const evidence = await runtimeTool.invoke?.( + runOptions?.context, + JSON.stringify({ + bottleId: "bt_123", + }), + { + toolCallId: "call_lookup", + }, + ); + + return { + finalOutput: evidence, + }; + }, + }, + }); + + const result = await harness.run( + "Classify bottle bt_123", + createHarnessContext({}), + ); + + expect(agent.tools[0]).toBe(originalTool); + expect(toolCalls(result.session)).toMatchObject([ + { + id: "call_lookup", + name: "lookupBottle", + arguments: { + bottleId: "bt_123", + }, + result: { + bottleId: "bt_123", + family: "bourbon", + }, + }, + ]); +}); + +test("rejects implicit agent and runner factories", () => { + expect(() => + openaiAgentsHarness({ + prompt: judgePrompt, + agent: (() => ({ + name: "classifier", + model: "gpt-4.1-mini", + })) as unknown as DemoAgent, + runner: { + run: async () => ({}), + }, + }), + ).toThrow("Use createAgent() for agent factories"); + + expect(() => + openaiAgentsHarness({ + prompt: judgePrompt, + agent: { + name: "classifier", + model: "gpt-4.1-mini", + }, + runner: (() => ({ + run: async () => ({}), + })) as unknown as { run: () => Promise }, + }), + ).toThrow("Use createRunner() for runner factories"); +}); + +test("keeps tool capture isolated across overlapping runs", async () => { + const invoke = vi.fn(async (_runContext: unknown, rawInput: unknown) => { + if (typeof rawInput !== "string") { + throw new Error("Expected JSON tool input"); + } + + const input = JSON.parse(rawInput) as { bottleId: string }; + return { + bottleId: input.bottleId, + }; + }); + const lookupBottle = { + type: "function", + name: "lookupBottle", + invoke, + } satisfies OpenAiAgentsTool; + const originalInvoke = lookupBottle.invoke; + const agent = { + name: "classifier", + model: "gpt-4.1-mini", + tools: [lookupBottle], + } satisfies DemoAgent; + const harness = openaiAgentsHarness({ + prompt: judgePrompt, + agent, + runner: { + run: async (runAgent: DemoAgent, _input: string, runOptions) => { + const runtimeContext = runOptions?.context as + | { metadata: DemoMetadata } + | undefined; + const scenario = runtimeContext?.metadata.scenario ?? "unknown"; + await new Promise((resolve) => setTimeout(resolve, 1)); + const evidence = await runAgent.tools?.[0].invoke?.( + runOptions?.context, + JSON.stringify({ + bottleId: `bt_${scenario}`, + }), + { + toolCallId: `call_${scenario}`, + }, + ); + + return { + finalOutput: evidence, + }; + }, + }, + }); + + const [firstRun, secondRun] = await Promise.all([ + harness.run( + "Classify first bottle", + createHarnessContext({ scenario: "first" }), + ), + harness.run( + "Classify second bottle", + createHarnessContext({ scenario: "second" }), + ), + ]); + + expect(agent.tools?.[0]).toBe(lookupBottle); + expect(agent.tools?.[0].invoke).toBe(originalInvoke); + expect(toolCalls(firstRun.session)).toMatchObject([ + { + id: "call_first", + arguments: { + bottleId: "bt_first", + }, + result: { + bottleId: "bt_first", + }, + }, + ]); + expect(toolCalls(secondRun.session)).toMatchObject([ + { + id: "call_second", + arguments: { + bottleId: "bt_second", + }, + result: { + bottleId: "bt_second", + }, + }, + ]); +}); + +test("marks failed tool output items as tool call errors", async () => { + const harness = openaiAgentsHarness({ + prompt: judgePrompt, + agent: { + name: "editor", + model: "gpt-4.1-mini", + }, + runner: { + run: async () => ({ + finalOutput: "patch failed", + newItems: [ + { + type: "tool_call_item", + rawItem: { + type: "apply_patch_call", + callId: "call_patch", + status: "completed", + operation: { + type: "update_file", + path: "README.md", + diff: "...", + }, + }, + }, + { + type: "tool_call_output_item", + output: "patch rejected", + rawItem: { + type: "apply_patch_call_output", + callId: "call_patch", + status: "failed", + output: "patch rejected", + }, + }, + ], + }), + }, + }); + + const result = await harness.run("Patch README", createHarnessContext({})); + const [call] = toolCalls(result.session); + + expect(call).toMatchObject({ + id: "call_patch", + name: "apply_patch_call", + error: { + message: "patch rejected", + }, + metadata: { + outputStatus: "failed", + }, + }); + expect(call.result).toBeUndefined(); +}); + +test("attaches partial tool calls when Runner.run errors", async () => { + const lookupBottle = { + type: "function", + name: "lookupBottle", + invoke: async () => ({ + bottleId: "bt_missing", + family: "unknown", + }), + } satisfies OpenAiAgentsTool; + const harness = openaiAgentsHarness({ + prompt: judgePrompt, + agent: { + name: "classifier", + model: "gpt-4.1-mini", + tools: [lookupBottle], + } satisfies DemoAgent, + runner: { + run: async (agent: DemoAgent, _input: string, runOptions) => { + await agent.tools?.[0].invoke?.( + runOptions?.context, + JSON.stringify({ + bottleId: "bt_missing", + }), + { + toolCallId: "call_lookup", + }, + ); + + throw new Error("classifier failed after lookup"); + }, + }, + }); + + const error = await harness + .run("Classify bottle bt_missing", createHarnessContext({})) + .catch((caughtError) => caughtError); + const run = getHarnessRunFromError(error); + + expect(run).toBeDefined(); + expect(run?.usage.toolCalls).toBe(1); + expect(run?.errors).toEqual([ + { + type: "Error", + message: "classifier failed after lookup", + }, + ]); + expect(toolCalls(run!.session)).toMatchObject([ + { + id: "call_lookup", + name: "lookupBottle", + arguments: { + bottleId: "bt_missing", + }, + result: { + bottleId: "bt_missing", + family: "unknown", + }, + }, + ]); +}); diff --git a/packages/harness-openai-agents/src/index.ts b/packages/harness-openai-agents/src/index.ts new file mode 100644 index 0000000..60e68b2 --- /dev/null +++ b/packages/harness-openai-agents/src/index.ts @@ -0,0 +1,1805 @@ +import type { + Harness, + HarnessContext, + HarnessMetadata, + HarnessPrompt, + HarnessRun, + JsonValue, + NormalizedMessage, + NormalizedSession, + TimingSummary, + ToolCallRecord, + UsageSummary, +} from "vitest-evals/harness"; +import { + attachHarnessRunToError, + hasCallableMethod, + isHarnessRun, + isNormalizedSession, + normalizeContent, + normalizeMetadata, + normalizeRecord, + resolveHarnessRunErrors, + serializeError, + toJsonValue, +} from "vitest-evals/harness"; +import { + executeWithReplay, + getReplayMetadataFromError, + normalizeReplayMetadata, +} from "vitest-evals/replay"; +import type { + ReplayMode, + ToolRecording, + ToolReplayConfig, +} from "vitest-evals/replay"; + +type MaybePromise = T | Promise; + +export type OpenAiAgentsReplayMode = ReplayMode; + +export interface OpenAiAgentsRuntimeContext< + TMetadata extends HarnessMetadata = HarnessMetadata, +> { + metadata: Readonly; + artifacts: HarnessContext["artifacts"]; + setArtifact: HarnessContext["setArtifact"]; +} + +export type OpenAiAgentsRunOptions = Record< + string, + unknown +> & { + context?: TContext; + signal?: AbortSignal; + stream?: boolean; +}; + +export interface OpenAiAgentsRunner { + run: ( + agent: TAgent, + input: TInput, + options?: OpenAiAgentsRunOptions, + ) => MaybePromise; +} + +export interface OpenAiAgentsRuntime< + TInput = string, + TMetadata extends HarnessMetadata = HarnessMetadata, + TContext = OpenAiAgentsRuntimeContext, +> { + context: TContext; + runOptions: OpenAiAgentsRunOptions; + signal?: AbortSignal; + tools: OpenAiAgentsTool[]; +} + +export interface OpenAiAgentsHarnessRunArgs< + TAgent, + TInput, + TMetadata extends HarnessMetadata, + TRunner, + TResult, + TContext, +> { + agent: TAgent; + input: TInput; + context: HarnessContext; + runtime: OpenAiAgentsRuntime; + runner: TRunner | undefined; + runOptions: OpenAiAgentsRunOptions; +} + +export interface OpenAiAgentsHarnessResultArgs< + TAgent, + TInput, + TMetadata extends HarnessMetadata, + TRunner, + TResult, + TContext, +> extends OpenAiAgentsHarnessRunArgs< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + > { + result: TResult; + output: JsonValue | undefined; +} + +export interface OpenAiAgentsToolContext< + TInput = string, + TMetadata extends HarnessMetadata = HarnessMetadata, +> { + input: TInput; + metadata: HarnessContext["metadata"]; + signal?: AbortSignal; + setArtifact: HarnessContext["setArtifact"]; + runContext: unknown; + details: unknown; + tool: OpenAiAgentsTool; +} + +export type OpenAiAgentsToolRecording< + TArgs extends JsonValue = JsonValue, + TResult extends JsonValue = JsonValue, +> = ToolRecording; + +export type OpenAiAgentsToolReplayConfig< + TArgs extends JsonValue = JsonValue, + TResult extends JsonValue = JsonValue, + TInput = string, + TMetadata extends HarnessMetadata = HarnessMetadata, +> = ToolReplayConfig< + TArgs, + TResult, + OpenAiAgentsToolContext +>; + +export type OpenAiAgentsToolReplayPolicy< + TInput = string, + TMetadata extends HarnessMetadata = HarnessMetadata, +> = + | boolean + | OpenAiAgentsToolReplayConfig; + +export type OpenAiAgentsToolReplayPolicies< + TInput = string, + TMetadata extends HarnessMetadata = HarnessMetadata, +> = Record>; + +type OpenAiAgentsInvoke = (...args: unknown[]) => unknown; + +export type OpenAiAgentsTool< + TInput = string, + TMetadata extends HarnessMetadata = HarnessMetadata, +> = Record & { + name?: string; + toolName?: string; + type?: string; + invoke?: OpenAiAgentsInvoke; +}; + +export interface OpenAiAgentsHarnessNormalizeOptions< + TAgent, + TInput = string, + TMetadata extends HarnessMetadata = HarnessMetadata, + TRunner = unknown, + TResult = unknown, + TContext = OpenAiAgentsRuntimeContext, +> { + session?: ( + args: OpenAiAgentsHarnessResultArgs< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, + ) => MaybePromise; + output?: ( + args: Omit< + OpenAiAgentsHarnessResultArgs< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, + "output" + >, + ) => MaybePromise; + outputText?: ( + args: OpenAiAgentsHarnessResultArgs< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, + ) => MaybePromise; + usage?: ( + args: OpenAiAgentsHarnessResultArgs< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, + ) => MaybePromise; + timings?: ( + args: OpenAiAgentsHarnessResultArgs< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, + ) => MaybePromise; + errors?: ( + args: OpenAiAgentsHarnessResultArgs< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, + ) => MaybePromise>>; +} + +export interface OpenAiAgentsHarnessOptions< + TAgent, + TInput = string, + TMetadata extends HarnessMetadata = HarnessMetadata, + TRunner = OpenAiAgentsRunner< + TAgent, + TInput, + OpenAiAgentsRuntimeContext, + unknown + >, + TResult = unknown, + TContext = OpenAiAgentsRuntimeContext, +> { + agent?: TAgent; + createAgent?: () => MaybePromise; + runner?: TRunner; + createRunner?: ( + args: Omit< + OpenAiAgentsHarnessRunArgs< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, + "runner" + >, + ) => MaybePromise; + run?: ( + args: OpenAiAgentsHarnessRunArgs< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, + ) => MaybePromise; + runOptions?: + | OpenAiAgentsRunOptions + | (( + args: Omit< + OpenAiAgentsHarnessRunArgs< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, + "runner" | "runtime" | "runOptions" + >, + ) => MaybePromise | undefined>); + toolReplay?: OpenAiAgentsToolReplayPolicies; + normalize?: OpenAiAgentsHarnessNormalizeOptions< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >; + prompt: HarnessPrompt; + name?: string; +} + +type RuntimeToolCapture = { + calls: ToolCallRecord[]; +}; + +/** Adapts an `@openai/agents` Runner workflow into a normalized harness. */ +export function openaiAgentsHarness< + TAgent, + TInput = string, + TMetadata extends HarnessMetadata = HarnessMetadata, + TRunner = OpenAiAgentsRunner< + TAgent, + TInput, + OpenAiAgentsRuntimeContext, + unknown + >, + TResult = unknown, + TContext = OpenAiAgentsRuntimeContext, +>( + options: OpenAiAgentsHarnessOptions< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, +): Harness { + validateOptions(options); + + return { + name: options.name ?? "openai-agents", + prompt: options.prompt, + run: async (input, context) => { + const agent = await resolveAgent(options); + return executeOpenAiAgentsHarness(options, agent, input, context); + }, + }; +} + +async function executeOpenAiAgentsHarness< + TAgent, + TInput, + TMetadata extends HarnessMetadata, + TRunner, + TResult, + TContext, +>( + options: OpenAiAgentsHarnessOptions< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, + agent: TAgent, + input: TInput, + context: HarnessContext, +): Promise { + const startedAt = Date.now(); + const capture: RuntimeToolCapture = { + calls: [], + }; + + return withInstrumentedAgentTools( + agent, + { + input, + context, + capture, + toolReplay: options.toolReplay, + }, + async (instrumentedAgent, runtimeTools) => { + const defaultRuntimeContext = { + metadata: context.metadata, + artifacts: context.artifacts, + setArtifact: context.setArtifact, + } satisfies OpenAiAgentsRuntimeContext; + const runOptions = await resolveRunOptions< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >( + options, + instrumentedAgent, + input, + context, + defaultRuntimeContext as TContext, + ); + const runtime = { + context: runOptions.context as TContext, + runOptions, + signal: runOptions.signal, + tools: runtimeTools, + } satisfies OpenAiAgentsRuntime; + const runner = await resolveRunner(options, { + agent: instrumentedAgent, + input, + context, + runtime, + runOptions, + }); + + try { + const result = await runAgent(options, { + agent: instrumentedAgent, + input, + context, + runtime, + runner, + runOptions, + }); + const settledResult = await settleRunResult(result); + + if (isHarnessRun(settledResult) && !hasResultOverrides(options)) { + if ( + Object.keys(context.artifacts).length > 0 && + !settledResult.artifacts + ) { + settledResult.artifacts = context.artifacts; + } + return settledResult; + } + + const normalizeResult = settledResult as TResult; + const baseResultArgs = { + agent: instrumentedAgent, + input, + context, + runtime, + runner, + runOptions, + result: normalizeResult, + }; + const output = options.normalize?.output + ? await options.normalize.output(baseResultArgs) + : resolveOutput(normalizeResult); + const resultArgs = { + ...baseResultArgs, + output, + } satisfies OpenAiAgentsHarnessResultArgs< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >; + const usage = options.normalize?.usage + ? await options.normalize.usage(resultArgs) + : resolveUsage(normalizeResult, capture.calls.length); + const outputText = options.normalize?.outputText + ? await options.normalize.outputText(resultArgs) + : resolveOutputText(normalizeResult, output); + const session = options.normalize?.session + ? await options.normalize.session(resultArgs) + : resolveSession(input, normalizeResult, output, outputText, usage, { + runtimeToolCalls: capture.calls, + }); + + return { + session, + output, + usage, + timings: options.normalize?.timings + ? await options.normalize.timings(resultArgs) + : { totalMs: Date.now() - startedAt }, + artifacts: + Object.keys(context.artifacts).length > 0 + ? context.artifacts + : undefined, + errors: options.normalize?.errors + ? await options.normalize.errors(resultArgs) + : resolveHarnessRunErrors(normalizeResult), + }; + } catch (error) { + const usage = + capture.calls.length > 0 ? { toolCalls: capture.calls.length } : {}; + const run = { + session: resolveSession( + input, + undefined, + undefined, + undefined, + usage, + { + runtimeToolCalls: capture.calls, + }, + ), + output: undefined, + usage, + timings: { totalMs: Date.now() - startedAt }, + artifacts: + Object.keys(context.artifacts).length > 0 + ? context.artifacts + : undefined, + errors: [serializeError(error)], + } satisfies HarnessRun; + + throw attachHarnessRunToError(error, run); + } + }, + ); +} + +function validateOptions< + TAgent, + TInput, + TMetadata extends HarnessMetadata, + TRunner, + TResult, + TContext, +>( + options: OpenAiAgentsHarnessOptions< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, +) { + const hasAgent = options.agent !== undefined; + const hasCreateAgent = typeof options.createAgent === "function"; + + if (hasAgent && hasCreateAgent) { + throw new Error( + "openaiAgentsHarness accepts either agent or createAgent(), not both.", + ); + } + + if (!hasAgent && !hasCreateAgent) { + throw new Error( + "openaiAgentsHarness requires either an agent instance or createAgent().", + ); + } + + if (options.runner && options.createRunner) { + throw new Error( + "openaiAgentsHarness accepts either runner or createRunner(), not both.", + ); + } + + if (typeof options.agent === "function") { + throw new Error( + "openaiAgentsHarness agent must be an Agent instance. Use createAgent() for agent factories.", + ); + } + + if ( + typeof options.runner === "function" && + !hasCallableMethod(options.runner, "run") + ) { + throw new Error( + "openaiAgentsHarness runner must be a Runner instance. Use createRunner() for runner factories.", + ); + } + + if (!options.run && !options.runner && !options.createRunner) { + throw new Error( + "openaiAgentsHarness requires runner/createRunner for Runner.run(agent, input, options), or run() for a custom entrypoint.", + ); + } +} + +async function resolveAgent< + TAgent, + TInput, + TMetadata extends HarnessMetadata, + TRunner, + TResult, + TContext, +>( + options: OpenAiAgentsHarnessOptions< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, +) { + if (options.createAgent) { + return options.createAgent(); + } + + if (options.agent !== undefined) { + return options.agent; + } + + throw new Error( + "openaiAgentsHarness requires either an agent instance or createAgent().", + ); +} + +async function resolveRunner< + TAgent, + TInput, + TMetadata extends HarnessMetadata, + TRunner, + TResult, + TContext, +>( + options: OpenAiAgentsHarnessOptions< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, + args: Omit< + OpenAiAgentsHarnessRunArgs< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, + "runner" + >, +) { + if (options.createRunner) { + return options.createRunner(args); + } + + if (options.runner !== undefined) { + return options.runner; + } + + return undefined; +} + +async function resolveRunOptions< + TAgent, + TInput, + TMetadata extends HarnessMetadata, + TRunner, + TResult, + TContext, +>( + options: OpenAiAgentsHarnessOptions< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, + agent: TAgent, + input: TInput, + context: HarnessContext, + defaultRuntimeContext: TContext, +): Promise> { + const userOptions = + typeof options.runOptions === "function" + ? await options.runOptions({ + agent, + input, + context, + }) + : options.runOptions; + const baseOptions = userOptions ?? {}; + + return { + ...baseOptions, + context: + "context" in baseOptions + ? (baseOptions.context as TContext) + : defaultRuntimeContext, + signal: + "signal" in baseOptions + ? (baseOptions.signal as AbortSignal | undefined) + : context.signal, + stream: "stream" in baseOptions ? Boolean(baseOptions.stream) : false, + }; +} + +async function runAgent< + TAgent, + TInput, + TMetadata extends HarnessMetadata, + TRunner, + TResult, + TContext, +>( + options: OpenAiAgentsHarnessOptions< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, + args: OpenAiAgentsHarnessRunArgs< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, +): Promise { + if (options.run) { + return options.run(args); + } + + if (hasRunnerRunMethod(args.runner)) { + return args.runner.run(args.agent, args.input, args.runOptions); + } + + throw new Error( + "openaiAgentsHarness requires runner/createRunner for the default Runner.run path, or run() for a custom entrypoint.", + ); +} + +function hasRunnerRunMethod( + runner: unknown, +): runner is OpenAiAgentsRunner { + return hasCallableMethod(runner, "run"); +} + +async function settleRunResult(result: unknown) { + if ( + result && + typeof result === "object" && + "completed" in result && + isPromiseLike((result as { completed?: unknown }).completed) + ) { + await (result as { completed: Promise }).completed; + } + + return result; +} + +function hasResultOverrides< + TAgent, + TInput, + TMetadata extends HarnessMetadata, + TRunner, + TResult, + TContext, +>( + options: OpenAiAgentsHarnessOptions< + TAgent, + TInput, + TMetadata, + TRunner, + TResult, + TContext + >, +) { + return Boolean( + options.normalize?.output ?? + options.normalize?.outputText ?? + options.normalize?.session ?? + options.normalize?.usage ?? + options.normalize?.timings ?? + options.normalize?.errors, + ); +} + +async function withInstrumentedAgentTools< + TAgent, + TInput, + TMetadata extends HarnessMetadata, + TResult, +>( + agent: TAgent, + args: { + input: TInput; + context: HarnessContext; + capture: RuntimeToolCapture; + toolReplay: OpenAiAgentsToolReplayPolicies | undefined; + }, + callback: ( + agent: TAgent, + runtimeTools: OpenAiAgentsTool[], + ) => Promise, +) { + const agentTools = getAgentTools(agent) ?? []; + validateToolReplayPolicies(agentTools, args.toolReplay); + + if (agentTools.length === 0) { + return callback(agent, []); + } + + const runtimeTools = agentTools.map((tool) => instrumentTool(tool, args)); + const instrumentedAgent = cloneAgentWithTools(agent, runtimeTools); + return callback(instrumentedAgent, runtimeTools); +} + +function getAgentTools( + agent: unknown, +): OpenAiAgentsTool[] | undefined { + const tools = getObjectProperty(agent, "tools"); + return Array.isArray(tools) + ? (tools as OpenAiAgentsTool[]) + : undefined; +} + +function instrumentTool( + tool: OpenAiAgentsTool, + args: { + input: TInput; + context: HarnessContext; + capture: RuntimeToolCapture; + toolReplay: OpenAiAgentsToolReplayPolicies | undefined; + }, +): OpenAiAgentsTool { + const toolName = resolveToolName(tool); + const replay = args.toolReplay?.[toolName]; + + if (typeof tool.invoke !== "function") { + if (replay) { + throw new Error( + `Tool replay requires invoke() for ${toolName}. Hosted or provider-executed OpenAI Agents tools cannot be recorded automatically.`, + ); + } + + return tool; + } + + const originalInvoke = tool.invoke; + const instrumentedInvoke = (async (runContext, rawInput, details) => + executeInstrumentedTool({ + tool, + toolName, + replay, + rawInput, + runContext, + details, + input: args.input, + context: args.context, + capture: args.capture, + execute: () => originalInvoke(runContext, rawInput, details), + })) as OpenAiAgentsInvoke; + + return { + ...tool, + invoke: instrumentedInvoke, + }; +} + +function validateToolReplayPolicies( + tools: OpenAiAgentsTool[], + toolReplay: OpenAiAgentsToolReplayPolicies | undefined, +) { + const replayToolNames = Object.entries(toolReplay ?? {}) + .filter(([, replay]) => Boolean(replay)) + .map(([toolName]) => toolName); + if (replayToolNames.length === 0) { + return; + } + + const knownToolNames = new Set(tools.map(resolveToolName)); + const unknownToolNames = replayToolNames.filter( + (toolName) => !knownToolNames.has(toolName), + ); + if (unknownToolNames.length > 0) { + throw new Error( + `Tool replay configured for unknown OpenAI Agents tool(s): ${unknownToolNames.join(", ")}.`, + ); + } +} + +function cloneAgentWithTools( + agent: TAgent, + tools: OpenAiAgentsTool[], +): TAgent { + if (hasCallableMethod(agent, "clone")) { + return ( + agent as { + clone: (config: { + tools: OpenAiAgentsTool[]; + }) => TAgent; + } + ).clone({ tools }); + } + + if (!agent || typeof agent !== "object") { + return agent; + } + + return Object.assign({}, agent, { tools }) as TAgent; +} + +async function executeInstrumentedTool< + TInput, + TMetadata extends HarnessMetadata, +>({ + tool, + toolName, + replay, + rawInput, + runContext, + details, + input, + context, + capture, + execute, +}: { + tool: OpenAiAgentsTool; + toolName: string; + replay: OpenAiAgentsToolReplayPolicy | undefined; + rawInput: unknown; + runContext: unknown; + details: unknown; + input: TInput; + context: HarnessContext; + capture: RuntimeToolCapture; + execute: () => MaybePromise; +}) { + const startedAt = new Date(); + const toolCallId = resolveToolCallId(runContext, rawInput, details); + const normalizedArgs = normalizeArguments(rawInput); + const replayContext = { + input, + metadata: context.metadata, + signal: context.signal, + setArtifact: context.setArtifact, + runContext, + details, + tool, + } satisfies OpenAiAgentsToolContext; + + try { + const execution = replay + ? await executeWithReplay({ + toolName, + args: normalizeReplayToolInput(rawInput), + context: replayContext, + execute: async () => + toReplayJsonValue(await execute(), `${toolName} tool output`), + replay, + }) + : { + result: await execute(), + replay: undefined, + }; + const finishedAt = new Date(); + const normalizedResult = normalizeToolResult(execution.result); + const call = { + ...(toolCallId ? { id: toolCallId } : {}), + name: toolName, + ...(normalizedArgs !== undefined ? { arguments: normalizedArgs } : {}), + ...(normalizedResult !== undefined ? { result: normalizedResult } : {}), + startedAt: startedAt.toISOString(), + finishedAt: finishedAt.toISOString(), + durationMs: finishedAt.getTime() - startedAt.getTime(), + metadata: normalizeReplayMetadata(execution.replay), + } satisfies ToolCallRecord; + + capture.calls.push(call); + return execution.result; + } catch (error) { + const finishedAt = new Date(); + const replay = getReplayMetadataFromError(error); + const call = { + ...(toolCallId ? { id: toolCallId } : {}), + name: toolName, + ...(normalizedArgs !== undefined ? { arguments: normalizedArgs } : {}), + error: normalizeError(error), + startedAt: startedAt.toISOString(), + finishedAt: finishedAt.toISOString(), + durationMs: finishedAt.getTime() - startedAt.getTime(), + metadata: normalizeReplayMetadata(replay), + } satisfies ToolCallRecord; + + capture.calls.push(call); + throw error; + } +} + +function resolveToolName(tool: unknown) { + return ( + stringProperty(tool, "name") ?? + stringProperty(tool, "toolName") ?? + stringProperty(getObjectProperty(tool, "function"), "name") ?? + "unknown" + ); +} + +function resolveToolCallId( + runContext: unknown, + rawInput: unknown, + details: unknown, +) { + return ( + findStringAtPath(details, ["toolCallId"]) ?? + findStringAtPath(details, ["tool_call_id"]) ?? + findStringAtPath(details, ["callId"]) ?? + findStringAtPath(details, ["call_id"]) ?? + findStringAtPath(details, ["toolCall", "callId"]) ?? + findStringAtPath(details, ["toolCall", "call_id"]) ?? + findStringAtPath(details, ["rawItem", "callId"]) ?? + findStringAtPath(details, ["rawItem", "call_id"]) ?? + findStringAtPath(runContext, ["toolCallId"]) ?? + findStringAtPath(runContext, ["tool_call_id"]) ?? + findStringAtPath(runContext, ["toolCall", "callId"]) ?? + findStringAtPath(rawInput, ["toolCallId"]) ?? + findStringAtPath(rawInput, ["tool_call_id"]) + ); +} + +function resolveOutput(result: unknown): JsonValue | undefined { + if (!result || typeof result !== "object") { + return toJsonValue(result); + } + + const candidates = [ + "finalOutput", + "final_output", + "object", + "result", + "decision", + "text", + ] satisfies string[]; + + for (const key of candidates) { + const normalized = toJsonValue((result as Record)[key]); + if (normalized !== undefined) { + return normalized; + } + } + + const output = (result as { output?: unknown }).output; + if (typeof output === "string") { + return output; + } + + return undefined; +} + +function resolveOutputText( + result: unknown, + output: JsonValue | undefined, +): string | undefined { + if (!result || typeof result !== "object") { + return typeof output === "string" ? output : stringifyJson(output); + } + + const directText = + stringProperty(result, "finalOutput") ?? + stringProperty(result, "final_output") ?? + stringProperty(result, "text"); + if (directText !== undefined) { + return directText; + } + + const itemText = resolveAssistantTextFromItems( + arrayProperty(result, "newItems") ?? arrayProperty(result, "output") ?? [], + ); + if (itemText) { + return itemText; + } + + return typeof output === "string" ? output : stringifyJson(output); +} + +function resolveUsage(result: unknown, runtimeToolCallCount: number) { + const usage = + getObjectProperty(getObjectProperty(result, "state"), "usage") ?? + getObjectProperty(getObjectProperty(result, "runContext"), "usage") ?? + getObjectProperty(result, "usage"); + const usageRecord = + usage && typeof usage === "object" + ? (usage as Record) + : undefined; + const toolCallCount = + countToolCallsFromResult(result) || runtimeToolCallCount || undefined; + + if (!usageRecord) { + return toolCallCount ? { toolCalls: toolCallCount } : {}; + } + + return { + provider: resolveProvider(result), + model: resolveModel(result), + inputTokens: numberProperty(usageRecord, "inputTokens"), + outputTokens: numberProperty(usageRecord, "outputTokens"), + reasoningTokens: numberProperty(usageRecord, "reasoningTokens"), + totalTokens: numberProperty(usageRecord, "totalTokens"), + toolCalls: toolCallCount, + retries: numberProperty(usageRecord, "retries"), + metadata: normalizeMetadata({ + requests: usageRecord.requests, + requestUsageEntries: usageRecord.requestUsageEntries, + raw: usageRecord.raw, + }), + } satisfies UsageSummary; +} + +function resolveSession( + input: unknown, + result: unknown, + output: JsonValue | undefined, + outputText: string | undefined, + usage: UsageSummary, + options: { + runtimeToolCalls: ToolCallRecord[]; + }, +): NormalizedSession { + if ( + isNormalizedSession( + (result as Record | undefined)?.session, + ) + ) { + return (result as { session: NormalizedSession }).session; + } + + if ( + isNormalizedSession((result as Record | undefined)?.trace) + ) { + return (result as { trace: NormalizedSession }).trace; + } + + const newItems = arrayProperty(result, "newItems"); + const outputItems = arrayProperty(result, "output"); + const messages = + newItems && newItems.length > 0 + ? normalizeInputMessages(getObjectProperty(result, "input") ?? input) + : normalizeHistoryMessages(result, input); + + if (newItems && newItems.length > 0) { + messages.push(...normalizeRunItems(newItems, options.runtimeToolCalls)); + } else if (outputItems && outputItems.length > 0) { + messages.push(...normalizeRunItems(outputItems, options.runtimeToolCalls)); + } + + appendUnmatchedRuntimeToolCalls(messages, options.runtimeToolCalls); + + if ( + output !== undefined && + !messages.some( + (message) => + message.role === "assistant" && message.content !== undefined, + ) + ) { + messages.push({ + role: "assistant", + content: output, + }); + } + + return { + messages, + outputText, + provider: resolveProvider(result) ?? usage.provider, + model: resolveModel(result) ?? usage.model, + metadata: normalizeMetadata({ + lastResponseId: getObjectProperty(result, "lastResponseId"), + interruptions: getObjectProperty(result, "interruptions"), + rawResponses: getObjectProperty(result, "rawResponses"), + inputGuardrailResults: getObjectProperty(result, "inputGuardrailResults"), + outputGuardrailResults: getObjectProperty( + result, + "outputGuardrailResults", + ), + toolInputGuardrailResults: getObjectProperty( + result, + "toolInputGuardrailResults", + ), + toolOutputGuardrailResults: getObjectProperty( + result, + "toolOutputGuardrailResults", + ), + activeAgent: normalizeAgentMetadata( + getObjectProperty(result, "activeAgent"), + ), + lastAgent: normalizeAgentMetadata(getObjectProperty(result, "lastAgent")), + }), + }; +} + +function normalizeHistoryMessages( + result: unknown, + fallbackInput: unknown, +): NormalizedMessage[] { + const history = arrayProperty(result, "history"); + if (!history || history.length === 0) { + return normalizeInputMessages( + getObjectProperty(result, "input") ?? fallbackInput, + ); + } + + const messages: NormalizedMessage[] = []; + for (const item of history) { + const normalized = normalizeModelMessage(item); + if (normalized) { + messages.push(normalized); + } + } + + return messages.length > 0 + ? messages + : normalizeInputMessages( + getObjectProperty(result, "input") ?? fallbackInput, + ); +} + +function normalizeInputMessages(input: unknown): NormalizedMessage[] { + if (Array.isArray(input)) { + const messages = input + .map((item) => normalizeModelMessage(item)) + .filter((message): message is NormalizedMessage => Boolean(message)); + + return messages.length > 0 + ? messages + : [ + { + role: "user", + content: normalizeContent(input), + }, + ]; + } + + return [ + { + role: "user", + content: normalizeContent(input), + }, + ]; +} + +function normalizeRunItems( + items: unknown[], + runtimeToolCalls: ToolCallRecord[], +): NormalizedMessage[] { + const messages: NormalizedMessage[] = []; + const outputItemsByCallId = new Map(); + const runtimeCallsById = new Map( + runtimeToolCalls + .filter((call): call is ToolCallRecord & { id: string } => + Boolean(call.id), + ) + .map((call) => [call.id, call]), + ); + + for (const item of items) { + const rawItem = getRunItemRawItem(item); + const callId = resolveRawToolCallId(rawItem); + if (callId && isToolCallOutputItem(item, rawItem)) { + outputItemsByCallId.set(callId, item); + } + } + + for (const item of items) { + const rawItem = getRunItemRawItem(item); + + if (isAssistantMessageItem(item, rawItem)) { + messages.push({ + role: "assistant", + content: normalizeMessageContent(rawItem, item), + metadata: normalizeRunItemMetadata(item, rawItem), + }); + continue; + } + + if (isToolCallItem(item, rawItem)) { + const callId = resolveRawToolCallId(rawItem); + const runtimeCall = callId ? runtimeCallsById.get(callId) : undefined; + const call = normalizeToolCallItem( + item, + rawItem, + outputItemsByCallId.get(callId ?? ""), + runtimeCall, + ); + messages.push({ + role: "assistant", + toolCalls: [call], + metadata: normalizeRunItemMetadata(item, rawItem), + }); + continue; + } + + if (isToolCallOutputItem(item, rawItem)) { + messages.push(normalizeToolResultMessage(item, rawItem)); + continue; + } + + const metadata = normalizeRunItemMetadata(item, rawItem); + if (metadata) { + messages.push({ + role: "assistant", + metadata, + }); + } + } + + return messages; +} + +function appendUnmatchedRuntimeToolCalls( + messages: NormalizedMessage[], + runtimeToolCalls: ToolCallRecord[], +) { + const seenIds = new Set( + messages.flatMap((message) => + (message.toolCalls ?? []) + .map((call) => call.id) + .filter((id): id is string => Boolean(id)), + ), + ); + const unmatched = runtimeToolCalls.filter( + (call) => !call.id || !seenIds.has(call.id), + ); + + for (const call of unmatched) { + messages.push({ + role: "assistant", + toolCalls: [call], + }); + + if (call.result !== undefined || call.error) { + messages.push({ + role: "tool", + ...(call.result !== undefined + ? { content: call.result } + : call.error && call.error.message.length > 0 + ? { content: call.error.message } + : {}), + metadata: normalizeMetadata({ + name: call.name, + toolCallId: call.id, + isError: Boolean(call.error), + }), + }); + } + } +} + +function normalizeModelMessage(item: unknown): NormalizedMessage | undefined { + if (!item || typeof item !== "object") { + return undefined; + } + + const rawItem = getRunItemRawItem(item); + const role = stringProperty(rawItem, "role"); + if ( + role !== "system" && + role !== "user" && + role !== "assistant" && + role !== "tool" + ) { + return undefined; + } + + const content = normalizeMessageContent(rawItem, item); + return { + role, + ...(content !== undefined ? { content } : {}), + metadata: normalizeRunItemMetadata(item, rawItem), + }; +} + +function normalizeToolCallItem( + item: unknown, + rawItem: unknown, + outputItem: unknown, + runtimeCall: ToolCallRecord | undefined, +): ToolCallRecord { + const rawOutputItem = getRunItemRawItem(outputItem); + const output = + getObjectProperty(outputItem, "output") ?? + getObjectProperty(rawOutputItem, "output"); + const outputStatus = stringProperty(rawOutputItem, "status"); + const outputError = + outputStatus === "failed" ? normalizeToolOutputError(output) : undefined; + const normalizedResult = + output !== undefined ? normalizeToolResult(output) : undefined; + const call = { + id: resolveRawToolCallId(rawItem), + name: resolveRawToolName(rawItem), + arguments: normalizeArguments(getObjectProperty(rawItem, "arguments")), + ...(outputError + ? { error: outputError } + : normalizedResult !== undefined + ? { result: normalizedResult } + : {}), + metadata: normalizeMetadata({ + status: getObjectProperty(rawItem, "status"), + outputStatus, + namespace: getObjectProperty(rawItem, "namespace"), + providerData: getObjectProperty(rawItem, "providerData"), + itemType: getObjectProperty(item, "type"), + rawType: getObjectProperty(rawItem, "type"), + }), + } satisfies ToolCallRecord; + + return mergeToolCalls(call, runtimeCall); +} + +function normalizeToolResultMessage( + item: unknown, + rawItem: unknown, +): NormalizedMessage { + const output = + getObjectProperty(item, "output") ?? getObjectProperty(rawItem, "output"); + const status = stringProperty(rawItem, "status"); + const isError = status === "failed"; + + return { + role: "tool", + ...(output !== undefined ? { content: normalizeContent(output) } : {}), + metadata: normalizeMetadata({ + name: resolveRawToolName(rawItem), + toolCallId: resolveRawToolCallId(rawItem), + isError, + status, + namespace: getObjectProperty(rawItem, "namespace"), + providerData: getObjectProperty(rawItem, "providerData"), + itemType: getObjectProperty(item, "type"), + rawType: getObjectProperty(rawItem, "type"), + }), + }; +} + +function mergeToolCalls( + call: ToolCallRecord, + runtimeCall: ToolCallRecord | undefined, +): ToolCallRecord { + if (!runtimeCall) { + return call; + } + + const error = runtimeCall.error ?? call.error; + const hasRuntimeResult = hasOwnObjectProperty(runtimeCall, "result"); + const hasCallResult = hasOwnObjectProperty(call, "result"); + const result = hasRuntimeResult ? runtimeCall.result : call.result; + + const merged = { + ...runtimeCall, + ...call, + id: call.id ?? runtimeCall.id, + name: call.name ?? runtimeCall.name, + arguments: call.arguments ?? runtimeCall.arguments, + metadata: normalizeMetadata({ + ...(runtimeCall.metadata ?? {}), + ...(call.metadata ?? {}), + }), + }; + + if (error) { + const { result: _result, ...withoutResult } = merged; + return { + ...withoutResult, + error, + }; + } + + const { error: _error, result: _result, ...withoutOutcome } = merged; + if (hasRuntimeResult || hasCallResult) { + return { + ...withoutOutcome, + result, + }; + } + + return withoutOutcome; +} + +function normalizeMessageContent( + rawItem: unknown, + item: unknown, +): JsonValue | undefined { + const contentAccessor = getObjectProperty(item, "content"); + if (typeof contentAccessor === "string" && contentAccessor.length > 0) { + return contentAccessor; + } + + const content = getObjectProperty(rawItem, "content"); + const text = extractText(content); + if (text) { + return text; + } + + return content === undefined ? undefined : normalizeContent(content); +} + +function resolveAssistantTextFromItems(items: unknown[]) { + const texts: string[] = []; + + for (const item of items) { + const rawItem = getRunItemRawItem(item); + if (!isAssistantMessageItem(item, rawItem)) { + continue; + } + + const text = extractText(getObjectProperty(rawItem, "content")); + if (text) { + texts.push(text); + } + } + + return texts.join("\n\n"); +} + +function isAssistantMessageItem(item: unknown, rawItem: unknown) { + return ( + getObjectProperty(item, "type") === "message_output_item" || + stringProperty(rawItem, "role") === "assistant" + ); +} + +function isToolCallItem(item: unknown, rawItem: unknown) { + const itemType = getObjectProperty(item, "type"); + const rawType = getObjectProperty(rawItem, "type"); + + return ( + itemType === "tool_call_item" || + rawType === "function_call" || + rawType === "hosted_tool_call" || + rawType === "tool_search_call" || + rawType === "shell_call" || + rawType === "computer_call" || + rawType === "apply_patch_call" + ); +} + +function isToolCallOutputItem(item: unknown, rawItem: unknown) { + const itemType = getObjectProperty(item, "type"); + const rawType = getObjectProperty(rawItem, "type"); + + return ( + itemType === "tool_call_output_item" || + rawType === "function_call_result" || + rawType === "tool_search_output" || + rawType === "shell_call_output" || + rawType === "computer_call_result" || + rawType === "apply_patch_call_output" + ); +} + +function getRunItemRawItem(item: unknown) { + return getObjectProperty(item, "rawItem") ?? item; +} + +function normalizeRunItemMetadata(item: unknown, rawItem: unknown) { + return normalizeMetadata({ + id: getObjectProperty(rawItem, "id"), + status: getObjectProperty(rawItem, "status"), + providerData: getObjectProperty(rawItem, "providerData"), + agent: normalizeAgentMetadata(getObjectProperty(item, "agent")), + itemType: getObjectProperty(item, "type"), + rawType: getObjectProperty(rawItem, "type"), + }); +} + +function resolveRawToolCallId(rawItem: unknown) { + return ( + stringProperty(rawItem, "callId") ?? + stringProperty(rawItem, "call_id") ?? + stringProperty(rawItem, "id") + ); +} + +function resolveRawToolName(rawItem: unknown) { + const rawType = stringProperty(rawItem, "type"); + if (rawType === "tool_search_call" || rawType === "tool_search_output") { + return "tool_search"; + } + + return ( + stringProperty(rawItem, "name") ?? + stringProperty(rawItem, "toolName") ?? + stringProperty(rawItem, "namespace") ?? + rawType ?? + "unknown" + ); +} + +function countToolCallsFromResult(result: unknown): number { + const newItems = arrayProperty(result, "newItems"); + const items = + newItems && newItems.length > 0 + ? newItems + : (arrayProperty(result, "output") ?? []); + const seenCallIds = new Set(); + + return items.reduce((count, item) => { + const rawItem = getRunItemRawItem(item); + if (!isToolCallItem(item, rawItem)) { + return count; + } + + const callId = resolveRawToolCallId(rawItem); + if (callId) { + if (seenCallIds.has(callId)) { + return count; + } + + seenCallIds.add(callId); + } + + return count + 1; + }, 0); +} + +function normalizeArguments( + value: unknown, +): Record | undefined { + const parsed = parseMaybeJson(value); + if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { + return parsed === undefined + ? undefined + : { input: normalizeContent(parsed) }; + } + + return normalizeRecord(parsed as Record); +} + +function normalizeReplayToolInput(value: unknown): JsonValue { + const parsed = parseMaybeJson(value); + return toReplayJsonValue(parsed, "OpenAI Agents tool input"); +} + +function normalizeToolResult(value: unknown): JsonValue | undefined { + const normalized = toJsonValue(value); + if (normalized !== undefined) { + return normalized; + } + + return value === undefined ? undefined : String(value); +} + +function normalizeToolOutputError( + output: unknown, +): NonNullable { + return { + message: resolveToolOutputErrorMessage(output), + }; +} + +function resolveToolOutputErrorMessage(output: unknown) { + if (typeof output === "string") { + return output.length > 0 ? output : "Tool call failed"; + } + + const message = + stringProperty(output, "message") ?? + stringProperty(output, "error") ?? + stringProperty(output, "text") ?? + extractText(output); + if (message && message.length > 0) { + return message; + } + + const normalized = toJsonValue(output); + return normalized === undefined + ? "Tool call failed" + : JSON.stringify(normalized); +} + +function parseMaybeJson(value: unknown) { + if (typeof value !== "string") { + return value; + } + + try { + return JSON.parse(value) as unknown; + } catch { + return value; + } +} + +function normalizeError(error: unknown): NonNullable { + const serialized = serializeError(error); + const { message, type, ...details } = serialized; + + return { + ...details, + message: typeof message === "string" ? message : String(message), + ...(typeof type === "string" ? { type } : {}), + }; +} + +function toReplayJsonValue(value: unknown, label: string): JsonValue { + const normalized = toJsonValue(value); + if (normalized === undefined) { + throw new Error( + `Tool replay only supports JSON-serializable values. ${label} could not be normalized.`, + ); + } + + return normalized; +} + +function extractText(value: unknown): string | undefined { + if (typeof value === "string") { + return value; + } + + if (!Array.isArray(value)) { + return undefined; + } + + const parts = value + .map((entry) => { + if (!entry || typeof entry !== "object") { + return undefined; + } + + return ( + stringProperty(entry, "text") ?? + stringProperty(entry, "refusal") ?? + stringProperty(entry, "transcript") + ); + }) + .filter((entry): entry is string => Boolean(entry)); + + return parts.length > 0 ? parts.join("") : undefined; +} + +function resolveProvider(result: unknown) { + return ( + stringProperty(result, "provider") ?? + stringProperty(getObjectProperty(result, "model"), "provider") ?? + stringProperty(getObjectProperty(result, "lastAgent"), "provider") ?? + stringProperty( + getObjectProperty(getObjectProperty(result, "lastAgent"), "model"), + "provider", + ) + ); +} + +function resolveModel(result: unknown) { + const directModel = getObjectProperty(result, "model"); + const lastAgentModel = getObjectProperty( + getObjectProperty(result, "lastAgent"), + "model", + ); + + return ( + stringProperty(result, "model") ?? + stringProperty(directModel, "modelId") ?? + stringProperty(directModel, "id") ?? + (typeof lastAgentModel === "string" ? lastAgentModel : undefined) ?? + stringProperty(lastAgentModel, "modelId") ?? + stringProperty(lastAgentModel, "id") + ); +} + +function normalizeAgentMetadata(agent: unknown) { + if (!agent || typeof agent !== "object") { + return undefined; + } + + return normalizeMetadata({ + name: getObjectProperty(agent, "name"), + model: resolveModel({ lastAgent: agent }), + }); +} + +function getObjectProperty(value: unknown, key: string): unknown { + return value && typeof value === "object" + ? (value as Record)[key] + : undefined; +} + +function hasOwnObjectProperty(value: object, key: keyof ToolCallRecord) { + return Object.prototype.hasOwnProperty.call(value, key); +} + +function stringProperty(value: unknown, key: string): string | undefined { + const property = getObjectProperty(value, key); + return typeof property === "string" ? property : undefined; +} + +function numberProperty(value: unknown, key: string): number | undefined { + const property = getObjectProperty(value, key); + return typeof property === "number" ? property : undefined; +} + +function arrayProperty(value: unknown, key: string): unknown[] | undefined { + const property = getObjectProperty(value, key); + return Array.isArray(property) ? property : undefined; +} + +function findStringAtPath(value: unknown, path: string[]) { + let current = value; + for (const key of path) { + current = getObjectProperty(current, key); + } + + return typeof current === "string" ? current : undefined; +} + +function stringifyJson(value: JsonValue | undefined) { + return value === undefined ? undefined : JSON.stringify(value); +} + +function isPromiseLike(value: unknown): value is Promise { + return Boolean( + value && typeof (value as { then?: unknown }).then === "function", + ); +} diff --git a/packages/harness-openai-agents/tsconfig.json b/packages/harness-openai-agents/tsconfig.json new file mode 100644 index 0000000..9e25e6e --- /dev/null +++ b/packages/harness-openai-agents/tsconfig.json @@ -0,0 +1,4 @@ +{ + "extends": "../../tsconfig.base.json", + "include": ["src/**/*.ts"] +} diff --git a/packages/harness-openai-agents/tsup.config.ts b/packages/harness-openai-agents/tsup.config.ts new file mode 100644 index 0000000..3d247e5 --- /dev/null +++ b/packages/harness-openai-agents/tsup.config.ts @@ -0,0 +1,11 @@ +import { defineConfig } from "tsup"; + +export default defineConfig({ + entry: ["src/**/*.ts", "!src/**/*.test.ts", "!src/**/*.test.*.ts"], + format: ["cjs", "esm"], + dts: true, + splitting: false, + sourcemap: true, + clean: true, + external: ["@openai/agents", "vitest-evals"], +}); diff --git a/packages/harness-pi-ai/README.md b/packages/harness-pi-ai/README.md index 622fe6e..3f1e24b 100644 --- a/packages/harness-pi-ai/README.md +++ b/packages/harness-pi-ai/README.md @@ -11,12 +11,31 @@ npm install -D vitest-evals @vitest-evals/harness-pi-ai ## Usage ```ts +import { expect } from "vitest"; import { piAiHarness } from "@vitest-evals/harness-pi-ai"; +import { describeEval, toolCalls } from "vitest-evals"; const harness = piAiHarness({ createAgent: () => createRefundAgent(), + toolReplay: { + lookupInvoice: true, + }, prompt: sharedJudgePrompt, }); + +describeEval("refund agent", { harness }, (it) => { + it("approves a refundable invoice", async ({ run }) => { + const result = await run("Refund invoice inv_123"); + + expect(result.output).toMatchObject({ + status: "approved", + }); + expect(toolCalls(result.session).map((call) => call.name)).toEqual([ + "lookupInvoice", + "createRefund", + ]); + }); +}); ``` `prompt` gives rubric or factuality judges the same provider/model setup @@ -74,7 +93,7 @@ The adapter provides: - a required prompt seam for LLM-backed judges - normalized session capture from emitted events and wrapped tool calls - usage/output inference for common `pi-ai`-style result objects -- opt-in tool replay/recording when the tool definition sets `replay: true` +- opt-in tool replay/recording from harness-level `toolReplay` See the workspace demo in `apps/demo-pi`. @@ -95,17 +114,24 @@ export default defineConfig({ }); ``` -Then opt individual tools into recording/replay: +Then opt individual tools into recording/replay from the harness: ```ts -const tools = { - lookupInvoice: { - replay: true, - execute: async ({ invoiceId }) => fetchInvoice(invoiceId), +const harness = piAiHarness({ + createAgent: () => createRefundAgent(), + toolReplay: { + lookupInvoice: true, }, -}; + prompt: sharedJudgePrompt, +}); ``` +When an agent exposes both a native Pi tool and a runtime tool with the same +name, a native tool call records in its own cassette namespace. Runtime calls of +that same name are treated as implementation details while the native tool is +executing, so delegated runtime calls do not create duplicate trace entries or +overwrite the native recording. + Supported modes: - `off`: never read or write recordings diff --git a/packages/harness-pi-ai/src/index.test.ts b/packages/harness-pi-ai/src/index.test.ts index 70ee5c7..c9e0dc1 100644 --- a/packages/harness-pi-ai/src/index.test.ts +++ b/packages/harness-pi-ai/src/index.test.ts @@ -494,6 +494,151 @@ describeEval( }, ); +test("lets native Pi tools own replay when they delegate to a runtime tool of the same name", async () => { + replayDir = mkdtempSync(join(process.cwd(), ".tmp-pi-overlap-replay-")); + vi.stubEnv("VITEST_EVALS_REPLAY_MODE", "auto"); + vi.stubEnv("VITEST_EVALS_REPLAY_DIR", replayDir); + + const lookupInvoice = vi.fn(async ({ invoiceId }: { invoiceId: string }) => ({ + invoiceId, + refundable: true, + })); + let activeRuntime: DemoRuntime | undefined; + const nativeExecute = vi.fn( + async (_toolCallId: string, args: { invoiceId: string }) => { + if (!activeRuntime) { + throw new Error("Expected runtime before native tool execution"); + } + + const invoice = await activeRuntime.tools.lookupInvoice({ + invoiceId: args.invoiceId, + }); + + return { + content: [{ type: "text", text: JSON.stringify(invoice) }], + details: invoice, + }; + }, + ); + + const replayHarness = piAiHarness({ + prompt: judgePrompt, + toolReplay: { + lookupInvoice: true, + }, + createAgent: () => { + const nativeTools = [ + { + name: "lookupInvoice", + execute: nativeExecute, + }, + ]; + + return { + toolset: { + lookupInvoice: { + execute: lookupInvoice, + }, + } satisfies PiAiToolset, + agent: { + state: { + tools: nativeTools, + }, + }, + async run(_input: string, runtime: DemoRuntime) { + activeRuntime = runtime; + const toolResult = await nativeTools[0].execute("lookupInvoice", { + invoiceId: "inv_123", + }); + + runtime.events.assistant(toolResult.content[0].text); + + return { + decision: toolResult.details.refundable + ? { status: "approved" as const } + : { status: "denied" as const, reason: "not refundable" }, + }; + }, + }; + }, + }); + + const firstRun = await replayHarness.run("Refund invoice inv_123", { + metadata: {}, + task: { + meta: {}, + }, + artifacts: {}, + setArtifact: vi.fn(), + }); + + expect(nativeExecute).toHaveBeenCalledTimes(1); + expect(lookupInvoice).toHaveBeenCalledTimes(1); + const firstCalls = toolCalls(firstRun.session); + expect(firstCalls).toHaveLength(1); + expect(firstCalls[0]).toMatchObject({ + name: "lookupInvoice", + result: { + invoiceId: "inv_123", + refundable: true, + }, + metadata: { + replay: { + status: "recorded", + }, + }, + }); + const recordingPath = ( + firstCalls[0].metadata?.replay as { recordingPath: string } + ).recordingPath; + expect(recordingPath).toContain("lookupInvoice.native"); + const recording = JSON.parse( + readFileSync(join(process.cwd(), recordingPath), "utf8"), + ) as { + output: { + __vitestEvals: { kind: string }; + normalizedResult: { invoiceId: string; refundable: boolean }; + }; + }; + expect(recording.output).toMatchObject({ + __vitestEvals: { + kind: "pi-ai-native-tool-result", + }, + normalizedResult: { + invoiceId: "inv_123", + refundable: true, + }, + }); + + nativeExecute.mockImplementation(async () => { + throw new Error("native tool should not execute after recording exists"); + }); + lookupInvoice.mockImplementation(async () => { + throw new Error("runtime tool should not execute after recording exists"); + }); + + const secondRun = await replayHarness.run("Refund invoice inv_123", { + metadata: {}, + task: { + meta: {}, + }, + artifacts: {}, + setArtifact: vi.fn(), + }); + + expect(nativeExecute).toHaveBeenCalledTimes(1); + expect(lookupInvoice).toHaveBeenCalledTimes(1); + expect(toolCalls(secondRun.session)).toHaveLength(1); + expect(toolCalls(secondRun.session)[0]).toMatchObject({ + name: "lookupInvoice", + metadata: { + replay: { + status: "replayed", + }, + }, + }); +}); + describeEval( "pi-ai harness infers runtime toolsets from existing agents", { @@ -790,11 +935,13 @@ test("replays native agent tools without breaking the agent-facing result", asyn const replayHarness = piAiHarness({ prompt: judgePrompt, + toolReplay: { + lookupInvoice: true, + }, createAgent: () => { const nativeTools = [ { name: "lookupInvoice", - replay: true, execute, }, ]; @@ -932,6 +1079,76 @@ test("replays native agent tools without breaking the agent-facing result", asyn ]); }); +test("does not opt native agent tools into replay from tool objects", async () => { + replayDir = mkdtempSync(join(process.cwd(), ".tmp-pi-native-replay-")); + vi.stubEnv("VITEST_EVALS_REPLAY_MODE", "auto"); + vi.stubEnv("VITEST_EVALS_REPLAY_DIR", replayDir); + + const execute = vi.fn( + async (_toolCallId: string, args: { invoiceId: string }) => ({ + content: [ + { + type: "text", + text: JSON.stringify({ + invoiceId: args.invoiceId, + refundable: true, + }), + }, + ], + details: { + invoiceId: args.invoiceId, + refundable: true, + }, + }), + ); + + const harness = piAiHarness({ + prompt: judgePrompt, + createAgent: () => { + const nativeTools = [ + { + name: "lookupInvoice", + replay: true, + execute, + }, + ]; + + return { + agent: { + state: { + tools: nativeTools, + }, + }, + async run(_input: string, runtime: { events: DemoRuntime["events"] }) { + const toolResult = await nativeTools[0].execute("lookupInvoice", { + invoiceId: "inv_123", + }); + + runtime.events.assistant(toolResult.content[0].text); + + return { + decision: { + status: "approved" as const, + }, + }; + }, + }; + }, + }); + + const run = await harness.run("Refund invoice inv_123", { + metadata: {}, + task: { + meta: {}, + }, + artifacts: {}, + setArtifact: vi.fn(), + }); + + expect(execute).toHaveBeenCalledTimes(1); + expect(toolCalls(run.session)[0].metadata?.replay).toBeUndefined(); +}); + test("records and replays opt-in tools in auto mode", async () => { replayDir = mkdtempSync(join(process.cwd(), ".tmp-pi-replay-")); vi.stubEnv("VITEST_EVALS_REPLAY_MODE", "auto"); @@ -944,10 +1161,12 @@ test("records and replays opt-in tools in auto mode", async () => { const replayHarness = piAiHarness({ prompt: judgePrompt, + toolReplay: { + lookupInvoice: true, + }, createAgent: () => ({ id: "refund-agent" }), tools: { lookupInvoice: { - replay: true, execute, }, } satisfies PiAiToolset, @@ -1016,6 +1235,51 @@ test("records and replays opt-in tools in auto mode", async () => { }); }); +test("does not opt runtime tools into replay from tool definitions", async () => { + replayDir = mkdtempSync(join(process.cwd(), ".tmp-pi-replay-")); + vi.stubEnv("VITEST_EVALS_REPLAY_MODE", "auto"); + vi.stubEnv("VITEST_EVALS_REPLAY_DIR", replayDir); + + const execute = vi.fn(async ({ invoiceId }: { invoiceId: string }) => ({ + invoiceId, + refundable: true, + })); + + const harness = piAiHarness<{ id: string }, string, DemoMetadata>({ + prompt: judgePrompt, + createAgent: () => ({ id: "refund-agent" }), + tools: { + lookupInvoice: { + replay: true, + execute, + }, + } as unknown as PiAiToolset, + run: async ({ runtime }) => { + await runtime.tools.lookupInvoice({ + invoiceId: "inv_123", + }); + + return { + decision: { + status: "approved", + }, + }; + }, + }); + + const run = await harness.run("Refund invoice inv_123", { + metadata: {}, + task: { + meta: {}, + }, + artifacts: {}, + setArtifact: vi.fn(), + }); + + expect(execute).toHaveBeenCalledTimes(1); + expect(toolCalls(run.session)[0].metadata?.replay).toBeUndefined(); +}); + test("errors when strict mode is missing a recording", async () => { replayDir = mkdtempSync(join(process.cwd(), ".tmp-pi-replay-")); vi.stubEnv("VITEST_EVALS_REPLAY_MODE", "strict"); @@ -1028,10 +1292,12 @@ test("errors when strict mode is missing a recording", async () => { const replayHarness = piAiHarness({ prompt: judgePrompt, + toolReplay: { + lookupInvoice: true, + }, createAgent: () => ({ id: "refund-agent" }), tools: { lookupInvoice: { - replay: true, execute, }, } satisfies PiAiToolset, diff --git a/packages/harness-pi-ai/src/index.ts b/packages/harness-pi-ai/src/index.ts index 4e4c374..d93d738 100644 --- a/packages/harness-pi-ai/src/index.ts +++ b/packages/harness-pi-ai/src/index.ts @@ -49,14 +49,6 @@ type PiAgentToolLike< TMetadata extends HarnessMetadata = HarnessMetadata, > = { name: string; - replay?: - | boolean - | PiAiToolReplayConfig< - Record, - JsonValue, - TInput, - TMetadata - >; execute: (toolCallId: string, args: Record) => unknown; }; @@ -105,6 +97,23 @@ export type PiAiToolReplayConfig< TMetadata extends HarnessMetadata = HarnessMetadata, > = ToolReplayConfig>; +export type PiAiToolReplayPolicy< + TInput = string, + TMetadata extends HarnessMetadata = HarnessMetadata, +> = + | boolean + | PiAiToolReplayConfig< + Record, + JsonValue, + TInput, + TMetadata + >; + +export type PiAiToolReplayPolicies< + TInput = string, + TMetadata extends HarnessMetadata = HarnessMetadata, +> = Record>; + export interface PiAiToolDefinition< TArgs extends Record = Record, TResult extends JsonValue = JsonValue, @@ -112,7 +121,6 @@ export interface PiAiToolDefinition< TMetadata extends HarnessMetadata = HarnessMetadata, > { description?: string; - replay?: boolean | PiAiToolReplayConfig; execute: ( args: TArgs, context: PiAiToolContext, @@ -190,6 +198,7 @@ interface PiAiHarnessBaseOptions< > { agent?: TAgent; createAgent?: () => MaybePromise; + toolReplay?: PiAiToolReplayPolicies; normalize?: PiAiHarnessNormalizeOptions< TAgent, TInput, @@ -309,6 +318,10 @@ type InferredToolSurfaces = { nativeToolsets?: Array[]>; }; +type PiToolExecutionState = { + activeNativeToolNames: Map; +}; + /** Adapts a Pi agent runtime into a normalized vitest-evals harness. */ export function piAiHarness< TAgent, @@ -403,10 +416,13 @@ async function executePiHarnessRun< runtimeTools: TTools | undefined, nativeToolsets?: Array[]>, ): Promise { + const executionState = createPiToolExecutionState(); const runtime = createRuntime({ input, context, tools: runtimeTools, + toolReplay: options.toolReplay, + executionState, messages, }); @@ -419,6 +435,8 @@ async function executePiHarnessRun< context, messages, toolCalls: runtime.toolCalls, + toolReplay: options.toolReplay, + executionState, }, () => runAgent(options, { @@ -725,6 +743,8 @@ async function withInstrumentedAgentTools< context: HarnessContext; messages: NormalizedMessage[]; toolCalls: ToolCallRecord[]; + toolReplay: PiAiToolReplayPolicies | undefined; + executionState: PiToolExecutionState; }, callback: () => Promise, ) { @@ -756,13 +776,17 @@ async function withInstrumentedAgentTools< signal: args.context.signal, setArtifact: args.context.setArtifact, } satisfies PiAiToolContext; + const leaveNativeTool = enterNativeToolExecution( + args.executionState, + tool.name, + ); try { const execution = await executeNativeToolWithReplay({ toolName: tool.name, toolCallId, execute: originalExecute, - replay: tool.replay, + replay: args.toolReplay?.[tool.name], args: rawArgs, context: toolContext, }); @@ -806,6 +830,8 @@ async function withInstrumentedAgentTools< toolCalls: [call], }); throw error; + } finally { + leaveNativeTool(); } }; instrumentedExecute[ORIGINAL_NATIVE_EXECUTE] = originalExecute; @@ -912,6 +938,39 @@ function getNativeToolExecuteOrigin( return nativeExecute[ORIGINAL_NATIVE_EXECUTE] ?? nativeExecute; } +function createPiToolExecutionState(): PiToolExecutionState { + return { + activeNativeToolNames: new Map(), + }; +} + +function enterNativeToolExecution( + state: PiToolExecutionState, + toolName: string, +) { + state.activeNativeToolNames.set( + toolName, + (state.activeNativeToolNames.get(toolName) ?? 0) + 1, + ); + + return () => { + const nextCount = (state.activeNativeToolNames.get(toolName) ?? 1) - 1; + if (nextCount <= 0) { + state.activeNativeToolNames.delete(toolName); + return; + } + + state.activeNativeToolNames.set(toolName, nextCount); + }; +} + +function hasActiveNativeToolExecution( + state: PiToolExecutionState, + toolName: string, +) { + return (state.activeNativeToolNames.get(toolName) ?? 0) > 0; +} + async function executeNativeToolWithReplay< TInput, TMetadata extends HarnessMetadata, @@ -926,7 +985,7 @@ async function executeNativeToolWithReplay< toolName: string; toolCallId: string; execute: PiAgentToolLike["execute"]; - replay: PiAgentToolLike["replay"]; + replay: PiAiToolReplayPolicy | undefined; args: Record; context: PiAiToolContext; }) { @@ -934,7 +993,7 @@ async function executeNativeToolWithReplay< let liveResult: unknown; const execution = await executeWithReplay({ - toolName, + toolName: createNativeReplayToolName(toolName), args, context, execute: async (toolArgs) => { @@ -959,6 +1018,10 @@ async function executeNativeToolWithReplay< }; } +function createNativeReplayToolName(toolName: string) { + return `${toolName}.native`; +} + function createRuntime< TInput, TMetadata extends HarnessMetadata, @@ -967,11 +1030,15 @@ function createRuntime< input, context, tools, + toolReplay, + executionState, messages, }: { input: TInput; context: HarnessContext; tools: TTools | undefined; + toolReplay: PiAiToolReplayPolicies | undefined; + executionState: PiToolExecutionState; messages: NormalizedMessage[]; }): PiAiRuntime & { toolCalls: ToolCallRecord[]; @@ -1019,6 +1086,10 @@ function createRuntime< toolName, async (args: Record) => { const startedAt = new Date(); + const isNativeImplementationCall = hasActiveNativeToolExecution( + executionState, + toolName, + ); const toolContext = { input, metadata: context.metadata, @@ -1030,10 +1101,18 @@ function createRuntime< const execution = await executeToolWithReplay({ toolName, tool, + replay: isNativeImplementationCall + ? undefined + : toolReplay?.[toolName], args, context: toolContext, }); const finishedAt = new Date(); + + if (isNativeImplementationCall) { + return execution.result; + } + const call = { name: toolName, arguments: args, @@ -1058,6 +1137,10 @@ function createRuntime< return execution.result; } catch (error) { const finishedAt = new Date(); + if (isNativeImplementationCall) { + throw error; + } + const call = { name: toolName, arguments: args, @@ -1319,19 +1402,26 @@ async function executeToolWithReplay< >({ toolName, tool, + replay, args, context, }: { toolName: string; tool: PiAiToolDefinition; + replay: PiAiToolReplayPolicy | undefined; args: TArgs; context: PiAiToolContext; }) { - return executeWithReplay({ + return executeWithReplay< + Record, + JsonValue, + PiAiToolContext + >({ toolName, args, context, - execute: tool.execute, - replay: tool.replay, + execute: (toolArgs, toolContext) => + tool.execute(toolArgs as TArgs, toolContext), + replay, }); } diff --git a/packages/vitest-evals/README.md b/packages/vitest-evals/README.md index ab06535..7c5ef7d 100644 --- a/packages/vitest-evals/README.md +++ b/packages/vitest-evals/README.md @@ -14,6 +14,8 @@ Install a first-party harness package for the runtime you want to test: npm install -D @vitest-evals/harness-pi-ai # or npm install -D @vitest-evals/harness-ai-sdk +# or +npm install -D @vitest-evals/harness-openai-agents ``` ## Core Model @@ -146,10 +148,21 @@ The harness owns normalization, diagnostics, tool capture, replay plumbing, and reporter-facing artifacts. Your app just needs one runtime seam where those wrapped pieces can be injected. +Replay opt-in belongs on the harness, via `toolReplay`, while replay mode and +recording directory can live in Vitest environment config. Tool definitions +should stay free of VCR policy. + For the Pi-specific harness, output/session/usage normalization should usually be inferred automatically. Treat low-level normalization callbacks as an escape hatch, not part of the primary authoring path. +For OpenAI Agents SDK apps, use +`@vitest-evals/harness-openai-agents` with an existing `Agent` or +`createAgent()` factory and a `Runner` / `createRunner()` callback. The harness +calls `Runner.run(agent, input, options)` by default and exposes the same +normalization and replay hooks when the app needs a custom entrypoint or +structured domain output mapping. + ## Custom App Harnesses First-party harness packages are conveniences, not the only supported path. If diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 031cd0d..32da930 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -75,14 +75,29 @@ importers: specifier: ^4.3.6 version: 4.3.6 + apps/demo-openai-agents: + dependencies: + '@openai/agents': + specifier: ^0.8.5 + version: 0.8.5(ws@8.20.0)(zod@4.3.6) + '@vitest-evals/harness-openai-agents': + specifier: workspace:* + version: link:../../packages/harness-openai-agents + vitest-evals: + specifier: workspace:* + version: link:../../packages/vitest-evals + zod: + specifier: ^4.3.6 + version: 4.3.6 + apps/demo-pi: dependencies: '@mariozechner/pi-agent-core': specifier: 0.67.68 - version: 0.67.68(ws@8.20.0)(zod@4.3.6) + version: 0.67.68(@modelcontextprotocol/sdk@1.29.0(zod@4.3.6))(ws@8.20.0)(zod@4.3.6) '@mariozechner/pi-ai': specifier: 0.67.68 - version: 0.67.68(ws@8.20.0)(zod@4.3.6) + version: 0.67.68(@modelcontextprotocol/sdk@1.29.0(zod@4.3.6))(ws@8.20.0)(zod@4.3.6) '@vitest-evals/harness-pi-ai': specifier: workspace:* version: link:../../packages/harness-pi-ai @@ -99,14 +114,23 @@ importers: specifier: workspace:* version: link:../vitest-evals + packages/harness-openai-agents: + devDependencies: + '@openai/agents': + specifier: ^0.8.5 + version: 0.8.5(ws@8.20.0)(zod@4.3.6) + vitest-evals: + specifier: workspace:* + version: link:../vitest-evals + packages/harness-pi-ai: devDependencies: '@mariozechner/pi-agent-core': specifier: ^0.67.68 - version: 0.67.68(ws@8.20.0)(zod@4.3.6) + version: 0.67.68(@modelcontextprotocol/sdk@1.29.0(zod@4.3.6))(ws@8.20.0)(zod@4.3.6) '@mariozechner/pi-ai': specifier: ^0.67.68 - version: 0.67.68(ws@8.20.0)(zod@4.3.6) + version: 0.67.68(@modelcontextprotocol/sdk@1.29.0(zod@4.3.6))(ws@8.20.0)(zod@4.3.6) vitest-evals: specifier: workspace:* version: link:../vitest-evals @@ -351,24 +375,28 @@ packages: engines: {node: '>=14.21.3'} cpu: [arm64] os: [linux] + libc: [musl] '@biomejs/cli-linux-arm64@1.9.4': resolution: {integrity: sha512-fJIW0+LYujdjUgJJuwesP4EjIBl/N/TcOX3IvIHJQNsAqvV2CHIogsmA94BPG6jZATS4Hi+xv4SkBBQSt1N4/g==} engines: {node: '>=14.21.3'} cpu: [arm64] os: [linux] + libc: [glibc] '@biomejs/cli-linux-x64-musl@1.9.4': resolution: {integrity: sha512-gEhi/jSBhZ2m6wjV530Yy8+fNqG8PAinM3oV7CyO+6c3CEh16Eizm21uHVsyVBEB6RIM8JHIl6AGYCv6Q6Q9Tg==} engines: {node: '>=14.21.3'} cpu: [x64] os: [linux] + libc: [musl] '@biomejs/cli-linux-x64@1.9.4': resolution: {integrity: sha512-lRCJv/Vi3Vlwmbd6K+oQ0KhLHMAysN8lXoCI7XeHlxaajk06u7G+UsFSO01NAs5iYuWKmVZjmiOzJ0OJmGsMwg==} engines: {node: '>=14.21.3'} cpu: [x64] os: [linux] + libc: [glibc] '@biomejs/cli-win32-arm64@1.9.4': resolution: {integrity: sha512-tlbhLk+WXZmgwoIKwHIHEBZUwxml7bRJgk0X2sPyNR3S93cdRq6XulAZRQJ17FYGGzWne0fgrXBKpl7l4M87Hg==} @@ -547,6 +575,12 @@ packages: '@modelcontextprotocol/sdk': optional: true + '@hono/node-server@1.19.14': + resolution: {integrity: sha512-GwtvgtXxnWsucXvbQXkRgqksiH2Qed37H9xHZocE5sA3N8O8O8/8FA3uclQXxXVzc9XBZuEOMK7+r02FmSpHtw==} + engines: {node: '>=18.14.1'} + peerDependencies: + hono: ^4 + '@isaacs/cliui@8.0.2': resolution: {integrity: sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==} engines: {node: '>=12'} @@ -582,6 +616,39 @@ packages: '@mistralai/mistralai@2.2.0': resolution: {integrity: sha512-JQUGIXjFWnw/J9LpTSf/ZXwVW3Sh8FBAcfTo5QvAHqkl4CfSiIwnjRJhMoAFcP6ncCe84YPU1ncDGX+p3OXnfg==} + '@modelcontextprotocol/sdk@1.29.0': + resolution: {integrity: sha512-zo37mZA9hJWpULgkRpowewez1y6ML5GsXJPY8FI0tBBCd77HEvza4jDqRKOXgHNn867PVGCyTdzqpz0izu5ZjQ==} + engines: {node: '>=18'} + peerDependencies: + '@cfworker/json-schema': ^4.1.1 + zod: ^3.25 || ^4.0 + peerDependenciesMeta: + '@cfworker/json-schema': + optional: true + + '@openai/agents-core@0.8.5': + resolution: {integrity: sha512-qs9mmN+D+UmqEZo3qrvhhIIXIOgSvJPic0v4a+ruq+eYgcQMk3PY8lLcsdQwJit6zf2Wyfv1q2cX5m3jzWZpKw==} + peerDependencies: + zod: ^4.0.0 + peerDependenciesMeta: + zod: + optional: true + + '@openai/agents-openai@0.8.5': + resolution: {integrity: sha512-cGYmyiVy8ecgf2Vch0L/ekeNo3xuZsuWnRsxyv+w9ai9dgxUifdEQ6G3dtsjMLtmXVHRVGoO7mVBr+tKcilntw==} + peerDependencies: + zod: ^4.0.0 + + '@openai/agents-realtime@0.8.5': + resolution: {integrity: sha512-JqKVsR33OvKtTxRp5Ylhw8WfNvJ49ZIhlhMZlSVKqwR2Ks6JuxqFJ0zM9p7JIbTQDSlAZnmnZJv1qlItaildiQ==} + peerDependencies: + zod: ^4.0.0 + + '@openai/agents@0.8.5': + resolution: {integrity: sha512-OFA7XVV1qXE8lzatvQj080KdSArt8utBExFXRfD5B/R7KT0D+AVaKwg6nLoW3Gxb30vRkIUQf+MaW/Wz+gO3Yg==} + peerDependencies: + zod: ^4.0.0 + '@opentelemetry/api@1.9.0': resolution: {integrity: sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==} engines: {node: '>=8.0.0'} @@ -654,56 +721,67 @@ packages: resolution: {integrity: sha512-EtP8aquZ0xQg0ETFcxUbU71MZlHaw9MChwrQzatiE8U/bvi5uv/oChExXC4mWhjiqK7azGJBqU0tt5H123SzVA==} cpu: [arm] os: [linux] + libc: [glibc] '@rollup/rollup-linux-arm-musleabihf@4.46.2': resolution: {integrity: sha512-qO7F7U3u1nfxYRPM8HqFtLd+raev2K137dsV08q/LRKRLEc7RsiDWihUnrINdsWQxPR9jqZ8DIIZ1zJJAm5PjQ==} cpu: [arm] os: [linux] + libc: [musl] '@rollup/rollup-linux-arm64-gnu@4.46.2': resolution: {integrity: sha512-3dRaqLfcOXYsfvw5xMrxAk9Lb1f395gkoBYzSFcc/scgRFptRXL9DOaDpMiehf9CO8ZDRJW2z45b6fpU5nwjng==} cpu: [arm64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-arm64-musl@4.46.2': resolution: {integrity: sha512-fhHFTutA7SM+IrR6lIfiHskxmpmPTJUXpWIsBXpeEwNgZzZZSg/q4i6FU4J8qOGyJ0TR+wXBwx/L7Ho9z0+uDg==} cpu: [arm64] os: [linux] + libc: [musl] '@rollup/rollup-linux-loongarch64-gnu@4.46.2': resolution: {integrity: sha512-i7wfGFXu8x4+FRqPymzjD+Hyav8l95UIZ773j7J7zRYc3Xsxy2wIn4x+llpunexXe6laaO72iEjeeGyUFmjKeA==} cpu: [loong64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-ppc64-gnu@4.46.2': resolution: {integrity: sha512-B/l0dFcHVUnqcGZWKcWBSV2PF01YUt0Rvlurci5P+neqY/yMKchGU8ullZvIv5e8Y1C6wOn+U03mrDylP5q9Yw==} cpu: [ppc64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-riscv64-gnu@4.46.2': resolution: {integrity: sha512-32k4ENb5ygtkMwPMucAb8MtV8olkPT03oiTxJbgkJa7lJ7dZMr0GCFJlyvy+K8iq7F/iuOr41ZdUHaOiqyR3iQ==} cpu: [riscv64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-riscv64-musl@4.46.2': resolution: {integrity: sha512-t5B2loThlFEauloaQkZg9gxV05BYeITLvLkWOkRXogP4qHXLkWSbSHKM9S6H1schf/0YGP/qNKtiISlxvfmmZw==} cpu: [riscv64] os: [linux] + libc: [musl] '@rollup/rollup-linux-s390x-gnu@4.46.2': resolution: {integrity: sha512-YKjekwTEKgbB7n17gmODSmJVUIvj8CX7q5442/CK80L8nqOUbMtf8b01QkG3jOqyr1rotrAnW6B/qiHwfcuWQA==} cpu: [s390x] os: [linux] + libc: [glibc] '@rollup/rollup-linux-x64-gnu@4.46.2': resolution: {integrity: sha512-Jj5a9RUoe5ra+MEyERkDKLwTXVu6s3aACP51nkfnK9wJTraCC8IMe3snOfALkrjTYd2G1ViE1hICj0fZ7ALBPA==} cpu: [x64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-x64-musl@4.46.2': resolution: {integrity: sha512-7kX69DIrBeD7yNp4A5b81izs8BqoZkCIaxQaOpumcJ1S/kmqNFjPhDu1LHeVXv0SexfHQv5cqHsxLOjETuqDuA==} cpu: [x64] os: [linux] + libc: [musl] '@rollup/rollup-win32-arm64-msvc@4.46.2': resolution: {integrity: sha512-wiJWMIpeaak/jsbaq2HMh/rzZxHVW1rU6coyeNNpMwk5isiPjSTx0a4YLSlYDwBH/WBvLz+EtsNqQScZTLJy3g==} @@ -932,6 +1010,9 @@ packages: '@types/retry@0.12.0': resolution: {integrity: sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==} + '@types/ws@8.18.1': + resolution: {integrity: sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg==} + '@vercel/oidc@3.1.0': resolution: {integrity: sha512-Fw28YZpRnA3cAHHDlkt7xQHiJ0fcL+NRcIqsocZQUSmbzeIKRpwttJjik5ZGanXP+vlA4SbTg+AbA3bP363l+w==} engines: {node: '>= 20'} @@ -974,6 +1055,10 @@ packages: '@vitest/utils@4.1.2': resolution: {integrity: sha512-xw2/TiX82lQHA06cgbqRKFb5lCAy3axQ4H4SoUFhUsg+wztiet+co86IAMDtF6Vm1hc7J6j09oh/rgDn+JdKIQ==} + accepts@2.0.0: + resolution: {integrity: sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng==} + engines: {node: '>= 0.6'} + acorn@8.15.0: resolution: {integrity: sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==} engines: {node: '>=0.4.0'} @@ -1052,6 +1137,10 @@ packages: binary-search@1.3.6: resolution: {integrity: sha512-nbE1WxOTTrUWIfsfZ4aHGYu5DOuNkbxGokjV6Z2kxfJK3uaAb8zNK1muzOeipoLHZjInT4Br88BHpzevc681xA==} + body-parser@2.2.2: + resolution: {integrity: sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==} + engines: {node: '>=18'} + bowser@2.14.1: resolution: {integrity: sha512-tzPjzCxygAKWFOJP011oxFHs57HzIhOEracIgAePE4pqB3LikALKnSzUyU4MGs9/iCEUuHlAJTjTc5M+u7YEGg==} @@ -1071,10 +1160,22 @@ packages: peerDependencies: esbuild: '>=0.18' + bytes@3.1.2: + resolution: {integrity: sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==} + engines: {node: '>= 0.8'} + cac@6.7.14: resolution: {integrity: sha512-b6Ilus+c3RrdDk+JhLKUAQfzzgLEPy6wcXqS7f/xe1EETvsDP6GORG7SFuOs6cID5YkqchW/LXZbX5bc8j7ZcQ==} engines: {node: '>=8'} + call-bind-apply-helpers@1.0.2: + resolution: {integrity: sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==} + engines: {node: '>= 0.4'} + + call-bound@1.0.4: + resolution: {integrity: sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg==} + engines: {node: '>= 0.4'} + chai@6.2.2: resolution: {integrity: sha512-NUPRluOfOiTKBKvWPtSD4PhFvWCqOi0BGStNWs57X9js7XGTprSmFoz5F0tWhR4WPjNeR9jXqdC7/UpSJTnlRg==} engines: {node: '>=18'} @@ -1136,9 +1237,29 @@ packages: resolution: {integrity: sha512-5IKcdX0nnYavi6G7TtOhwkYzyjfJlatbjMjuLSfE2kYT5pMDOilZ4OvMhi637CcDICTmz3wARPoyhqyX1Y+XvA==} engines: {node: ^14.18.0 || >=16.10.0} + content-disposition@1.1.0: + resolution: {integrity: sha512-5jRCH9Z/+DRP7rkvY83B+yGIGX96OYdJmzngqnw2SBSxqCFPd0w2km3s5iawpGX8krnwSGmF0FW5Nhr0Hfai3g==} + engines: {node: '>=18'} + + content-type@1.0.5: + resolution: {integrity: sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==} + engines: {node: '>= 0.6'} + convert-source-map@2.0.0: resolution: {integrity: sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==} + cookie-signature@1.2.2: + resolution: {integrity: sha512-D76uU73ulSXrD1UXF4KE2TMxVVwhsnCgfAyTg9k8P6KGZjlXKrOLe4dJQKI3Bxi5wjesZoFXJWElNWBjPZMbhg==} + engines: {node: '>=6.6.0'} + + cookie@0.7.2: + resolution: {integrity: sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w==} + engines: {node: '>= 0.6'} + + cors@2.8.6: + resolution: {integrity: sha512-tJtZBBHA6vjIAaF6EnIaq6laBBP9aq/Y3ouVJjEfoHbRBcHBAHYcMh/w8LDrk2PvIMMq8gmopa5D4V8RmbrxGw==} + engines: {node: '>= 0.10'} + cross-spawn@7.0.6: resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} engines: {node: '>= 8'} @@ -1160,10 +1281,23 @@ packages: supports-color: optional: true + debug@4.4.3: + resolution: {integrity: sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==} + engines: {node: '>=6.0'} + peerDependencies: + supports-color: '*' + peerDependenciesMeta: + supports-color: + optional: true + degenerator@5.0.1: resolution: {integrity: sha512-TllpMR/t0M5sqCXfj85i4XaAzxmS5tVA16dqvdkMwGmzI+dXLXnw3J+3Vdv7VKw+ThlTMboK6i9rnZ6Nntj5CQ==} engines: {node: '>= 14'} + depd@2.0.0: + resolution: {integrity: sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==} + engines: {node: '>= 0.8'} + dotenv-cli@8.0.0: resolution: {integrity: sha512-aLqYbK7xKOiTMIRf1lDPbI+Y+Ip/wo5k3eyp6ePysVaSqbyxjyK3dK35BTxG+rmd7djf5q2UPs4noPNH+cj0Qw==} hasBin: true @@ -1176,12 +1310,19 @@ packages: resolution: {integrity: sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==} engines: {node: '>=12'} + dunder-proto@1.0.1: + resolution: {integrity: sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==} + engines: {node: '>= 0.4'} + eastasianwidth@0.2.0: resolution: {integrity: sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==} ecdsa-sig-formatter@1.0.11: resolution: {integrity: sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==} + ee-first@1.1.1: + resolution: {integrity: sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==} + emoji-regex@10.4.0: resolution: {integrity: sha512-EC+0oUMY1Rqm4O6LLrgjtYDvcVYTy7chDnM4Q7030tP4Kwj3u/pR6gP9ygnp2CJMK5Gq+9Q2oqmrFJAz01DXjw==} @@ -1191,18 +1332,37 @@ packages: emoji-regex@9.2.2: resolution: {integrity: sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==} + encodeurl@2.0.0: + resolution: {integrity: sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==} + engines: {node: '>= 0.8'} + environment@1.1.0: resolution: {integrity: sha512-xUtoPkMggbz0MPyPiIWr1Kp4aeWJjDZ6SMvURhimjdZgsRuDplF5/s9hcgGhyXMhs+6vpnuoiZ2kFiu3FMnS8Q==} engines: {node: '>=18'} + es-define-property@1.0.1: + resolution: {integrity: sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==} + engines: {node: '>= 0.4'} + + es-errors@1.3.0: + resolution: {integrity: sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==} + engines: {node: '>= 0.4'} + es-module-lexer@2.0.0: resolution: {integrity: sha512-5POEcUuZybH7IdmGsD8wlf0AI55wMecM9rVBTI/qEAy2c1kTOm3DjFYjrBdI2K3BaJjJYfYFeRtM0t9ssnRuxw==} + es-object-atoms@1.1.1: + resolution: {integrity: sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==} + engines: {node: '>= 0.4'} + esbuild@0.25.8: resolution: {integrity: sha512-vVC0USHGtMi8+R4Kz8rt6JhEWLxsv9Rnu/lGYbPR8u47B+DCBksq9JarW0zOO7bs37hyOK1l2/oqtbciutL5+Q==} engines: {node: '>=18'} hasBin: true + escape-html@1.0.3: + resolution: {integrity: sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow==} + escodegen@2.1.0: resolution: {integrity: sha512-2NlIDTwUWJN0mRPQOdtQBzbUHvdGY2P1VXSyU83Q3xKxM7WHX2Ql8dKq782Q9TgQUNOLEzEYu9bzLNj1q88I5w==} engines: {node: '>=6.0'} @@ -1224,6 +1384,10 @@ packages: resolution: {integrity: sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==} engines: {node: '>=0.10.0'} + etag@1.8.1: + resolution: {integrity: sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==} + engines: {node: '>= 0.6'} + eventemitter3@5.0.1: resolution: {integrity: sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==} @@ -1231,6 +1395,10 @@ packages: resolution: {integrity: sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==} engines: {node: '>=18.0.0'} + eventsource@3.0.7: + resolution: {integrity: sha512-CRT1WTyuQoD771GW56XEZFQ/ZoSfWid1alKGDYMmkt2yl8UXrVR4pspqWNEcqKvVIzg6PAltWjxcSSPrboA4iA==} + engines: {node: '>=18.0.0'} + execa@8.0.1: resolution: {integrity: sha512-VyhnebXciFV2DESc+p6B+y0LjSm0krU4OgJN44qFAhBY0TJ+1V61tYD2+wHusZ6F9n5K+vl8k0sTy7PEfV4qpg==} engines: {node: '>=16.17'} @@ -1239,6 +1407,16 @@ packages: resolution: {integrity: sha512-knvyeauYhqjOYvQ66MznSMs83wmHrCycNEN6Ao+2AeYEfxUIkuiVxdEa1qlGEPK+We3n0THiDciYSsCcgW/DoA==} engines: {node: '>=12.0.0'} + express-rate-limit@8.4.1: + resolution: {integrity: sha512-NGVYwQSAyEQgzxX1iCM978PP9AdO/hW93gMcF6ZwQCm+rFvLsBH6w4xcXWTcliS8La5EPRN3p9wzItqBwJrfNw==} + engines: {node: '>= 16'} + peerDependencies: + express: '>= 4.11' + + express@5.2.1: + resolution: {integrity: sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw==} + engines: {node: '>= 18'} + extend@3.0.2: resolution: {integrity: sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==} @@ -1283,6 +1461,10 @@ packages: resolution: {integrity: sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==} engines: {node: '>=8'} + finalhandler@2.1.1: + resolution: {integrity: sha512-S8KoZgRZN+a5rNwqTxlZZePjT/4cnm0ROV70LedRHZ0p8u9fRID0hJUZQpkKLzro8LfmC8sx23bY6tVNxv8pQA==} + engines: {node: '>= 18.0.0'} + fix-dts-default-cjs-exports@1.0.1: resolution: {integrity: sha512-pVIECanWFC61Hzl2+oOCtoJ3F17kglZC/6N94eRWycFgBH35hHx0Li604ZIzhseh97mf2p0cv7vVrOZGoqhlEg==} @@ -1294,11 +1476,22 @@ packages: resolution: {integrity: sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==} engines: {node: '>=12.20.0'} + forwarded@0.2.0: + resolution: {integrity: sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==} + engines: {node: '>= 0.6'} + + fresh@2.0.0: + resolution: {integrity: sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==} + engines: {node: '>= 0.8'} + fsevents@2.3.3: resolution: {integrity: sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==} engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0} os: [darwin] + function-bind@1.1.2: + resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==} + gaxios@7.1.4: resolution: {integrity: sha512-bTIgTsM2bWn3XklZISBTQX7ZSddGW+IO3bMdGaemHZ3tbqExMENHLx6kKZ/KlejgrMtj8q7wBItt51yegqalrA==} engines: {node: '>=18'} @@ -1311,6 +1504,14 @@ packages: resolution: {integrity: sha512-vpeMIQKxczTD/0s2CdEWHcb0eeJe6TFjxb+J5xgX7hScxqrGuyjmv4c1D4A/gelKfyox0gJJwIHF+fLjeaM8kQ==} engines: {node: '>=18'} + get-intrinsic@1.3.0: + resolution: {integrity: sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==} + engines: {node: '>= 0.4'} + + get-proto@1.0.1: + resolution: {integrity: sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==} + engines: {node: '>= 0.4'} + get-stream@8.0.1: resolution: {integrity: sha512-VaUJspBffn/LMCJVoMvSAdmscJyS1auj5Zulnn5UoYcY531UWmdwhRWkcGKnGU93m5HSXP9LP2usOryrBtQowA==} engines: {node: '>=16'} @@ -1335,13 +1536,33 @@ packages: resolution: {integrity: sha512-eAmLkjDjAFCVXg7A1unxHsLf961m6y17QFqXqAXGj/gVkKFrEICfStRfwUlGNfeCEjNRa32JEWOUTlYXPyyKvA==} engines: {node: '>=14'} + gopd@1.2.0: + resolution: {integrity: sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==} + engines: {node: '>= 0.4'} + has-flag@4.0.0: resolution: {integrity: sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==} engines: {node: '>=8'} + has-symbols@1.1.0: + resolution: {integrity: sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==} + engines: {node: '>= 0.4'} + + hasown@2.0.3: + resolution: {integrity: sha512-ej4AhfhfL2Q2zpMmLo7U1Uv9+PyhIZpgQLGT1F9miIGmiCJIoCgSmczFdrc97mWT4kVY72KA+WnnhJ5pghSvSg==} + engines: {node: '>= 0.4'} + + hono@4.12.16: + resolution: {integrity: sha512-jN0ZewiNAWSe5khM3EyCmBb250+b40wWbwNILNfEvq84VREWwOIkuUsFONk/3i3nqkz7Oe1PcpM2mwQEK2L9Kg==} + engines: {node: '>=16.9.0'} + html-escaper@2.0.2: resolution: {integrity: sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==} + http-errors@2.0.1: + resolution: {integrity: sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==} + engines: {node: '>= 0.8'} + http-proxy-agent@7.0.2: resolution: {integrity: sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==} engines: {node: '>= 14'} @@ -1354,6 +1575,13 @@ packages: resolution: {integrity: sha512-AXcZb6vzzrFAUE61HnN4mpLqd/cSIwNQjtNWR0euPm6y0iqx3G4gOXaIDdtdDwZmhwe82LA6+zinmW4UBWVePQ==} engines: {node: '>=16.17.0'} + iconv-lite@0.7.2: + resolution: {integrity: sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==} + engines: {node: '>=0.10.0'} + + inherits@2.0.4: + resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==} + install@0.13.0: resolution: {integrity: sha512-zDml/jzr2PKU9I8J/xyZBQn8rPCAY//UOYNmR01XwNwyfhEWObo2SWfSl1+0tm1u6PhxLwDnfsT/6jB7OUxqFA==} engines: {node: '>= 0.10'} @@ -1362,6 +1590,10 @@ packages: resolution: {integrity: sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==} engines: {node: '>= 12'} + ipaddr.js@1.9.1: + resolution: {integrity: sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==} + engines: {node: '>= 0.10'} + is-any-array@2.0.1: resolution: {integrity: sha512-UtilS7hLRu++wb/WBAw9bNuP1Eg04Ivn1vERJck8zJthEvXCBEBpGR/33u/xLKWEQf95803oalHrVDptcAvFdQ==} @@ -1381,6 +1613,9 @@ packages: resolution: {integrity: sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==} engines: {node: '>=0.12.0'} + is-promise@4.0.0: + resolution: {integrity: sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ==} + is-stream@3.0.0: resolution: {integrity: sha512-LnQR4bZ9IADDRSkvpqMGvt/tEJWclzklNgSw48V5EAaAeDd6qGvN8ei6k5p0tvxSR171VmGyHuTiAOfxAbr8kA==} engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} @@ -1403,6 +1638,9 @@ packages: jackspeak@3.4.3: resolution: {integrity: sha512-OGlZQpz2yfahA/Rd1Y8Cd9SIEsqvXkLVoSw/cgwhnhFMDbsQFeZYoJJ7bIZBS9BcamUW96asq/npPWugM+RQBw==} + jose@6.2.3: + resolution: {integrity: sha512-YYVDInQKFJfR/xa3ojUTl8c2KoTwiL1R5Wg9YCydwH0x0B9grbzlg5HC7mMjCtUJjbQ/YnGEZIhI5tCgfTb4Hw==} + joycon@3.1.1: resolution: {integrity: sha512-34wB/Y7MW7bzjKRjUKTa46I2Z7eV62Rkhva+KkopW7Qvv/OSWBqvkSY7vusOPrNuZcUG3tApvdVgNB8POj3SPw==} engines: {node: '>=10'} @@ -1428,6 +1666,9 @@ packages: json-schema-traverse@1.0.0: resolution: {integrity: sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==} + json-schema-typed@8.0.2: + resolution: {integrity: sha512-fQhoXdcvc3V28x7C7BMs4P5+kNlgUURe2jmUT1T//oBRMDrqy1QPelJimwZGo7Hg9VPV3EQV5Bnq4hbFy2vetA==} + json-schema@0.4.0: resolution: {integrity: sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==} @@ -1490,6 +1731,18 @@ packages: resolution: {integrity: sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==} engines: {node: '>=10'} + math-intrinsics@1.1.0: + resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==} + engines: {node: '>= 0.4'} + + media-typer@1.1.0: + resolution: {integrity: sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw==} + engines: {node: '>= 0.8'} + + merge-descriptors@2.0.0: + resolution: {integrity: sha512-Snk314V5ayFLhp3fkUREub6WtjBfPdCPY1Ln8/8munuLuiYhsABgBVWsozAG+MWMbVEvcdcpbi9R7ww22l9Q3g==} + engines: {node: '>=18'} + merge-stream@2.0.0: resolution: {integrity: sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==} @@ -1497,6 +1750,14 @@ packages: resolution: {integrity: sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==} engines: {node: '>=8.6'} + mime-db@1.54.0: + resolution: {integrity: sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ==} + engines: {node: '>= 0.6'} + + mime-types@3.0.2: + resolution: {integrity: sha512-Lbgzdk0h4juoQ9fCKXW4by0UJqj+nOOrI9MJ1sSj4nI8aI2eo1qmvQEie4VD1glsS250n15LsWsYtCugiStS5A==} + engines: {node: '>=18'} + mimic-fn@4.0.0: resolution: {integrity: sha512-vqiC06CuhBTUdZH+RYl8sFrL096vA45Ok5ISO6sE/Mr1jRbGH4Csnhi8f3wKVl7x8mO4Au7Ir9D3Oyv1VYMFJw==} engines: {node: '>=12'} @@ -1552,6 +1813,10 @@ packages: engines: {node: ^10 || ^12 || ^13.7 || ^14 || >=15.0.1} hasBin: true + negotiator@1.0.0: + resolution: {integrity: sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg==} + engines: {node: '>= 0.6'} + netmask@2.1.1: resolution: {integrity: sha512-eonl3sLUha+S1GzTPxychyhnUzKyeQkZ7jLjKrBagJgPla13F+uQ71HgpFefyHgqrjEbCPkDArxYsjY8/+gLKA==} engines: {node: '>= 0.4.0'} @@ -1573,9 +1838,20 @@ packages: resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==} engines: {node: '>=0.10.0'} + object-inspect@1.13.4: + resolution: {integrity: sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==} + engines: {node: '>= 0.4'} + obug@2.1.1: resolution: {integrity: sha512-uTqF9MuPraAQ+IsnPf366RG4cP9RtUi7MLO1N3KEc+wb0a6yKpeL0lmk2IB1jY5KHPAlTc6T/JRdC/YqxHNwkQ==} + on-finished@2.4.1: + resolution: {integrity: sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg==} + engines: {node: '>= 0.8'} + + once@1.4.0: + resolution: {integrity: sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==} + onetime@6.0.0: resolution: {integrity: sha512-1FlR+gjXK7X+AsAHso35MnyN5KqGwJRi/31ft6x0M194ht7S+rWAvd7PHss9xSKMzE0asv1pyIHaJYq+BbacAQ==} engines: {node: '>=12'} @@ -1623,6 +1899,10 @@ packages: package-json-from-dist@1.0.1: resolution: {integrity: sha512-UEZIS3/by4OC8vL3P2dTXRETpebLI2NiI5vIrjaD/5UtrkFX/tNbwjTSRAGC/+7CAo2pIcBaRgWmcBBHcsaCIw==} + parseurl@1.3.3: + resolution: {integrity: sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==} + engines: {node: '>= 0.8'} + partial-json@0.1.7: resolution: {integrity: sha512-Njv/59hHaokb/hRUjce3Hdv12wd60MtM9Z5Olmn+nehe0QDAsRtRbJPvJ0Z91TusF0SuZRIvnM+S4l6EIP8leA==} @@ -1642,6 +1922,9 @@ packages: resolution: {integrity: sha512-Xa4Nw17FS9ApQFJ9umLiJS4orGjm7ZzwUrwamcGQuHSzDyth9boKDaycYdDcZDuqYATXw4HFXgaqWTctW/v1HA==} engines: {node: '>=16 || 14 >=14.18'} + path-to-regexp@8.4.2: + resolution: {integrity: sha512-qRcuIdP69NPm4qbACK+aDogI5CBDMi1jKe0ry5rSQJz8JVLsC7jV8XpiJjGRLLol3N+R5ihGYcrPLTno6pAdBA==} + pathe@2.0.3: resolution: {integrity: sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w==} @@ -1665,6 +1948,10 @@ packages: resolution: {integrity: sha512-TfySrs/5nm8fQJDcBDuUng3VOUKsd7S+zqvbOTiGXHfxX4wK31ard+hoNuvkicM/2YFzlpDgABOevKSsB4G/FA==} engines: {node: '>= 6'} + pkce-challenge@5.0.1: + resolution: {integrity: sha512-wQ0b/W4Fr01qtpHlqSqspcj3EhBvimsdh0KlHhH8HRZnMsEa0ea2fTULOXOS9ccQr3om+GcGRk4e+isrZWV8qQ==} + engines: {node: '>=16.20.0'} + pkg-types@1.3.1: resolution: {integrity: sha512-/Jm5M4RvtBFVkKWRu2BLUTNP8/M2a+UwuAX+ae4770q1qVGtfjG+WTCupoZixokjmHiry8uI+dlY8KXYV5HVVQ==} @@ -1694,6 +1981,10 @@ packages: resolution: {integrity: sha512-3wY1AxV+VBNW8Yypfd1yQY9pXnqTAN+KwQxL8iYm3/BjKYMNg4i0owhEe26PWDOMaIrzeeF98Lqd5NGz4omiIg==} engines: {node: '>=12.0.0'} + proxy-addr@2.0.7: + resolution: {integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==} + engines: {node: '>= 0.10'} + proxy-agent@6.5.0: resolution: {integrity: sha512-TmatMXdr2KlRiA2CyDu8GqR8EjahTG3aY3nXjdzFyoZbmB8hrBsTyMezhULIXKnC0jpfjlmiZ3+EaCzoInSu/A==} engines: {node: '>= 14'} @@ -1705,6 +1996,18 @@ packages: resolution: {integrity: sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==} engines: {node: '>=6'} + qs@6.15.1: + resolution: {integrity: sha512-6YHEFRL9mfgcAvql/XhwTvf5jKcOiiupt2FiJxHkiX1z4j7WL8J/jRHYLluORvc1XxB5rV20KoeK00gVJamspg==} + engines: {node: '>=0.6'} + + range-parser@1.2.1: + resolution: {integrity: sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==} + engines: {node: '>= 0.6'} + + raw-body@3.0.2: + resolution: {integrity: sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA==} + engines: {node: '>= 0.10'} + readdirp@4.1.2: resolution: {integrity: sha512-GDhwkLfywWL2s6vEjyhri+eXmfH6j1L7JE27WhqLeYzoh/A3DBaYGEj2H/HFZCn/kMfim73FXxEJTw06WtxQwg==} engines: {node: '>= 14.18.0'} @@ -1733,14 +2036,32 @@ packages: engines: {node: '>=18.0.0', npm: '>=8.0.0'} hasBin: true + router@2.2.0: + resolution: {integrity: sha512-nLTrUKm2UyiL7rlhapu/Zl45FwNgkZGaCpZbIHajDYgwlJCOzLSk+cIPAnsEqV955GjILJnKbdQC1nVPz+gAYQ==} + engines: {node: '>= 18'} + safe-buffer@5.2.1: resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==} + safer-buffer@2.1.2: + resolution: {integrity: sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==} + semver@7.7.2: resolution: {integrity: sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==} engines: {node: '>=10'} hasBin: true + send@1.2.1: + resolution: {integrity: sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ==} + engines: {node: '>= 18'} + + serve-static@2.2.1: + resolution: {integrity: sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw==} + engines: {node: '>= 18'} + + setprototypeof@1.2.0: + resolution: {integrity: sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==} + shebang-command@2.0.0: resolution: {integrity: sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==} engines: {node: '>=8'} @@ -1749,6 +2070,22 @@ packages: resolution: {integrity: sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==} engines: {node: '>=8'} + side-channel-list@1.0.1: + resolution: {integrity: sha512-mjn/0bi/oUURjc5Xl7IaWi/OJJJumuoJFQJfDDyO46+hBWsfaVM65TBHq2eoZBhzl9EchxOijpkbRC8SVBQU0w==} + engines: {node: '>= 0.4'} + + side-channel-map@1.0.1: + resolution: {integrity: sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA==} + engines: {node: '>= 0.4'} + + side-channel-weakmap@1.0.2: + resolution: {integrity: sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A==} + engines: {node: '>= 0.4'} + + side-channel@1.1.0: + resolution: {integrity: sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==} + engines: {node: '>= 0.4'} + siginfo@2.0.0: resolution: {integrity: sha512-ybx0WO1/8bSBLEWXZvEd7gMW3Sn3JFlW3TvX1nREbDLRNQNaeNN8WK0meBwPdAaOI7TtRRRJn/Es1zhrrCHu7g==} @@ -1796,6 +2133,10 @@ packages: stackback@0.0.2: resolution: {integrity: sha512-1XMJE5fQo1jGH6Y/7ebnwPOBEkIEnT4QF32d5R1+VXdXveM0IBMJt8zfaxX1P3QhVwrYe+576+jkANtSS2mBbw==} + statuses@2.0.2: + resolution: {integrity: sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw==} + engines: {node: '>= 0.8'} + std-env@4.0.0: resolution: {integrity: sha512-zUMPtQ/HBY3/50VbpkupYHbRroTRZJPRLvreamgErJVys0ceuzMkD44J/QjqhHjOzK42GQ3QZIeFG1OYfOtKqQ==} @@ -1872,6 +2213,10 @@ packages: resolution: {integrity: sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==} engines: {node: '>=8.0'} + toidentifier@1.0.1: + resolution: {integrity: sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==} + engines: {node: '>=0.6'} + tr46@1.0.1: resolution: {integrity: sha512-dTpowEjclQ7Kgx5SdBkqRzVhERQXov8/l9Ft9dVM9fmg0W0KQSVaXX9T4i6twCPNtYiZM53lpSSUAwJbFPOHxA==} @@ -1917,6 +2262,10 @@ packages: typescript: optional: true + type-is@2.0.1: + resolution: {integrity: sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw==} + engines: {node: '>= 0.6'} + typescript@5.8.3: resolution: {integrity: sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==} engines: {node: '>=14.17'} @@ -1932,12 +2281,20 @@ packages: resolution: {integrity: sha512-xXnp4kTyor2Zq+J1FfPI6Eq3ew5h6Vl0F/8d9XU5zZQf1tX9s2Su1/3PiMmUANFULpmksxkClamIZcaUqryHsQ==} engines: {node: '>=20.18.1'} + unpipe@1.0.0: + resolution: {integrity: sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==} + engines: {node: '>= 0.8'} + validate.io-array@1.0.6: resolution: {integrity: sha512-DeOy7CnPEziggrOO5CZhVKJw6S3Yi7e9e65R1Nl/RTN1vTQKnzjfvks0/8kQ40FP/dsjRAOd4hxmJ7uLa6vxkg==} validate.io-function@1.0.2: resolution: {integrity: sha512-LlFybRJEriSuBnUhQyG5bwglhh50EpTL2ul23MPIuR1odjO7XaMLFV8vHGwp7AZciFxtYOeiSCT5st+XSPONiQ==} + vary@1.1.2: + resolution: {integrity: sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==} + engines: {node: '>= 0.8'} + vite-tsconfig-paths@6.1.1: resolution: {integrity: sha512-2cihq7zliibCCZ8P9cKJrQBkfgdvcFkOOc3Y02o3GWUDLgqjWsZudaoiuOwO/gzTzy17cS5F7ZPo4bsnS4DGkg==} peerDependencies: @@ -2050,6 +2407,9 @@ packages: resolution: {integrity: sha512-G8ura3S+3Z2G+mkgNRq8dqaFZAuxfsxpBB8OCTGRTCtp+l/v9nbFNmCUP1BZMts3G1142MsZfn6eeUKrr4PD1Q==} engines: {node: '>=18'} + wrappy@1.0.2: + resolution: {integrity: sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==} + ws@8.20.0: resolution: {integrity: sha512-sAt8BhgNbzCtgGbt2OxmpuryO63ZoDk/sqaB/znQm94T4fCEsy/yV+7CdC1kJhOU9lboAEU7R3kquuycDoibVA==} engines: {node: '>=10.0.0'} @@ -2634,17 +2994,24 @@ snapshots: '@esbuild/win32-x64@0.25.8': optional: true - '@google/genai@1.50.1': + '@google/genai@1.50.1(@modelcontextprotocol/sdk@1.29.0(zod@4.3.6))': dependencies: google-auth-library: 10.6.2 p-retry: 4.6.2 protobufjs: 7.5.5 ws: 8.20.0 + optionalDependencies: + '@modelcontextprotocol/sdk': 1.29.0(zod@4.3.6) transitivePeerDependencies: - bufferutil - supports-color - utf-8-validate + '@hono/node-server@1.19.14(hono@4.12.16)': + dependencies: + hono: 4.12.16 + optional: true + '@isaacs/cliui@8.0.2': dependencies: string-width: 5.1.2 @@ -2675,9 +3042,9 @@ snapshots: '@jridgewell/resolve-uri': 3.1.2 '@jridgewell/sourcemap-codec': 1.5.4 - '@mariozechner/pi-agent-core@0.67.68(ws@8.20.0)(zod@4.3.6)': + '@mariozechner/pi-agent-core@0.67.68(@modelcontextprotocol/sdk@1.29.0(zod@4.3.6))(ws@8.20.0)(zod@4.3.6)': dependencies: - '@mariozechner/pi-ai': 0.67.68(ws@8.20.0)(zod@4.3.6) + '@mariozechner/pi-ai': 0.67.68(@modelcontextprotocol/sdk@1.29.0(zod@4.3.6))(ws@8.20.0)(zod@4.3.6) transitivePeerDependencies: - '@modelcontextprotocol/sdk' - aws-crt @@ -2687,11 +3054,11 @@ snapshots: - ws - zod - '@mariozechner/pi-ai@0.67.68(ws@8.20.0)(zod@4.3.6)': + '@mariozechner/pi-ai@0.67.68(@modelcontextprotocol/sdk@1.29.0(zod@4.3.6))(ws@8.20.0)(zod@4.3.6)': dependencies: '@anthropic-ai/sdk': 0.90.0(zod@4.3.6) '@aws-sdk/client-bedrock-runtime': 3.1032.0 - '@google/genai': 1.50.1 + '@google/genai': 1.50.1(@modelcontextprotocol/sdk@1.29.0(zod@4.3.6)) '@mistralai/mistralai': 2.2.0 '@sinclair/typebox': 0.34.49 ajv: 8.17.1 @@ -2720,6 +3087,80 @@ snapshots: - bufferutil - utf-8-validate + '@modelcontextprotocol/sdk@1.29.0(zod@4.3.6)': + dependencies: + '@hono/node-server': 1.19.14(hono@4.12.16) + ajv: 8.17.1 + ajv-formats: 3.0.1(ajv@8.17.1) + content-type: 1.0.5 + cors: 2.8.6 + cross-spawn: 7.0.6 + eventsource: 3.0.7 + eventsource-parser: 3.0.6 + express: 5.2.1 + express-rate-limit: 8.4.1(express@5.2.1) + hono: 4.12.16 + jose: 6.2.3 + json-schema-typed: 8.0.2 + pkce-challenge: 5.0.1 + raw-body: 3.0.2 + zod: 4.3.6 + zod-to-json-schema: 3.25.2(zod@4.3.6) + transitivePeerDependencies: + - supports-color + optional: true + + '@openai/agents-core@0.8.5(ws@8.20.0)(zod@4.3.6)': + dependencies: + debug: 4.4.3 + openai: 6.33.0(ws@8.20.0)(zod@4.3.6) + optionalDependencies: + '@modelcontextprotocol/sdk': 1.29.0(zod@4.3.6) + zod: 4.3.6 + transitivePeerDependencies: + - '@cfworker/json-schema' + - supports-color + - ws + + '@openai/agents-openai@0.8.5(ws@8.20.0)(zod@4.3.6)': + dependencies: + '@openai/agents-core': 0.8.5(ws@8.20.0)(zod@4.3.6) + debug: 4.4.3 + openai: 6.33.0(ws@8.20.0)(zod@4.3.6) + zod: 4.3.6 + transitivePeerDependencies: + - '@cfworker/json-schema' + - supports-color + - ws + + '@openai/agents-realtime@0.8.5(zod@4.3.6)': + dependencies: + '@openai/agents-core': 0.8.5(ws@8.20.0)(zod@4.3.6) + '@types/ws': 8.18.1 + debug: 4.4.3 + ws: 8.20.0 + zod: 4.3.6 + transitivePeerDependencies: + - '@cfworker/json-schema' + - bufferutil + - supports-color + - utf-8-validate + + '@openai/agents@0.8.5(ws@8.20.0)(zod@4.3.6)': + dependencies: + '@openai/agents-core': 0.8.5(ws@8.20.0)(zod@4.3.6) + '@openai/agents-openai': 0.8.5(ws@8.20.0)(zod@4.3.6) + '@openai/agents-realtime': 0.8.5(zod@4.3.6) + debug: 4.4.3 + openai: 6.33.0(ws@8.20.0)(zod@4.3.6) + zod: 4.3.6 + transitivePeerDependencies: + - '@cfworker/json-schema' + - bufferutil + - supports-color + - utf-8-validate + - ws + '@opentelemetry/api@1.9.0': {} '@pkgjs/parseargs@0.11.0': @@ -3128,6 +3569,10 @@ snapshots: '@types/retry@0.12.0': {} + '@types/ws@8.18.1': + dependencies: + '@types/node': 25.5.0 + '@vercel/oidc@3.1.0': {} '@vitest/coverage-v8@4.1.2(vitest@4.1.2(@opentelemetry/api@1.9.0)(@types/node@25.5.0)(vite@7.0.6(@types/node@25.5.0)(yaml@2.8.0)))': @@ -3185,6 +3630,12 @@ snapshots: convert-source-map: 2.0.0 tinyrainbow: 3.1.0 + accepts@2.0.0: + dependencies: + mime-types: 3.0.2 + negotiator: 1.0.0 + optional: true + acorn@8.15.0: {} agent-base@7.1.4: {} @@ -3260,6 +3711,21 @@ snapshots: binary-search@1.3.6: {} + body-parser@2.2.2: + dependencies: + bytes: 3.1.2 + content-type: 1.0.5 + debug: 4.4.3 + http-errors: 2.0.1 + iconv-lite: 0.7.2 + on-finished: 2.4.1 + qs: 6.15.1 + raw-body: 3.0.2 + type-is: 2.0.1 + transitivePeerDependencies: + - supports-color + optional: true + bowser@2.14.1: {} brace-expansion@2.0.2: @@ -3277,8 +3743,23 @@ snapshots: esbuild: 0.25.8 load-tsconfig: 0.2.5 + bytes@3.1.2: + optional: true + cac@6.7.14: {} + call-bind-apply-helpers@1.0.2: + dependencies: + es-errors: 1.3.0 + function-bind: 1.1.2 + optional: true + + call-bound@1.0.4: + dependencies: + call-bind-apply-helpers: 1.0.2 + get-intrinsic: 1.3.0 + optional: true + chai@6.2.2: {} chalk@5.4.1: {} @@ -3333,8 +3814,26 @@ snapshots: consola@3.4.2: {} + content-disposition@1.1.0: + optional: true + + content-type@1.0.5: + optional: true + convert-source-map@2.0.0: {} + cookie-signature@1.2.2: + optional: true + + cookie@0.7.2: + optional: true + + cors@2.8.6: + dependencies: + object-assign: 4.1.1 + vary: 1.1.2 + optional: true + cross-spawn@7.0.6: dependencies: path-key: 3.1.1 @@ -3349,12 +3848,19 @@ snapshots: dependencies: ms: 2.1.3 + debug@4.4.3: + dependencies: + ms: 2.1.3 + degenerator@5.0.1: dependencies: ast-types: 0.13.4 escodegen: 2.1.0 esprima: 4.0.1 + depd@2.0.0: + optional: true + dotenv-cli@8.0.0: dependencies: cross-spawn: 7.0.6 @@ -3366,22 +3872,46 @@ snapshots: dotenv@16.6.1: {} + dunder-proto@1.0.1: + dependencies: + call-bind-apply-helpers: 1.0.2 + es-errors: 1.3.0 + gopd: 1.2.0 + optional: true + eastasianwidth@0.2.0: {} ecdsa-sig-formatter@1.0.11: dependencies: safe-buffer: 5.2.1 + ee-first@1.1.1: + optional: true + emoji-regex@10.4.0: {} emoji-regex@8.0.0: {} emoji-regex@9.2.2: {} + encodeurl@2.0.0: + optional: true + environment@1.1.0: {} + es-define-property@1.0.1: + optional: true + + es-errors@1.3.0: + optional: true + es-module-lexer@2.0.0: {} + es-object-atoms@1.1.1: + dependencies: + es-errors: 1.3.0 + optional: true + esbuild@0.25.8: optionalDependencies: '@esbuild/aix-ppc64': 0.25.8 @@ -3411,6 +3941,9 @@ snapshots: '@esbuild/win32-ia32': 0.25.8 '@esbuild/win32-x64': 0.25.8 + escape-html@1.0.3: + optional: true + escodegen@2.1.0: dependencies: esprima: 4.0.1 @@ -3429,10 +3962,18 @@ snapshots: esutils@2.0.3: {} + etag@1.8.1: + optional: true + eventemitter3@5.0.1: {} eventsource-parser@3.0.6: {} + eventsource@3.0.7: + dependencies: + eventsource-parser: 3.0.6 + optional: true + execa@8.0.1: dependencies: cross-spawn: 7.0.6 @@ -3447,6 +3988,46 @@ snapshots: expect-type@1.3.0: {} + express-rate-limit@8.4.1(express@5.2.1): + dependencies: + express: 5.2.1 + ip-address: 10.1.0 + optional: true + + express@5.2.1: + dependencies: + accepts: 2.0.0 + body-parser: 2.2.2 + content-disposition: 1.1.0 + content-type: 1.0.5 + cookie: 0.7.2 + cookie-signature: 1.2.2 + debug: 4.4.3 + depd: 2.0.0 + encodeurl: 2.0.0 + escape-html: 1.0.3 + etag: 1.8.1 + finalhandler: 2.1.1 + fresh: 2.0.0 + http-errors: 2.0.1 + merge-descriptors: 2.0.0 + mime-types: 3.0.2 + on-finished: 2.4.1 + once: 1.4.0 + parseurl: 1.3.3 + proxy-addr: 2.0.7 + qs: 6.15.1 + range-parser: 1.2.1 + router: 2.2.0 + send: 1.2.1 + serve-static: 2.2.1 + statuses: 2.0.2 + type-is: 2.0.1 + vary: 1.1.2 + transitivePeerDependencies: + - supports-color + optional: true + extend@3.0.2: {} fast-deep-equal@3.1.3: {} @@ -3482,6 +4063,18 @@ snapshots: dependencies: to-regex-range: 5.0.1 + finalhandler@2.1.1: + dependencies: + debug: 4.4.3 + encodeurl: 2.0.0 + escape-html: 1.0.3 + on-finished: 2.4.1 + parseurl: 1.3.3 + statuses: 2.0.2 + transitivePeerDependencies: + - supports-color + optional: true + fix-dts-default-cjs-exports@1.0.1: dependencies: magic-string: 0.30.17 @@ -3497,9 +4090,18 @@ snapshots: dependencies: fetch-blob: 3.2.0 + forwarded@0.2.0: + optional: true + + fresh@2.0.0: + optional: true + fsevents@2.3.3: optional: true + function-bind@1.1.2: + optional: true + gaxios@7.1.4: dependencies: extend: 3.0.2 @@ -3518,6 +4120,26 @@ snapshots: get-east-asian-width@1.3.0: {} + get-intrinsic@1.3.0: + dependencies: + call-bind-apply-helpers: 1.0.2 + es-define-property: 1.0.1 + es-errors: 1.3.0 + es-object-atoms: 1.1.1 + function-bind: 1.1.2 + get-proto: 1.0.1 + gopd: 1.2.0 + has-symbols: 1.1.0 + hasown: 2.0.3 + math-intrinsics: 1.1.0 + optional: true + + get-proto@1.0.1: + dependencies: + dunder-proto: 1.0.1 + es-object-atoms: 1.1.1 + optional: true + get-stream@8.0.1: {} get-uri@6.0.5: @@ -3552,10 +4174,33 @@ snapshots: google-logging-utils@1.1.3: {} + gopd@1.2.0: + optional: true + has-flag@4.0.0: {} + has-symbols@1.1.0: + optional: true + + hasown@2.0.3: + dependencies: + function-bind: 1.1.2 + optional: true + + hono@4.12.16: + optional: true + html-escaper@2.0.2: {} + http-errors@2.0.1: + dependencies: + depd: 2.0.0 + inherits: 2.0.4 + setprototypeof: 1.2.0 + statuses: 2.0.2 + toidentifier: 1.0.1 + optional: true + http-proxy-agent@7.0.2: dependencies: agent-base: 7.1.4 @@ -3572,10 +4217,21 @@ snapshots: human-signals@5.0.0: {} + iconv-lite@0.7.2: + dependencies: + safer-buffer: 2.1.2 + optional: true + + inherits@2.0.4: + optional: true + install@0.13.0: {} ip-address@10.1.0: {} + ipaddr.js@1.9.1: + optional: true + is-any-array@2.0.1: {} is-fullwidth-code-point@3.0.0: {} @@ -3588,6 +4244,9 @@ snapshots: is-number@7.0.0: {} + is-promise@4.0.0: + optional: true + is-stream@3.0.0: {} isexe@2.0.0: {} @@ -3611,6 +4270,9 @@ snapshots: optionalDependencies: '@pkgjs/parseargs': 0.11.0 + jose@6.2.3: + optional: true + joycon@3.1.1: {} js-levenshtein@1.1.6: {} @@ -3632,6 +4294,9 @@ snapshots: json-schema-traverse@1.0.0: {} + json-schema-typed@8.0.2: + optional: true + json-schema@0.4.0: {} jwa@2.0.1: @@ -3716,6 +4381,15 @@ snapshots: dependencies: semver: 7.7.2 + math-intrinsics@1.1.0: + optional: true + + media-typer@1.1.0: + optional: true + + merge-descriptors@2.0.0: + optional: true + merge-stream@2.0.0: {} micromatch@4.0.8: @@ -3723,6 +4397,14 @@ snapshots: braces: 3.0.3 picomatch: 2.3.1 + mime-db@1.54.0: + optional: true + + mime-types@3.0.2: + dependencies: + mime-db: 1.54.0 + optional: true + mimic-fn@4.0.0: {} mimic-function@5.0.1: {} @@ -3784,6 +4466,9 @@ snapshots: nanoid@3.3.11: {} + negotiator@1.0.0: + optional: true + netmask@2.1.1: {} node-domexception@1.0.0: {} @@ -3800,8 +4485,21 @@ snapshots: object-assign@4.1.1: {} + object-inspect@1.13.4: + optional: true + obug@2.1.1: {} + on-finished@2.4.1: + dependencies: + ee-first: 1.1.1 + optional: true + + once@1.4.0: + dependencies: + wrappy: 1.0.2 + optional: true + onetime@6.0.0: dependencies: mimic-fn: 4.0.0 @@ -3850,6 +4548,9 @@ snapshots: package-json-from-dist@1.0.1: {} + parseurl@1.3.3: + optional: true + partial-json@0.1.7: {} path-expression-matcher@1.5.0: {} @@ -3863,6 +4564,9 @@ snapshots: lru-cache: 10.4.3 minipass: 7.1.2 + path-to-regexp@8.4.2: + optional: true + pathe@2.0.3: {} picocolors@1.1.1: {} @@ -3875,6 +4579,9 @@ snapshots: pirates@4.0.7: {} + pkce-challenge@5.0.1: + optional: true + pkg-types@1.3.1: dependencies: confbox: 0.1.8 @@ -3909,6 +4616,12 @@ snapshots: '@types/node': 25.5.0 long: 5.3.2 + proxy-addr@2.0.7: + dependencies: + forwarded: 0.2.0 + ipaddr.js: 1.9.1 + optional: true + proxy-agent@6.5.0: dependencies: agent-base: 7.1.4 @@ -3926,6 +4639,22 @@ snapshots: punycode@2.3.1: {} + qs@6.15.1: + dependencies: + side-channel: 1.1.0 + optional: true + + range-parser@1.2.1: + optional: true + + raw-body@3.0.2: + dependencies: + bytes: 3.1.2 + http-errors: 2.0.1 + iconv-lite: 0.7.2 + unpipe: 1.0.0 + optional: true + readdirp@4.1.2: {} require-from-string@2.0.2: {} @@ -3967,16 +4696,92 @@ snapshots: '@rollup/rollup-win32-x64-msvc': 4.46.2 fsevents: 2.3.3 + router@2.2.0: + dependencies: + debug: 4.4.3 + depd: 2.0.0 + is-promise: 4.0.0 + parseurl: 1.3.3 + path-to-regexp: 8.4.2 + transitivePeerDependencies: + - supports-color + optional: true + safe-buffer@5.2.1: {} + safer-buffer@2.1.2: + optional: true + semver@7.7.2: {} + send@1.2.1: + dependencies: + debug: 4.4.3 + encodeurl: 2.0.0 + escape-html: 1.0.3 + etag: 1.8.1 + fresh: 2.0.0 + http-errors: 2.0.1 + mime-types: 3.0.2 + ms: 2.1.3 + on-finished: 2.4.1 + range-parser: 1.2.1 + statuses: 2.0.2 + transitivePeerDependencies: + - supports-color + optional: true + + serve-static@2.2.1: + dependencies: + encodeurl: 2.0.0 + escape-html: 1.0.3 + parseurl: 1.3.3 + send: 1.2.1 + transitivePeerDependencies: + - supports-color + optional: true + + setprototypeof@1.2.0: + optional: true + shebang-command@2.0.0: dependencies: shebang-regex: 3.0.0 shebang-regex@3.0.0: {} + side-channel-list@1.0.1: + dependencies: + es-errors: 1.3.0 + object-inspect: 1.13.4 + optional: true + + side-channel-map@1.0.1: + dependencies: + call-bound: 1.0.4 + es-errors: 1.3.0 + get-intrinsic: 1.3.0 + object-inspect: 1.13.4 + optional: true + + side-channel-weakmap@1.0.2: + dependencies: + call-bound: 1.0.4 + es-errors: 1.3.0 + get-intrinsic: 1.3.0 + object-inspect: 1.13.4 + side-channel-map: 1.0.1 + optional: true + + side-channel@1.1.0: + dependencies: + es-errors: 1.3.0 + object-inspect: 1.13.4 + side-channel-list: 1.0.1 + side-channel-map: 1.0.1 + side-channel-weakmap: 1.0.2 + optional: true + siginfo@2.0.0: {} signal-exit@4.1.0: {} @@ -4019,6 +4824,9 @@ snapshots: stackback@0.0.2: {} + statuses@2.0.2: + optional: true + std-env@4.0.0: {} string-argv@0.3.2: {} @@ -4097,6 +4905,9 @@ snapshots: dependencies: is-number: 7.0.0 + toidentifier@1.0.1: + optional: true + tr46@1.0.1: dependencies: punycode: 2.3.1 @@ -4141,6 +4952,13 @@ snapshots: - tsx - yaml + type-is@2.0.1: + dependencies: + content-type: 1.0.5 + media-typer: 1.1.0 + mime-types: 3.0.2 + optional: true + typescript@5.8.3: {} ufo@1.6.1: {} @@ -4149,10 +4967,16 @@ snapshots: undici@7.25.0: {} + unpipe@1.0.0: + optional: true + validate.io-array@1.0.6: {} validate.io-function@1.0.2: {} + vary@1.1.2: + optional: true + vite-tsconfig-paths@6.1.1(typescript@5.8.3)(vite@7.0.6(@types/node@25.5.0)(yaml@2.8.0)): dependencies: debug: 4.4.1 @@ -4241,6 +5065,9 @@ snapshots: string-width: 7.2.0 strip-ansi: 7.1.0 + wrappy@1.0.2: + optional: true + ws@8.20.0: {} yaml@2.8.0: {} diff --git a/scripts/bump-release-versions.mjs b/scripts/bump-release-versions.mjs index afda7ab..d245d10 100644 --- a/scripts/bump-release-versions.mjs +++ b/scripts/bump-release-versions.mjs @@ -11,6 +11,7 @@ if (!newVersion) { const files = [ "packages/vitest-evals/package.json", "packages/harness-ai-sdk/package.json", + "packages/harness-openai-agents/package.json", "packages/harness-pi-ai/package.json", ]; diff --git a/scripts/eval-cli.mjs b/scripts/eval-cli.mjs index 7616cc7..06da697 100644 --- a/scripts/eval-cli.mjs +++ b/scripts/eval-cli.mjs @@ -31,6 +31,9 @@ export function parseEvalCliArgs(args) { export function createEvalEnv(baseEnv, toolDetailLevel) { return { ...baseEnv, + VITEST_EVALS_REPLAY_MODE: baseEnv.VITEST_EVALS_REPLAY_MODE ?? "auto", + VITEST_EVALS_REPLAY_DIR: + baseEnv.VITEST_EVALS_REPLAY_DIR ?? ".vitest-evals/recordings", ...(toolDetailLevel > 0 ? { VITEST_EVALS_TOOL_DETAILS: "1", diff --git a/scripts/eval-cli.test.mjs b/scripts/eval-cli.test.mjs new file mode 100644 index 0000000..ee7ecd4 --- /dev/null +++ b/scripts/eval-cli.test.mjs @@ -0,0 +1,34 @@ +import { describe, expect, test } from "vitest"; +import { createEvalEnv, parseEvalCliArgs } from "./eval-cli.mjs"; + +describe("eval CLI helpers", () => { + test("defaults demo evals to replay auto mode", () => { + expect(createEvalEnv({}, 0)).toMatchObject({ + VITEST_EVALS_REPLAY_MODE: "auto", + VITEST_EVALS_REPLAY_DIR: ".vitest-evals/recordings", + }); + }); + + test("preserves explicit replay overrides", () => { + expect( + createEvalEnv( + { + VITEST_EVALS_REPLAY_MODE: "strict", + VITEST_EVALS_REPLAY_DIR: "/tmp/replay", + }, + 0, + ), + ).toMatchObject({ + VITEST_EVALS_REPLAY_MODE: "strict", + VITEST_EVALS_REPLAY_DIR: "/tmp/replay", + }); + }); + + test("keeps verbose flags separate from forwarded Vitest args", () => { + expect(parseEvalCliArgs(["--", "-vv", "--pool=forks"])).toEqual({ + failMode: false, + forwardedArgs: ["--pool=forks"], + toolDetailLevel: 2, + }); + }); +}); diff --git a/tsconfig.base.json b/tsconfig.base.json index 31e6d3f..64692ff 100644 --- a/tsconfig.base.json +++ b/tsconfig.base.json @@ -11,6 +11,9 @@ "vitest-evals": ["packages/vitest-evals/src/index.ts"], "vitest-evals/*": ["packages/vitest-evals/src/*"], "@vitest-evals/harness-ai-sdk": ["packages/harness-ai-sdk/src/index.ts"], + "@vitest-evals/harness-openai-agents": [ + "packages/harness-openai-agents/src/index.ts" + ], "@vitest-evals/harness-pi-ai": ["packages/harness-pi-ai/src/index.ts"] } } diff --git a/vitest.config.ts b/vitest.config.ts index 10976f9..c824002 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -9,6 +9,7 @@ export default defineConfig({ "packages/**/*.eval.ts", "apps/**/*.test.ts", "apps/**/*.eval.ts", + "scripts/**/*.test.mjs", ], }, });