getsentry · dcramer · May 4, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
diff --git a/.craft.yml b/.craft.yml
@@ -9,6 +9,10 @@ targets:
     id: "@vitest-evals/harness-ai-sdk"
     access: public
     includeNames: /^vitest-evals-harness-ai-sdk-\d.*\.tgz$/
+  - name: npm
+    id: "@vitest-evals/harness-openai-agents"
+    access: public
+    includeNames: /^vitest-evals-harness-openai-agents-\d.*\.tgz$/
   - name: npm
     id: "@vitest-evals/harness-pi-ai"
     access: public

diff --git a/.github/workflows/merge-jobs.yml b/.github/workflows/merge-jobs.yml
@@ -66,6 +66,7 @@ jobs:
           mkdir -p artifacts
           pnpm --filter vitest-evals pack --pack-destination artifacts
           pnpm --filter @vitest-evals/harness-ai-sdk pack --pack-destination artifacts
+          pnpm --filter @vitest-evals/harness-openai-agents pack --pack-destination artifacts
           pnpm --filter @vitest-evals/harness-pi-ai pack --pack-destination artifacts
           ls -la artifacts
 

diff --git a/.gitignore b/.gitignore
@@ -88,6 +88,9 @@ dist
 # Build files
 /dist
 
+# vitest-evals replay recordings
+.vitest-evals/
+
 # Gatsby files
 .cache/
 # Comment in the public line in if your project uses Gatsby and not Next.js

diff --git a/README.md b/README.md
@@ -5,19 +5,24 @@ Monorepo for the explicit-run `vitest-evals` shape:
 - `packages/vitest-evals`: core suite API, judges, normalized harness/session
   types, reporter, and legacy compatibility exports
 - `packages/harness-ai-sdk`: `ai-sdk`-focused harness adapter
+- `packages/harness-openai-agents`: `@openai/agents`-focused harness adapter
 - `packages/harness-pi-ai`: `pi-ai`-focused harness adapter with tool replay
 - `apps/demo-pi`: end-to-end Pi Mono demo evals with an app-local refund agent
 - `apps/demo-ai-sdk`: end-to-end AI SDK demo evals with app-local refund tools
+- `apps/demo-openai-agents`: end-to-end OpenAI Agents demo evals with
+  app-local refund tools
 
 ## Workspace Layout
 
 ```text
 packages/
   vitest-evals/
   harness-ai-sdk/
+  harness-openai-agents/
   harness-pi-ai/
 apps/
   demo-ai-sdk/
+  demo-openai-agents/
   demo-pi/
 ```
 
@@ -158,8 +163,8 @@ when the judge needs richer run/session data or the suite's configured model
 prompt seam.
 
 Tool replay is available for opt-in tools in the first-party harnesses.
-Configure it globally in Vitest and then mark individual tools with
-`replay: true`:
+Configure the replay mode and directory globally in Vitest, then opt individual
+tools in from the harness with `toolReplay: { toolName: true }`.
 
 ```ts
 import tsconfigPaths from "vite-tsconfig-paths";
@@ -187,5 +192,7 @@ errors on missing recordings. Recordings are stored under
 `.vitest-evals/recordings/<tool-name>/`.
 
 `pnpm evals` fans out to each workspace package or app that exposes an `evals`
-script. The demo apps expect provider keys in `.env` or `.env.local`. The
+script. The shared eval CLI defaults replay to `auto` and writes recordings
+under `.vitest-evals/recordings`, unless those environment variables are
+already set. Demo apps expect provider keys in `.env` or `.env.local`. The
 intentional failing examples remain under the `evals:fail` scripts.
diff --git a/apps/demo-ai-sdk/evals/shared.ts b/apps/demo-ai-sdk/evals/shared.ts
@@ -91,7 +91,6 @@ async function createRefund({
 const refundTools = {
   lookupInvoice: {
     description: "Look up invoice details inside demo billing.",
-    replay: true,
     inputSchema: z.object({
       invoiceId: z
         .string()
@@ -111,6 +110,9 @@ const refundTools = {
 
 export const refundHarness = aiSdkHarness({
   tools: refundTools,
+  toolReplay: {
+    lookupInvoice: true,
+  },
   prompt: (input, options) =>
     generateText({
       model: anthropic("claude-sonnet-4-5"),

diff --git a/apps/demo-openai-agents/README.md b/apps/demo-openai-agents/README.md
@@ -0,0 +1,34 @@
+# Demo OpenAI Agents App
+
+This app demonstrates an `@openai/agents` harness wired into `vitest-evals`
+through the workspace packages:
+
+- `vitest-evals`
+- `@vitest-evals/harness-openai-agents`
+
+The passing live eval lives in `evals/refund.eval.ts`.
+It demonstrates a real OpenAI Agents `Agent`, `Runner`, local function tools,
+tool replay configured from the harness, and explicit Vitest assertions on
+`run.output` and the normalized session trace.
+
+The intentionally failing examples live in `evals/refund.fail.eval.ts`.
+One fails an automatic harness-backed judge, and one fails explicit assertions
+after the harness completes.
+
+Run them with:
+
+```sh
+pnpm --filter @demo/demo-openai-agents run evals
+pnpm --filter @demo/demo-openai-agents run evals -- -v
+pnpm --filter @demo/demo-openai-agents run evals -- -vv
+pnpm --filter @demo/demo-openai-agents run evals -- -vvv
+pnpm --filter @demo/demo-openai-agents run evals -- -vvvv
+pnpm --filter @demo/demo-openai-agents run evals:verbose
+pnpm --filter @demo/demo-openai-agents run evals:fail
+```
+
+`pnpm --filter @demo/demo-openai-agents run evals` runs only the passing eval.
+Use `pnpm --filter @demo/demo-openai-agents run evals:fail` to run just the
+intentional failures.
+
+Both scripts expect `OPENAI_API_KEY` to be present in `.env` or `.env.local`.
diff --git a/apps/demo-openai-agents/evals/refund.eval.ts b/apps/demo-openai-agents/evals/refund.eval.ts
@@ -0,0 +1,47 @@
+import {
+  describeEval,
+  StructuredOutputJudge,
+  ToolCallJudge,
+} from "vitest-evals";
+import { expect } from "vitest";
+import { assertRefundCase, refundHarness } from "./shared";
+import type { RefundCase } from "../src/refundAgent";
+
+const outputJudge = StructuredOutputJudge();
+
+describeEval(
+  "demo openai agents refund agent",
+  {
+    skipIf: () => !process.env.OPENAI_API_KEY,
+    harness: refundHarness,
+    judges: [ToolCallJudge()],
+  },
+  (it) => {
+    it.for<RefundCase>([
+      {
+        name: "approves refundable invoice",
+        input: "Refund invoice inv_123",
+        expectedStatus: "approved",
+        expectedTools: ["lookupInvoice", "createRefund"],
+      },
+      {
+        name: "denies non-refundable invoice",
+        input: "Refund invoice inv_404",
+        expectedStatus: "denied",
+        expectedTools: ["lookupInvoice"],
+      },
+    ])("$name", async ({ input, ...metadata }, { run }) => {
+      const result = await run(input, {
+        metadata,
+      });
+
+      await assertRefundCase(result, metadata);
+      await expect(result).toSatisfyJudge(outputJudge, {
+        metadata,
+        expected: {
+          status: metadata.expectedStatus,
+        },
+      });
+    });
+  },
+);
diff --git a/apps/demo-openai-agents/evals/refund.fail.eval.ts b/apps/demo-openai-agents/evals/refund.fail.eval.ts
@@ -0,0 +1,63 @@
+import { expect } from "vitest";
+import { describeEval, StructuredOutputJudge } from "vitest-evals";
+import { refundHarness } from "./shared";
+import type { RefundCase } from "../src/refundAgent";
+
+type AssertionRefundCase = RefundCase;
+type ScoredRefundCase = RefundCase & {
+  expected: Record<string, unknown>;
+};
+
+describeEval(
+  "demo openai agents refund scorer failing example",
+  {
+    skipIf: () => !process.env.OPENAI_API_KEY,
+    harness: refundHarness,
+    judges: [StructuredOutputJudge()],
+  },
+  (it) => {
+    it.for<ScoredRefundCase>([
+      {
+        name: "judge expects approval for a denied invoice",
+        input: "Refund invoice inv_404",
+        expectedStatus: "denied",
+        expectedTools: ["lookupInvoice"],
+        expected: {
+          status: "approved",
+        },
+      },
+    ])("$name", async ({ input, ...metadata }, { run }) => {
+      await run(input, {
+        metadata,
+      });
+    });
+  },
+);
+
+describeEval(
+  "demo openai agents refund assertion failing example",
+  {
+    skipIf: () => !process.env.OPENAI_API_KEY,
+    harness: refundHarness,
+  },
+  (it) => {
+    it.for<AssertionRefundCase>([
+      {
+        name: "asserts the wrong refund id after approval",
+        input: "Refund invoice inv_123",
+        expectedStatus: "approved",
+        expectedTools: ["lookupInvoice", "createRefund"],
+      },
+    ])("$name", async ({ input, ...metadata }, { run }) => {
+      const result = await run(input, {
+        metadata,
+      });
+
+      expect(result.output).toMatchObject({
+        status: "approved",
+        invoiceId: "inv_123",
+        refundId: "rf_wrong",
+      });
+    });
+  },
+);
diff --git a/apps/demo-openai-agents/evals/shared.ts b/apps/demo-openai-agents/evals/shared.ts
@@ -0,0 +1,40 @@
+import { openaiAgentsHarness } from "@vitest-evals/harness-openai-agents";
+import { expect } from "vitest";
+import { type HarnessRun, toolCalls } from "vitest-evals";
+import {
+  createRefundAgent,
+  createRefundRunner,
+  parseRefundDecision,
+  promptRefundModel,
+  resolveResultText,
+  type RefundCase,
+} from "../src/refundAgent";
+
+export const refundHarness = openaiAgentsHarness({
+  createAgent: () => createRefundAgent(),
+  createRunner: () => createRefundRunner(),
+  prompt: promptRefundModel,
+  runOptions: {
+    maxTurns: 5,
+  },
+  toolReplay: {
+    lookupInvoice: true,
+  },
+  normalize: {
+    output: ({ result }) => parseRefundDecision(resolveResultText(result)),
+  },
+});
+
+export async function assertRefundCase(
+  run: HarnessRun,
+  expected: Pick<RefundCase, "expectedStatus" | "expectedTools">,
+) {
+  expect(run.output).toMatchObject({
+    status: expected.expectedStatus,
+  });
+  expect(toolCalls(run.session).map((call) => call.name)).toEqual(
+    expected.expectedTools,
+  );
+  expect(run.usage.model).toContain("gpt");
+  expect(run.usage.totalTokens).toBeGreaterThan(0);
+}
diff --git a/apps/demo-openai-agents/package.json b/apps/demo-openai-agents/package.json
@@ -0,0 +1,16 @@
+{
+  "name": "@demo/demo-openai-agents",
+  "private": true,
+  "version": "0.1.0",
+  "scripts": {
+    "evals": "node ./scripts/run-evals.mjs",
+    "evals:verbose": "node ./scripts/run-evals.mjs -v",
+    "evals:fail": "node ./scripts/run-evals.mjs --fail"
+  },
+  "dependencies": {
+    "@openai/agents": "^0.8.5",
+    "@vitest-evals/harness-openai-agents": "workspace:*",
+    "vitest-evals": "workspace:*",
+    "zod": "^4.3.6"
+  }
+}
diff --git a/apps/demo-openai-agents/scripts/run-evals.mjs b/apps/demo-openai-agents/scripts/run-evals.mjs
@@ -0,0 +1,50 @@
+import { spawnSync } from "node:child_process";
+import { dirname, resolve } from "node:path";
+import { fileURLToPath } from "node:url";
+import { createEvalEnv, parseEvalCliArgs } from "../../../scripts/eval-cli.mjs";
+
+const WORKSPACE_ROOT = resolve(
+  dirname(fileURLToPath(import.meta.url)),
+  "../../..",
+);
+
+const { failMode, forwardedArgs, toolDetailLevel } = parseEvalCliArgs(
+  process.argv.slice(2),
+);
+const env = createEvalEnv(process.env, toolDetailLevel);
+
+const explicitTargetIndex = forwardedArgs.findIndex(
+  (arg) => !arg.startsWith("-"),
+);
+const target =
+  explicitTargetIndex >= 0
+    ? forwardedArgs.splice(explicitTargetIndex, 1)[0]
+    : failMode
+      ? "apps/demo-openai-agents/evals/refund.fail.eval.ts"
+      : "apps/demo-openai-agents/evals/refund.eval.ts";
+
+const command = [
+  "exec",
+  "dotenv",
+  "-e",
+  ".env",
+  "-e",
+  ".env.local",
+  "--",
+  "vitest",
+  "run",
+  target,
+  "--config",
+  "vitest.config.ts",
+  "--reporter",
+  "packages/vitest-evals/src/reporter.ts",
+  ...forwardedArgs,
+];
+
+const result = spawnSync("pnpm", command, {
+  cwd: WORKSPACE_ROOT,
+  env,
+  stdio: "inherit",
+});
+
+process.exit(result.status ?? 1);