getsentry · dcramer · May 3, 2026 · May 3, 2026 · May 3, 2026 · May 3, 2026
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -49,9 +49,15 @@ jobs:
       - name: Run linter
         run: pnpm lint
 
+      - name: Run typecheck
+        run: pnpm typecheck
+
       - name: Run tests
         run: pnpm test:ci
 
+      - name: Build
+        run: pnpm build
+
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v4
         env:

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -40,8 +40,8 @@ packages/
       legacy/
   harness-ai-sdk/
   harness-pi-ai/
-  foobar/
 apps/
+  demo-ai-sdk/
   demo-pi/
 docs/
 ```
@@ -66,10 +66,11 @@ Owns the AI SDK adapter into `HarnessRun`.
 
 Owns the `pi-ai` adapter, wrapped tool runtime, and tool replay behavior.
 
-### `packages/foobar` and `apps/demo-pi`
+### Demo apps
 
-Own the example runtime seam and live demos. Keep them realistic and aligned
-with the public story.
+Own their app-local demo fixtures and live demos. Keep them realistic and
+aligned with the public story. `packages/` is reserved for real package
+surfaces.
 
 ## Core Rules
 

diff --git a/README.md b/README.md
@@ -6,11 +6,8 @@ Monorepo for the explicit-run `vitest-evals` shape:
   types, reporter, and legacy compatibility exports
 - `packages/harness-ai-sdk`: `ai-sdk`-focused harness adapter
 - `packages/harness-pi-ai`: `pi-ai`-focused harness adapter with tool replay
-- `packages/foobar`: example package with a small refund agent
-- `apps/demo-pi`: end-to-end Pi Mono demo evals wired through the workspace
-  packages
-- `apps/demo-ai-sdk`: end-to-end AI SDK demo evals wired through the workspace
-  packages
+- `apps/demo-pi`: end-to-end Pi Mono demo evals with an app-local refund agent
+- `apps/demo-ai-sdk`: end-to-end AI SDK demo evals with app-local refund tools
 
 ## Workspace Layout
 
@@ -19,7 +16,6 @@ packages/
   vitest-evals/
   harness-ai-sdk/
   harness-pi-ai/
-  foobar/
 apps/
   demo-ai-sdk/
   demo-pi/
@@ -29,15 +25,16 @@ apps/
 
 ```sh
 pnpm install
+pnpm lint
 pnpm typecheck
 pnpm test
+pnpm build
 pnpm evals
 pnpm evals -- -v
 pnpm evals -- -vv
 pnpm evals -- -vvv
 pnpm evals -- -vvvv
 pnpm evals:verbose
-pnpm build
 ```
 
 Verbosity tiers for eval output:
@@ -51,25 +48,41 @@ from the workspace `tsconfig` paths via `vite-tsconfig-paths`, and package
 boundaries are expressed in package manifests rather than hard-coded alias
 tables.
 
+Pull request CI runs the same core safety checks: release config validation,
+lint, typecheck, the CI test suite, and the workspace build.
+
 ## Example
 
 The `apps/demo-pi` app shows the intended explicit-run flow:
 
 ```ts
-import { createRefundAgent } from "@demo/foobar";
+import { expect } from "vitest";
 import { piAiHarness } from "@vitest-evals/harness-pi-ai";
 import {
   describeEval,
-  ToolCallJudge,
   namedJudge,
   toolCalls,
+  type JudgeContext,
 } from "vitest-evals";
+import { createRefundAgent } from "../src/refundAgent";
+
+type RefundEvalMetadata = {
+  expectedStatus: "approved" | "denied";
+  expectedTools: string[];
+};
 
 const FactualityJudge = namedJudge(
   "FactualityJudge",
-  async ({ output }) => {
-    const answer = output;
-    const verdict = await judgeFactuality(answer);
+  async ({
+    input,
+    output,
+    metadata,
+  }: JudgeContext<string, RefundEvalMetadata>) => {
+    const verdict = await judgeFactuality({
+      question: input,
+      answer: output,
+      expectedStatus: metadata.expectedStatus,
+    });
 
     return {
       score: verdict.score,
@@ -86,7 +99,7 @@ describeEval(
     harness: piAiHarness({
       createAgent: () => createRefundAgent(),
     }),
-    judges: [ToolCallJudge()],
+    judges: [FactualityJudge],
   },
   (it) => {
     it.for([
@@ -104,7 +117,6 @@ describeEval(
       expect(result.output).toMatchObject({
         status: metadata.expectedStatus,
       });
-      await expect(result).toSatisfyJudge(FactualityJudge);
       expect(toolCalls(result.session).map((call) => call.name)).toEqual(
         metadata.expectedTools,
       );

diff --git a/apps/demo-ai-sdk/README.md b/apps/demo-ai-sdk/README.md
@@ -5,11 +5,10 @@ through the workspace packages:
 
 - `vitest-evals`
 - `@vitest-evals/harness-ai-sdk`
-- `@demo/foobar`
 
 The passing live eval lives in `evals/refund.eval.ts`.
-It demonstrates an automatic harness-backed tool judge plus explicit Vitest
-assertions on `run.output` and the normalized session trace.
+It demonstrates app-local refund tools and explicit Vitest assertions on
+`run.output` and the normalized session trace.
 
 The intentionally failing examples live in `evals/refund.fail.eval.ts`.
 One fails an automatic harness-backed judge, and one fails explicit assertions

diff --git a/apps/demo-ai-sdk/evals/refund.eval.ts b/apps/demo-ai-sdk/evals/refund.eval.ts
@@ -1,6 +1,5 @@
-import { assertRefundCase } from "@demo/foobar/testing";
 import { describeEval } from "vitest-evals";
-import { refundHarness } from "./shared";
+import { assertRefundCase, refundHarness } from "./shared";
 
 describeEval(
   "demo ai-sdk refund agent",