diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 57fe405..1be766a 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -49,9 +49,15 @@ jobs:
       - name: Run linter
         run: pnpm lint
 
+      - name: Run typecheck
+        run: pnpm typecheck
+
       - name: Run tests
         run: pnpm test:ci
 
+      - name: Build
+        run: pnpm build
+
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v4
         env:
diff --git a/CLAUDE.md b/CLAUDE.md
index e610190..32858fd 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -40,8 +40,8 @@ packages/
       legacy/
   harness-ai-sdk/
   harness-pi-ai/
-  foobar/
 apps/
+  demo-ai-sdk/
   demo-pi/
 docs/
 ```
@@ -66,10 +66,11 @@ Owns the AI SDK adapter into `HarnessRun`.
 
 Owns the `pi-ai` adapter, wrapped tool runtime, and tool replay behavior.
 
-### `packages/foobar` and `apps/demo-pi`
+### Demo apps
 
-Own the example runtime seam and live demos. Keep them realistic and aligned
-with the public story.
+Own their app-local demo fixtures and live demos. Keep them realistic and
+aligned with the public story. `packages/` is reserved for real package
+surfaces.
 
 ## Core Rules
 
diff --git a/README.md b/README.md
index 3a1817c..b2b041b 100644
--- a/README.md
+++ b/README.md
@@ -6,11 +6,8 @@ Monorepo for the explicit-run `vitest-evals` shape:
   types, reporter, and legacy compatibility exports
 - `packages/harness-ai-sdk`: `ai-sdk`-focused harness adapter
 - `packages/harness-pi-ai`: `pi-ai`-focused harness adapter with tool replay
-- `packages/foobar`: example package with a small refund agent
-- `apps/demo-pi`: end-to-end Pi Mono demo evals wired through the workspace
-  packages
-- `apps/demo-ai-sdk`: end-to-end AI SDK demo evals wired through the workspace
-  packages
+- `apps/demo-pi`: end-to-end Pi Mono demo evals with an app-local refund agent
+- `apps/demo-ai-sdk`: end-to-end AI SDK demo evals with app-local refund tools
 
 ## Workspace Layout
 
@@ -19,7 +16,6 @@ packages/
   vitest-evals/
   harness-ai-sdk/
   harness-pi-ai/
-  foobar/
 apps/
   demo-ai-sdk/
   demo-pi/
@@ -29,15 +25,16 @@ apps/
 
 ```sh
 pnpm install
+pnpm lint
 pnpm typecheck
 pnpm test
+pnpm build
 pnpm evals
 pnpm evals -- -v
 pnpm evals -- -vv
 pnpm evals -- -vvv
 pnpm evals -- -vvvv
 pnpm evals:verbose
-pnpm build
 ```
 
 Verbosity tiers for eval output:
@@ -51,25 +48,41 @@ from the workspace `tsconfig` paths via `vite-tsconfig-paths`, and package
 boundaries are expressed in package manifests rather than hard-coded alias
 tables.
 
+Pull request CI runs the same core safety checks: release config validation,
+lint, typecheck, the CI test suite, and the workspace build.
+
 ## Example
 
 The `apps/demo-pi` app shows the intended explicit-run flow:
 
 ```ts
-import { createRefundAgent } from "@demo/foobar";
+import { expect } from "vitest";
 import { piAiHarness } from "@vitest-evals/harness-pi-ai";
 import {
   describeEval,
-  ToolCallJudge,
   namedJudge,
   toolCalls,
+  type JudgeContext,
 } from "vitest-evals";
+import { createRefundAgent } from "../src/refundAgent";
+
+type RefundEvalMetadata = {
+  expectedStatus: "approved" | "denied";
+  expectedTools: string[];
+};
 
 const FactualityJudge = namedJudge(
   "FactualityJudge",
-  async ({ output }) => {
-    const answer = output;
-    const verdict = await judgeFactuality(answer);
+  async ({
+    input,
+    output,
+    metadata,
+  }: JudgeContext<string, RefundEvalMetadata>) => {
+    const verdict = await judgeFactuality({
+      question: input,
+      answer: output,
+      expectedStatus: metadata.expectedStatus,
+    });
 
     return {
       score: verdict.score,
@@ -86,7 +99,7 @@ describeEval(
     harness: piAiHarness({
       createAgent: () => createRefundAgent(),
     }),
-    judges: [ToolCallJudge()],
+    judges: [FactualityJudge],
   },
   (it) => {
     it.for([
@@ -104,7 +117,6 @@ describeEval(
       expect(result.output).toMatchObject({
         status: metadata.expectedStatus,
       });
-      await expect(result).toSatisfyJudge(FactualityJudge);
       expect(toolCalls(result.session).map((call) => call.name)).toEqual(
         metadata.expectedTools,
       );
diff --git a/apps/demo-ai-sdk/README.md b/apps/demo-ai-sdk/README.md
index 20f0034..094d2c5 100644
--- a/apps/demo-ai-sdk/README.md
+++ b/apps/demo-ai-sdk/README.md
@@ -5,11 +5,10 @@ through the workspace packages:
 
 - `vitest-evals`
 - `@vitest-evals/harness-ai-sdk`
-- `@demo/foobar`
 
 The passing live eval lives in `evals/refund.eval.ts`.
-It demonstrates an automatic harness-backed tool judge plus explicit Vitest
-assertions on `run.output` and the normalized session trace.
+It demonstrates app-local refund tools and explicit Vitest assertions on
+`run.output` and the normalized session trace.
 
 The intentionally failing examples live in `evals/refund.fail.eval.ts`.
 One fails an automatic harness-backed judge, and one fails explicit assertions
diff --git a/apps/demo-ai-sdk/evals/refund.eval.ts b/apps/demo-ai-sdk/evals/refund.eval.ts
index ebdf362..e8af022 100644
--- a/apps/demo-ai-sdk/evals/refund.eval.ts
+++ b/apps/demo-ai-sdk/evals/refund.eval.ts
@@ -1,6 +1,5 @@
-import { assertRefundCase } from "@demo/foobar/testing";
 import { describeEval } from "vitest-evals";
-import { refundHarness } from "./shared";
+import { assertRefundCase, refundHarness } from "./shared";
 
 describeEval(
   "demo ai-sdk refund agent",
diff --git a/apps/demo-ai-sdk/evals/shared.ts b/apps/demo-ai-sdk/evals/shared.ts
index af602e1..c5f936e 100644
--- a/apps/demo-ai-sdk/evals/shared.ts
+++ b/apps/demo-ai-sdk/evals/shared.ts
@@ -1,20 +1,96 @@
 import { anthropic } from "@ai-sdk/anthropic";
-import {
-  CREATE_REFUND_DESCRIPTION,
-  LOOKUP_INVOICE_DESCRIPTION,
-  REFUND_SYSTEM_PROMPT,
-  createRefund,
-  lookupInvoice,
-  parseRefundDecision,
-  type RefundCase,
-} from "@demo/foobar";
 import { aiSdkHarness, type AiSdkToolset } from "@vitest-evals/harness-ai-sdk";
 import { generateText, stepCountIs } from "ai";
+import { expect } from "vitest";
+import { type HarnessRun, toolCalls } from "vitest-evals";
 import { z } from "zod";
 
+type InvoiceRecord = {
+  invoiceId: string;
+  amount: number;
+  refundable: boolean;
+  customer: string;
+};
+
+type RefundDecision =
+  | {
+      status: "approved";
+      invoiceId: string;
+      refundId: string;
+      amount: number;
+    }
+  | {
+      status: "denied";
+      invoiceId: string;
+      reason: string;
+    };
+
+export type RefundCase = {
+  input: string;
+  expectedStatus: RefundDecision["status"];
+  expectedTools: string[];
+};
+
+const REFUND_SYSTEM_PROMPT = [
+  "You are the demo refund operations agent.",
+  "You must decide whether a refund should be approved for the invoice in the user's request.",
+  "Always call lookupInvoice before making a decision.",
+  "If the invoice is refundable, call createRefund with the full invoice amount.",
+  "If the invoice is not refundable, do not call createRefund.",
+  "Return JSON only and do not wrap it in markdown.",
+  'Approved shape: {"status":"approved","invoiceId":"...","refundId":"...","amount":4200}',
+  'Denied shape: {"status":"denied","invoiceId":"...","reason":"..."}',
+].join("\n");
+
+const INVOICES: Record<string, InvoiceRecord> = {
+  inv_123: {
+    invoiceId: "inv_123",
+    amount: 4200,
+    refundable: true,
+    customer: "Acme Co",
+  },
+  inv_404: {
+    invoiceId: "inv_404",
+    amount: 1700,
+    refundable: false,
+    customer: "Globex",
+  },
+};
+
+async function lookupInvoice({
+  invoiceId,
+}: {
+  invoiceId: string;
+}): Promise<InvoiceRecord> {
+  const invoice = INVOICES[invoiceId];
+  if (!invoice) {
+    throw new Error(`Invoice ${invoiceId} not found`);
+  }
+
+  return invoice;
+}
+
+async function createRefund({
+  invoiceId,
+  amount,
+}: {
+  invoiceId: string;
+  amount: number;
+}): Promise<{
+  refundId: string;
+  amount: number;
+  status: "submitted";
+}> {
+  return {
+    refundId: `rf_${invoiceId}`,
+    amount,
+    status: "submitted",
+  };
+}
+
 const refundTools = {
   lookupInvoice: {
-    description: LOOKUP_INVOICE_DESCRIPTION,
+    description: "Look up invoice details inside demo billing.",
     replay: true,
     inputSchema: z.object({
       invoiceId: z
@@ -24,7 +100,7 @@ const refundTools = {
     execute: lookupInvoice,
   },
   createRefund: {
-    description: CREATE_REFUND_DESCRIPTION,
+    description: "Create a refund for a refundable invoice.",
     inputSchema: z.object({
       invoiceId: z.string().describe("The invoice id that should be refunded."),
       amount: z.number().describe("The amount to refund in cents."),
@@ -46,3 +122,124 @@ export const refundHarness = aiSdkHarness({
     }),
   output: ({ result }) => parseRefundDecision(result.text),
 });
+
+export async function assertRefundCase(
+  run: HarnessRun,
+  expected: Pick<RefundCase, "expectedStatus" | "expectedTools">,
+) {
+  expect(run.output).toMatchObject({
+    status: expected.expectedStatus,
+  });
+  expect(toolCalls(run.session).map((call) => call.name)).toEqual(
+    expected.expectedTools,
+  );
+  expect(run.usage.provider).toContain("anthropic");
+  expect(run.usage.model).toContain("claude");
+  expect(run.usage.totalTokens).toBeGreaterThan(0);
+}
+
+function parseRefundDecision(text: string): RefundDecision {
+  const cleaned = stripMarkdownFence(text);
+  const jsonText = extractJsonObjectText(cleaned);
+  const parsed = JSON.parse(jsonText) as Record<string, unknown>;
+
+  if (
+    parsed.status === "approved" &&
+    typeof parsed.invoiceId === "string" &&
+    typeof parsed.refundId === "string" &&
+    typeof parsed.amount === "number"
+  ) {
+    return {
+      status: "approved",
+      invoiceId: parsed.invoiceId,
+      refundId: parsed.refundId,
+      amount: parsed.amount,
+    };
+  }
+
+  if (
+    parsed.status === "denied" &&
+    typeof parsed.invoiceId === "string" &&
+    typeof parsed.reason === "string"
+  ) {
+    return {
+      status: "denied",
+      invoiceId: parsed.invoiceId,
+      reason: parsed.reason,
+    };
+  }
+
+  throw new Error(`Refund agent returned an invalid decision payload: ${text}`);
+}
+
+function stripMarkdownFence(text: string) {
+  const trimmed = text.trim();
+  if (!trimmed.startsWith("```") || !trimmed.endsWith("```")) {
+    return trimmed;
+  }
+
+  const firstNewline = trimmed.indexOf("\n");
+  if (firstNewline === -1) {
+    return trimmed;
+  }
+
+  const fenceHeader = trimmed.slice(3, firstNewline).trim().toLowerCase();
+  if (fenceHeader !== "" && fenceHeader !== "json") {
+    return trimmed;
+  }
+
+  return trimmed.slice(firstNewline + 1, -3).trim();
+}
+
+function extractJsonObjectText(text: string) {
+  const start = text.indexOf("{");
+  if (start === -1) {
+    return text;
+  }
+
+  let depth = 0;
+  let inString = false;
+  let isEscaped = false;
+
+  for (let index = start; index < text.length; index += 1) {
+    const char = text[index];
+
+    if (inString) {
+      if (isEscaped) {
+        isEscaped = false;
+        continue;
+      }
+
+      if (char === "\\") {
+        isEscaped = true;
+        continue;
+      }
+
+      if (char === '"') {
+        inString = false;
+      }
+      continue;
+    }
+
+    if (char === '"') {
+      inString = true;
+      continue;
+    }
+
+    if (char === "{") {
+      depth += 1;
+      continue;
+    }
+
+    if (char !== "}") {
+      continue;
+    }
+
+    depth -= 1;
+    if (depth === 0) {
+      return text.slice(start, index + 1);
+    }
+  }
+
+  return text;
+}
diff --git a/apps/demo-ai-sdk/package.json b/apps/demo-ai-sdk/package.json
index f3e629b..f55abf1 100644
--- a/apps/demo-ai-sdk/package.json
+++ b/apps/demo-ai-sdk/package.json
@@ -9,7 +9,6 @@
   },
   "dependencies": {
     "@ai-sdk/anthropic": "^3.0.71",
-    "@demo/foobar": "workspace:*",
     "@vitest-evals/harness-ai-sdk": "workspace:*",
     "ai": "^6.0.141",
     "vitest-evals": "workspace:*",
diff --git a/apps/demo-pi/README.md b/apps/demo-pi/README.md
index 3086dcb..36a582b 100644
--- a/apps/demo-pi/README.md
+++ b/apps/demo-pi/README.md
@@ -5,11 +5,11 @@ through the workspace packages:
 
 - `vitest-evals`
 - `@vitest-evals/harness-pi-ai`
-- `@demo/foobar`
 
 The passing live eval lives in `evals/refund.eval.ts`.
-It demonstrates an automatic harness-backed tool judge plus explicit Vitest
-assertions on `run.output` and the normalized session trace.
+It demonstrates an app-local refund agent, an automatic harness-backed tool
+judge, and explicit Vitest assertions on `run.output` and the normalized
+session trace.
 
 The intentionally failing examples live in `evals/refund.fail.eval.ts`.
 One fails an automatic harness-backed judge, and one fails explicit assertions
diff --git a/apps/demo-pi/evals/refund.eval.ts b/apps/demo-pi/evals/refund.eval.ts
index 6d0a350..139b68b 100644
--- a/apps/demo-pi/evals/refund.eval.ts
+++ b/apps/demo-pi/evals/refund.eval.ts
@@ -1,5 +1,4 @@
 import { expect } from "vitest";
-import { createRefundAgent, type RefundCase } from "@demo/foobar";
 import { piAiHarness } from "@vitest-evals/harness-pi-ai";
 import {
   describeEval,
@@ -7,6 +6,7 @@ import {
   ToolCallJudge,
   toolCalls,
 } from "vitest-evals";
+import { createRefundAgent, type RefundCase } from "../src/refundAgent";
 
 const outputJudge = StructuredOutputJudge();
 
diff --git a/apps/demo-pi/evals/refund.fail.eval.ts b/apps/demo-pi/evals/refund.fail.eval.ts
index 35b9732..294dcbe 100644
--- a/apps/demo-pi/evals/refund.fail.eval.ts
+++ b/apps/demo-pi/evals/refund.fail.eval.ts
@@ -1,7 +1,7 @@
 import { expect } from "vitest";
-import { createRefundAgent, type RefundCase } from "@demo/foobar";
 import { piAiHarness } from "@vitest-evals/harness-pi-ai";
 import { describeEval, StructuredOutputJudge } from "vitest-evals";
+import { createRefundAgent, type RefundCase } from "../src/refundAgent";
 
 type AssertionRefundCase = RefundCase;
 type ScoredRefundCase = RefundCase & {
diff --git a/apps/demo-pi/package.json b/apps/demo-pi/package.json
index 1ac85dd..1ce4c30 100644
--- a/apps/demo-pi/package.json
+++ b/apps/demo-pi/package.json
@@ -8,7 +8,8 @@
     "evals:fail": "node ./scripts/run-evals.mjs --fail"
   },
   "dependencies": {
-    "@demo/foobar": "workspace:*",
+    "@mariozechner/pi-agent-core": "0.67.68",
+    "@mariozechner/pi-ai": "0.67.68",
     "@vitest-evals/harness-pi-ai": "workspace:*",
     "vitest-evals": "workspace:*"
   }
diff --git a/packages/foobar/src/index.test.ts b/apps/demo-pi/src/refundAgent.test.ts
similarity index 99%
rename from packages/foobar/src/index.test.ts
rename to apps/demo-pi/src/refundAgent.test.ts
index beafba7..b0ec385 100644
--- a/packages/foobar/src/index.test.ts
+++ b/apps/demo-pi/src/refundAgent.test.ts
@@ -4,7 +4,7 @@ import {
   createRefundAgent,
   lookupInvoice,
   parseRefundDecision,
-} from "./index";
+} from "./refundAgent";
 
 describe("parseRefundDecision", () => {
   test("parses plain approved JSON", () => {
diff --git a/packages/foobar/src/index.ts b/apps/demo-pi/src/refundAgent.ts
similarity index 91%
rename from packages/foobar/src/index.ts
rename to apps/demo-pi/src/refundAgent.ts
index a2c0864..cf4709f 100644
--- a/packages/foobar/src/index.ts
+++ b/apps/demo-pi/src/refundAgent.ts
@@ -47,13 +47,13 @@ export type CreateRefundInput = {
 };
 
 export const LOOKUP_INVOICE_DESCRIPTION =
-  "Look up invoice details inside Foobar billing.";
+  "Look up invoice details inside demo billing.";
 export const CREATE_REFUND_DESCRIPTION =
   "Create a refund for a refundable invoice.";
-type FoobarRefundModel = "claude-sonnet-4-5";
-const DEFAULT_REFUND_MODEL: FoobarRefundModel = "claude-sonnet-4-5";
+type RefundAgentModel = "claude-sonnet-4-5";
+const DEFAULT_REFUND_MODEL: RefundAgentModel = "claude-sonnet-4-5";
 export const REFUND_SYSTEM_PROMPT = [
-  "You are Foobar's refund operations agent.",
+  "You are the demo refund operations agent.",
   "You must decide whether a refund should be approved for the invoice in the user's request.",
   "Always call lookupInvoice before making a decision.",
   "If the invoice is refundable, call createRefund with the full invoice amount.",
@@ -104,7 +104,7 @@ export async function createRefund({
   };
 }
 
-const foobarTools = {
+const refundAgentTools = {
   lookupInvoice: {
     description: LOOKUP_INVOICE_DESCRIPTION,
     replay: true,
@@ -116,14 +116,14 @@ const foobarTools = {
   },
 } satisfies PiAiToolset<string, RefundEvalMetadata>;
 
-type FoobarRuntime = PiAiRuntime<
-  typeof foobarTools,
+type RefundAgentRuntime = PiAiRuntime<
+  typeof refundAgentTools,
   string,
   RefundEvalMetadata
 >;
-type FoobarRuntimeTools = FoobarRuntime["tools"];
+type RefundAgentRuntimeTools = RefundAgentRuntime["tools"];
 
-const fallbackRuntimeTools: FoobarRuntimeTools = {
+const fallbackRuntimeTools: RefundAgentRuntimeTools = {
   lookupInvoice,
   createRefund,
 };
@@ -146,13 +146,11 @@ const createRefundParameters = Type.Object({
 type LookupInvoiceArgs = Static<typeof lookupInvoiceParameters>;
 type CreateRefundArgs = Static<typeof createRefundParameters>;
 
-export class FoobarRefundAgent {
+export class RefundAgent {
   private readonly agent: Agent;
-  readonly toolset = foobarTools;
+  readonly toolset = refundAgentTools;
 
-  constructor(
-    private readonly model: FoobarRefundModel = DEFAULT_REFUND_MODEL,
-  ) {
+  constructor(private readonly model: RefundAgentModel = DEFAULT_REFUND_MODEL) {
     this.agent = new Agent({
       initialState: {
         systemPrompt: REFUND_SYSTEM_PROMPT,
@@ -164,7 +162,7 @@ export class FoobarRefundAgent {
     });
   }
 
-  async run(input: string, runtime: FoobarRuntime) {
+  async run(input: string, runtime: RefundAgentRuntime) {
     await this.agent.reset();
     this.agent.state.systemPrompt = REFUND_SYSTEM_PROMPT;
     this.agent.state.model = getModel("anthropic", this.model);
@@ -213,12 +211,12 @@ export class FoobarRefundAgent {
 }
 
 /** Creates a fresh demo refund agent for one eval run. */
-export function createRefundAgent(options?: { model?: FoobarRefundModel }) {
-  return new FoobarRefundAgent(options?.model ?? DEFAULT_REFUND_MODEL);
+export function createRefundAgent(options?: { model?: RefundAgentModel }) {
+  return new RefundAgent(options?.model ?? DEFAULT_REFUND_MODEL);
 }
 
 function createAgentTools(
-  runtimeTools: FoobarRuntimeTools = fallbackRuntimeTools,
+  runtimeTools: RefundAgentRuntimeTools = fallbackRuntimeTools,
 ): Array<AgentTool<any, any>> {
   const lookupInvoiceTool: AgentTool<
     typeof lookupInvoiceParameters,
diff --git a/docs/architecture.md b/docs/architecture.md
index 5ef7bad..093d51c 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -28,8 +28,8 @@ packages/
       legacy/
   harness-ai-sdk/
   harness-pi-ai/
-  foobar/
 apps/
+  demo-ai-sdk/
   demo-pi/
 ```
 
@@ -105,7 +105,7 @@ For each eval test in a harness-backed suite:
 2. The suite callback registers named eval tests.
 3. The eval test calls `run(input)` at the point execution should happen.
 4. The configured harness runs the system under test exactly once.
-5. The harness returns a `HarnessRun` with `run.output`, `run.session`,
+5. The harness returns a `HarnessRun` with `result.output`, `result.session`,
    `usage`, `timings`, `artifacts`, and `errors`.
 6. Core stores that run on `task.meta.harness` for the reporter.
 7. Automatic suite-level judges run against the normalized run/session pair.
@@ -134,6 +134,12 @@ standard tool replay/VCR behavior for opt-in tools, including:
 Replay metadata becomes part of the normalized tool record so the reporter can
 surface it.
 
+## Demo Apps
+
+`apps/demo-pi` and `apps/demo-ai-sdk` own their demo fixtures locally. They stay
+under `apps/` because they are product demos, while `packages/` is reserved for
+real package surfaces that can be published or consumed independently.
+
 ## Extension Points
 
 ### New Harnesses
diff --git a/docs/custom-scorers.md b/docs/custom-scorers.md
index 95d000c..5ca6111 100644
--- a/docs/custom-scorers.md
+++ b/docs/custom-scorers.md
@@ -58,13 +58,10 @@ judge needs richer context, type it with `JudgeContext` and read `metadata`,
 
 ## Built-In Root Judges
 
-The root package ships judge-shaped helpers for common cases:
-
-```ts
-import { StructuredOutputJudge, ToolCallJudge } from "vitest-evals";
-```
-
-These operate on normalized harness data instead of raw scorer inputs.
+The root package still ships deterministic judge-shaped helpers such as
+`StructuredOutputJudge()` and `ToolCallJudge()`. They operate on normalized
+harness data instead of raw scorer inputs, but new docs should keep factuality
+or rubric judges as the primary examples.
 
 ## Legacy Scorer Example
 
diff --git a/docs/development-guide.md b/docs/development-guide.md
index e0ccffc..4d27ba0 100644
--- a/docs/development-guide.md
+++ b/docs/development-guide.md
@@ -73,10 +73,11 @@ Owns:
 - wrapped tool runtime injection
 - tool replay/VCR behavior
 
-### `packages/foobar` and `apps/demo-pi`
+## Demo Apps
 
-Own the example runtime seam and live demo eval coverage. Keep them realistic.
-They are part of the product story, not just smoke tests.
+`apps/demo-pi` and `apps/demo-ai-sdk` own live demo eval coverage and any
+app-local refund fixtures they need. Keep them realistic; they are part of the
+product story, not just smoke tests. `packages/` is for real package surfaces.
 
 ## Adding a New Judge
 
@@ -144,7 +145,7 @@ For targeted work, prefer narrow verification:
 
 - reporter changes: run reporter tests
 - harness changes: run the relevant harness package tests
-- demo/runtime changes: run `pnpm evals` or a filtered app/package eval command
+- demo app changes: run `pnpm evals` or a filtered app eval command
 - legacy changes: run the moved tests under `packages/vitest-evals/src/legacy`
 
 ## Documentation Expectations
diff --git a/docs/scorer-examples.md b/docs/scorer-examples.md
index e225711..f86b6c8 100644
--- a/docs/scorer-examples.md
+++ b/docs/scorer-examples.md
@@ -51,38 +51,11 @@ export const LookupThenRefundJudge: JudgeFn = async ({ toolCalls }) => {
 await expect(result).toSatisfyJudge(FactualityJudge);
 ```
 
-## Built-In Judge Helpers
+## Deterministic Helper Note
 
-```ts
-import { StructuredOutputJudge, ToolCallJudge } from "vitest-evals";
-
-describeEval(
-  "refund agent",
-  {
-    harness: piAiHarness({
-      createAgent: () => createRefundAgent(),
-    }),
-    judges: [ToolCallJudge()],
-  },
-  (it) => {
-    it("approves a refund", async ({ run }) => {
-      const result = await run("Refund invoice inv_123", {
-        metadata: {
-          expected: { status: "approved" },
-          expectedTools: [
-            { name: "lookupInvoice" },
-            { name: "createRefund" },
-          ],
-        },
-      });
-
-      await expect(result).toSatisfyJudge(StructuredOutputJudge(), {
-        expected: { status: "approved" },
-      });
-    });
-  },
-);
-```
+Built-ins such as `StructuredOutputJudge()` and `ToolCallJudge()` still exist
+for deterministic contract checks. New docs should use factuality or rubric
+judges as the primary examples.
 
 ## Legacy Scorer Example
 
diff --git a/docs/testing.md b/docs/testing.md
index 6591bac..6df0df2 100644
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -25,6 +25,10 @@ packages/harness-ai-sdk/src/
   index.test.ts
 packages/harness-pi-ai/src/
   index.test.ts
+apps/demo-pi/src/
+  refundAgent.test.ts
+apps/demo-ai-sdk/evals/
+  *.eval.ts
 apps/demo-pi/evals/
   *.eval.ts
 ```
diff --git a/package.json b/package.json
index 0b8438f..dd7b740 100644
--- a/package.json
+++ b/package.json
@@ -13,9 +13,9 @@
     "prepare": "simple-git-hooks",
     "release:check": "node ./scripts/check-release-config.mjs",
     "typecheck": "tsc --noEmit",
-    "test": "dotenv -e .env -e .env.local -- vitest run packages --config=./vitest.config.ts --reporter=./packages/vitest-evals/src/reporter.ts",
-    "test:watch": "dotenv -e .env -e .env.local -- vitest packages --config=./vitest.config.ts --reporter=./packages/vitest-evals/src/reporter.ts",
-    "test:ci": "dotenv -e .env -e .env.local -- vitest run packages --config=./vitest.config.ts --coverage --reporter=./packages/vitest-evals/src/reporter.ts --reporter=junit --outputFile=tests.junit.xml"
+    "test": "dotenv -e .env -e .env.local -- vitest run packages apps --config=./vitest.config.ts --reporter=./packages/vitest-evals/src/reporter.ts",
+    "test:watch": "dotenv -e .env -e .env.local -- vitest packages apps --config=./vitest.config.ts --reporter=./packages/vitest-evals/src/reporter.ts",
+    "test:ci": "dotenv -e .env -e .env.local -- vitest run packages apps --config=./vitest.config.ts --coverage --reporter=./packages/vitest-evals/src/reporter.ts --reporter=junit --outputFile=tests.junit.xml"
   },
   "repository": {
     "type": "git",
diff --git a/packages/foobar/package.json b/packages/foobar/package.json
deleted file mode 100644
index a2f8ecd..0000000
--- a/packages/foobar/package.json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-  "name": "@demo/foobar",
-  "private": true,
-  "version": "0.1.0",
-  "types": "./dist/index.d.ts",
-  "main": "./dist/index.js",
-  "module": "./dist/index.mjs",
-  "files": ["dist"],
-  "exports": {
-    ".": {
-      "source": "./src/index.ts",
-      "types": "./dist/index.d.ts",
-      "require": "./dist/index.js",
-      "import": "./dist/index.mjs"
-    },
-    "./testing": {
-      "source": "./src/testing.ts",
-      "types": "./dist/testing.d.ts",
-      "require": "./dist/testing.js",
-      "import": "./dist/testing.mjs"
-    }
-  },
-  "dependencies": {
-    "@mariozechner/pi-agent-core": "0.67.68",
-    "@mariozechner/pi-ai": "0.67.68"
-  },
-  "devDependencies": {
-    "vitest": "^4.1.2",
-    "vitest-evals": "workspace:*"
-  },
-  "scripts": {
-    "build": "tsup --config ./tsup.config.ts"
-  }
-}
diff --git a/packages/foobar/src/testing.ts b/packages/foobar/src/testing.ts
deleted file mode 100644
index 6277f7c..0000000
--- a/packages/foobar/src/testing.ts
+++ /dev/null
@@ -1,18 +0,0 @@
-import { expect } from "vitest";
-import { type HarnessRun, toolCalls } from "vitest-evals";
-import type { RefundCase } from "./index";
-
-export async function assertRefundCase(
-  run: HarnessRun,
-  expected: Pick<RefundCase, "expectedStatus" | "expectedTools">,
-) {
-  expect(run.output).toMatchObject({
-    status: expected.expectedStatus,
-  });
-  expect(toolCalls(run.session).map((call) => call.name)).toEqual(
-    expected.expectedTools,
-  );
-  expect(run.usage.provider).toContain("anthropic");
-  expect(run.usage.model).toContain("claude");
-  expect(run.usage.totalTokens).toBeGreaterThan(0);
-}
diff --git a/packages/foobar/tsconfig.json b/packages/foobar/tsconfig.json
deleted file mode 100644
index 9e25e6e..0000000
--- a/packages/foobar/tsconfig.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "extends": "../../tsconfig.base.json",
-  "include": ["src/**/*.ts"]
-}
diff --git a/packages/foobar/tsup.config.ts b/packages/foobar/tsup.config.ts
deleted file mode 100644
index a57f24f..0000000
--- a/packages/foobar/tsup.config.ts
+++ /dev/null
@@ -1,11 +0,0 @@
-import { defineConfig } from "tsup";
-
-export default defineConfig({
-  entry: ["src/**/*.ts", "!src/**/*.test.ts", "!src/**/*.test.*.ts"],
-  format: ["cjs", "esm"],
-  dts: true,
-  splitting: false,
-  sourcemap: true,
-  clean: true,
-  external: ["vitest", "vitest-evals"],
-});
diff --git a/packages/harness-ai-sdk/src/index.test.ts b/packages/harness-ai-sdk/src/index.test.ts
index 50b5a93..78a0a55 100644
--- a/packages/harness-ai-sdk/src/index.test.ts
+++ b/packages/harness-ai-sdk/src/index.test.ts
@@ -517,7 +517,10 @@ test("omits empty runtime tool error content when a task errors", async () => {
         inputSchema: z.object({
           invoiceId: z.string(),
         }),
-        execute: async () => {
+        execute: async (
+          _input: { invoiceId: string },
+          _execution: ToolExecutionOptions,
+        ) => {
           throw new Error("");
         },
       },
diff --git a/packages/harness-ai-sdk/src/index.ts b/packages/harness-ai-sdk/src/index.ts
index f402243..ebcbc9f 100644
--- a/packages/harness-ai-sdk/src/index.ts
+++ b/packages/harness-ai-sdk/src/index.ts
@@ -861,7 +861,7 @@ function resolveSession(
   }
 
   const unmatchedRuntimeToolCalls = runtimeToolCalls.filter(
-    (call) => !stepToolCallIds.has(call.id),
+    (call) => call.id === undefined || !stepToolCallIds.has(call.id),
   );
 
   if (unmatchedRuntimeToolCalls.length > 0) {
diff --git a/packages/harness-pi-ai/README.md b/packages/harness-pi-ai/README.md
index bb45d45..0a7abaf 100644
--- a/packages/harness-pi-ai/README.md
+++ b/packages/harness-pi-ai/README.md
@@ -53,7 +53,8 @@ normalization hooks still exist under `normalize`:
 
 ```ts
 const harness = piAiHarness({
-  createAgent: () => createRefundAgent(),
+  createAgent: () => createWrappedRefundAgent(),
+  run: ({ agent, input, runtime }) => agent.run(input, runtime),
   normalize: {
     output: ({ result }) => result.customDecision,
   },
diff --git a/packages/harness-pi-ai/src/index.ts b/packages/harness-pi-ai/src/index.ts
index 9cde050..66eb68f 100644
--- a/packages/harness-pi-ai/src/index.ts
+++ b/packages/harness-pi-ai/src/index.ts
@@ -61,6 +61,13 @@ type PiAgentToolLike<
 
 const ORIGINAL_NATIVE_EXECUTE = Symbol("vitest-evals.originalNativeExecute");
 
+type NativeToolExecute<
+  TInput,
+  TMetadata extends HarnessMetadata,
+> = PiAgentToolLike<TInput, TMetadata>["execute"] & {
+  [ORIGINAL_NATIVE_EXECUTE]?: PiAgentToolLike<TInput, TMetadata>["execute"];
+};
+
 export type PiAiReplayMode = ReplayMode;
 
 export interface PiAiEventSink {
@@ -426,12 +433,13 @@ async function executePiHarnessRun<
       return result;
     }
 
+    const normalizeResult = result as TResult;
     const resultArgs = {
       agent,
       input,
       context,
       runtime,
-      result,
+      result: normalizeResult,
     } satisfies PiAiHarnessResultArgs<
       TAgent,
       TInput,
@@ -442,13 +450,13 @@ async function executePiHarnessRun<
 
     const output = options.normalize?.output
       ? await options.normalize.output(resultArgs)
-      : resolveOutput(result);
+      : resolveOutput(normalizeResult);
     const usage = options.normalize?.usage
       ? await options.normalize.usage(resultArgs)
-      : resolveUsage(result, runtime.toolCalls.length);
+      : resolveUsage(normalizeResult, runtime.toolCalls.length);
     const session = options.normalize?.session
       ? await options.normalize.session(resultArgs)
-      : resolveSession(result, messages, output, usage);
+      : resolveSession(normalizeResult, messages, output, usage);
 
     return {
       session,
@@ -463,7 +471,7 @@ async function executePiHarnessRun<
           : undefined,
       errors: options.normalize?.errors
         ? await options.normalize.errors(resultArgs)
-        : resolveErrors(result),
+        : resolveErrors(normalizeResult),
     };
   } catch (error) {
     const usage = resolveUsage(undefined, runtime.toolCalls.length);
@@ -734,7 +742,10 @@ async function withInstrumentedAgentTools<
 
     const originalExecute = getNativeToolExecuteOrigin(tool.execute);
     originalExecutions.set(tool, originalExecute);
-    const instrumentedExecute = async (toolCallId, rawArgs) => {
+    const instrumentedExecute: NativeToolExecute<TInput, TMetadata> = async (
+      toolCallId: string,
+      rawArgs: Record<string, JsonValue>,
+    ) => {
       const startedAt = new Date();
       const toolContext = {
         input: args.input,
@@ -780,7 +791,7 @@ async function withInstrumentedAgentTools<
         const call = {
           name: tool.name,
           arguments: rawArgs,
-          error: serializeError(error),
+          error: serializeToolCallError(error),
           startedAt: startedAt.toISOString(),
           finishedAt: finishedAt.toISOString(),
           durationMs: finishedAt.getTime() - startedAt.getTime(),
@@ -878,10 +889,24 @@ function isPromiseLike(value: unknown): value is Promise<unknown> {
   );
 }
 
+function serializeToolCallError(
+  error: unknown,
+): NonNullable<ToolCallRecord["error"]> {
+  const serialized = serializeError(error);
+  const { message, type, ...details } = serialized;
+
+  return {
+    ...details,
+    message: typeof message === "string" ? message : String(message),
+    ...(typeof type === "string" ? { type } : {}),
+  };
+}
+
 function getNativeToolExecuteOrigin<TInput, TMetadata extends HarnessMetadata>(
   execute: PiAgentToolLike<TInput, TMetadata>["execute"],
 ) {
-  return execute[ORIGINAL_NATIVE_EXECUTE] ?? execute;
+  const nativeExecute = execute as NativeToolExecute<TInput, TMetadata>;
+  return nativeExecute[ORIGINAL_NATIVE_EXECUTE] ?? nativeExecute;
 }
 
 async function executeNativeToolWithReplay<
@@ -1033,7 +1058,7 @@ function createRuntime<
           const call = {
             name: toolName,
             arguments: args,
-            error: serializeError(error),
+            error: serializeToolCallError(error),
             startedAt: startedAt.toISOString(),
             finishedAt: finishedAt.toISOString(),
             durationMs: finishedAt.getTime() - startedAt.getTime(),
diff --git a/packages/vitest-evals/README.md b/packages/vitest-evals/README.md
index 6d3dd37..286570d 100644
--- a/packages/vitest-evals/README.md
+++ b/packages/vitest-evals/README.md
@@ -12,6 +12,8 @@ Install a first-party harness package for the runtime you want to test:
 
 ```sh
 npm install -D @vitest-evals/harness-pi-ai
+# or
+npm install -D @vitest-evals/harness-ai-sdk
 ```
 
 ## Core Model
@@ -20,9 +22,9 @@ npm install -D @vitest-evals/harness-pi-ai
 - the suite callback receives a fixture-backed Vitest `it`
 - `run(input, { metadata? })` executes the harness explicitly and returns a
   normalized `HarnessRun`
-- `run.output` is the app-facing value you assert on directly
-- `run.session` is the canonical JSON-serializable trace for reporting, replay,
-  tool assertions, and judges
+- the returned `result.output` is the app-facing value you assert on directly
+- the returned `result.session` is the canonical JSON-serializable trace for
+  reporting, replay, tool assertions, and judges
 - per-run judge inputs should usually live under `metadata`
 - suite-level `judges` are optional and run automatically after each `run(...)`
 - suite-level `judgeThreshold` controls fail-on-score for those automatic judges
@@ -32,20 +34,33 @@ npm install -D @vitest-evals/harness-pi-ai
 ## Explicit Run Example
 
 ```ts
-import { createRefundAgent } from "@demo/foobar";
+import { expect } from "vitest";
 import { piAiHarness } from "@vitest-evals/harness-pi-ai";
 import {
   describeEval,
-  ToolCallJudge,
   namedJudge,
   toolCalls,
+  type JudgeContext,
 } from "vitest-evals";
+import { createRefundAgent } from "../src/refundAgent";
+
+type RefundEvalMetadata = {
+  expectedStatus: "approved" | "denied";
+  expectedTools: string[];
+};
 
 const FactualityJudge = namedJudge(
   "FactualityJudge",
-  async ({ output }) => {
-    const answer = output;
-    const verdict = await judgeFactuality(answer);
+  async ({
+    input,
+    output,
+    metadata,
+  }: JudgeContext<string, RefundEvalMetadata>) => {
+    const verdict = await judgeFactuality({
+      question: input,
+      answer: output,
+      expectedStatus: metadata.expectedStatus,
+    });
 
     return {
       score: verdict.score,
@@ -62,12 +77,13 @@ describeEval(
     harness: piAiHarness({
       createAgent: () => createRefundAgent(),
     }),
-    judges: [ToolCallJudge()],
+    judges: [FactualityJudge],
   },
   (it) => {
     it("approves a refundable invoice", async ({ run }) => {
       const result = await run("Refund invoice inv_123", {
         metadata: {
+          expectedStatus: "approved",
           expectedTools: ["lookupInvoice", "createRefund"],
         },
       });
@@ -77,7 +93,6 @@ describeEval(
         "lookupInvoice",
         "createRefund",
       ]);
-      await expect(result).toSatisfyJudge(FactualityJudge);
     });
   },
 );
@@ -171,22 +186,17 @@ const FactualityJudge = namedJudge(
 );
 ```
 
-A simple factuality judge can just score `output`, which is the normalized
-response text that `toSatisfyJudge(...)` passes automatically. Structured or
-programmatic result checks should usually read `run.output` instead. When a
-judge needs richer context, type it with `JudgeContext` and read `inputValue`,
-`metadata`, `toolCalls`, or `session` from there.
-
-When you only need deterministic contract checks, the built-ins are still
-useful:
-
-```ts
-import { StructuredOutputJudge, ToolCallJudge } from "vitest-evals";
-
-await expect(result).toSatisfyJudge(StructuredOutputJudge(), {
-  expected: { status: "approved" },
-});
-```
+For a `HarnessRun`, `toSatisfyJudge(...)` passes `result.output` as `output`.
+For raw values or normalized sessions, the matcher infers the best available
+output from the received value. Structured or programmatic result checks should
+usually assert on `result.output` directly. When a judge needs richer context,
+type it with `JudgeContext` and read `inputValue`, `metadata`, `toolCalls`, or
+`session` from there.
+
+When you only need deterministic contract checks, built-ins such as
+`StructuredOutputJudge()` and `ToolCallJudge()` are still available. The primary
+documentation examples intentionally use factuality/rubric judges because those
+match the product's LLM-as-a-judge direction.
 
 ## Legacy Compatibility
 
diff --git a/packages/vitest-evals/src/harness.test.ts b/packages/vitest-evals/src/harness.test.ts
index 16d1b8b..957138f 100644
--- a/packages/vitest-evals/src/harness.test.ts
+++ b/packages/vitest-evals/src/harness.test.ts
@@ -12,6 +12,7 @@ import {
   type Harness,
   type HarnessContext,
   type HarnessRun,
+  type NormalizedSession,
 } from "./index";
 
 type RefundEvalMetadata = {
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 4ba1089..031cd0d 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -62,9 +62,6 @@ importers:
       '@ai-sdk/anthropic':
         specifier: ^3.0.71
         version: 3.0.71(zod@4.3.6)
-      '@demo/foobar':
-        specifier: workspace:*
-        version: link:../../packages/foobar
       '@vitest-evals/harness-ai-sdk':
         specifier: workspace:*
         version: link:../../packages/harness-ai-sdk
@@ -79,18 +76,6 @@ importers:
         version: 4.3.6
 
   apps/demo-pi:
-    dependencies:
-      '@demo/foobar':
-        specifier: workspace:*
-        version: link:../../packages/foobar
-      '@vitest-evals/harness-pi-ai':
-        specifier: workspace:*
-        version: link:../../packages/harness-pi-ai
-      vitest-evals:
-        specifier: workspace:*
-        version: link:../../packages/vitest-evals
-
-  packages/foobar:
     dependencies:
       '@mariozechner/pi-agent-core':
         specifier: 0.67.68
@@ -98,13 +83,12 @@ importers:
       '@mariozechner/pi-ai':
         specifier: 0.67.68
         version: 0.67.68(ws@8.20.0)(zod@4.3.6)
-    devDependencies:
-      vitest:
-        specifier: ^4.1.2
-        version: 4.1.2(@opentelemetry/api@1.9.0)(@types/node@25.5.0)(vite@7.0.6(@types/node@25.5.0)(yaml@2.8.0))
+      '@vitest-evals/harness-pi-ai':
+        specifier: workspace:*
+        version: link:../../packages/harness-pi-ai
       vitest-evals:
         specifier: workspace:*
-        version: link:../vitest-evals
+        version: link:../../packages/vitest-evals
 
   packages/harness-ai-sdk:
     devDependencies:
diff --git a/tsconfig.base.json b/tsconfig.base.json
index f0d7025..31e6d3f 100644
--- a/tsconfig.base.json
+++ b/tsconfig.base.json
@@ -11,9 +11,7 @@
       "vitest-evals": ["packages/vitest-evals/src/index.ts"],
       "vitest-evals/*": ["packages/vitest-evals/src/*"],
       "@vitest-evals/harness-ai-sdk": ["packages/harness-ai-sdk/src/index.ts"],
-      "@vitest-evals/harness-pi-ai": ["packages/harness-pi-ai/src/index.ts"],
-      "@demo/foobar": ["packages/foobar/src/index.ts"],
-      "@demo/foobar/testing": ["packages/foobar/src/testing.ts"]
+      "@vitest-evals/harness-pi-ai": ["packages/harness-pi-ai/src/index.ts"]
     }
   }
 }