hookdeck · leggetter · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/docs/agent-evaluation/package.json b/docs/agent-evaluation/package.json
@@ -12,9 +12,10 @@
     "score": "node --import tsx src/score-eval.ts",
     "viz:trajectory": "node --import tsx src/generate-trajectory-html.ts",
     "typecheck": "tsc --noEmit",
-    "test": "npm run test:trajectory && npm run test:redact-secrets",
+    "test": "npm run test:trajectory && npm run test:redact-secrets && npm run test:llm-judge-parse",
     "test:trajectory": "node --import tsx src/trajectory-fixture-smoke.ts",
-    "test:redact-secrets": "node --import tsx src/redact-secrets.test.ts"
+    "test:redact-secrets": "node --import tsx src/redact-secrets.test.ts",
+    "test:llm-judge-parse": "node --import tsx src/llm-judge-parse.test.ts"
   },
   "engines": {
     "node": ">=18"

diff --git a/docs/agent-evaluation/src/llm-judge-parse.test.ts b/docs/agent-evaluation/src/llm-judge-parse.test.ts
@@ -0,0 +1,68 @@
+/**
+ * Unit tests for LLM judge JSON parsing / overall reconciliation.
+ */
+import {
+  parseJudgeBooleanExplicit,
+  reconcileOverallTranscriptPass,
+} from "./llm-judge.js";
+
+function assert(condition: boolean, message: string): void {
+  if (!condition) {
+    throw new Error(`Assertion failed: ${message}`);
+  }
+}
+
+function testParseJudgeBooleanExplicit(): void {
+  assert(parseJudgeBooleanExplicit(true).value === true, "boolean true");
+  assert(parseJudgeBooleanExplicit(false).value === false, "boolean false");
+  assert(parseJudgeBooleanExplicit("false").value === false, 'string "false"');
+  assert(parseJudgeBooleanExplicit("FALSE").value === false, 'string "FALSE"');
+  assert(parseJudgeBooleanExplicit("true").value === true, 'string "true"');
+  assert(parseJudgeBooleanExplicit(undefined).value === false, "undefined => false");
+  assert(!parseJudgeBooleanExplicit(undefined).explicit, "undefined not explicit");
+  assert(parseJudgeBooleanExplicit("false").explicit, '"false" is explicit');
+  assert(parseJudgeBooleanExplicit(true).explicit, "boolean is explicit");
+}
+
+function testReconcileAllCriteriaPassOverridesOverallFalse(): void {
+  const criteria = [
+    { criterion: "sdk", pass: true, evidence: "ok" },
+    { criterion: "execution", pass: true, evidence: "ok" },
+  ];
+  assert(
+    reconcileOverallTranscriptPass(false, criteria) === true,
+    "all criteria pass => overall true even when model said false",
+  );
+}
+
+function testReconcileAnyCriterionFailForcesOverallFalse(): void {
+  const criteria = [
+    { criterion: "sdk", pass: true, evidence: "ok" },
+    { criterion: "execution", pass: false, evidence: "failed" },
+  ];
+  assert(
+    reconcileOverallTranscriptPass(true, criteria) === false,
+    "any criterion fail => overall false even when model said true",
+  );
+}
+
+function testReconcileEmptyCriteriaUsesModelOverall(): void {
+  assert(
+    reconcileOverallTranscriptPass(true, []) === true,
+    "no criteria => keep model overall true",
+  );
+  assert(
+    reconcileOverallTranscriptPass(false, []) === false,
+    "no criteria => keep model overall false",
+  );
+}
+
+function main(): void {
+  testParseJudgeBooleanExplicit();
+  testReconcileAllCriteriaPassOverridesOverallFalse();
+  testReconcileAnyCriterionFailForcesOverallFalse();
+  testReconcileEmptyCriteriaUsesModelOverall();
+  console.error("llm-judge-parse.test: OK");
+}
+
+main();
diff --git a/docs/agent-evaluation/src/llm-judge.ts b/docs/agent-evaluation/src/llm-judge.ts
@@ -90,6 +90,40 @@ function stripJsonFence(text: string): string {
   return t;
 }
 
+/** When criteria[] is present, overall is the AND of criterion passes (models sometimes disagree). */
+export function reconcileOverallTranscriptPass(
+  overall_from_model: boolean,
+  criteria: readonly LlmCriterionJudgment[],
+): boolean {
+  if (criteria.length === 0) {
+    return overall_from_model;
+  }
+  return criteria.every((c) => c.pass);
+}
+
+/** Parse booleans from judge JSON; treats string "true"/"false" (case-insensitive) as explicit. */
+export function parseJudgeBooleanExplicit(
+  value: unknown,
+): { readonly explicit: boolean; readonly value: boolean } {
+  if (typeof value === "boolean") {
+    return { explicit: true, value };
+  }
+  if (typeof value === "string") {
+    const normalized = value.trim().toLowerCase();
+    if (normalized === "true") {
+      return { explicit: true, value: true };
+    }
+    if (normalized === "false") {
+      return { explicit: true, value: false };
+    }
+  }
+  return { explicit: false, value: Boolean(value) };
+}
+
+function parseJudgeBoolean(value: unknown): boolean {
+  return parseJudgeBooleanExplicit(value).value;
+}
+
 function parseJudgeJson(text: string): Omit<LlmJudgeReport, "model" | "runFile" | "scenarioFile" | "version"> & {
   version?: number;
 } {
@@ -101,7 +135,8 @@ function parseJudgeJson(text: string): Omit<LlmJudgeReport, "model" | "runFile"
     const detail = parse_err instanceof Error ? parse_err.message : String(parse_err);
     throw new Error(`JSON.parse failed: ${detail}`);
   }
-  const overall = Boolean(parsed.overall_transcript_pass);
+  const overall_parsed = parseJudgeBooleanExplicit(parsed.overall_transcript_pass);
+  const overall_from_model = overall_parsed.value;
   const criteriaIn = parsed.criteria;
   const criteria: LlmCriterionJudgment[] = [];
   if (Array.isArray(criteriaIn)) {
@@ -110,11 +145,17 @@ function parseJudgeJson(text: string): Omit<LlmJudgeReport, "model" | "runFile"
       const o = c as Record<string, unknown>;
       criteria.push({
         criterion: String(o.criterion ?? o.id ?? "unnamed"),
-        pass: Boolean(o.pass),
+        pass: parseJudgeBoolean(o.pass),
         evidence: String(o.evidence ?? ""),
       });
     }
   }
+  const overall = reconcileOverallTranscriptPass(overall_from_model, criteria);
+  if (criteria.length > 0 && overall_parsed.explicit && overall !== overall_from_model) {
+    console.error(
+      `LLM judge: reconciled overall_transcript_pass ${overall_from_model} -> ${overall} (${criteria.length} criteria)`,
+    );
+  }
   const exec = parsed.execution_in_transcript;
   let execution_in_transcript: LlmJudgeReport["execution_in_transcript"] = {
     pass: null,