diff --git a/docs/agent-evaluation/package.json b/docs/agent-evaluation/package.json index 6bf563f0..4938cc76 100644 --- a/docs/agent-evaluation/package.json +++ b/docs/agent-evaluation/package.json @@ -12,9 +12,10 @@ "score": "node --import tsx src/score-eval.ts", "viz:trajectory": "node --import tsx src/generate-trajectory-html.ts", "typecheck": "tsc --noEmit", - "test": "npm run test:trajectory && npm run test:redact-secrets", + "test": "npm run test:trajectory && npm run test:redact-secrets && npm run test:llm-judge-parse", "test:trajectory": "node --import tsx src/trajectory-fixture-smoke.ts", - "test:redact-secrets": "node --import tsx src/redact-secrets.test.ts" + "test:redact-secrets": "node --import tsx src/redact-secrets.test.ts", + "test:llm-judge-parse": "node --import tsx src/llm-judge-parse.test.ts" }, "engines": { "node": ">=18" diff --git a/docs/agent-evaluation/src/llm-judge-parse.test.ts b/docs/agent-evaluation/src/llm-judge-parse.test.ts new file mode 100644 index 00000000..d8e902cd --- /dev/null +++ b/docs/agent-evaluation/src/llm-judge-parse.test.ts @@ -0,0 +1,68 @@ +/** + * Unit tests for LLM judge JSON parsing / overall reconciliation. + */ +import { + parseJudgeBooleanExplicit, + reconcileOverallTranscriptPass, +} from "./llm-judge.js"; + +function assert(condition: boolean, message: string): void { + if (!condition) { + throw new Error(`Assertion failed: ${message}`); + } +} + +function testParseJudgeBooleanExplicit(): void { + assert(parseJudgeBooleanExplicit(true).value === true, "boolean true"); + assert(parseJudgeBooleanExplicit(false).value === false, "boolean false"); + assert(parseJudgeBooleanExplicit("false").value === false, 'string "false"'); + assert(parseJudgeBooleanExplicit("FALSE").value === false, 'string "FALSE"'); + assert(parseJudgeBooleanExplicit("true").value === true, 'string "true"'); + assert(parseJudgeBooleanExplicit(undefined).value === false, "undefined => false"); + assert(!parseJudgeBooleanExplicit(undefined).explicit, "undefined not explicit"); + assert(parseJudgeBooleanExplicit("false").explicit, '"false" is explicit'); + assert(parseJudgeBooleanExplicit(true).explicit, "boolean is explicit"); +} + +function testReconcileAllCriteriaPassOverridesOverallFalse(): void { + const criteria = [ + { criterion: "sdk", pass: true, evidence: "ok" }, + { criterion: "execution", pass: true, evidence: "ok" }, + ]; + assert( + reconcileOverallTranscriptPass(false, criteria) === true, + "all criteria pass => overall true even when model said false", + ); +} + +function testReconcileAnyCriterionFailForcesOverallFalse(): void { + const criteria = [ + { criterion: "sdk", pass: true, evidence: "ok" }, + { criterion: "execution", pass: false, evidence: "failed" }, + ]; + assert( + reconcileOverallTranscriptPass(true, criteria) === false, + "any criterion fail => overall false even when model said true", + ); +} + +function testReconcileEmptyCriteriaUsesModelOverall(): void { + assert( + reconcileOverallTranscriptPass(true, []) === true, + "no criteria => keep model overall true", + ); + assert( + reconcileOverallTranscriptPass(false, []) === false, + "no criteria => keep model overall false", + ); +} + +function main(): void { + testParseJudgeBooleanExplicit(); + testReconcileAllCriteriaPassOverridesOverallFalse(); + testReconcileAnyCriterionFailForcesOverallFalse(); + testReconcileEmptyCriteriaUsesModelOverall(); + console.error("llm-judge-parse.test: OK"); +} + +main(); diff --git a/docs/agent-evaluation/src/llm-judge.ts b/docs/agent-evaluation/src/llm-judge.ts index c809dce0..dcd71752 100644 --- a/docs/agent-evaluation/src/llm-judge.ts +++ b/docs/agent-evaluation/src/llm-judge.ts @@ -90,6 +90,40 @@ function stripJsonFence(text: string): string { return t; } +/** When criteria[] is present, overall is the AND of criterion passes (models sometimes disagree). */ +export function reconcileOverallTranscriptPass( + overall_from_model: boolean, + criteria: readonly LlmCriterionJudgment[], +): boolean { + if (criteria.length === 0) { + return overall_from_model; + } + return criteria.every((c) => c.pass); +} + +/** Parse booleans from judge JSON; treats string "true"/"false" (case-insensitive) as explicit. */ +export function parseJudgeBooleanExplicit( + value: unknown, +): { readonly explicit: boolean; readonly value: boolean } { + if (typeof value === "boolean") { + return { explicit: true, value }; + } + if (typeof value === "string") { + const normalized = value.trim().toLowerCase(); + if (normalized === "true") { + return { explicit: true, value: true }; + } + if (normalized === "false") { + return { explicit: true, value: false }; + } + } + return { explicit: false, value: Boolean(value) }; +} + +function parseJudgeBoolean(value: unknown): boolean { + return parseJudgeBooleanExplicit(value).value; +} + function parseJudgeJson(text: string): Omit & { version?: number; } { @@ -101,7 +135,8 @@ function parseJudgeJson(text: string): Omit; criteria.push({ criterion: String(o.criterion ?? o.id ?? "unnamed"), - pass: Boolean(o.pass), + pass: parseJudgeBoolean(o.pass), evidence: String(o.evidence ?? ""), }); } } + const overall = reconcileOverallTranscriptPass(overall_from_model, criteria); + if (criteria.length > 0 && overall_parsed.explicit && overall !== overall_from_model) { + console.error( + `LLM judge: reconciled overall_transcript_pass ${overall_from_model} -> ${overall} (${criteria.length} criteria)`, + ); + } const exec = parsed.execution_in_transcript; let execution_in_transcript: LlmJudgeReport["execution_in_transcript"] = { pass: null,