Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docs/agent-evaluation/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@
"score": "node --import tsx src/score-eval.ts",
"viz:trajectory": "node --import tsx src/generate-trajectory-html.ts",
"typecheck": "tsc --noEmit",
"test": "npm run test:trajectory && npm run test:redact-secrets",
"test": "npm run test:trajectory && npm run test:redact-secrets && npm run test:llm-judge-parse",
"test:trajectory": "node --import tsx src/trajectory-fixture-smoke.ts",
"test:redact-secrets": "node --import tsx src/redact-secrets.test.ts"
"test:redact-secrets": "node --import tsx src/redact-secrets.test.ts",
"test:llm-judge-parse": "node --import tsx src/llm-judge-parse.test.ts"
},
"engines": {
"node": ">=18"
Expand Down
68 changes: 68 additions & 0 deletions docs/agent-evaluation/src/llm-judge-parse.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/**
* Unit tests for LLM judge JSON parsing / overall reconciliation.
*/
import {
parseJudgeBooleanExplicit,
reconcileOverallTranscriptPass,
} from "./llm-judge.js";

function assert(condition: boolean, message: string): void {
if (!condition) {
throw new Error(`Assertion failed: ${message}`);
}
}
Comment thread
Copilot marked this conversation as resolved.

function testParseJudgeBooleanExplicit(): void {
assert(parseJudgeBooleanExplicit(true).value === true, "boolean true");
assert(parseJudgeBooleanExplicit(false).value === false, "boolean false");
assert(parseJudgeBooleanExplicit("false").value === false, 'string "false"');
assert(parseJudgeBooleanExplicit("FALSE").value === false, 'string "FALSE"');
assert(parseJudgeBooleanExplicit("true").value === true, 'string "true"');
assert(parseJudgeBooleanExplicit(undefined).value === false, "undefined => false");
assert(!parseJudgeBooleanExplicit(undefined).explicit, "undefined not explicit");
assert(parseJudgeBooleanExplicit("false").explicit, '"false" is explicit');
assert(parseJudgeBooleanExplicit(true).explicit, "boolean is explicit");
}

function testReconcileAllCriteriaPassOverridesOverallFalse(): void {
const criteria = [
{ criterion: "sdk", pass: true, evidence: "ok" },
{ criterion: "execution", pass: true, evidence: "ok" },
];
assert(
reconcileOverallTranscriptPass(false, criteria) === true,
"all criteria pass => overall true even when model said false",
);
}

function testReconcileAnyCriterionFailForcesOverallFalse(): void {
const criteria = [
{ criterion: "sdk", pass: true, evidence: "ok" },
{ criterion: "execution", pass: false, evidence: "failed" },
];
assert(
reconcileOverallTranscriptPass(true, criteria) === false,
"any criterion fail => overall false even when model said true",
);
}

function testReconcileEmptyCriteriaUsesModelOverall(): void {
assert(
reconcileOverallTranscriptPass(true, []) === true,
"no criteria => keep model overall true",
);
assert(
reconcileOverallTranscriptPass(false, []) === false,
"no criteria => keep model overall false",
);
}

function main(): void {
testParseJudgeBooleanExplicit();
testReconcileAllCriteriaPassOverridesOverallFalse();
testReconcileAnyCriterionFailForcesOverallFalse();
testReconcileEmptyCriteriaUsesModelOverall();
console.error("llm-judge-parse.test: OK");
}

main();
45 changes: 43 additions & 2 deletions docs/agent-evaluation/src/llm-judge.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,40 @@ function stripJsonFence(text: string): string {
return t;
}

/** When criteria[] is present, overall is the AND of criterion passes (models sometimes disagree). */
export function reconcileOverallTranscriptPass(
overall_from_model: boolean,
criteria: readonly LlmCriterionJudgment[],
): boolean {
if (criteria.length === 0) {
return overall_from_model;
}
return criteria.every((c) => c.pass);
}

/** Parse booleans from judge JSON; treats string "true"/"false" (case-insensitive) as explicit. */
export function parseJudgeBooleanExplicit(
value: unknown,
): { readonly explicit: boolean; readonly value: boolean } {
if (typeof value === "boolean") {
return { explicit: true, value };
}
if (typeof value === "string") {
const normalized = value.trim().toLowerCase();
if (normalized === "true") {
return { explicit: true, value: true };
}
if (normalized === "false") {
return { explicit: true, value: false };
}
}
return { explicit: false, value: Boolean(value) };
}

function parseJudgeBoolean(value: unknown): boolean {
return parseJudgeBooleanExplicit(value).value;
}

function parseJudgeJson(text: string): Omit<LlmJudgeReport, "model" | "runFile" | "scenarioFile" | "version"> & {
version?: number;
} {
Expand All @@ -101,7 +135,8 @@ function parseJudgeJson(text: string): Omit<LlmJudgeReport, "model" | "runFile"
const detail = parse_err instanceof Error ? parse_err.message : String(parse_err);
throw new Error(`JSON.parse failed: ${detail}`);
}
const overall = Boolean(parsed.overall_transcript_pass);
const overall_parsed = parseJudgeBooleanExplicit(parsed.overall_transcript_pass);
const overall_from_model = overall_parsed.value;
const criteriaIn = parsed.criteria;
const criteria: LlmCriterionJudgment[] = [];
if (Array.isArray(criteriaIn)) {
Expand All @@ -110,11 +145,17 @@ function parseJudgeJson(text: string): Omit<LlmJudgeReport, "model" | "runFile"
const o = c as Record<string, unknown>;
criteria.push({
criterion: String(o.criterion ?? o.id ?? "unnamed"),
pass: Boolean(o.pass),
pass: parseJudgeBoolean(o.pass),
evidence: String(o.evidence ?? ""),
});
}
}
const overall = reconcileOverallTranscriptPass(overall_from_model, criteria);
if (criteria.length > 0 && overall_parsed.explicit && overall !== overall_from_model) {
console.error(
`LLM judge: reconciled overall_transcript_pass ${overall_from_model} -> ${overall} (${criteria.length} criteria)`,
);
}
const exec = parsed.execution_in_transcript;
let execution_in_transcript: LlmJudgeReport["execution_in_transcript"] = {
pass: null,
Expand Down
Loading