Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions src/ai-sdk-integration.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,12 @@ describeEval("@ai/sdk ToolCallScorer", {
});

return {
result: text,
transcript: [
{
role: "assistant",
parts: [{ type: "text", text }],
},
],
toolCalls: steps
.flatMap((step) => step.toolCalls)
.map((call) => ({
Expand Down Expand Up @@ -112,10 +117,7 @@ describeEval("@ai/sdk StructuredOutputScorer", {
}),
});

return {
result: JSON.stringify(object),
toolCalls: [],
};
return JSON.stringify(object);
},
scorers: [
StructuredOutputScorer({
Expand Down Expand Up @@ -148,7 +150,12 @@ describeEval("@ai/sdk ToolCallScorer (No stopWhen)", {
});

return {
result: text,
transcript: [
{
role: "assistant",
parts: [{ type: "text", text }],
},
],
toolCalls: steps
.flatMap((step) => step.toolCalls)
.map((call) => ({
Expand Down
91 changes: 89 additions & 2 deletions src/evaluate/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,95 @@ describe("evaluate", () => {
});

const call = mockGenerateObject.mock.calls[0][0];
expect(call.prompt).toContain("the task output");
expect(call.prompt).toContain("must mention specific details");
expect(call.messages[0].content[0].text).toContain("[ASSISTANT]");
expect(call.messages[0].content[0].text).toContain("the task output");
expect(call.messages[1].content).toContain("must mention specific details");
});

test("passes multimodal transcripts to the judge", async () => {
mockGenerateObject.mockResolvedValueOnce({
object: { answer: "A", rationale: "Handled the transcript correctly" },
} as any);

const ctx = makeContext();
await _evaluate(ctx, {
task: async () => ({
transcript: [
{
role: "user",
parts: [
{ type: "text", text: "What is shown here?" },
{
type: "image",
image: "data:image/png;base64,abc123",
mediaType: "image/png",
},
],
},
{
role: "assistant",
parts: [{ type: "text", text: "It is a cat." }],
},
],
}),
criteria: "The answer should identify the subject of the image",
threshold: 1,
});

const call = mockGenerateObject.mock.calls[0][0];
const transcriptText = call.messages[0].content
.filter((part: any) => part.type === "text")
.map((part: any) => part.text)
.join("");
expect(transcriptText).toContain("[USER]\nWhat is shown here?");
expect(transcriptText).toContain("[image image/png]");
expect(transcriptText).toContain("[ASSISTANT]\nIt is a cat.");
expect(call.messages[0].content).toContainEqual({
type: "image",
image: "data:image/png;base64,abc123",
mediaType: "image/png",
});
});

test("does not pass tool metadata to the judge by default", async () => {
mockGenerateObject.mockResolvedValueOnce({
object: { answer: "A", rationale: "Focused on the visible transcript" },
} as any);

const ctx = makeContext();
await _evaluate(ctx, {
task: async () => ({
transcript: [
{
role: "user",
parts: [{ type: "text", text: "What is the weather?" }],
},
{
role: "assistant",
parts: [{ type: "text", text: "It is 72F in Seattle." }],
},
],
toolCalls: [
{
name: "getWeather",
arguments: { location: "Seattle" },
result: { temperature: 72 },
},
],
}),
criteria: "The answer should report the weather to the user",
threshold: 1,
});

const call = mockGenerateObject.mock.calls[0][0];
const transcriptText = call.messages[0].content
.filter((part: any) => part.type === "text")
.map((part: any) => part.text)
.join("");
expect(transcriptText).toContain("[USER]\nWhat is the weather?");
expect(transcriptText).toContain("[ASSISTANT]\nIt is 72F in Seattle.");
expect(transcriptText).not.toContain("tool-call");
expect(transcriptText).not.toContain("tool-result");
});

test("maps all answer choices to correct scores", async () => {
Expand Down
35 changes: 23 additions & 12 deletions src/evaluate/index.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import { generateObject } from "ai";
import { z } from "zod";
import { assert, test } from "vitest";
import { wrapText } from "../wrapText";
import {
type Transcript,
formatEvalValue,
normalizeEvaluateOutput,
toJudgeUserMessage,
} from "../messages";

type LanguageModel = Parameters<typeof generateObject>[0]["model"];

Expand All @@ -13,15 +18,11 @@ export function configure(opts: { model: LanguageModel }) {

const EVAL_SYSTEM = `You are assessing a submitted output based on a given criterion. Ignore differences in style, grammar, punctuation, or length. Focus only on whether the criterion is met.`;

const EVAL_PROMPT = (output: string, criteria: string) => `<submission>
${output}
</submission>

<criteria>
const EVAL_PROMPT = (criteria: string) => `<criteria>
${criteria}
</criteria>

Does the submission meet the criteria? Select one option:
Does the conversation transcript meet the criteria? Select one option:
(A) The criteria is fully met with no issues
(B) The criteria is mostly met with minor gaps
(C) The criteria is partially met with notable gaps
Expand All @@ -37,7 +38,7 @@ const CHOICE_SCORES: Record<string, number> = {
};

interface EvaluateOptions {
task: () => Promise<string>;
task: () => Promise<string | { transcript: Transcript }>;
criteria: string;
threshold?: number;
}
Expand All @@ -57,9 +58,11 @@ export async function _evaluate(
);
}

let output: string;
let taskOutput: string | { transcript: Transcript };
let evaluationOutput: ReturnType<typeof normalizeEvaluateOutput>;
try {
output = await opts.task();
taskOutput = await opts.task();
evaluationOutput = normalizeEvaluateOutput(taskOutput);
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
ctx.task.meta.eval = {
Expand All @@ -84,7 +87,13 @@ export async function _evaluate(
rationale: z.string(),
}),
system: EVAL_SYSTEM,
prompt: EVAL_PROMPT(output, opts.criteria),
messages: [
toJudgeUserMessage(evaluationOutput.transcript),
{
role: "user",
content: EVAL_PROMPT(opts.criteria),
},
],
}));
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
Expand Down Expand Up @@ -118,7 +127,9 @@ export async function _evaluate(
if (score < threshold) {
assert(
false,
`Score: ${score} (${object.answer}) below threshold: ${threshold}\n\n## Output:\n${wrapText(output)}\n\n## Rationale:\n${wrapText(object.rationale)}`,
`Score: ${score} (${object.answer}) below threshold: ${threshold}\n\n## Output:\n${formatEvalValue(
typeof taskOutput === "string" ? taskOutput : taskOutput.transcript,
)}\n\n## Rationale:\n${formatEvalValue(object.rationale)}`,
Comment thread
cursor[bot] marked this conversation as resolved.
);
}
}
Expand Down
43 changes: 43 additions & 0 deletions src/formatScores.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,47 @@ describe("formatScores", () => {
# Scorer B [0.8]"
`);
});

it("should format transcript outputs", () => {
const scores = [
{
name: "Scorer A",
score: 0.2,
metadata: {
rationale: "Image description was incorrect",
output: [
{
role: "assistant",
parts: [
{ type: "text", text: "A dog on a sofa." },
{
type: "image",
image: "data:image/png;base64,abc",
mediaType: "image/png",
},
],
},
],
},
},
];

const result = formatScores(scores);

expect(result).toMatchInlineSnapshot(`
"# Scorer A [0.2]

## Rationale

Image description was incorrect

## Response

## assistant

A dog on a sofa.

[image image/png]"
`);
});
});
Loading
Loading