getsentry · dcramer · Mar 17, 2026 · Mar 17, 2026
diff --git a/src/ai-sdk-integration.test.ts b/src/ai-sdk-integration.test.ts
@@ -72,7 +72,12 @@ describeEval("@ai/sdk ToolCallScorer", {
     });
 
     return {
-      result: text,
+      transcript: [
+        {
+          role: "assistant",
+          parts: [{ type: "text", text }],
+        },
+      ],
       toolCalls: steps
         .flatMap((step) => step.toolCalls)
         .map((call) => ({
@@ -112,10 +117,7 @@ describeEval("@ai/sdk StructuredOutputScorer", {
       }),
     });
 
-    return {
-      result: JSON.stringify(object),
-      toolCalls: [],
-    };
+    return JSON.stringify(object);
   },
   scorers: [
     StructuredOutputScorer({
@@ -148,7 +150,12 @@ describeEval("@ai/sdk ToolCallScorer (No stopWhen)", {
     });
 
     return {
-      result: text,
+      transcript: [
+        {
+          role: "assistant",
+          parts: [{ type: "text", text }],
+        },
+      ],
       toolCalls: steps
         .flatMap((step) => step.toolCalls)
         .map((call) => ({

diff --git a/src/evaluate/index.test.ts b/src/evaluate/index.test.ts
@@ -152,8 +152,95 @@ describe("evaluate", () => {
     });
 
     const call = mockGenerateObject.mock.calls[0][0];
-    expect(call.prompt).toContain("the task output");
-    expect(call.prompt).toContain("must mention specific details");
+    expect(call.messages[0].content[0].text).toContain("[ASSISTANT]");
+    expect(call.messages[0].content[0].text).toContain("the task output");
+    expect(call.messages[1].content).toContain("must mention specific details");
+  });
+
+  test("passes multimodal transcripts to the judge", async () => {
+    mockGenerateObject.mockResolvedValueOnce({
+      object: { answer: "A", rationale: "Handled the transcript correctly" },
+    } as any);
+
+    const ctx = makeContext();
+    await _evaluate(ctx, {
+      task: async () => ({
+        transcript: [
+          {
+            role: "user",
+            parts: [
+              { type: "text", text: "What is shown here?" },
+              {
+                type: "image",
+                image: "data:image/png;base64,abc123",
+                mediaType: "image/png",
+              },
+            ],
+          },
+          {
+            role: "assistant",
+            parts: [{ type: "text", text: "It is a cat." }],
+          },
+        ],
+      }),
+      criteria: "The answer should identify the subject of the image",
+      threshold: 1,
+    });
+
+    const call = mockGenerateObject.mock.calls[0][0];
+    const transcriptText = call.messages[0].content
+      .filter((part: any) => part.type === "text")
+      .map((part: any) => part.text)
+      .join("");
+    expect(transcriptText).toContain("[USER]\nWhat is shown here?");
+    expect(transcriptText).toContain("[image image/png]");
+    expect(transcriptText).toContain("[ASSISTANT]\nIt is a cat.");
+    expect(call.messages[0].content).toContainEqual({
+      type: "image",
+      image: "data:image/png;base64,abc123",
+      mediaType: "image/png",
+    });
+  });
+
+  test("does not pass tool metadata to the judge by default", async () => {
+    mockGenerateObject.mockResolvedValueOnce({
+      object: { answer: "A", rationale: "Focused on the visible transcript" },
+    } as any);
+
+    const ctx = makeContext();
+    await _evaluate(ctx, {
+      task: async () => ({
+        transcript: [
+          {
+            role: "user",
+            parts: [{ type: "text", text: "What is the weather?" }],
+          },
+          {
+            role: "assistant",
+            parts: [{ type: "text", text: "It is 72F in Seattle." }],
+          },
+        ],
+        toolCalls: [
+          {
+            name: "getWeather",
+            arguments: { location: "Seattle" },
+            result: { temperature: 72 },
+          },
+        ],
+      }),
+      criteria: "The answer should report the weather to the user",
+      threshold: 1,
+    });
+
+    const call = mockGenerateObject.mock.calls[0][0];
+    const transcriptText = call.messages[0].content
+      .filter((part: any) => part.type === "text")
+      .map((part: any) => part.text)
+      .join("");
+    expect(transcriptText).toContain("[USER]\nWhat is the weather?");
+    expect(transcriptText).toContain("[ASSISTANT]\nIt is 72F in Seattle.");
+    expect(transcriptText).not.toContain("tool-call");
+    expect(transcriptText).not.toContain("tool-result");
   });
 
   test("maps all answer choices to correct scores", async () => {

diff --git a/src/evaluate/index.ts b/src/evaluate/index.ts
@@ -1,7 +1,12 @@
 import { generateObject } from "ai";
 import { z } from "zod";
 import { assert, test } from "vitest";
-import { wrapText } from "../wrapText";
+import {
+  type Transcript,
+  formatEvalValue,
+  normalizeEvaluateOutput,
+  toJudgeUserMessage,
+} from "../messages";
 
 type LanguageModel = Parameters<typeof generateObject>[0]["model"];
 
@@ -13,15 +18,11 @@ export function configure(opts: { model: LanguageModel }) {
 
 const EVAL_SYSTEM = `You are assessing a submitted output based on a given criterion. Ignore differences in style, grammar, punctuation, or length. Focus only on whether the criterion is met.`;
 
-const EVAL_PROMPT = (output: string, criteria: string) => `<submission>
-${output}
-</submission>
-
-<criteria>
+const EVAL_PROMPT = (criteria: string) => `<criteria>
 ${criteria}
 </criteria>
 
-Does the submission meet the criteria? Select one option:
+Does the conversation transcript meet the criteria? Select one option:
 (A) The criteria is fully met with no issues
 (B) The criteria is mostly met with minor gaps
 (C) The criteria is partially met with notable gaps
@@ -37,7 +38,7 @@ const CHOICE_SCORES: Record<string, number> = {
 };
 
 interface EvaluateOptions {
-  task: () => Promise<string>;
+  task: () => Promise<string | { transcript: Transcript }>;
   criteria: string;
   threshold?: number;
 }
@@ -57,9 +58,11 @@ export async function _evaluate(
     );
   }
 
-  let output: string;
+  let taskOutput: string | { transcript: Transcript };
+  let evaluationOutput: ReturnType<typeof normalizeEvaluateOutput>;
   try {
-    output = await opts.task();
+    taskOutput = await opts.task();
+    evaluationOutput = normalizeEvaluateOutput(taskOutput);
   } catch (error) {
     const errorMessage = error instanceof Error ? error.message : String(error);
     ctx.task.meta.eval = {
@@ -84,7 +87,13 @@ export async function _evaluate(
         rationale: z.string(),
       }),
       system: EVAL_SYSTEM,
-      prompt: EVAL_PROMPT(output, opts.criteria),
+      messages: [
+        toJudgeUserMessage(evaluationOutput.transcript),
+        {
+          role: "user",
+          content: EVAL_PROMPT(opts.criteria),
+        },
+      ],
     }));
   } catch (error) {
     const errorMessage = error instanceof Error ? error.message : String(error);
@@ -118,7 +127,9 @@ export async function _evaluate(
   if (score < threshold) {
     assert(
       false,
-      `Score: ${score} (${object.answer}) below threshold: ${threshold}\n\n## Output:\n${wrapText(output)}\n\n## Rationale:\n${wrapText(object.rationale)}`,
+      `Score: ${score} (${object.answer}) below threshold: ${threshold}\n\n## Output:\n${formatEvalValue(
+        typeof taskOutput === "string" ? taskOutput : taskOutput.transcript,
+      )}\n\n## Rationale:\n${formatEvalValue(object.rationale)}`,
     );
   }
 }

diff --git a/src/formatScores.test.ts b/src/formatScores.test.ts
@@ -72,4 +72,47 @@ describe("formatScores", () => {
       # Scorer B [0.8]"
     `);
   });
+
+  it("should format transcript outputs", () => {
+    const scores = [
+      {
+        name: "Scorer A",
+        score: 0.2,
+        metadata: {
+          rationale: "Image description was incorrect",
+          output: [
+            {
+              role: "assistant",
+              parts: [
+                { type: "text", text: "A dog on a sofa." },
+                {
+                  type: "image",
+                  image: "data:image/png;base64,abc",
+                  mediaType: "image/png",
+                },
+              ],
+            },
+          ],
+        },
+      },
+    ];
+
+    const result = formatScores(scores);
+
+    expect(result).toMatchInlineSnapshot(`
+      "# Scorer A [0.2]
+
+      ## Rationale
+
+      Image description was incorrect
+
+      ## Response
+
+      ## assistant
+
+      A dog on a sofa.
+
+      [image image/png]"
+    `);
+  });
 });