diff --git a/src/annotateJUnitWithScoresData.test.ts b/src/annotateJUnitWithScoresData.test.ts
new file mode 100644
index 0000000..50e8261
--- /dev/null
+++ b/src/annotateJUnitWithScoresData.test.ts
@@ -0,0 +1,574 @@
+import { describe, expect, test, vi, beforeEach } from "vitest";
+import {
+  annotateJUnitWithScoresData,
+  type Score,
+  type ToolCall,
+} from "./index";
+
+describe("annotateJUnitWithScoresData", () => {
+  let mockAnnotate: ReturnType<typeof vi.fn>;
+  let mockTestTask: any;
+
+  beforeEach(() => {
+    mockAnnotate = vi.fn();
+    mockTestTask = {
+      context: {
+        annotate: mockAnnotate,
+      },
+      meta: {
+        eval: undefined,
+      },
+    };
+  });
+
+  describe("score annotations", () => {
+    test("should annotate basic score with value", () => {
+      const scores: (Score & { name: string })[] = [
+        { name: "Factuality", score: 0.8 },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores);
+
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "0.8",
+        "evals.scores.Factuality.value",
+      );
+    });
+
+    test("should annotate score type as float for numeric scores", () => {
+      const scores: (Score & { name: string })[] = [
+        { name: "Accuracy", score: 0.75 },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores);
+
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "float",
+        "evals.scores.Accuracy.type",
+      );
+    });
+
+    test("should annotate score type as bool for boolean scores", () => {
+      const scores: (Score & { name: string })[] = [
+        { name: "IsCorrect", score: true as any },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores);
+
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "bool",
+        "evals.scores.IsCorrect.type",
+      );
+    });
+
+    test("should annotate llm_judge from metadata", () => {
+      const scores: (Score & { name: string })[] = [
+        {
+          name: "Factuality",
+          score: 0.9,
+          metadata: {
+            llm_judge: "gemini_2.5pro",
+            rationale: "Good answer",
+          },
+        },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores);
+
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "gemini_2.5pro",
+        "evals.scores.Factuality.llm_judge",
+      );
+    });
+
+    test("should annotate avg score", () => {
+      const scores: (Score & { name: string })[] = [
+        { name: "Completeness", score: 0.85 },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores);
+    });
+
+    test("should handle null scores", () => {
+      const scores: (Score & { name: string })[] = [
+        { name: "Unknown", score: null },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores);
+
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "",
+        "evals.scores.Unknown.value",
+      );
+    });
+
+    test("should use score_IDX when name is not available", () => {
+      const scores: (Score & { name: string })[] = [
+        { name: "", score: 0.7 }, // Empty name
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores);
+
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "0.7",
+        "evals.scores.score_0.value",
+      );
+    });
+
+    test("should replace dots in score names with underscores", () => {
+      const scores: (Score & { name: string })[] = [
+        { name: "my.scorer.name", score: 0.6 },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores);
+
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "0.6",
+        "evals.scores.my_scorer_name.value",
+      );
+    });
+
+    test("should flatten and annotate metadata fields", () => {
+      const scores: (Score & { name: string })[] = [
+        {
+          name: "Detailed",
+          score: 0.8,
+          metadata: {
+            rationale: "Good response",
+            output: "Detailed answer",
+            nested: {
+              field: "value",
+              deep: {
+                nested: "data",
+              },
+            },
+          },
+        },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores);
+
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "Good response",
+        "evals.scores.Detailed.metadata.rationale",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "Detailed answer",
+        "evals.scores.Detailed.metadata.output",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "value",
+        "evals.scores.Detailed.metadata.nested.field",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "data",
+        "evals.scores.Detailed.metadata.nested.deep.nested",
+      );
+    });
+
+    test("should handle multiple scores", () => {
+      const scores: (Score & { name: string })[] = [
+        { name: "Accuracy", score: 0.9 },
+        { name: "Completeness", score: 0.8 },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores);
+
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "0.9",
+        "evals.scores.Accuracy.value",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "0.8",
+        "evals.scores.Completeness.value",
+      );
+    });
+  });
+
+  describe("toolCalls annotations", () => {
+    test("should annotate toolCalls when present", () => {
+      const toolCalls: ToolCall[] = [
+        {
+          name: "getWeather",
+          arguments: { location: "Seattle", units: "celsius" },
+          result: { temperature: 18, condition: "partly cloudy" },
+          status: "completed",
+          type: "function",
+          id: "call_123",
+        },
+      ];
+
+      const scores: (Score & { name: string })[] = [
+        { name: "ToolUsage", score: 1.0 },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores, toolCalls);
+
+      // Check toolCall annotations
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "getWeather",
+        "evals.toolCalls.0.name",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "Seattle",
+        "evals.toolCalls.0.arguments.location",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "celsius",
+        "evals.toolCalls.0.arguments.units",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "18",
+        "evals.toolCalls.0.result.temperature",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "partly cloudy",
+        "evals.toolCalls.0.result.condition",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "completed",
+        "evals.toolCalls.0.status",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "function",
+        "evals.toolCalls.0.type",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "call_123",
+        "evals.toolCalls.0.id",
+      );
+    });
+
+    test("should handle multiple toolCalls", () => {
+      const toolCalls: ToolCall[] = [
+        {
+          name: "search",
+          arguments: { query: "weather" },
+          result: { results: ["result1", "result2"] },
+          status: "completed",
+        },
+        {
+          name: "calculate",
+          arguments: { expression: "2+2" },
+          result: { answer: 4 },
+          status: "completed",
+        },
+      ];
+
+      const scores: (Score & { name: string })[] = [
+        { name: "MultiTool", score: 1.0 },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores, toolCalls);
+
+      // Check first toolCall
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "search",
+        "evals.toolCalls.0.name",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "weather",
+        "evals.toolCalls.0.arguments.query",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "result1",
+        "evals.toolCalls.0.result.results.0",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "result2",
+        "evals.toolCalls.0.result.results.1",
+      );
+
+      // Check second toolCall
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "calculate",
+        "evals.toolCalls.1.name",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "2+2",
+        "evals.toolCalls.1.arguments.expression",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "4",
+        "evals.toolCalls.1.result.answer",
+      );
+    });
+
+    test("should handle toolCalls with dots in field names", () => {
+      const toolCalls: ToolCall[] = [
+        {
+          name: "api.call",
+          arguments: { "user.id": "123", "data.type": "json" },
+          result: { "response.time": 150 },
+          status: "completed",
+        },
+      ];
+
+      const scores: (Score & { name: string })[] = [
+        { name: "DotFields", score: 1.0 },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores, toolCalls);
+
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "api.call",
+        "evals.toolCalls.0.name",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "123",
+        "evals.toolCalls.0.arguments.user_id",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "json",
+        "evals.toolCalls.0.arguments.data_type",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "150",
+        "evals.toolCalls.0.result.response_time",
+      );
+    });
+
+    test("should handle toolCalls with nested objects", () => {
+      const toolCalls: ToolCall[] = [
+        {
+          name: "complexTool",
+          arguments: {
+            config: {
+              timeout: 5000,
+              retries: 3,
+              headers: {
+                "Content-Type": "application/json",
+              },
+            },
+          },
+          result: {
+            data: {
+              items: [
+                { id: 1, name: "item1" },
+                { id: 2, name: "item2" },
+              ],
+            },
+          },
+          status: "completed",
+        },
+      ];
+
+      const scores: (Score & { name: string })[] = [
+        { name: "NestedTool", score: 1.0 },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores, toolCalls);
+
+      // Check nested arguments
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "5000",
+        "evals.toolCalls.0.arguments.config.timeout",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "3",
+        "evals.toolCalls.0.arguments.config.retries",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "application/json",
+        "evals.toolCalls.0.arguments.config.headers.Content-Type",
+      );
+
+      // Check nested results
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "1",
+        "evals.toolCalls.0.result.data.items.0.id",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "item1",
+        "evals.toolCalls.0.result.data.items.0.name",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "2",
+        "evals.toolCalls.0.result.data.items.1.id",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "item2",
+        "evals.toolCalls.0.result.data.items.1.name",
+      );
+    });
+
+    test("should not annotate toolCalls when not present", () => {
+      const scores: (Score & { name: string })[] = [
+        { name: "NoTools", score: 1.0 },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores); // No toolCalls
+
+      // Should not have any toolCalls annotations
+      const toolCallAnnotations = mockAnnotate.mock.calls.filter(
+        (call: any[]) => call[1].startsWith("evals.toolCalls"),
+      );
+      expect(toolCallAnnotations).toHaveLength(0);
+    });
+  });
+
+  describe("edge cases", () => {
+    test("should handle empty scores array", () => {
+      const scores: (Score & { name: string })[] = [];
+
+      annotateJUnitWithScoresData(mockTestTask, scores);
+
+      // Should not have any score annotations
+      const scoreAnnotations = mockAnnotate.mock.calls.filter((call: any[]) =>
+        call[1].startsWith("evals.scores"),
+      );
+      expect(scoreAnnotations).toHaveLength(0);
+    });
+
+    test("should handle empty toolCalls array", () => {
+      const scores: (Score & { name: string })[] = [
+        { name: "Test", score: 1.0 },
+      ];
+      const toolCalls: ToolCall[] = []; // Empty toolCalls array
+
+      annotateJUnitWithScoresData(mockTestTask, scores, toolCalls);
+
+      // Should not have any toolCalls annotations
+      const toolCallAnnotations = mockAnnotate.mock.calls.filter(
+        (call: any[]) => call[1].startsWith("evals.toolCalls"),
+      );
+      expect(toolCallAnnotations).toHaveLength(0);
+    });
+
+    test("should handle undefined values in objects", () => {
+      const toolCalls: ToolCall[] = [
+        {
+          name: "testTool",
+          arguments: { required: "value", optional: undefined },
+          result: { data: null },
+          status: "completed",
+        },
+      ];
+
+      const scores: (Score & { name: string })[] = [
+        {
+          name: "UndefinedTest",
+          score: 0.5,
+          metadata: {
+            rationale: undefined,
+            output: null,
+          },
+        },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores, toolCalls);
+
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "",
+        "evals.toolCalls.0.arguments.optional",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "",
+        "evals.toolCalls.0.result.data",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "",
+        "evals.scores.UndefinedTest.metadata.rationale",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "",
+        "evals.scores.UndefinedTest.metadata.output",
+      );
+    });
+
+    test("should handle complex nested structures", () => {
+      const toolCalls: ToolCall[] = [
+        {
+          name: "complexTool",
+          arguments: {
+            nested: {
+              deep: {
+                deeper: {
+                  value: "final",
+                },
+              },
+            },
+          },
+          result: {
+            mixed: {
+              string: "text",
+              number: 42,
+              boolean: true,
+            },
+          },
+          status: "completed",
+        },
+      ];
+
+      const scores: (Score & { name: string })[] = [
+        {
+          name: "ComplexTest",
+          score: 0.9,
+          metadata: {
+            nested: {
+              data: {
+                value: "test",
+              },
+            },
+          },
+        },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores, toolCalls);
+
+      // Check deeply nested toolCall arguments
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "final",
+        "evals.toolCalls.0.arguments.nested.deep.deeper.value",
+      );
+
+      // Check mixed result types
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "text",
+        "evals.toolCalls.0.result.mixed.string",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "42",
+        "evals.toolCalls.0.result.mixed.number",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "true",
+        "evals.toolCalls.0.result.mixed.boolean",
+      );
+
+      // Check nested metadata
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "test",
+        "evals.scores.ComplexTest.metadata.nested.data.value",
+      );
+    });
+
+    test("should handle multiple scores with different naming patterns", () => {
+      const scores: (Score & { name: string })[] = [
+        { name: "NormalScorer", score: 0.8 },
+        { name: "", score: 0.7 }, // Empty name
+        { name: "scorer.with.dots", score: 0.6 },
+        { name: "AnotherScorer", score: 0.9 },
+      ];
+
+      annotateJUnitWithScoresData(mockTestTask, scores);
+
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "0.8",
+        "evals.scores.NormalScorer.value",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "0.7",
+        "evals.scores.score_1.value",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "0.6",
+        "evals.scores.scorer_with_dots.value",
+      );
+      expect(mockAnnotate).toHaveBeenCalledWith(
+        "0.9",
+        "evals.scores.AnotherScorer.value",
+      );
+    });
+  });
+});
diff --git a/src/index.ts b/src/index.ts
index 1aea708..fc4ad20 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -1,4 +1,4 @@
-import { assert, describe, expect, test } from "vitest";
+import { assert, describe, expect, type RunnerTestCase, test } from "vitest";
 import "vitest";
 
 /**
@@ -62,7 +62,9 @@ export type Score = {
   score: number | null;
   metadata?: {
     rationale?: string;
-    output?: string;
+    output?: string | null;
+    llm_judge?: string;
+    [key: string]: any; // Allow additional metadata fields
   };
 };
 
@@ -95,7 +97,7 @@ declare module "vitest" {
     eval?: {
       scores: (Score & { name: string })[];
       avgScore: number;
-      toolCalls?: ToolCall[];
+      toolCalls?: ToolCall[] | undefined;
     };
   }
 }
@@ -263,6 +265,10 @@ export function describeEval(
           const avgScore =
             scores.reduce((acc, s) => acc + (s.score ?? 0), 0) / scores.length;
 
+          // Available for JUnit XML reporter
+          annotateJUnitWithScoresData(testTask, scoresWithName, toolCalls);
+
+          // Available for JSON reporter
           testTask.meta.eval = {
             scores: scoresWithName,
             avgScore,
@@ -303,6 +309,133 @@ export function formatScores(scores: (Score & { name: string })[]) {
     .join("\n\n");
 }
 
+/**
+ * Annotates JUnit test results with evaluation scores and tool call data for XML reporting.
+ *
+ * This function adds structured annotations to the test context that can be used by JUnit XML
+ * reporters to include evaluation metrics and tool usage information in test reports.
+ *
+ * The annotations follow a hierarchical schema:
+ * - `evals.scores.{SCORE_NAME}.value` - The numeric score value
+ * - `evals.scores.{SCORE_NAME}.type` - The data type (float/bool)
+ * - `evals.scores.{SCORE_NAME}.llm_judge` - LLM judge reasoning (if available)
+ * - `evals.scores.{SCORE_NAME}.metadata.{FIELD}` - Flattened metadata fields
+ * - `evals.toolCalls.{INDEX}.{FIELD}` - Tool call data (if present)
+ *
+ * @param testTask - The Vitest test case to annotate
+ * @param scoresWithName - Array of evaluation scores with their names
+ * @param toolCalls - Optional array of tool calls made during the test
+ *
+ * @example
+ * ```javascript
+ * // In a test case
+ * const scores = [{ name: "factuality", score: 0.8, metadata: { rationale: "Good answer" } }];
+ * const toolCalls = [{ name: "search", arguments: { query: "weather" } }];
+ *
+ * annotateJUnitWithScoresData(testTask, scores, toolCalls);
+ * // Results in annotations like:
+ * // evals.scores.factuality.value = "0.8"
+ * // evals.scores.factuality.type = "float"
+ * // evals.scores.factuality.metadata.rationale = "Good answer"
+ * // evals.toolCalls.0.name = "search"
+ * // evals.toolCalls.0.arguments.query = "weather"
+ * ```
+ */
+export function annotateJUnitWithScoresData(
+  testTask: RunnerTestCase,
+  scoresWithName: (Score & { name: string })[],
+  toolCalls?: ToolCall[],
+) {
+  /**
+   * Recursively flattens nested objects into dot-notation keys for JUnit annotations.
+   *
+   * Converts nested object structures into flat key-value pairs where nested keys
+   * are joined with dots. Dots in original keys are replaced with underscores to
+   * avoid conflicts with the annotation hierarchy.
+   *
+   * @param obj - The object to flatten
+   * @param prefix - Current key prefix for nested properties
+   * @returns Flattened object with dot-notation keys
+   *
+   * @example
+   * ```javascript
+   * flattenObject({ a: { b: 1, "c.d": 2 } })
+   * // Returns: { "a.b": 1, "a.c_d": 2 }
+   *
+   * flattenObject({ metadata: { rationale: "Good", details: { confidence: 0.9 } } }, "score")
+   * // Returns: { "score.metadata.rationale": "Good", "score.metadata.details.confidence": 0.9 }
+   * ```
+   */
+  function flattenObject(obj: any, prefix = ""): Record<string, any> {
+    const flattened: Record<string, any> = {};
+
+    for (const [key, value] of Object.entries(obj)) {
+      // Replace dots in keys with underscores to avoid conflicts with annotation hierarchy
+      const keyNoDots = key.replace(/\./g, "_");
+      const newKey = prefix ? `${prefix}.${keyNoDots}` : keyNoDots;
+
+      if (value !== null && typeof value === "object") {
+        Object.assign(flattened, flattenObject(value, newKey));
+      } else {
+        flattened[newKey] = value;
+      }
+    }
+
+    return flattened;
+  }
+
+  // Annotate scores following the schema: evals.scores.SCORE_NAME
+  for (let i = 0; i < scoresWithName.length; i++) {
+    const score = scoresWithName[i];
+    // Scored with no name are listed as "score_0", "score_1", etc.
+    const scoreName = score.name.replace(/\./g, "_") || `score_${i}`;
+
+    // Required: value
+    testTask.context.annotate(
+      String(score.score ?? ""),
+      `evals.scores.${scoreName}.value`,
+    );
+
+    // Optional: type (infer from score value)
+    if (score.score !== null && score.score !== undefined) {
+      const scoreType = typeof score.score === "boolean" ? "bool" : "float";
+      testTask.context.annotate(scoreType, `evals.scores.${scoreName}.type`);
+    }
+
+    // Optional: llm_judge (if available in metadata)
+    if (score.metadata?.llm_judge) {
+      testTask.context.annotate(
+        score.metadata.llm_judge,
+        `evals.scores.${scoreName}.llm_judge`,
+      );
+    }
+
+    // Optional: metadata fields (flattened)
+    if (score.metadata) {
+      const flattenedMetadata = flattenObject(score.metadata);
+      for (const [key, value] of Object.entries(flattenedMetadata)) {
+        testTask.context.annotate(
+          String(value ?? ""),
+          `evals.scores.${scoreName}.metadata.${key}`,
+        );
+      }
+    }
+  }
+
+  // Annotate toolCalls if present
+  if (toolCalls && toolCalls.length > 0) {
+    for (let i = 0; i < toolCalls.length; i++) {
+      const toolCall = toolCalls[i];
+      const flattenedToolCall = flattenObject(toolCall);
+
+      for (const [key, value] of Object.entries(flattenedToolCall)) {
+        const annotationKey = `evals.toolCalls.${i}.${key}`;
+        testTask.context.annotate(String(value ?? ""), annotationKey);
+      }
+    }
+  }
+}
+
 /**
  * Wraps text to fit within a specified width, breaking at word boundaries.
  *