Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
2b9e725
Add evaluate script
amrit110 Feb 10, 2026
1febe82
Add temperature settings to config
amrit110 Feb 10, 2026
2f07905
Add ignore
amrit110 Feb 10, 2026
6947837
Remove resume functionality
amrit110 Feb 10, 2026
b346420
Refactor judges and evaluate script
amrit110 Feb 11, 2026
aff5fbb
Merge branch 'main' into ak/add_evaluate_script
amrit110 Feb 11, 2026
39e89c8
Fixes based on code review
amrit110 Feb 11, 2026
4af5ff7
Merge branch 'ak/add_evaluate_script' of github.com:VectorInstitute/e…
amrit110 Feb 11, 2026
62e6097
Merge branch 'main' into ak/add_evaluate_script
amrit110 Feb 11, 2026
45ddeb1
Small fixes to docstring
amrit110 Feb 11, 2026
0ead41c
Merge branch 'ak/add_evaluate_script' of github.com:VectorInstitute/e…
amrit110 Feb 11, 2026
6dfc069
Small fix
amrit110 Feb 11, 2026
a73eeff
Merge branch 'main' into ak/add_evaluate_script
amrit110 Feb 13, 2026
c912781
Fix and simplify grader
amrit110 Feb 13, 2026
c7bd2cc
Merge branch 'ak/add_evaluate_script' of github.com:VectorInstitute/e…
amrit110 Feb 13, 2026
43171bc
Merge branch 'main' into ak/add_evaluate_script
amrit110 Feb 17, 2026
3be5c8a
Merge branch 'main' into ak/add_evaluate_script
amrit110 Feb 17, 2026
df01dcc
Merge branch 'main' into ak/add_evaluate_script
amrit110 Feb 17, 2026
5a08491
Remove mypy ignore
amrit110 Feb 17, 2026
e9cccb1
Merge branch 'ak/add_evaluate_script' of github.com:VectorInstitute/e…
amrit110 Feb 17, 2026
06b536f
Merge branch 'main' into ak/add_evaluate_script
amrit110 Feb 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions aieng-eval-agents/aieng/agent_evals/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,18 @@ class Configs(BaseSettings):
default="gemini-2.5-pro",
description="Model name for LLM-as-judge evaluation tasks.",
)
default_temperature: float = Field(
default=1.0,
ge=0.0,
le=2.0,
description="Default temperature for LLM generation. Lower values (0.0-0.3) produce more consistent outputs.",
)
default_evaluator_temperature: float = Field(
default=0.0,
ge=0.0,
le=2.0,
description="Temperature for LLM-as-judge evaluations. Default 0.0 for deterministic judging.",
)

# === Tracing (Langfuse) ===
langfuse_public_key: str | None = Field(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,365 @@
"""DeepSearchQA grader for evaluating knowledge agent responses.

This module provides the official DeepSearchQA evaluation methodology using
an LLM grader (autorater) to assess answer correctness with precision, recall,
and F1 metrics.

References
----------
- Paper: DeepSearchQA: Bridging the Comprehensiveness Gap for Deep Research Agents
- Dataset: https://huggingface.co/datasets/google/deepsearchqa
- Leaderboard: https://www.kaggle.com/benchmarks/google/dsqa
"""

import logging
from enum import Enum
from typing import Any

from aieng.agent_evals.async_client_manager import AsyncClientManager
from aieng.agent_evals.evaluation.graders._utils import run_structured_parse_call
from aieng.agent_evals.evaluation.graders.config import LLMRequestConfig
from aieng.agent_evals.evaluation.types import Evaluation
from pydantic import BaseModel, Field


logger = logging.getLogger(__name__)


class EvaluationOutcome(str, Enum):
"""Possible outcomes for DeepSearchQA evaluation.

The four disjoint categories represent the relationship between
the submitted answer set (S) and ground truth set (G).
"""

FULLY_CORRECT = "fully_correct"
CORRECT_WITH_EXTRANEOUS = "correct_with_extraneous"
PARTIALLY_CORRECT = "partially_correct"
FULLY_INCORRECT = "fully_incorrect"


class DeepSearchQAResult(BaseModel):
"""Result from DeepSearchQA evaluation with IR metrics.

This follows the official DeepSearchQA evaluation methodology from:
https://www.kaggle.com/benchmarks/google/dsqa
"""

precision: float = Field(
default=0.0, description="Fraction of predicted items that are correct (0-1). P = |S ∩ G| / |S|"
)
recall: float = Field(
default=0.0, description="Fraction of ground truth items that were found (0-1). R = |S ∩ G| / |G|"
)
f1_score: float = Field(
default=0.0, description="Harmonic mean of precision and recall (0-1). F1 = 2 * P * R / (P + R)"
)
outcome: EvaluationOutcome = Field(
default=EvaluationOutcome.FULLY_INCORRECT,
description="Evaluation outcome indicating the relationship between submitted and ground truth answer sets",
)
correctness_details: dict[str, bool] = Field(
default_factory=dict, description="For each ground truth item, whether it was found in the response"
)
extraneous_items: list[str] = Field(
default_factory=list, description="Items in the response that are not in the ground truth"
)
explanation: str = Field(default="", description="Explanation from the grader about the evaluation")

def to_evaluations(self) -> list[Evaluation]:
"""Convert this result to Langfuse Evaluation objects.

Returns
-------
list[Evaluation]
Four evaluations: Outcome (categorical), F1, Precision, Recall (numeric).
"""
comment_parts = [
f"Outcome: {self.outcome}",
f"Precision: {self.precision:.2f}",
f"Recall: {self.recall:.2f}",
f"F1: {self.f1_score:.2f}",
]

if self.explanation:
comment_parts.append(f"\nExplanation: {self.explanation}")

if self.correctness_details:
found = sum(1 for v in self.correctness_details.values() if v)
total = len(self.correctness_details)
comment_parts.append(f"\nCorrectness: {found}/{total} items found")

if self.extraneous_items:
comment_parts.append(f"\nExtraneous: {len(self.extraneous_items)} items")

comment = "\n".join(comment_parts)

outcome_display = {
EvaluationOutcome.FULLY_CORRECT: "Fully Correct",
EvaluationOutcome.CORRECT_WITH_EXTRANEOUS: "Correct with Extraneous",
EvaluationOutcome.PARTIALLY_CORRECT: "Partially Correct",
EvaluationOutcome.FULLY_INCORRECT: "Fully Incorrect",
}

return [
Evaluation(
name="Outcome",
value=outcome_display.get(self.outcome, self.outcome.value),
comment=self.explanation,
),
Evaluation(name="F1", value=self.f1_score, comment=comment),
Evaluation(name="Precision", value=self.precision, comment=comment),
Evaluation(name="Recall", value=self.recall, comment=comment),
]

@staticmethod
def error_evaluations(error_msg: str) -> list[Evaluation]:
"""Create error evaluations when evaluation fails.

Parameters
----------
error_msg : str
Description of the error that occurred.

Returns
-------
list[Evaluation]
Three evaluations (F1, Precision, Recall) all set to 0.0.
"""
comment = f"Evaluation error: {error_msg}"
return [
Evaluation(name="F1", value=0.0, comment=comment),
Evaluation(name="Precision", value=0.0, comment=comment),
Evaluation(name="Recall", value=0.0, comment=comment),
]


class DeepSearchQAGraderResponse(BaseModel):
"""Structured response from the DeepSearchQA grader.

This matches the official DeepSearchQA grader output format.
"""

answer_correctness: dict[str, Any] = Field(
alias="Answer Correctness",
description=(
"Dictionary containing: "
"Explanation (str) - Explanation of the evaluation; "
"Correctness Details (dict[str, bool]) - Per-item correctness; "
"Excessive Answers (list[str]) - Extra items not in ground truth"
),
)


# Official DeepSearchQA grader prompt from Appendix A of the paper
DEEPSEARCHQA_GRADER_PROMPT = """\
Your task is to evaluate whether a given "AI Response" for a specific "User Prompt"
arrived at the correct answer.

**Answer Correctness Task**
* **Purpose:** Assess whether the AI response provides the correct answer(s) based on
the provided "Correct Answer" and "Prompt Type".
* **Process:**
* Identify the "Prompt Type": "{prompt_type}".
* Refer to the "Correct Answer": "{answer}".
* Based on the "Prompt Type", determine if the "AI Response" contains the expected
answer(s).
* **'Single Answer'**: Check if the response provides the answer that addresses
the user's question. It does not have to match the exact wording of the provided
answer.
* **'Set Answer'**: Check if the response includes *each* item from the provided
ground truth answers. The order might not matter unless specified otherwise. The
response might include more answers than the list. Determine the correctness *
only* based on the list first and then check if the response includes answers not
in the list.
* **Explanation:** Provide a brief explanation justifying your assessment of answer
correctness, referencing specific parts of the AI response and the correct answer.
* **Correctness Details:** Provide a dictionary, one key for each expected answer
part, and value is a boolean indicating whether each expected answer part was found.
* For 'Set Answer', this will be a list of attributes, one for each item/part in
the "Correct Answer". Each key will be a string indicating the expected answer
part, and the value will be a boolean indicating whether that part was found in
the response.
* **Excessive Answers:** Provide a list of strings, each indicating an excessive
answer part. If the response provides answers that are **not** in the "Correct Answer
" list, add these answers as excessive answers. Return an empty list when there's no
excessive answers in the response.

**Output Format:**
Your evaluation *must* be structured as a nested JSON dictionary with the following top-
level keys: '"Answer Correctness"'. Please return NULL if any of "Prompt", "AI Response"
or "Correct Answer" is empty.
The value for '"Answer Correctness"' should be a dictionary containing '"Explanation"' (
a string), '"Correctness Details"' (a dictionary where each key is the expected correct
answer, and the value is a boolean indicating whether the response contains the correct
answer), and '"Excessive Answers"' (a list of strings indicating the excessive answers).

Make sure you return a valid JSON string. Pay special attention to quotes, commas and
special characters in the JSON string. Make sure to escape all special characters and
quotes in the JSON string.

**Example (Partial):**
```json
{{
"Answer Correctness": {{
"Explanation": "The response correctly identified Belgium and France but also
includes an excessive answer, Italy.",
"Correctness Details": {{
"Belgium": true,
"France": true
}},
"Excessive Answers": [ "Italy" ]
}}
}}
```

**Now, proceed with the evaluation using the provided User Prompt, AI Response, and
Correct Answer.**

User Prompt (Wrapped in <prompt> and </prompt>):
<prompt>
{prompt}
</prompt>
--------------------
** Correct Answer (Wrapped in <answer> and </answer>):
Prompt Type: {prompt_type}
<answer>
{answer}
</answer>
--------------------
AI assistant response (Wrapped in <response> and </response>):
<response>
{response}
</response>
--------------------
Rating:
"""


def _calculate_metrics_from_grader(
grader_result: dict[str, Any],
) -> DeepSearchQAResult:
"""Calculate precision, recall, F1 from grader output.

This follows the exact methodology from the paper:
- Precision = |S∩G| / |S|
- Recall = |S∩G| / |G|
- F1 = 2*P*R / (P+R)

Parameters
----------
grader_result : dict
Output from the LLM grader with Correctness Details and Excessive Answers.

Returns
-------
DeepSearchQAResult
Computed metrics and classifications.
"""
correctness_details = grader_result.get("Correctness Details", {})
extraneous_items = grader_result.get("Excessive Answers", [])
explanation = grader_result.get("Explanation", "")

# Count matched ground truth items
num_ground_truth = len(correctness_details)
num_matched = sum(1 for v in correctness_details.values() if v)
num_extraneous = len(extraneous_items)

# Total predicted items = matched + extraneous
num_predicted = num_matched + num_extraneous

# Calculate metrics
precision = num_matched / num_predicted if num_predicted > 0 else 0.0
recall = num_matched / num_ground_truth if num_ground_truth > 0 else 1.0
f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0

# Determine outcome based on set relationships
if num_matched == num_ground_truth and num_extraneous == 0:
outcome = EvaluationOutcome.FULLY_CORRECT
elif num_matched == num_ground_truth and num_extraneous > 0:
outcome = EvaluationOutcome.CORRECT_WITH_EXTRANEOUS
elif num_matched > 0:
outcome = EvaluationOutcome.PARTIALLY_CORRECT
else:
outcome = EvaluationOutcome.FULLY_INCORRECT

return DeepSearchQAResult(
precision=precision,
recall=recall,
f1_score=f1_score,
outcome=outcome,
correctness_details=correctness_details,
extraneous_items=extraneous_items,
explanation=explanation,
)


async def evaluate_deepsearchqa_async(
*,
question: str,
answer: str,
ground_truth: str,
answer_type: str = "Single Answer",
model_config: LLMRequestConfig | None = None,
) -> DeepSearchQAResult:
"""Evaluate an answer using DeepSearchQA methodology.

This async evaluator uses shared infrastructure for LLM evaluation.

Parameters
----------
question : str
The original question.
answer : str
The agent's answer.
ground_truth : str
The expected ground truth answer.
answer_type : str
Type of answer: "Single Answer" or "Set Answer".
model_config : LLMRequestConfig | None, optional
Optional model configuration. If None, defaults are used.

Returns
-------
DeepSearchQAResult
The evaluation result with precision, recall, and F1 metrics.
"""
config = model_config or LLMRequestConfig()
client_manager = AsyncClientManager.get_instance()

# Build the grader prompt
user_prompt = DEEPSEARCHQA_GRADER_PROMPT.format(
prompt=question,
response=answer,
answer=ground_truth,
prompt_type=answer_type,
)

try:
completion = await run_structured_parse_call(
openai_client=client_manager.openai_client,
default_model=client_manager.configs.default_evaluator_model,
model_config=config,
system_prompt="", # DeepSearchQA uses all instructions in user prompt
user_prompt=user_prompt,
response_format=DeepSearchQAGraderResponse,
)

grader_response: DeepSearchQAGraderResponse | None = completion.choices[0].message.parsed

if grader_response is None:
raise ValueError("Grader returned null response")

return _calculate_metrics_from_grader(grader_response.answer_correctness)

except Exception as e:
logger.warning(f"Failed to evaluate with DeepSearchQA grader: {e}")
return DeepSearchQAResult(
precision=0.0,
recall=0.0,
f1_score=0.0,
outcome=EvaluationOutcome.FULLY_INCORRECT,
correctness_details={},
extraneous_items=[],
explanation=f"Grader error: {e}",
)
Loading