Skip to content
2 changes: 2 additions & 0 deletions src/ragas/metrics/collections/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from ragas.metrics.collections._answer_relevancy import AnswerRelevancy
from ragas.metrics.collections._answer_similarity import AnswerSimilarity
from ragas.metrics.collections._bleu_score import BleuScore
from ragas.metrics.collections._context_recall import ContextRecall
from ragas.metrics.collections._rouge_score import RougeScore
from ragas.metrics.collections._semantic_similarity import SemanticSimilarity
from ragas.metrics.collections._string import (
Expand All @@ -18,6 +19,7 @@
"AnswerRelevancy",
"AnswerSimilarity",
"BleuScore",
"ContextRecall",
"DistanceMeasure",
"ExactMatch",
"NonLLMStringSimilarity",
Expand Down
124 changes: 124 additions & 0 deletions src/ragas/metrics/collections/_context_recall.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""Context Recall metric v2 - Class-based implementation with modern components."""

import typing as t

import numpy as np
from pydantic import BaseModel

from ragas.metrics.collections.base import BaseMetric
from ragas.metrics.result import MetricResult
from ragas.prompt.metrics.context_recall import context_recall_prompt

if t.TYPE_CHECKING:
from ragas.llms.base import InstructorBaseRagasLLM


class ContextRecallClassification(BaseModel):
"""Structured output for a single statement classification."""

statement: str
reason: str
attributed: int


class ContextRecallOutput(BaseModel):
"""Structured output for context recall classifications."""

classifications: t.List[ContextRecallClassification]


class ContextRecall(BaseMetric):
"""
Evaluate context recall by classifying if statements can be attributed to context.

This implementation uses modern instructor LLMs with structured output.
Only supports modern components - legacy wrappers are rejected with clear error messages.

Usage:
>>> import instructor
>>> from openai import AsyncOpenAI
>>> from ragas.llms.base import instructor_llm_factory
>>> from ragas.metrics.collections import ContextRecall
>>>
>>> # Setup dependencies
>>> client = AsyncOpenAI()
>>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o-mini")
>>>
>>> # Create metric instance
>>> metric = ContextRecall(llm=llm)
>>>
>>> # Single evaluation
>>> result = await metric.ascore(
... user_input="What is the capital of France?",
... retrieved_contexts=["Paris is the capital of France."],
... reference="Paris is the capital and largest city of France."
... )
>>> print(f"Score: {result.value}")
>>>
>>> # Batch evaluation
>>> results = await metric.abatch_score([
... {"user_input": "Q1", "retrieved_contexts": ["C1"], "reference": "A1"},
... {"user_input": "Q2", "retrieved_contexts": ["C2"], "reference": "A2"},
... ])

Attributes:
llm: Modern instructor-based LLM for classification
name: The metric name
allowed_values: Score range (0.0 to 1.0)
"""

# Type hints for linter (attributes are set in __init__)
llm: "InstructorBaseRagasLLM"

def __init__(
self,
llm: "InstructorBaseRagasLLM",
name: str = "context_recall",
**kwargs,
):
"""Initialize ContextRecall metric with required components."""
# Set attributes explicitly before calling super()
self.llm = llm

# Call super() for validation
super().__init__(name=name, **kwargs)

async def ascore(
self,
user_input: str,
retrieved_contexts: t.List[str],
reference: str,
) -> MetricResult:
"""
Calculate context recall score asynchronously.

Components are guaranteed to be validated and non-None by the base class.

Args:
user_input: The original question
retrieved_contexts: List of retrieved context strings
reference: The reference answer to evaluate

Returns:
MetricResult with recall score (0.0-1.0)
"""
# Combine contexts into a single string
context = "\n".join(retrieved_contexts) if retrieved_contexts else ""

# Generate prompt
prompt = context_recall_prompt(
question=user_input, context=context, answer=reference
)

# Get classifications from LLM
result = await self.llm.agenerate(prompt, ContextRecallOutput)

# Calculate score
if not result.classifications:
return MetricResult(value=np.nan)

# Count attributions
attributions = [c.attributed for c in result.classifications]
score = sum(attributions) / len(attributions) if attributions else np.nan

return MetricResult(value=float(score))
64 changes: 64 additions & 0 deletions src/ragas/prompt/metrics/context_recall.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""Context Recall prompt for classifying statement attributions."""

import json


def context_recall_prompt(question: str, context: str, answer: str) -> str:
"""
Generate the prompt for context recall evaluation.

Args:
question: The original question
context: The retrieved context to evaluate against
answer: The reference answer containing statements to classify

Returns:
Formatted prompt string for the LLM
"""
# Use json.dumps() to safely escape the strings
safe_question = json.dumps(question)
safe_context = json.dumps(context)
safe_answer = json.dumps(answer)

return f"""Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only 'Yes' (1) or 'No' (0) as a binary classification. Output json with reason.

--------EXAMPLES-----------
Example 1
Input: {{
"question": "What can you tell me about Albert Einstein?",
"context": "Albert Einstein (14 March 1879 - 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass-energy equivalence formula E = mc2, which arises from relativity theory, has been called 'the world's most famous equation'. He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect', a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.",
"answer": "Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895."
}}
Output: {{
"classifications": [
{{
"statement": "Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.",
"reason": "The date of birth of Einstein is mentioned clearly in the context.",
"attributed": 1
}},
{{
"statement": "He received the 1921 Nobel Prize in Physics for his services to theoretical physics.",
"reason": "The exact sentence is present in the given context.",
"attributed": 1
}},
{{
"statement": "He published 4 papers in 1905.",
"reason": "There is no mention about papers he wrote in the given context.",
"attributed": 0
}},
{{
"statement": "Einstein moved to Switzerland in 1895.",
"reason": "There is no supporting evidence for this in the given context.",
"attributed": 0
}}
]
}}
-----------------------------

Now perform the same with the following input
Input: {{
"question": {safe_question},
"context": {safe_context},
"answer": {safe_answer}
}}
Output: """
158 changes: 158 additions & 0 deletions tests/e2e/metrics_migration/test_context_recall_migration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
"""E2E tests for Context Recall metric migration from v1 (class-based) to v2 (class-based with automatic validation)."""

import pytest

from ragas.metrics import LLMContextRecall as LegacyContextRecall
from ragas.metrics.collections import ContextRecall

from .base_migration_test import BaseMigrationTest


class TestContextRecallE2EMigration(BaseMigrationTest):
"""E2E test compatibility between legacy ContextRecall class and new V2 ContextRecall class with automatic validation."""

@pytest.fixture
def sample_data(self):
"""Real-world test cases for context recall evaluation."""
return [
{
"user_input": "What is the capital of France?",
"retrieved_contexts": [
"Paris is the capital and largest city of France.",
"France is a country in Western Europe.",
],
"reference": "Paris is the capital of France. It is located in northern France.",
"description": "Full attribution - all statements should be found in context",
},
{
"user_input": "Tell me about Albert Einstein",
"retrieved_contexts": [
"Albert Einstein was born in 1879. He developed the theory of relativity."
],
"reference": "Einstein was born in 1879. He won the Nobel Prize in 1921. He developed relativity theory.",
"description": "Partial attribution - Nobel Prize not mentioned in context",
},
{
"user_input": "What are the main causes of climate change?",
"retrieved_contexts": [
"Climate change is primarily caused by greenhouse gas emissions from burning fossil fuels.",
"Deforestation also contributes to climate change by reducing CO2 absorption.",
],
"reference": "The main causes include fossil fuel emissions and deforestation.",
"description": "Multiple contexts - all statements attributed",
},
{
"user_input": "How does photosynthesis work?",
"retrieved_contexts": [
"Photosynthesis is a process where plants use sunlight to produce glucose."
],
"reference": "Plants convert sunlight into glucose through photosynthesis. This process also produces oxygen and occurs in chloroplasts.",
"description": "Partial attribution - oxygen and chloroplasts not in context",
},
{
"user_input": "What is quantum computing?",
"retrieved_contexts": [
"Quantum computers use quantum bits or qubits instead of classical bits."
],
"reference": "Quantum computing uses qubits.",
"description": "Simple case - direct attribution",
},
]

@pytest.mark.asyncio
async def test_legacy_context_recall_vs_v2_context_recall_e2e_compatibility(
self,
sample_data,
legacy_llm,
modern_llm,
):
"""E2E test that legacy and v2 implementations produce similar scores with real LLM."""
await self.run_e2e_compatibility_test(
sample_data=sample_data,
legacy_metric_factory=LegacyContextRecall,
v2_metric_factory=ContextRecall,
legacy_components={"llm": legacy_llm},
v2_components={"llm": modern_llm},
tolerance=0.3,
metric_name="Context Recall",
additional_info_keys=["user_input", "reference"],
)

@pytest.mark.asyncio
async def test_context_recall_attribution_detection(self, legacy_llm, modern_llm):
"""Test that both implementations correctly detect statement attributions."""

if legacy_llm is None or modern_llm is None:
pytest.skip("LLM required for E2E testing")

# Test cases specifically for attribution detection
test_cases = [
{
"user_input": "What is the capital of France?",
"retrieved_contexts": ["Paris is the capital of France."],
"reference": "Paris is the capital of France.",
"expected_high": True,
"description": "Perfect attribution - should get high score",
},
{
"user_input": "What is the capital of France?",
"retrieved_contexts": ["France is a European country."],
"reference": "Paris is the capital of France.",
"expected_high": False,
"description": "No attribution - should get low score",
},
{
"user_input": "Tell me about Einstein",
"retrieved_contexts": ["Einstein was born in 1879."],
"reference": "Einstein was born in 1879. He won the Nobel Prize.",
"expected_high": False,
"description": "Partial attribution - should get medium score (50%)",
},
]

# Define custom assertion function
def assertion_fn(case, legacy_score, v2_result):
print(f" Reference: {case['reference']}")

if case.get("expected_high"):
# High attribution should get high scores (> 0.8)
assert legacy_score > 0.8, (
f"Legacy should detect high attribution: {legacy_score}"
)
assert v2_result.value > 0.8, (
f"V2 class should detect high attribution: {v2_result.value}"
)
print(" ✅ All detected high attribution")
else:
# Low/partial attribution should get lower scores
# Note: We don't enforce strict thresholds here as it depends on the specific case
print(
f" ✅ Scores reflect attribution level (Legacy: {legacy_score:.2f}, V2: {v2_result.value:.2f})"
)

await self.run_metric_specific_test(
test_cases=test_cases,
legacy_metric_factory=LegacyContextRecall,
v2_metric_factory=ContextRecall,
legacy_components={"llm": legacy_llm},
v2_components={"llm": modern_llm},
test_name="attribution detection",
assertion_fn=assertion_fn,
)

def test_context_recall_migration_requirements_documented(self):
"""Document the requirements for running full E2E context recall tests."""

requirements = {
"llm": "OpenAI GPT, Anthropic Claude, or other LangChain-compatible LLM",
"environment": "API keys configured for LLM providers",
"purpose": "Verify that v2 class-based implementation with automatic validation produces similar results to legacy class-based implementation",
}

self.create_requirements_documentation(
metric_name="Context Recall",
requirements=requirements,
test_file_name="test_context_recall_migration.py",
)

assert True
Loading