Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
:type similarity_threshold: int
:param f1_score_threshold: The threshold for F1 score evaluation. Default is 0.5.
:type f1_score_threshold: float
:keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
:paramtype is_reasoning_model: bool
:return: A callable class that evaluates and generates metrics for "question-answering" scenario.
:param kwargs: Additional arguments to pass to the evaluator.
:type kwargs: Any
Expand Down Expand Up @@ -87,6 +90,7 @@ def __init__(
fluency_threshold: int = 3,
similarity_threshold: int = 3,
f1_score_threshold: float = 0.5,
is_reasoning_model: bool = False,
**kwargs,
):
# Type checking
Expand All @@ -102,11 +106,31 @@ def __init__(
raise TypeError(f"{name} must be an int or float, got {type(value)}")

evaluators = [
GroundednessEvaluator(model_config, threshold=groundedness_threshold),
RelevanceEvaluator(model_config, threshold=relevance_threshold),
CoherenceEvaluator(model_config, threshold=coherence_threshold),
FluencyEvaluator(model_config, threshold=fluency_threshold),
SimilarityEvaluator(model_config, threshold=similarity_threshold),
GroundednessEvaluator(
model_config,
threshold=groundedness_threshold,
is_reasoning_model=is_reasoning_model,
),
RelevanceEvaluator(
model_config,
threshold=relevance_threshold,
is_reasoning_model=is_reasoning_model,
),
CoherenceEvaluator(
model_config,
threshold=coherence_threshold,
is_reasoning_model=is_reasoning_model,
),
FluencyEvaluator(
model_config,
threshold=fluency_threshold,
is_reasoning_model=is_reasoning_model,
),
SimilarityEvaluator(
model_config,
threshold=similarity_threshold,
is_reasoning_model=is_reasoning_model,
),
F1ScoreEvaluator(threshold=f1_score_threshold),
]
super().__init__(evaluators=evaluators, **kwargs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
RetrievalEvaluator,
RelevanceEvaluator,
GroundednessEvaluator,
QAEvaluator,
)


Expand Down Expand Up @@ -243,3 +244,19 @@ def test_groundedness_evaluator_missing_required_inputs(self, mock_model_config)
"Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query' and 'response' are required."
in exc_info.value.args[0]
)

def test_qa_evaluator_is_reasoning_model_default(self, mock_model_config):
"""Test QAEvaluator initializes with is_reasoning_model defaulting to False"""
qa_eval = QAEvaluator(model_config=mock_model_config)
# Check that all model-based evaluators have is_reasoning_model set to False
for evaluator in qa_eval._evaluators:
if hasattr(evaluator, '_is_reasoning_model'):
assert evaluator._is_reasoning_model is False

def test_qa_evaluator_is_reasoning_model_true(self, mock_model_config):
"""Test QAEvaluator properly passes is_reasoning_model=True to sub-evaluators"""
qa_eval = QAEvaluator(model_config=mock_model_config, is_reasoning_model=True)
# Check that all model-based evaluators have is_reasoning_model set to True
for evaluator in qa_eval._evaluators:
if hasattr(evaluator, '_is_reasoning_model'):
assert evaluator._is_reasoning_model is True
Loading