diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py index df095f67ba97..c300552a32bc 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py @@ -35,6 +35,9 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]): :type similarity_threshold: int :param f1_score_threshold: The threshold for F1 score evaluation. Default is 0.5. :type f1_score_threshold: float + :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models). + This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False. + :paramtype is_reasoning_model: bool :return: A callable class that evaluates and generates metrics for "question-answering" scenario. :param kwargs: Additional arguments to pass to the evaluator. :type kwargs: Any @@ -87,6 +90,7 @@ def __init__( fluency_threshold: int = 3, similarity_threshold: int = 3, f1_score_threshold: float = 0.5, + is_reasoning_model: bool = False, **kwargs, ): # Type checking @@ -102,11 +106,31 @@ def __init__( raise TypeError(f"{name} must be an int or float, got {type(value)}") evaluators = [ - GroundednessEvaluator(model_config, threshold=groundedness_threshold), - RelevanceEvaluator(model_config, threshold=relevance_threshold), - CoherenceEvaluator(model_config, threshold=coherence_threshold), - FluencyEvaluator(model_config, threshold=fluency_threshold), - SimilarityEvaluator(model_config, threshold=similarity_threshold), + GroundednessEvaluator( + model_config, + threshold=groundedness_threshold, + is_reasoning_model=is_reasoning_model, + ), + RelevanceEvaluator( + model_config, + threshold=relevance_threshold, + is_reasoning_model=is_reasoning_model, + ), + CoherenceEvaluator( + model_config, + threshold=coherence_threshold, + is_reasoning_model=is_reasoning_model, + ), + FluencyEvaluator( + model_config, + threshold=fluency_threshold, + is_reasoning_model=is_reasoning_model, + ), + SimilarityEvaluator( + model_config, + threshold=similarity_threshold, + is_reasoning_model=is_reasoning_model, + ), F1ScoreEvaluator(threshold=f1_score_threshold), ] super().__init__(evaluators=evaluators, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py index 9bfbc85721eb..4af60eefbca2 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py @@ -9,6 +9,7 @@ RetrievalEvaluator, RelevanceEvaluator, GroundednessEvaluator, + QAEvaluator, ) @@ -243,3 +244,19 @@ def test_groundedness_evaluator_missing_required_inputs(self, mock_model_config) "Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query' and 'response' are required." in exc_info.value.args[0] ) + + def test_qa_evaluator_is_reasoning_model_default(self, mock_model_config): + """Test QAEvaluator initializes with is_reasoning_model defaulting to False""" + qa_eval = QAEvaluator(model_config=mock_model_config) + # Check that all model-based evaluators have is_reasoning_model set to False + for evaluator in qa_eval._evaluators: + if hasattr(evaluator, '_is_reasoning_model'): + assert evaluator._is_reasoning_model is False + + def test_qa_evaluator_is_reasoning_model_true(self, mock_model_config): + """Test QAEvaluator properly passes is_reasoning_model=True to sub-evaluators""" + qa_eval = QAEvaluator(model_config=mock_model_config, is_reasoning_model=True) + # Check that all model-based evaluators have is_reasoning_model set to True + for evaluator in qa_eval._evaluators: + if hasattr(evaluator, '_is_reasoning_model'): + assert evaluator._is_reasoning_model is True