From ca7286993da4397d40a75f05249f6d37ed21a94b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 9 Dec 2025 17:21:58 +0000 Subject: [PATCH 1/5] Initial plan From f5be456b055e98de84b4202b850959a6169dea4e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 9 Dec 2025 17:31:55 +0000 Subject: [PATCH 2/5] Add is_reasoning_model parameter support to QAEvaluator Co-authored-by: nagkumar91 <4727422+nagkumar91@users.noreply.github.com> --- .../ai/evaluation/_evaluators/_qa/_qa.py | 14 +++++++++----- .../unittests/test_built_in_evaluator.py | 19 +++++++++++++++++++ 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py index df095f67ba97..f582266b1cfb 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py @@ -35,6 +35,9 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]): :type similarity_threshold: int :param f1_score_threshold: The threshold for F1 score evaluation. Default is 0.5. :type f1_score_threshold: float + :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models). + This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False. + :paramtype is_reasoning_model: bool :return: A callable class that evaluates and generates metrics for "question-answering" scenario. :param kwargs: Additional arguments to pass to the evaluator. :type kwargs: Any @@ -87,6 +90,7 @@ def __init__( fluency_threshold: int = 3, similarity_threshold: int = 3, f1_score_threshold: float = 0.5, + is_reasoning_model: bool = False, **kwargs, ): # Type checking @@ -102,11 +106,11 @@ def __init__( raise TypeError(f"{name} must be an int or float, got {type(value)}") evaluators = [ - GroundednessEvaluator(model_config, threshold=groundedness_threshold), - RelevanceEvaluator(model_config, threshold=relevance_threshold), - CoherenceEvaluator(model_config, threshold=coherence_threshold), - FluencyEvaluator(model_config, threshold=fluency_threshold), - SimilarityEvaluator(model_config, threshold=similarity_threshold), + GroundednessEvaluator(model_config, threshold=groundedness_threshold, is_reasoning_model=is_reasoning_model), + RelevanceEvaluator(model_config, threshold=relevance_threshold, is_reasoning_model=is_reasoning_model), + CoherenceEvaluator(model_config, threshold=coherence_threshold, is_reasoning_model=is_reasoning_model), + FluencyEvaluator(model_config, threshold=fluency_threshold, is_reasoning_model=is_reasoning_model), + SimilarityEvaluator(model_config, threshold=similarity_threshold, is_reasoning_model=is_reasoning_model), F1ScoreEvaluator(threshold=f1_score_threshold), ] super().__init__(evaluators=evaluators, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py index 9bfbc85721eb..bb0d276b0b30 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py @@ -9,6 +9,7 @@ RetrievalEvaluator, RelevanceEvaluator, GroundednessEvaluator, + QAEvaluator, ) @@ -243,3 +244,21 @@ def test_groundedness_evaluator_missing_required_inputs(self, mock_model_config) "Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query' and 'response' are required." in exc_info.value.args[0] ) + + def test_qa_evaluator_is_reasoning_model_default(self, mock_model_config): + """Test QAEvaluator initializes with is_reasoning_model defaulting to False""" + qa_eval = QAEvaluator(model_config=mock_model_config) + + # Check that all model-based evaluators have is_reasoning_model set to False + for evaluator in qa_eval._evaluators: + if hasattr(evaluator, '_is_reasoning_model'): + assert evaluator._is_reasoning_model is False + + def test_qa_evaluator_is_reasoning_model_true(self, mock_model_config): + """Test QAEvaluator properly passes is_reasoning_model=True to sub-evaluators""" + qa_eval = QAEvaluator(model_config=mock_model_config, is_reasoning_model=True) + + # Check that all model-based evaluators have is_reasoning_model set to True + for evaluator in qa_eval._evaluators: + if hasattr(evaluator, '_is_reasoning_model'): + assert evaluator._is_reasoning_model is True From 89aa78575b7a161cbbc167b38429e6314f06a7ff Mon Sep 17 00:00:00 2001 From: Nagkumar Arkalgud Date: Tue, 9 Dec 2025 15:24:08 -0800 Subject: [PATCH 3/5] Update sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../tests/unittests/test_built_in_evaluator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py index bb0d276b0b30..72a7908754ad 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py @@ -248,7 +248,6 @@ def test_groundedness_evaluator_missing_required_inputs(self, mock_model_config) def test_qa_evaluator_is_reasoning_model_default(self, mock_model_config): """Test QAEvaluator initializes with is_reasoning_model defaulting to False""" qa_eval = QAEvaluator(model_config=mock_model_config) - # Check that all model-based evaluators have is_reasoning_model set to False for evaluator in qa_eval._evaluators: if hasattr(evaluator, '_is_reasoning_model'): From fb69e37725626f01f7e9dc6f19ce9711e8a5a61f Mon Sep 17 00:00:00 2001 From: Nagkumar Arkalgud Date: Tue, 9 Dec 2025 15:24:18 -0800 Subject: [PATCH 4/5] Update sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../tests/unittests/test_built_in_evaluator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py index 72a7908754ad..4af60eefbca2 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py @@ -256,7 +256,6 @@ def test_qa_evaluator_is_reasoning_model_default(self, mock_model_config): def test_qa_evaluator_is_reasoning_model_true(self, mock_model_config): """Test QAEvaluator properly passes is_reasoning_model=True to sub-evaluators""" qa_eval = QAEvaluator(model_config=mock_model_config, is_reasoning_model=True) - # Check that all model-based evaluators have is_reasoning_model set to True for evaluator in qa_eval._evaluators: if hasattr(evaluator, '_is_reasoning_model'): From 363bec4db9e63bce4a74fb4c635c11fd4e3f1419 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 9 Dec 2025 23:30:32 +0000 Subject: [PATCH 5/5] Apply black formatting to _qa.py evaluators list Co-authored-by: nagkumar91 <4727422+nagkumar91@users.noreply.github.com> --- .../ai/evaluation/_evaluators/_qa/_qa.py | 30 +++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py index f582266b1cfb..c300552a32bc 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py @@ -106,11 +106,31 @@ def __init__( raise TypeError(f"{name} must be an int or float, got {type(value)}") evaluators = [ - GroundednessEvaluator(model_config, threshold=groundedness_threshold, is_reasoning_model=is_reasoning_model), - RelevanceEvaluator(model_config, threshold=relevance_threshold, is_reasoning_model=is_reasoning_model), - CoherenceEvaluator(model_config, threshold=coherence_threshold, is_reasoning_model=is_reasoning_model), - FluencyEvaluator(model_config, threshold=fluency_threshold, is_reasoning_model=is_reasoning_model), - SimilarityEvaluator(model_config, threshold=similarity_threshold, is_reasoning_model=is_reasoning_model), + GroundednessEvaluator( + model_config, + threshold=groundedness_threshold, + is_reasoning_model=is_reasoning_model, + ), + RelevanceEvaluator( + model_config, + threshold=relevance_threshold, + is_reasoning_model=is_reasoning_model, + ), + CoherenceEvaluator( + model_config, + threshold=coherence_threshold, + is_reasoning_model=is_reasoning_model, + ), + FluencyEvaluator( + model_config, + threshold=fluency_threshold, + is_reasoning_model=is_reasoning_model, + ), + SimilarityEvaluator( + model_config, + threshold=similarity_threshold, + is_reasoning_model=is_reasoning_model, + ), F1ScoreEvaluator(threshold=f1_score_threshold), ] super().__init__(evaluators=evaluators, **kwargs)