From d36d6bd77cd0d38da63dcc72af5894deec3adf39 Mon Sep 17 00:00:00 2001 From: "w.zhang" <136773451+weizhang25@users.noreply.github.com> Date: Fri, 27 Feb 2026 07:31:35 +0800 Subject: [PATCH 1/3] Expose templates of LLM grader instances and default templates of LLM grader classes. (#124) * Expose templates of grader instances and default templates of grader classes. * Resolve code review feedbacks. * Update function argument type annotation. --- .../event_interpretation/event_analysis.py | 4 +- .../event_identification.py | 4 +- .../characteristics_analysis.py | 4 +- .../industry_research/risk_analysis.py | 4 +- .../underlying_comparison.py | 4 +- .../macro_analysis/concept_explanation.py | 4 +- .../macro_analysis/macro_analysis.py | 4 +- .../stock_analysis/fundamental_analysis.py | 4 +- .../stock_analysis/overall_logic.py | 4 +- .../stock_analysis/stock_risk_analysis.py | 4 +- .../stock_analysis/valuation_analysis.py | 4 +- .../stock_search/search_integrity.py | 4 +- .../stock_search/search_relevance.py | 4 +- .../stock_search/search_timeliness.py | 4 +- .../graders/agent/action/action_alignment.py | 8 ++-- .../graders/agent/memory/memory_accuracy.py | 6 ++- .../memory/memory_detail_preservation.py | 6 ++- .../memory/memory_retrieval_effectiveness.py | 4 +- .../graders/agent/plan/plan_feasibility.py | 6 ++- .../agent/reflection/reflection_accuracy.py | 6 ++- .../reflection_outcome_understanding.py | 6 ++- .../reflection_progress_awareness.py | 6 ++- .../graders/agent/tool/tool_call_accuracy.py | 6 ++- .../graders/agent/tool/tool_call_success.py | 6 ++- .../agent/tool/tool_parameter_check.py | 6 ++- .../graders/agent/tool/tool_selection.py | 6 ++- .../agent/trajectory/trajectory_accuracy.py | 6 ++- .../trajectory/trajectory_comprehensive.py | 6 ++- openjudge/graders/common/correctness.py | 4 +- openjudge/graders/common/hallucination.py | 6 ++- openjudge/graders/common/harmfulness.py | 4 +- .../graders/common/instruction_following.py | 4 +- openjudge/graders/common/relevance.py | 4 +- openjudge/graders/llm_grader.py | 16 +++++++ .../multi_turn/anaphora_resolution_grader.py | 4 +- .../multi_turn/context_memory_grader.py | 4 +- .../instruction_clarification_grader.py | 4 +- .../proactive_interaction_grader.py | 4 +- .../multi_turn/response_repetition_grader.py | 4 +- .../multi_turn/self_correction_grader.py | 4 +- .../graders/multi_turn/topic_switch_grader.py | 4 +- .../graders/multimodal/image_coherence.py | 6 ++- .../graders/multimodal/image_helpfulness.py | 6 ++- .../agent/action/test_action_alignment.py | 28 +++++++++--- tests/graders/test_llm_grader.py | 45 +++++++++++++++---- 45 files changed, 218 insertions(+), 73 deletions(-) diff --git a/cookbooks/finance_grader/event_interpretation/event_analysis.py b/cookbooks/finance_grader/event_interpretation/event_analysis.py index 052696366..45fc33f28 100644 --- a/cookbooks/finance_grader/event_interpretation/event_analysis.py +++ b/cookbooks/finance_grader/event_interpretation/event_analysis.py @@ -219,6 +219,8 @@ class EventAnalysisGrader(LLMGrader): >>> print(result.rank) # [2, 1] means answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_EVENT_ANALYSIS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -241,7 +243,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate financial event analysis quality by comparing two responses", model=model, - template=template or DEFAULT_EVENT_ANALYSIS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/event_interpretation/event_identification.py b/cookbooks/finance_grader/event_interpretation/event_identification.py index fd0cba3f6..b49436071 100644 --- a/cookbooks/finance_grader/event_interpretation/event_identification.py +++ b/cookbooks/finance_grader/event_interpretation/event_identification.py @@ -196,6 +196,8 @@ class EventIdentificationGrader(LLMGrader): >>> print(result.rank) # [2, 1] means answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_EVENT_IDENTIFICATION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -218,7 +220,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate financial event identification quality by comparing two responses", model=model, - template=template or DEFAULT_EVENT_IDENTIFICATION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/industry_research/characteristics_analysis.py b/cookbooks/finance_grader/industry_research/characteristics_analysis.py index dcc03211b..bed67806c 100644 --- a/cookbooks/finance_grader/industry_research/characteristics_analysis.py +++ b/cookbooks/finance_grader/industry_research/characteristics_analysis.py @@ -216,6 +216,8 @@ class CharacteristicsAnalysisGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_CHARACTERISTICS_ANALYSIS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -236,7 +238,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate industry characteristics analysis quality by comparing two responses", model=model, - template=template or DEFAULT_CHARACTERISTICS_ANALYSIS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/industry_research/risk_analysis.py b/cookbooks/finance_grader/industry_research/risk_analysis.py index 9deb46d7d..0cfe01897 100644 --- a/cookbooks/finance_grader/industry_research/risk_analysis.py +++ b/cookbooks/finance_grader/industry_research/risk_analysis.py @@ -208,6 +208,8 @@ class RiskAnalysisGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_RISK_ANALYSIS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -228,7 +230,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate financial risk analysis quality by comparing two responses", model=model, - template=template or DEFAULT_RISK_ANALYSIS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/industry_research/underlying_comparison.py b/cookbooks/finance_grader/industry_research/underlying_comparison.py index 635010e63..0db78e8b8 100644 --- a/cookbooks/finance_grader/industry_research/underlying_comparison.py +++ b/cookbooks/finance_grader/industry_research/underlying_comparison.py @@ -212,6 +212,8 @@ class UnderlyingComparisonGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_UNDERLYING_COMPARISON_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -232,7 +234,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate underlying comparison analysis quality by comparing two responses", model=model, - template=template or DEFAULT_UNDERLYING_COMPARISON_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/macro_analysis/concept_explanation.py b/cookbooks/finance_grader/macro_analysis/concept_explanation.py index 57011b681..3e4fe1f22 100644 --- a/cookbooks/finance_grader/macro_analysis/concept_explanation.py +++ b/cookbooks/finance_grader/macro_analysis/concept_explanation.py @@ -187,6 +187,8 @@ class ConceptExplanationGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_CONCEPT_EXPLANATION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -207,7 +209,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate macroeconomic concept explanation quality by comparing two responses", model=model, - template=template or DEFAULT_CONCEPT_EXPLANATION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/macro_analysis/macro_analysis.py b/cookbooks/finance_grader/macro_analysis/macro_analysis.py index 246557493..f9f99bd4c 100644 --- a/cookbooks/finance_grader/macro_analysis/macro_analysis.py +++ b/cookbooks/finance_grader/macro_analysis/macro_analysis.py @@ -214,6 +214,8 @@ class MacroAnalysisGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_MACRO_ANALYSIS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -234,7 +236,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate macroeconomic analysis quality by comparing two responses", model=model, - template=template or DEFAULT_MACRO_ANALYSIS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/stock_analysis/fundamental_analysis.py b/cookbooks/finance_grader/stock_analysis/fundamental_analysis.py index 2fab5e7b9..26394fad8 100644 --- a/cookbooks/finance_grader/stock_analysis/fundamental_analysis.py +++ b/cookbooks/finance_grader/stock_analysis/fundamental_analysis.py @@ -212,6 +212,8 @@ class FundamentalAnalysisGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_FUNDAMENTAL_ANALYSIS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -232,7 +234,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate fundamental analysis quality by comparing two responses", model=model, - template=template or DEFAULT_FUNDAMENTAL_ANALYSIS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/stock_analysis/overall_logic.py b/cookbooks/finance_grader/stock_analysis/overall_logic.py index 0e14ae8ec..2b5d01b3a 100644 --- a/cookbooks/finance_grader/stock_analysis/overall_logic.py +++ b/cookbooks/finance_grader/stock_analysis/overall_logic.py @@ -204,6 +204,8 @@ class OverallLogicGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_OVERALL_LOGIC_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -224,7 +226,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate overall logic and structure quality by comparing two responses", model=model, - template=template or DEFAULT_OVERALL_LOGIC_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/stock_analysis/stock_risk_analysis.py b/cookbooks/finance_grader/stock_analysis/stock_risk_analysis.py index d8cf8af62..c7c79976d 100644 --- a/cookbooks/finance_grader/stock_analysis/stock_risk_analysis.py +++ b/cookbooks/finance_grader/stock_analysis/stock_risk_analysis.py @@ -211,6 +211,8 @@ class StockRiskAnalysisGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_STOCK_RISK_ANALYSIS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -231,7 +233,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate stock risk analysis quality by comparing two responses", model=model, - template=template or DEFAULT_STOCK_RISK_ANALYSIS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/stock_analysis/valuation_analysis.py b/cookbooks/finance_grader/stock_analysis/valuation_analysis.py index 0a0b14759..ee81de527 100644 --- a/cookbooks/finance_grader/stock_analysis/valuation_analysis.py +++ b/cookbooks/finance_grader/stock_analysis/valuation_analysis.py @@ -202,6 +202,8 @@ class ValuationAnalysisGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_VALUATION_ANALYSIS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -222,7 +224,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate valuation analysis quality by comparing two responses", model=model, - template=template or DEFAULT_VALUATION_ANALYSIS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/stock_search/search_integrity.py b/cookbooks/finance_grader/stock_search/search_integrity.py index 82a86c6aa..05c488c53 100644 --- a/cookbooks/finance_grader/stock_search/search_integrity.py +++ b/cookbooks/finance_grader/stock_search/search_integrity.py @@ -187,6 +187,8 @@ class SearchIntegrityGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_SEARCH_INTEGRITY_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -207,7 +209,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate stock search integrity and completeness by comparing two responses", model=model, - template=template or DEFAULT_SEARCH_INTEGRITY_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/stock_search/search_relevance.py b/cookbooks/finance_grader/stock_search/search_relevance.py index 220be351a..b86c406ea 100644 --- a/cookbooks/finance_grader/stock_search/search_relevance.py +++ b/cookbooks/finance_grader/stock_search/search_relevance.py @@ -187,6 +187,8 @@ class SearchRelevanceGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_SEARCH_RELEVANCE_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -207,7 +209,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate stock search relevance by comparing two responses", model=model, - template=template or DEFAULT_SEARCH_RELEVANCE_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/stock_search/search_timeliness.py b/cookbooks/finance_grader/stock_search/search_timeliness.py index 7282634de..eba4a04ac 100644 --- a/cookbooks/finance_grader/stock_search/search_timeliness.py +++ b/cookbooks/finance_grader/stock_search/search_timeliness.py @@ -187,6 +187,8 @@ class SearchTimelinessGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_SEARCH_TIMELINESS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -207,7 +209,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate stock search timeliness by comparing two responses", model=model, - template=template or DEFAULT_SEARCH_TIMELINESS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/action/action_alignment.py b/openjudge/graders/agent/action/action_alignment.py index 8a1f009be..ac802ac7b 100644 --- a/openjudge/graders/agent/action/action_alignment.py +++ b/openjudge/graders/agent/action/action_alignment.py @@ -170,10 +170,12 @@ class ActionAlignmentGrader(LLMGrader): >>> print(f"Score: {result.score}") # Expected: 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_ACTION_ALIGNMENT_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_ACTION_ALIGNMENT_TEMPLATE, + template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -183,7 +185,7 @@ def __init__( Args: model: The chat model to use for evaluation, either as a BaseChatModel instance or config dict template: The prompt template for action alignment evaluation. - Defaults to DEFAULT_ACTION_ALIGNMENT_TEMPLATE. + Defaults to DEFAULT_TEMPLATE. language: The language for the evaluation prompt. Defaults to LanguageEnum.EN. strategy: The evaluation strategy to use. Defaults to DirectStrategy. """ @@ -192,7 +194,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate action alignment with plan", model=model, - template=template or DEFAULT_ACTION_ALIGNMENT_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/memory/memory_accuracy.py b/openjudge/graders/agent/memory/memory_accuracy.py index ae9c72602..22aa2b0f1 100644 --- a/openjudge/graders/agent/memory/memory_accuracy.py +++ b/openjudge/graders/agent/memory/memory_accuracy.py @@ -170,10 +170,12 @@ class MemoryAccuracyGrader(LLMGrader): >>> print(f"Score: {result.score}") # Expected: 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_MEMORY_ACCURACY_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_MEMORY_ACCURACY_TEMPLATE, + template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -192,7 +194,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate memory accuracy", model=model, - template=template or DEFAULT_MEMORY_ACCURACY_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/memory/memory_detail_preservation.py b/openjudge/graders/agent/memory/memory_detail_preservation.py index 6a31da228..4bc7b2707 100644 --- a/openjudge/graders/agent/memory/memory_detail_preservation.py +++ b/openjudge/graders/agent/memory/memory_detail_preservation.py @@ -170,10 +170,12 @@ class MemoryDetailPreservationGrader(LLMGrader): >>> print(f"Score: {result.score}") # Expected: 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE, + template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -192,7 +194,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate memory detail preservation", model=model, - template=template or DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py index a3dd622d0..0a4f97b53 100644 --- a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py +++ b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py @@ -172,6 +172,8 @@ class MemoryRetrievalEffectivenessGrader(LLMGrader): >>> print(f"Score: {result.score}") # Expected: 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_MEMORY_RETRIEVAL_EFFECTIVENESS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -193,7 +195,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate memory retrieval effectiveness", model=model, - template=template or DEFAULT_MEMORY_RETRIEVAL_EFFECTIVENESS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/plan/plan_feasibility.py b/openjudge/graders/agent/plan/plan_feasibility.py index 764b574ea..33a9b42f1 100644 --- a/openjudge/graders/agent/plan/plan_feasibility.py +++ b/openjudge/graders/agent/plan/plan_feasibility.py @@ -173,10 +173,12 @@ class PlanFeasibilityGrader(LLMGrader): >>> print(f"Score: {result.score}") # Expected: 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_PLAN_FEASIBILITY_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_PLAN_FEASIBILITY_TEMPLATE, + template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -195,7 +197,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate plan feasibility", model=model, - template=template or DEFAULT_PLAN_FEASIBILITY_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/reflection/reflection_accuracy.py b/openjudge/graders/agent/reflection/reflection_accuracy.py index 8248e4b19..f68043252 100644 --- a/openjudge/graders/agent/reflection/reflection_accuracy.py +++ b/openjudge/graders/agent/reflection/reflection_accuracy.py @@ -170,10 +170,12 @@ class ReflectionAccuracyGrader(LLMGrader): >>> print(f"Score: {result.score}") # Expected: 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_REFLECTION_ACCURACY_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_REFLECTION_ACCURACY_TEMPLATE, + template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -191,7 +193,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate reflection accuracy", model=model, - template=template or DEFAULT_REFLECTION_ACCURACY_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py index 505310941..03f4da4b4 100644 --- a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py +++ b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py @@ -304,10 +304,12 @@ class ReflectionOutcomeUnderstandingGrader(LLMGrader): >>> print(f"Score: {result.score}") # Expected: 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE, + template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -326,7 +328,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate reflection outcome understanding", model=model, - template=template or DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/reflection/reflection_progress_awareness.py b/openjudge/graders/agent/reflection/reflection_progress_awareness.py index f92dff0c3..1776546a9 100644 --- a/openjudge/graders/agent/reflection/reflection_progress_awareness.py +++ b/openjudge/graders/agent/reflection/reflection_progress_awareness.py @@ -215,10 +215,12 @@ class ReflectionProgressAwarenessGrader(LLMGrader): >>> print(f"Score: {result.score}") # Expected: 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_REFLECTION_PROGRESS_AWARENESS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_REFLECTION_PROGRESS_AWARENESS_TEMPLATE, + template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -236,7 +238,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate reflection progress awareness", model=model, - template=template or DEFAULT_REFLECTION_PROGRESS_AWARENESS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/tool/tool_call_accuracy.py b/openjudge/graders/agent/tool/tool_call_accuracy.py index 42239cbb8..d433364f3 100644 --- a/openjudge/graders/agent/tool/tool_call_accuracy.py +++ b/openjudge/graders/agent/tool/tool_call_accuracy.py @@ -204,10 +204,12 @@ class ToolCallAccuracyGrader(LLMGrader): 5.0 """ + DEFAULT_TEMPLATE = DEFAULT_TOOL_CALL_ACCURACY_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TOOL_CALL_ACCURACY_TEMPLATE, + template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -227,7 +229,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluates the accuracy of tool calls made by an agent", model=model, - template=template or DEFAULT_TOOL_CALL_ACCURACY_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/tool/tool_call_success.py b/openjudge/graders/agent/tool/tool_call_success.py index 3c1aa5244..abd2f6706 100644 --- a/openjudge/graders/agent/tool/tool_call_success.py +++ b/openjudge/graders/agent/tool/tool_call_success.py @@ -222,10 +222,12 @@ class ToolCallSuccessGrader(LLMGrader): 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE + def __init__( self, model: Union[BaseChatModel, Dict[str, Any]], - template: Optional[PromptTemplate] = DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE, + template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -245,7 +247,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluates whether tool calls done by an AI agent includes failures or not", model=model, - template=template or DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/tool/tool_parameter_check.py b/openjudge/graders/agent/tool/tool_parameter_check.py index 1b2c646bc..f4dd50309 100644 --- a/openjudge/graders/agent/tool/tool_parameter_check.py +++ b/openjudge/graders/agent/tool/tool_parameter_check.py @@ -176,10 +176,12 @@ class ToolParameterCheckGrader(LLMGrader): >>> print(f"Score: {result.score}") # 1.0 (correct parameters) """ + DEFAULT_TEMPLATE = DEFAULT_TOOL_PARAMETER_CHECK_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TOOL_PARAMETER_CHECK_TEMPLATE, + template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -197,7 +199,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate tool parameter extraction correctness", model=model, - template=template or DEFAULT_TOOL_PARAMETER_CHECK_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/tool/tool_selection.py b/openjudge/graders/agent/tool/tool_selection.py index 41c045e89..ec83bd16a 100644 --- a/openjudge/graders/agent/tool/tool_selection.py +++ b/openjudge/graders/agent/tool/tool_selection.py @@ -187,10 +187,12 @@ class ToolSelectionGrader(LLMGrader): >>> print(f"Score: {result.score}") # Score from 1 to 5 """ + DEFAULT_TEMPLATE = DEFAULT_TOOL_SELECTION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TOOL_SELECTION_TEMPLATE, + template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -208,7 +210,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate tool selection", model=model, - template=template or DEFAULT_TOOL_SELECTION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/trajectory/trajectory_accuracy.py b/openjudge/graders/agent/trajectory/trajectory_accuracy.py index b1e1f087c..2ca23096c 100644 --- a/openjudge/graders/agent/trajectory/trajectory_accuracy.py +++ b/openjudge/graders/agent/trajectory/trajectory_accuracy.py @@ -191,10 +191,12 @@ class TrajectoryAccuracyGrader(LLMGrader): 3.0 """ + DEFAULT_TEMPLATE = DEFAULT_TRAJECTORY_ACCURACY_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TRAJECTORY_ACCURACY_TEMPLATE, + template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -213,7 +215,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluates the accuracy of agent trajectories in solving user queries", model=model, - template=template or DEFAULT_TRAJECTORY_ACCURACY_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py index c5dc9493b..6c924cac1 100644 --- a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py +++ b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py @@ -304,6 +304,8 @@ class TrajectoryComprehensiveGrader(LLMGrader): >>> print(f"Score: {result.score}") # computed from step averages """ + DEFAULT_TEMPLATE = DEFAULT_TRAJECTORY_COMPREHENSIVE_TEMPLATE + @staticmethod def _create_trajectory_callback( language: LanguageEnum = LanguageEnum.ZH, @@ -393,7 +395,7 @@ def callback(response: ChatResponse) -> Dict[str, Any]: def __init__( self, model: Union[BaseChatModel, dict], - template: Optional[PromptTemplate] = DEFAULT_TRAJECTORY_COMPREHENSIVE_TEMPLATE, + template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, resolution_threshold: float = 0.8, strategy: BaseEvaluationStrategy | None = None, @@ -423,7 +425,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Comprehensive evaluation for agent trajectories including step-level and overall problem-solving assessment", model=model, - template=template or DEFAULT_TRAJECTORY_COMPREHENSIVE_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, structured_model=TrajectoryEvaluationOutput, callback=self._create_trajectory_callback(language=language), diff --git a/openjudge/graders/common/correctness.py b/openjudge/graders/common/correctness.py index f194a2b17..ed43fe973 100644 --- a/openjudge/graders/common/correctness.py +++ b/openjudge/graders/common/correctness.py @@ -266,6 +266,8 @@ class CorrectnessGrader(LLMGrader): >>> print(result.score) # 2 - deviates from reference response """ + DEFAULT_TEMPLATE = DEFAULT_CORRECTNESS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -295,7 +297,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate whether response matches the provided reference response", model=model, - template=template or DEFAULT_CORRECTNESS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/common/hallucination.py b/openjudge/graders/common/hallucination.py index 96939a158..229ca47c2 100644 --- a/openjudge/graders/common/hallucination.py +++ b/openjudge/graders/common/hallucination.py @@ -249,6 +249,8 @@ class HallucinationGrader(LLMGrader): >>> print(result.score) # 5 - factually correct """ + DEFAULT_TEMPLATE = DEFAULT_HALLUCINATION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -278,7 +280,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate whether response contains hallucinations", model=model, - template=template or DEFAULT_HALLUCINATION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) @@ -343,7 +345,7 @@ async def _aevaluate( @staticmethod def get_metadata() -> Dict[str, Any]: - prompt = DEFAULT_HALLUCINATION_TEMPLATE.get_prompt() + prompt = HallucinationGrader.DEFAULT_TEMPLATE.get_prompt() return {"aevaluate": HallucinationGrader._aevaluate.__doc__, "prompt": prompt} diff --git a/openjudge/graders/common/harmfulness.py b/openjudge/graders/common/harmfulness.py index b4eb217c2..b1bb49b48 100644 --- a/openjudge/graders/common/harmfulness.py +++ b/openjudge/graders/common/harmfulness.py @@ -243,6 +243,8 @@ class HarmfulnessGrader(LLMGrader): >>> print(result.reason) # "Contains derogatory language targeting drivers" """ + DEFAULT_TEMPLATE = DEFAULT_HARMFULNESS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -272,7 +274,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate whether response contains harmful or inappropriate content", model=model, - template=template or DEFAULT_HARMFULNESS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/common/instruction_following.py b/openjudge/graders/common/instruction_following.py index f43ef221a..f5e4a6e6f 100644 --- a/openjudge/graders/common/instruction_following.py +++ b/openjudge/graders/common/instruction_following.py @@ -259,6 +259,8 @@ class InstructionFollowingGrader(LLMGrader): >>> print(result.score) # 2 - informal tone, poor structure """ + DEFAULT_TEMPLATE = DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -288,7 +290,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate whether response follows the given instructions", model=model, - template=template or DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/common/relevance.py b/openjudge/graders/common/relevance.py index 1bd786e12..e5eb1a400 100644 --- a/openjudge/graders/common/relevance.py +++ b/openjudge/graders/common/relevance.py @@ -259,6 +259,8 @@ class RelevanceGrader(LLMGrader): >>> print(result.score) # 5 - relevant with conversation context """ + DEFAULT_TEMPLATE = DEFAULT_RELEVANCE_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -288,7 +290,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate relevance of response to user query", model=model, - template=template or DEFAULT_RELEVANCE_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/llm_grader.py b/openjudge/graders/llm_grader.py index 1bf81a6a6..3fc37df00 100644 --- a/openjudge/graders/llm_grader.py +++ b/openjudge/graders/llm_grader.py @@ -53,6 +53,10 @@ class LLMGrader(BaseGrader): callback (Callable): Function to process model response metadata. """ + # The default template value is just a placeholder. + # Extended classes must set proper value to DEFAULT_TEMPLATE + DEFAULT_TEMPLATE = PromptTemplate(messages={}) + def __init__( self, model: BaseChatModel | dict, @@ -108,6 +112,9 @@ def __init__( else: self.language = language + if not template: + raise ValueError("Missing template argument value") + if isinstance(template, str): self.template = PromptTemplate( messages={ @@ -343,6 +350,15 @@ async def _aevaluate(self, **kwargs: Any) -> GraderScore | GraderRank: raise ValueError(f"Unsupported grader mode: {self.mode}") return result + def get_template(self, language: LanguageEnum = LanguageEnum.EN) -> Dict[str, Any]: + """Return the template of the specified language in this grader instance""" + return self.template.get_prompt(language) + + @classmethod + def get_default_template(cls, language: LanguageEnum = LanguageEnum.EN) -> Dict[str, Any]: + """Return the default template of the specified language in this grader class""" + return cls.DEFAULT_TEMPLATE.get_prompt(language) + @staticmethod def get_metadata() -> Dict[str, Any]: """Return the docstring of the aevaluate method to explain how LLMGrader works with LLM.""" diff --git a/openjudge/graders/multi_turn/anaphora_resolution_grader.py b/openjudge/graders/multi_turn/anaphora_resolution_grader.py index 9e4685f6d..16b64fd72 100644 --- a/openjudge/graders/multi_turn/anaphora_resolution_grader.py +++ b/openjudge/graders/multi_turn/anaphora_resolution_grader.py @@ -220,6 +220,8 @@ class AnaphoraResolutionGrader(LLMGrader): >>> print(result.score) # Expected: high score for correct resolution """ + DEFAULT_TEMPLATE = DEFAULT_ANAPHORA_RESOLUTION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -243,7 +245,7 @@ def __init__( name="anaphora_resolution", mode=GraderMode.POINTWISE, description="Evaluate anaphora resolution ability in multi-turn conversations", - template=template or DEFAULT_ANAPHORA_RESOLUTION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, **kwargs, diff --git a/openjudge/graders/multi_turn/context_memory_grader.py b/openjudge/graders/multi_turn/context_memory_grader.py index d8dc36b90..acd092fc8 100644 --- a/openjudge/graders/multi_turn/context_memory_grader.py +++ b/openjudge/graders/multi_turn/context_memory_grader.py @@ -217,6 +217,8 @@ class ContextMemoryGrader(LLMGrader): >>> print(result.score) # Expected: low score due to forgetting constraint """ + DEFAULT_TEMPLATE = DEFAULT_CONTEXT_MEMORY_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -240,7 +242,7 @@ def __init__( name="context_memory", mode=GraderMode.POINTWISE, description="Evaluate context memory ability in multi-turn conversations", - template=template or DEFAULT_CONTEXT_MEMORY_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, **kwargs, diff --git a/openjudge/graders/multi_turn/instruction_clarification_grader.py b/openjudge/graders/multi_turn/instruction_clarification_grader.py index 9e65941b7..cd0166966 100644 --- a/openjudge/graders/multi_turn/instruction_clarification_grader.py +++ b/openjudge/graders/multi_turn/instruction_clarification_grader.py @@ -220,6 +220,8 @@ class InstructionClarificationGrader(LLMGrader): >>> print(result.score) # Expected: high score for appropriate clarification """ + DEFAULT_TEMPLATE = DEFAULT_INSTRUCTION_CLARIFICATION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -243,7 +245,7 @@ def __init__( name="instruction_clarification", mode=GraderMode.POINTWISE, description="Evaluate instruction clarification ability in multi-turn conversations", - template=template or DEFAULT_INSTRUCTION_CLARIFICATION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, **kwargs, diff --git a/openjudge/graders/multi_turn/proactive_interaction_grader.py b/openjudge/graders/multi_turn/proactive_interaction_grader.py index 17c1ef6be..7642d642d 100644 --- a/openjudge/graders/multi_turn/proactive_interaction_grader.py +++ b/openjudge/graders/multi_turn/proactive_interaction_grader.py @@ -225,6 +225,8 @@ class ProactiveInteractionGrader(LLMGrader): >>> print(result.score) # Expected: high score for proactive engagement """ + DEFAULT_TEMPLATE = DEFAULT_PROACTIVE_INTERACTION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -248,7 +250,7 @@ def __init__( name="proactive_interaction", mode=GraderMode.POINTWISE, description="Evaluate proactive interaction ability in multi-turn conversations", - template=template or DEFAULT_PROACTIVE_INTERACTION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, **kwargs, diff --git a/openjudge/graders/multi_turn/response_repetition_grader.py b/openjudge/graders/multi_turn/response_repetition_grader.py index 48b068cf6..17355a3e1 100644 --- a/openjudge/graders/multi_turn/response_repetition_grader.py +++ b/openjudge/graders/multi_turn/response_repetition_grader.py @@ -225,6 +225,8 @@ class ResponseRepetitionGrader(LLMGrader): >>> print(result.score) # Expected: low score due to repetition """ + DEFAULT_TEMPLATE = DEFAULT_RESPONSE_REPETITION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -248,7 +250,7 @@ def __init__( name="response_repetition", mode=GraderMode.POINTWISE, description="Evaluate response repetition in multi-turn conversations", - template=template or DEFAULT_RESPONSE_REPETITION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, **kwargs, diff --git a/openjudge/graders/multi_turn/self_correction_grader.py b/openjudge/graders/multi_turn/self_correction_grader.py index 48a99a961..b930a2587 100644 --- a/openjudge/graders/multi_turn/self_correction_grader.py +++ b/openjudge/graders/multi_turn/self_correction_grader.py @@ -222,6 +222,8 @@ class SelfCorrectionGrader(LLMGrader): >>> print(result.score) # Expected: high score for good correction """ + DEFAULT_TEMPLATE = DEFAULT_SELF_CORRECTION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -245,7 +247,7 @@ def __init__( name="self_correction", mode=GraderMode.POINTWISE, description="Evaluate self-correction ability in multi-turn conversations", - template=template or DEFAULT_SELF_CORRECTION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, **kwargs, diff --git a/openjudge/graders/multi_turn/topic_switch_grader.py b/openjudge/graders/multi_turn/topic_switch_grader.py index 2651e6328..43d404004 100644 --- a/openjudge/graders/multi_turn/topic_switch_grader.py +++ b/openjudge/graders/multi_turn/topic_switch_grader.py @@ -220,6 +220,8 @@ class TopicSwitchGrader(LLMGrader): >>> print(result.score) # Expected: high score for handling topic switch """ + DEFAULT_TEMPLATE = DEFAULT_TOPIC_SWITCH_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -243,7 +245,7 @@ def __init__( name="topic_switch", mode=GraderMode.POINTWISE, description="Evaluate topic switch handling ability in multi-turn conversations", - template=template or DEFAULT_TOPIC_SWITCH_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, **kwargs, diff --git a/openjudge/graders/multimodal/image_coherence.py b/openjudge/graders/multimodal/image_coherence.py index d9e104e30..9e96ffac1 100644 --- a/openjudge/graders/multimodal/image_coherence.py +++ b/openjudge/graders/multimodal/image_coherence.py @@ -193,12 +193,14 @@ class ImageCoherenceGrader(LLMGrader): >>> print(result.score) # 4.8 - image coherent with sales context """ + DEFAULT_TEMPLATE = DEFAULT_IMAGE_COHERENCE_TEMPLATE + def __init__( self, model: BaseChatModel | dict, max_context_size: int = 500, threshold: float = 0.7, - template: PromptTemplate = DEFAULT_IMAGE_COHERENCE_TEMPLATE, + template: PromptTemplate = DEFAULT_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -218,7 +220,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate image-text coherence", model=model, - template=template or DEFAULT_IMAGE_COHERENCE_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/multimodal/image_helpfulness.py b/openjudge/graders/multimodal/image_helpfulness.py index 8e81677ab..67f37b13e 100644 --- a/openjudge/graders/multimodal/image_helpfulness.py +++ b/openjudge/graders/multimodal/image_helpfulness.py @@ -195,12 +195,14 @@ class ImageHelpfulnessGrader(LLMGrader): >>> print(result.score) # 4.5 - diagram very helpful for understanding """ + DEFAULT_TEMPLATE = DEFAULT_IMAGE_HELPFULNESS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, max_context_size: int = 500, threshold: float = 0.7, - template: PromptTemplate = DEFAULT_IMAGE_HELPFULNESS_TEMPLATE, + template: PromptTemplate = DEFAULT_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -220,7 +222,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate image helpfulness for understanding text", model=model, - template=template or DEFAULT_IMAGE_HELPFULNESS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/tests/graders/agent/action/test_action_alignment.py b/tests/graders/agent/action/test_action_alignment.py index 137ba1b7b..926a8c571 100644 --- a/tests/graders/agent/action/test_action_alignment.py +++ b/tests/graders/agent/action/test_action_alignment.py @@ -61,6 +61,28 @@ def test_initialization(self): assert grader.name == "action_alignment" assert grader.model == mock_model + language_template = grader.get_template(LanguageEnum.ZH) + assert len(language_template) == 1 + assert "zh" in language_template + template = language_template["zh"] + assert len(template) == 1 + assert len(template[0]) == 2 + assert template[0]["role"] == "user" + assert template[0]["content"].startswith( + "你是一名分析智能体行为的专家。你的任务是评估智能体是否执行了与其声明的计划或推理一致的动作。" + ) + + language_template = grader.get_default_template(LanguageEnum.EN) + assert len(language_template) == 1 + assert "en" in language_template + template = language_template["en"] + assert len(template) == 1 + assert len(template[0]) == 2 + assert template[0]["role"] == "user" + assert template[0]["content"].startswith( + "You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent executes an action that aligns with its stated plan or reasoning." + ) + @pytest.mark.asyncio async def test_successful_evaluation_aligned(self): """Test successful evaluation with good alignment""" @@ -156,12 +178,8 @@ async def test_error_handling(self): OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL") RUN_QUALITY_TESTS = bool(OPENAI_API_KEY and OPENAI_BASE_URL) -pytestmark = pytest.mark.skipif( - not RUN_QUALITY_TESTS, - reason="Requires API keys and base URL to run quality tests", -) - +@pytest.mark.skipif(not RUN_QUALITY_TESTS, reason="Requires API keys and base URL to run quality tests") @pytest.mark.quality class TestActionAlignmentGraderQuality: """Quality tests for ActionAlignmentGrader - testing evaluation quality""" diff --git a/tests/graders/test_llm_grader.py b/tests/graders/test_llm_grader.py index baf0e050a..1a3a49f03 100644 --- a/tests/graders/test_llm_grader.py +++ b/tests/graders/test_llm_grader.py @@ -42,6 +42,7 @@ from openjudge.graders.llm_grader import LLMGrader from openjudge.graders.schema import GraderError from openjudge.models.openai_chat_model import OpenAIChatModel +from openjudge.models.schema.prompt_template import LanguageEnum from openjudge.runner.grading_runner import GraderConfig, GradingRunner # ==================== UNIT TESTS ==================== @@ -60,12 +61,18 @@ def test_initialization_failure_without_template(self): model=AsyncMock(), name="foo", ) + assert "Missing template argument value" in str(error_obj.value) + + def test_initialization_failure_with_invalid_template_type(self): + """Test initialization failure without template""" + with pytest.raises(ValueError) as error_obj: + LLMGrader(model=AsyncMock(), name="foo", template=AsyncMock()) assert "Template must be a str, list, dict or PromptTemplate object" in str(error_obj.value) def test_initialization_with_string_template(self): """Test successful initialization with string template""" mock_model = AsyncMock() - template_str = """You're a LLM query answer relevance grader, you'll received Query/Response: + template_str = """You're a LLM query answer relevance grader, you'll receive Query/Response: Query: {query} Response: {response} Please read query/response, if the Response answers the Query, return 1, return 0 if no. @@ -98,7 +105,7 @@ def test_initialization_with_dict_template(self): }, { "role": "user", - "content": """You'll received Query/Response: + "content": """You'll receive Query/Response: Query: {query} Response: {response} Please read query/response, if the Response answers the Query, return 1, return 0 if no. @@ -139,7 +146,7 @@ def test_initialization_with_model_dict(self): "api_key": "test-key", } - template_str = """You're a LLM query answer relevance grader, you'll received Query/Response: + template_str = """You're a LLM query answer relevance grader, you'll receive Query/Response: Query: {query} Response: {response} Please read query/response, if the Response answers the Query, return 1, return 0 if no. @@ -158,8 +165,29 @@ def test_initialization_with_model_dict(self): ) assert grader.name == "test_llm_grader" - assert isinstance(grader.model, OpenAIChatModel) # Note: We can't easily check the model config since it's private + assert isinstance(grader.model, OpenAIChatModel) + + language_template = grader.get_template() + assert len(language_template) == 1 + assert LanguageEnum.EN in language_template + templates = language_template[LanguageEnum.EN] + assert len(templates) == 2 + for t in templates: + assert len(t) == 2 + assert "role" in t + assert "content" in t + + if t["role"] == "system": + assert ( + "You are a professional evaluation assistant. Please evaluate according to the user's requirements." + in t["content"] + ) + elif t["role"] == "user": + assert "You're a LLM query answer relevance grader, you'll receive Query/Response" in t["content"] + + default_template = grader.get_default_template() + assert len(default_template) == 0 @pytest.mark.asyncio async def test_pointwise_evaluation_success(self): @@ -217,7 +245,7 @@ async def test_listwise_evaluation_success(self): mock_model.achat = AsyncMock(return_value=mock_response) # Create grader with template that follows the specification in docs - template = """You're a LLM query answer ranking grader, you'll received Query and multiple Responses: + template = """You're a LLM query answer ranking grader, you'll receive Query and multiple Responses: Query: {query} Responses: 1. {response_1} @@ -308,9 +336,8 @@ def test_serialization_methods(self): OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL") RUN_QUALITY_TESTS = bool(OPENAI_API_KEY and OPENAI_BASE_URL) -pytestmark = pytest.mark.skipif(not RUN_QUALITY_TESTS, reason="Requires API keys and base URL to run quality tests") - +@pytest.mark.skipif(not RUN_QUALITY_TESTS, reason="Requires API keys and base URL to run quality tests") @pytest.mark.quality class TestLLMGraderQuality: """Quality tests for LLMGrader - testing evaluation quality using golden dataset""" @@ -361,7 +388,7 @@ def model(self): async def test_discriminative_power_with_runner(self, dataset, model): """Test the grader's ability to distinguish between accurate and inaccurate responses (using Runner)""" # Create grader with real model following the specification in docs - template = """You're a LLM query answer accuracy grader, you'll received Query/Response and Context: + template = """You're a LLM query answer accuracy grader, you'll receive Query/Response and Context: Query: {query} Response: {response} Context: {context} @@ -420,7 +447,7 @@ async def test_discriminative_power_with_runner(self, dataset, model): async def test_consistency_with_runner(self, dataset, model): """Test grader evaluation consistency (using Runner)""" # Create grader with real model following the specification in docs - template = """You're a LLM query answer accuracy grader, you'll received Query/Response and Context: + template = """You're a LLM query answer accuracy grader, you'll receive Query/Response and Context: Query: {query} Response: {response} Context: {context} From a94cb039bdd3cc89fe7fdd12b67bb2f855d90121 Mon Sep 17 00:00:00 2001 From: "w.zhang" <136773451+weizhang25@users.noreply.github.com> Date: Sat, 28 Feb 2026 02:55:06 +0800 Subject: [PATCH 2/3] fix 1 --- openjudge/graders/agent/action/action_alignment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openjudge/graders/agent/action/action_alignment.py b/openjudge/graders/agent/action/action_alignment.py index ac802ac7b..31fc1edc6 100644 --- a/openjudge/graders/agent/action/action_alignment.py +++ b/openjudge/graders/agent/action/action_alignment.py @@ -175,7 +175,7 @@ class ActionAlignmentGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): From f174c9a2c2fc7fa71dc52ed5759ab97dda5f050a Mon Sep 17 00:00:00 2001 From: "w.zhang" <136773451+weizhang25@users.noreply.github.com> Date: Sat, 28 Feb 2026 03:02:55 +0800 Subject: [PATCH 3/3] Set default templates in all subclasses of LLMGrader. --- openjudge/graders/agent/memory/memory_accuracy.py | 2 +- openjudge/graders/agent/memory/memory_detail_preservation.py | 2 +- openjudge/graders/agent/plan/plan_feasibility.py | 2 +- openjudge/graders/agent/reflection/reflection_accuracy.py | 2 +- .../agent/reflection/reflection_outcome_understanding.py | 2 +- .../graders/agent/reflection/reflection_progress_awareness.py | 2 +- openjudge/graders/agent/tool/tool_call_accuracy.py | 2 +- openjudge/graders/agent/tool/tool_call_success.py | 2 +- openjudge/graders/agent/tool/tool_parameter_check.py | 2 +- openjudge/graders/agent/tool/tool_selection.py | 2 +- openjudge/graders/agent/trajectory/trajectory_accuracy.py | 2 +- openjudge/graders/agent/trajectory/trajectory_comprehensive.py | 2 +- openjudge/graders/multimodal/image_coherence.py | 2 +- openjudge/graders/multimodal/image_helpfulness.py | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/openjudge/graders/agent/memory/memory_accuracy.py b/openjudge/graders/agent/memory/memory_accuracy.py index 22aa2b0f1..833778570 100644 --- a/openjudge/graders/agent/memory/memory_accuracy.py +++ b/openjudge/graders/agent/memory/memory_accuracy.py @@ -175,7 +175,7 @@ class MemoryAccuracyGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): diff --git a/openjudge/graders/agent/memory/memory_detail_preservation.py b/openjudge/graders/agent/memory/memory_detail_preservation.py index 4bc7b2707..60364b2db 100644 --- a/openjudge/graders/agent/memory/memory_detail_preservation.py +++ b/openjudge/graders/agent/memory/memory_detail_preservation.py @@ -175,7 +175,7 @@ class MemoryDetailPreservationGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): diff --git a/openjudge/graders/agent/plan/plan_feasibility.py b/openjudge/graders/agent/plan/plan_feasibility.py index 33a9b42f1..8602bd89b 100644 --- a/openjudge/graders/agent/plan/plan_feasibility.py +++ b/openjudge/graders/agent/plan/plan_feasibility.py @@ -178,7 +178,7 @@ class PlanFeasibilityGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): diff --git a/openjudge/graders/agent/reflection/reflection_accuracy.py b/openjudge/graders/agent/reflection/reflection_accuracy.py index f68043252..b35749a9b 100644 --- a/openjudge/graders/agent/reflection/reflection_accuracy.py +++ b/openjudge/graders/agent/reflection/reflection_accuracy.py @@ -175,7 +175,7 @@ class ReflectionAccuracyGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): diff --git a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py index 03f4da4b4..b74612208 100644 --- a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py +++ b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py @@ -309,7 +309,7 @@ class ReflectionOutcomeUnderstandingGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): diff --git a/openjudge/graders/agent/reflection/reflection_progress_awareness.py b/openjudge/graders/agent/reflection/reflection_progress_awareness.py index 1776546a9..ff52f103f 100644 --- a/openjudge/graders/agent/reflection/reflection_progress_awareness.py +++ b/openjudge/graders/agent/reflection/reflection_progress_awareness.py @@ -220,7 +220,7 @@ class ReflectionProgressAwarenessGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): diff --git a/openjudge/graders/agent/tool/tool_call_accuracy.py b/openjudge/graders/agent/tool/tool_call_accuracy.py index d433364f3..61e6bd25e 100644 --- a/openjudge/graders/agent/tool/tool_call_accuracy.py +++ b/openjudge/graders/agent/tool/tool_call_accuracy.py @@ -209,7 +209,7 @@ class ToolCallAccuracyGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): diff --git a/openjudge/graders/agent/tool/tool_call_success.py b/openjudge/graders/agent/tool/tool_call_success.py index abd2f6706..9b6cec085 100644 --- a/openjudge/graders/agent/tool/tool_call_success.py +++ b/openjudge/graders/agent/tool/tool_call_success.py @@ -227,7 +227,7 @@ class ToolCallSuccessGrader(LLMGrader): def __init__( self, model: Union[BaseChatModel, Dict[str, Any]], - template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): diff --git a/openjudge/graders/agent/tool/tool_parameter_check.py b/openjudge/graders/agent/tool/tool_parameter_check.py index f4dd50309..f77f6a3f2 100644 --- a/openjudge/graders/agent/tool/tool_parameter_check.py +++ b/openjudge/graders/agent/tool/tool_parameter_check.py @@ -181,7 +181,7 @@ class ToolParameterCheckGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): diff --git a/openjudge/graders/agent/tool/tool_selection.py b/openjudge/graders/agent/tool/tool_selection.py index ec83bd16a..a4023f29b 100644 --- a/openjudge/graders/agent/tool/tool_selection.py +++ b/openjudge/graders/agent/tool/tool_selection.py @@ -192,7 +192,7 @@ class ToolSelectionGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): diff --git a/openjudge/graders/agent/trajectory/trajectory_accuracy.py b/openjudge/graders/agent/trajectory/trajectory_accuracy.py index 2ca23096c..1ec1a1e1d 100644 --- a/openjudge/graders/agent/trajectory/trajectory_accuracy.py +++ b/openjudge/graders/agent/trajectory/trajectory_accuracy.py @@ -196,7 +196,7 @@ class TrajectoryAccuracyGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): diff --git a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py index 6c924cac1..93a24b041 100644 --- a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py +++ b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py @@ -395,7 +395,7 @@ def callback(response: ChatResponse) -> Dict[str, Any]: def __init__( self, model: Union[BaseChatModel, dict], - template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, resolution_threshold: float = 0.8, strategy: BaseEvaluationStrategy | None = None, diff --git a/openjudge/graders/multimodal/image_coherence.py b/openjudge/graders/multimodal/image_coherence.py index 9e96ffac1..1d4b49054 100644 --- a/openjudge/graders/multimodal/image_coherence.py +++ b/openjudge/graders/multimodal/image_coherence.py @@ -200,7 +200,7 @@ def __init__( model: BaseChatModel | dict, max_context_size: int = 500, threshold: float = 0.7, - template: PromptTemplate = DEFAULT_TEMPLATE, + template: PromptTemplate = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): diff --git a/openjudge/graders/multimodal/image_helpfulness.py b/openjudge/graders/multimodal/image_helpfulness.py index 67f37b13e..109ed8780 100644 --- a/openjudge/graders/multimodal/image_helpfulness.py +++ b/openjudge/graders/multimodal/image_helpfulness.py @@ -202,7 +202,7 @@ def __init__( model: BaseChatModel | dict, max_context_size: int = 500, threshold: float = 0.7, - template: PromptTemplate = DEFAULT_TEMPLATE, + template: PromptTemplate = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ):