diff --git a/cookbooks/finance_grader/event_interpretation/event_analysis.py b/cookbooks/finance_grader/event_interpretation/event_analysis.py index 052696366..45fc33f28 100644 --- a/cookbooks/finance_grader/event_interpretation/event_analysis.py +++ b/cookbooks/finance_grader/event_interpretation/event_analysis.py @@ -219,6 +219,8 @@ class EventAnalysisGrader(LLMGrader): >>> print(result.rank) # [2, 1] means answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_EVENT_ANALYSIS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -241,7 +243,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate financial event analysis quality by comparing two responses", model=model, - template=template or DEFAULT_EVENT_ANALYSIS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/event_interpretation/event_identification.py b/cookbooks/finance_grader/event_interpretation/event_identification.py index fd0cba3f6..b49436071 100644 --- a/cookbooks/finance_grader/event_interpretation/event_identification.py +++ b/cookbooks/finance_grader/event_interpretation/event_identification.py @@ -196,6 +196,8 @@ class EventIdentificationGrader(LLMGrader): >>> print(result.rank) # [2, 1] means answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_EVENT_IDENTIFICATION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -218,7 +220,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate financial event identification quality by comparing two responses", model=model, - template=template or DEFAULT_EVENT_IDENTIFICATION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/industry_research/characteristics_analysis.py b/cookbooks/finance_grader/industry_research/characteristics_analysis.py index dcc03211b..bed67806c 100644 --- a/cookbooks/finance_grader/industry_research/characteristics_analysis.py +++ b/cookbooks/finance_grader/industry_research/characteristics_analysis.py @@ -216,6 +216,8 @@ class CharacteristicsAnalysisGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_CHARACTERISTICS_ANALYSIS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -236,7 +238,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate industry characteristics analysis quality by comparing two responses", model=model, - template=template or DEFAULT_CHARACTERISTICS_ANALYSIS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/industry_research/risk_analysis.py b/cookbooks/finance_grader/industry_research/risk_analysis.py index 9deb46d7d..0cfe01897 100644 --- a/cookbooks/finance_grader/industry_research/risk_analysis.py +++ b/cookbooks/finance_grader/industry_research/risk_analysis.py @@ -208,6 +208,8 @@ class RiskAnalysisGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_RISK_ANALYSIS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -228,7 +230,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate financial risk analysis quality by comparing two responses", model=model, - template=template or DEFAULT_RISK_ANALYSIS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/industry_research/underlying_comparison.py b/cookbooks/finance_grader/industry_research/underlying_comparison.py index 635010e63..0db78e8b8 100644 --- a/cookbooks/finance_grader/industry_research/underlying_comparison.py +++ b/cookbooks/finance_grader/industry_research/underlying_comparison.py @@ -212,6 +212,8 @@ class UnderlyingComparisonGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_UNDERLYING_COMPARISON_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -232,7 +234,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate underlying comparison analysis quality by comparing two responses", model=model, - template=template or DEFAULT_UNDERLYING_COMPARISON_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/macro_analysis/concept_explanation.py b/cookbooks/finance_grader/macro_analysis/concept_explanation.py index 57011b681..3e4fe1f22 100644 --- a/cookbooks/finance_grader/macro_analysis/concept_explanation.py +++ b/cookbooks/finance_grader/macro_analysis/concept_explanation.py @@ -187,6 +187,8 @@ class ConceptExplanationGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_CONCEPT_EXPLANATION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -207,7 +209,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate macroeconomic concept explanation quality by comparing two responses", model=model, - template=template or DEFAULT_CONCEPT_EXPLANATION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/macro_analysis/macro_analysis.py b/cookbooks/finance_grader/macro_analysis/macro_analysis.py index 246557493..f9f99bd4c 100644 --- a/cookbooks/finance_grader/macro_analysis/macro_analysis.py +++ b/cookbooks/finance_grader/macro_analysis/macro_analysis.py @@ -214,6 +214,8 @@ class MacroAnalysisGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_MACRO_ANALYSIS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -234,7 +236,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate macroeconomic analysis quality by comparing two responses", model=model, - template=template or DEFAULT_MACRO_ANALYSIS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/stock_analysis/fundamental_analysis.py b/cookbooks/finance_grader/stock_analysis/fundamental_analysis.py index 2fab5e7b9..26394fad8 100644 --- a/cookbooks/finance_grader/stock_analysis/fundamental_analysis.py +++ b/cookbooks/finance_grader/stock_analysis/fundamental_analysis.py @@ -212,6 +212,8 @@ class FundamentalAnalysisGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_FUNDAMENTAL_ANALYSIS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -232,7 +234,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate fundamental analysis quality by comparing two responses", model=model, - template=template or DEFAULT_FUNDAMENTAL_ANALYSIS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/stock_analysis/overall_logic.py b/cookbooks/finance_grader/stock_analysis/overall_logic.py index 0e14ae8ec..2b5d01b3a 100644 --- a/cookbooks/finance_grader/stock_analysis/overall_logic.py +++ b/cookbooks/finance_grader/stock_analysis/overall_logic.py @@ -204,6 +204,8 @@ class OverallLogicGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_OVERALL_LOGIC_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -224,7 +226,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate overall logic and structure quality by comparing two responses", model=model, - template=template or DEFAULT_OVERALL_LOGIC_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/stock_analysis/stock_risk_analysis.py b/cookbooks/finance_grader/stock_analysis/stock_risk_analysis.py index d8cf8af62..c7c79976d 100644 --- a/cookbooks/finance_grader/stock_analysis/stock_risk_analysis.py +++ b/cookbooks/finance_grader/stock_analysis/stock_risk_analysis.py @@ -211,6 +211,8 @@ class StockRiskAnalysisGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_STOCK_RISK_ANALYSIS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -231,7 +233,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate stock risk analysis quality by comparing two responses", model=model, - template=template or DEFAULT_STOCK_RISK_ANALYSIS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/stock_analysis/valuation_analysis.py b/cookbooks/finance_grader/stock_analysis/valuation_analysis.py index 0a0b14759..ee81de527 100644 --- a/cookbooks/finance_grader/stock_analysis/valuation_analysis.py +++ b/cookbooks/finance_grader/stock_analysis/valuation_analysis.py @@ -202,6 +202,8 @@ class ValuationAnalysisGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_VALUATION_ANALYSIS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -222,7 +224,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate valuation analysis quality by comparing two responses", model=model, - template=template or DEFAULT_VALUATION_ANALYSIS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/stock_search/search_integrity.py b/cookbooks/finance_grader/stock_search/search_integrity.py index 82a86c6aa..05c488c53 100644 --- a/cookbooks/finance_grader/stock_search/search_integrity.py +++ b/cookbooks/finance_grader/stock_search/search_integrity.py @@ -187,6 +187,8 @@ class SearchIntegrityGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_SEARCH_INTEGRITY_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -207,7 +209,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate stock search integrity and completeness by comparing two responses", model=model, - template=template or DEFAULT_SEARCH_INTEGRITY_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/stock_search/search_relevance.py b/cookbooks/finance_grader/stock_search/search_relevance.py index 220be351a..b86c406ea 100644 --- a/cookbooks/finance_grader/stock_search/search_relevance.py +++ b/cookbooks/finance_grader/stock_search/search_relevance.py @@ -187,6 +187,8 @@ class SearchRelevanceGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_SEARCH_RELEVANCE_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -207,7 +209,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate stock search relevance by comparing two responses", model=model, - template=template or DEFAULT_SEARCH_RELEVANCE_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/cookbooks/finance_grader/stock_search/search_timeliness.py b/cookbooks/finance_grader/stock_search/search_timeliness.py index 7282634de..eba4a04ac 100644 --- a/cookbooks/finance_grader/stock_search/search_timeliness.py +++ b/cookbooks/finance_grader/stock_search/search_timeliness.py @@ -187,6 +187,8 @@ class SearchTimelinessGrader(LLMGrader): >>> print(result.rank) # [2, 1] if answer_2 is better """ + DEFAULT_TEMPLATE = DEFAULT_SEARCH_TIMELINESS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -207,7 +209,7 @@ def __init__( mode=GraderMode.LISTWISE, description="Evaluate stock search timeliness by comparing two responses", model=model, - template=template or DEFAULT_SEARCH_TIMELINESS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/action/action_alignment.py b/openjudge/graders/agent/action/action_alignment.py index ac802ac7b..31fc1edc6 100644 --- a/openjudge/graders/agent/action/action_alignment.py +++ b/openjudge/graders/agent/action/action_alignment.py @@ -175,7 +175,7 @@ class ActionAlignmentGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): diff --git a/openjudge/graders/agent/memory/memory_accuracy.py b/openjudge/graders/agent/memory/memory_accuracy.py index ae9c72602..833778570 100644 --- a/openjudge/graders/agent/memory/memory_accuracy.py +++ b/openjudge/graders/agent/memory/memory_accuracy.py @@ -170,10 +170,12 @@ class MemoryAccuracyGrader(LLMGrader): >>> print(f"Score: {result.score}") # Expected: 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_MEMORY_ACCURACY_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_MEMORY_ACCURACY_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -192,7 +194,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate memory accuracy", model=model, - template=template or DEFAULT_MEMORY_ACCURACY_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/memory/memory_detail_preservation.py b/openjudge/graders/agent/memory/memory_detail_preservation.py index 6a31da228..60364b2db 100644 --- a/openjudge/graders/agent/memory/memory_detail_preservation.py +++ b/openjudge/graders/agent/memory/memory_detail_preservation.py @@ -170,10 +170,12 @@ class MemoryDetailPreservationGrader(LLMGrader): >>> print(f"Score: {result.score}") # Expected: 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -192,7 +194,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate memory detail preservation", model=model, - template=template or DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py index a3dd622d0..0a4f97b53 100644 --- a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py +++ b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py @@ -172,6 +172,8 @@ class MemoryRetrievalEffectivenessGrader(LLMGrader): >>> print(f"Score: {result.score}") # Expected: 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_MEMORY_RETRIEVAL_EFFECTIVENESS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -193,7 +195,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate memory retrieval effectiveness", model=model, - template=template or DEFAULT_MEMORY_RETRIEVAL_EFFECTIVENESS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/plan/plan_feasibility.py b/openjudge/graders/agent/plan/plan_feasibility.py index 764b574ea..8602bd89b 100644 --- a/openjudge/graders/agent/plan/plan_feasibility.py +++ b/openjudge/graders/agent/plan/plan_feasibility.py @@ -173,10 +173,12 @@ class PlanFeasibilityGrader(LLMGrader): >>> print(f"Score: {result.score}") # Expected: 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_PLAN_FEASIBILITY_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_PLAN_FEASIBILITY_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -195,7 +197,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate plan feasibility", model=model, - template=template or DEFAULT_PLAN_FEASIBILITY_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/reflection/reflection_accuracy.py b/openjudge/graders/agent/reflection/reflection_accuracy.py index 8248e4b19..b35749a9b 100644 --- a/openjudge/graders/agent/reflection/reflection_accuracy.py +++ b/openjudge/graders/agent/reflection/reflection_accuracy.py @@ -170,10 +170,12 @@ class ReflectionAccuracyGrader(LLMGrader): >>> print(f"Score: {result.score}") # Expected: 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_REFLECTION_ACCURACY_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_REFLECTION_ACCURACY_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -191,7 +193,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate reflection accuracy", model=model, - template=template or DEFAULT_REFLECTION_ACCURACY_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py index 505310941..b74612208 100644 --- a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py +++ b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py @@ -304,10 +304,12 @@ class ReflectionOutcomeUnderstandingGrader(LLMGrader): >>> print(f"Score: {result.score}") # Expected: 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -326,7 +328,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate reflection outcome understanding", model=model, - template=template or DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/reflection/reflection_progress_awareness.py b/openjudge/graders/agent/reflection/reflection_progress_awareness.py index f92dff0c3..ff52f103f 100644 --- a/openjudge/graders/agent/reflection/reflection_progress_awareness.py +++ b/openjudge/graders/agent/reflection/reflection_progress_awareness.py @@ -215,10 +215,12 @@ class ReflectionProgressAwarenessGrader(LLMGrader): >>> print(f"Score: {result.score}") # Expected: 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_REFLECTION_PROGRESS_AWARENESS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_REFLECTION_PROGRESS_AWARENESS_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -236,7 +238,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate reflection progress awareness", model=model, - template=template or DEFAULT_REFLECTION_PROGRESS_AWARENESS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/tool/tool_call_accuracy.py b/openjudge/graders/agent/tool/tool_call_accuracy.py index 42239cbb8..61e6bd25e 100644 --- a/openjudge/graders/agent/tool/tool_call_accuracy.py +++ b/openjudge/graders/agent/tool/tool_call_accuracy.py @@ -204,10 +204,12 @@ class ToolCallAccuracyGrader(LLMGrader): 5.0 """ + DEFAULT_TEMPLATE = DEFAULT_TOOL_CALL_ACCURACY_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TOOL_CALL_ACCURACY_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -227,7 +229,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluates the accuracy of tool calls made by an agent", model=model, - template=template or DEFAULT_TOOL_CALL_ACCURACY_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/tool/tool_call_success.py b/openjudge/graders/agent/tool/tool_call_success.py index 3c1aa5244..9b6cec085 100644 --- a/openjudge/graders/agent/tool/tool_call_success.py +++ b/openjudge/graders/agent/tool/tool_call_success.py @@ -222,10 +222,12 @@ class ToolCallSuccessGrader(LLMGrader): 1.0 """ + DEFAULT_TEMPLATE = DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE + def __init__( self, model: Union[BaseChatModel, Dict[str, Any]], - template: Optional[PromptTemplate] = DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -245,7 +247,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluates whether tool calls done by an AI agent includes failures or not", model=model, - template=template or DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/tool/tool_parameter_check.py b/openjudge/graders/agent/tool/tool_parameter_check.py index 1b2c646bc..f77f6a3f2 100644 --- a/openjudge/graders/agent/tool/tool_parameter_check.py +++ b/openjudge/graders/agent/tool/tool_parameter_check.py @@ -176,10 +176,12 @@ class ToolParameterCheckGrader(LLMGrader): >>> print(f"Score: {result.score}") # 1.0 (correct parameters) """ + DEFAULT_TEMPLATE = DEFAULT_TOOL_PARAMETER_CHECK_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TOOL_PARAMETER_CHECK_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -197,7 +199,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate tool parameter extraction correctness", model=model, - template=template or DEFAULT_TOOL_PARAMETER_CHECK_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/tool/tool_selection.py b/openjudge/graders/agent/tool/tool_selection.py index 41c045e89..a4023f29b 100644 --- a/openjudge/graders/agent/tool/tool_selection.py +++ b/openjudge/graders/agent/tool/tool_selection.py @@ -187,10 +187,12 @@ class ToolSelectionGrader(LLMGrader): >>> print(f"Score: {result.score}") # Score from 1 to 5 """ + DEFAULT_TEMPLATE = DEFAULT_TOOL_SELECTION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TOOL_SELECTION_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -208,7 +210,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate tool selection", model=model, - template=template or DEFAULT_TOOL_SELECTION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/trajectory/trajectory_accuracy.py b/openjudge/graders/agent/trajectory/trajectory_accuracy.py index b1e1f087c..1ec1a1e1d 100644 --- a/openjudge/graders/agent/trajectory/trajectory_accuracy.py +++ b/openjudge/graders/agent/trajectory/trajectory_accuracy.py @@ -191,10 +191,12 @@ class TrajectoryAccuracyGrader(LLMGrader): 3.0 """ + DEFAULT_TEMPLATE = DEFAULT_TRAJECTORY_ACCURACY_TEMPLATE + def __init__( self, model: BaseChatModel | dict, - template: Optional[PromptTemplate] = DEFAULT_TRAJECTORY_ACCURACY_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -213,7 +215,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluates the accuracy of agent trajectories in solving user queries", model=model, - template=template or DEFAULT_TRAJECTORY_ACCURACY_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py index c5dc9493b..93a24b041 100644 --- a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py +++ b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py @@ -304,6 +304,8 @@ class TrajectoryComprehensiveGrader(LLMGrader): >>> print(f"Score: {result.score}") # computed from step averages """ + DEFAULT_TEMPLATE = DEFAULT_TRAJECTORY_COMPREHENSIVE_TEMPLATE + @staticmethod def _create_trajectory_callback( language: LanguageEnum = LanguageEnum.ZH, @@ -393,7 +395,7 @@ def callback(response: ChatResponse) -> Dict[str, Any]: def __init__( self, model: Union[BaseChatModel, dict], - template: Optional[PromptTemplate] = DEFAULT_TRAJECTORY_COMPREHENSIVE_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, resolution_threshold: float = 0.8, strategy: BaseEvaluationStrategy | None = None, @@ -423,7 +425,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Comprehensive evaluation for agent trajectories including step-level and overall problem-solving assessment", model=model, - template=template or DEFAULT_TRAJECTORY_COMPREHENSIVE_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, structured_model=TrajectoryEvaluationOutput, callback=self._create_trajectory_callback(language=language), diff --git a/openjudge/graders/common/correctness.py b/openjudge/graders/common/correctness.py index f194a2b17..ed43fe973 100644 --- a/openjudge/graders/common/correctness.py +++ b/openjudge/graders/common/correctness.py @@ -266,6 +266,8 @@ class CorrectnessGrader(LLMGrader): >>> print(result.score) # 2 - deviates from reference response """ + DEFAULT_TEMPLATE = DEFAULT_CORRECTNESS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -295,7 +297,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate whether response matches the provided reference response", model=model, - template=template or DEFAULT_CORRECTNESS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/common/hallucination.py b/openjudge/graders/common/hallucination.py index 96939a158..229ca47c2 100644 --- a/openjudge/graders/common/hallucination.py +++ b/openjudge/graders/common/hallucination.py @@ -249,6 +249,8 @@ class HallucinationGrader(LLMGrader): >>> print(result.score) # 5 - factually correct """ + DEFAULT_TEMPLATE = DEFAULT_HALLUCINATION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -278,7 +280,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate whether response contains hallucinations", model=model, - template=template or DEFAULT_HALLUCINATION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) @@ -343,7 +345,7 @@ async def _aevaluate( @staticmethod def get_metadata() -> Dict[str, Any]: - prompt = DEFAULT_HALLUCINATION_TEMPLATE.get_prompt() + prompt = HallucinationGrader.DEFAULT_TEMPLATE.get_prompt() return {"aevaluate": HallucinationGrader._aevaluate.__doc__, "prompt": prompt} diff --git a/openjudge/graders/common/harmfulness.py b/openjudge/graders/common/harmfulness.py index b4eb217c2..b1bb49b48 100644 --- a/openjudge/graders/common/harmfulness.py +++ b/openjudge/graders/common/harmfulness.py @@ -243,6 +243,8 @@ class HarmfulnessGrader(LLMGrader): >>> print(result.reason) # "Contains derogatory language targeting drivers" """ + DEFAULT_TEMPLATE = DEFAULT_HARMFULNESS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -272,7 +274,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate whether response contains harmful or inappropriate content", model=model, - template=template or DEFAULT_HARMFULNESS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/common/instruction_following.py b/openjudge/graders/common/instruction_following.py index f43ef221a..f5e4a6e6f 100644 --- a/openjudge/graders/common/instruction_following.py +++ b/openjudge/graders/common/instruction_following.py @@ -259,6 +259,8 @@ class InstructionFollowingGrader(LLMGrader): >>> print(result.score) # 2 - informal tone, poor structure """ + DEFAULT_TEMPLATE = DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -288,7 +290,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate whether response follows the given instructions", model=model, - template=template or DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/common/relevance.py b/openjudge/graders/common/relevance.py index 1bd786e12..e5eb1a400 100644 --- a/openjudge/graders/common/relevance.py +++ b/openjudge/graders/common/relevance.py @@ -259,6 +259,8 @@ class RelevanceGrader(LLMGrader): >>> print(result.score) # 5 - relevant with conversation context """ + DEFAULT_TEMPLATE = DEFAULT_RELEVANCE_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -288,7 +290,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate relevance of response to user query", model=model, - template=template or DEFAULT_RELEVANCE_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/multi_turn/anaphora_resolution_grader.py b/openjudge/graders/multi_turn/anaphora_resolution_grader.py index 9e4685f6d..16b64fd72 100644 --- a/openjudge/graders/multi_turn/anaphora_resolution_grader.py +++ b/openjudge/graders/multi_turn/anaphora_resolution_grader.py @@ -220,6 +220,8 @@ class AnaphoraResolutionGrader(LLMGrader): >>> print(result.score) # Expected: high score for correct resolution """ + DEFAULT_TEMPLATE = DEFAULT_ANAPHORA_RESOLUTION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -243,7 +245,7 @@ def __init__( name="anaphora_resolution", mode=GraderMode.POINTWISE, description="Evaluate anaphora resolution ability in multi-turn conversations", - template=template or DEFAULT_ANAPHORA_RESOLUTION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, **kwargs, diff --git a/openjudge/graders/multi_turn/context_memory_grader.py b/openjudge/graders/multi_turn/context_memory_grader.py index d8dc36b90..acd092fc8 100644 --- a/openjudge/graders/multi_turn/context_memory_grader.py +++ b/openjudge/graders/multi_turn/context_memory_grader.py @@ -217,6 +217,8 @@ class ContextMemoryGrader(LLMGrader): >>> print(result.score) # Expected: low score due to forgetting constraint """ + DEFAULT_TEMPLATE = DEFAULT_CONTEXT_MEMORY_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -240,7 +242,7 @@ def __init__( name="context_memory", mode=GraderMode.POINTWISE, description="Evaluate context memory ability in multi-turn conversations", - template=template or DEFAULT_CONTEXT_MEMORY_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, **kwargs, diff --git a/openjudge/graders/multi_turn/instruction_clarification_grader.py b/openjudge/graders/multi_turn/instruction_clarification_grader.py index 9e65941b7..cd0166966 100644 --- a/openjudge/graders/multi_turn/instruction_clarification_grader.py +++ b/openjudge/graders/multi_turn/instruction_clarification_grader.py @@ -220,6 +220,8 @@ class InstructionClarificationGrader(LLMGrader): >>> print(result.score) # Expected: high score for appropriate clarification """ + DEFAULT_TEMPLATE = DEFAULT_INSTRUCTION_CLARIFICATION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -243,7 +245,7 @@ def __init__( name="instruction_clarification", mode=GraderMode.POINTWISE, description="Evaluate instruction clarification ability in multi-turn conversations", - template=template or DEFAULT_INSTRUCTION_CLARIFICATION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, **kwargs, diff --git a/openjudge/graders/multi_turn/proactive_interaction_grader.py b/openjudge/graders/multi_turn/proactive_interaction_grader.py index 17c1ef6be..7642d642d 100644 --- a/openjudge/graders/multi_turn/proactive_interaction_grader.py +++ b/openjudge/graders/multi_turn/proactive_interaction_grader.py @@ -225,6 +225,8 @@ class ProactiveInteractionGrader(LLMGrader): >>> print(result.score) # Expected: high score for proactive engagement """ + DEFAULT_TEMPLATE = DEFAULT_PROACTIVE_INTERACTION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -248,7 +250,7 @@ def __init__( name="proactive_interaction", mode=GraderMode.POINTWISE, description="Evaluate proactive interaction ability in multi-turn conversations", - template=template or DEFAULT_PROACTIVE_INTERACTION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, **kwargs, diff --git a/openjudge/graders/multi_turn/response_repetition_grader.py b/openjudge/graders/multi_turn/response_repetition_grader.py index 48b068cf6..17355a3e1 100644 --- a/openjudge/graders/multi_turn/response_repetition_grader.py +++ b/openjudge/graders/multi_turn/response_repetition_grader.py @@ -225,6 +225,8 @@ class ResponseRepetitionGrader(LLMGrader): >>> print(result.score) # Expected: low score due to repetition """ + DEFAULT_TEMPLATE = DEFAULT_RESPONSE_REPETITION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -248,7 +250,7 @@ def __init__( name="response_repetition", mode=GraderMode.POINTWISE, description="Evaluate response repetition in multi-turn conversations", - template=template or DEFAULT_RESPONSE_REPETITION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, **kwargs, diff --git a/openjudge/graders/multi_turn/self_correction_grader.py b/openjudge/graders/multi_turn/self_correction_grader.py index 48a99a961..b930a2587 100644 --- a/openjudge/graders/multi_turn/self_correction_grader.py +++ b/openjudge/graders/multi_turn/self_correction_grader.py @@ -222,6 +222,8 @@ class SelfCorrectionGrader(LLMGrader): >>> print(result.score) # Expected: high score for good correction """ + DEFAULT_TEMPLATE = DEFAULT_SELF_CORRECTION_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -245,7 +247,7 @@ def __init__( name="self_correction", mode=GraderMode.POINTWISE, description="Evaluate self-correction ability in multi-turn conversations", - template=template or DEFAULT_SELF_CORRECTION_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, **kwargs, diff --git a/openjudge/graders/multi_turn/topic_switch_grader.py b/openjudge/graders/multi_turn/topic_switch_grader.py index 2651e6328..43d404004 100644 --- a/openjudge/graders/multi_turn/topic_switch_grader.py +++ b/openjudge/graders/multi_turn/topic_switch_grader.py @@ -220,6 +220,8 @@ class TopicSwitchGrader(LLMGrader): >>> print(result.score) # Expected: high score for handling topic switch """ + DEFAULT_TEMPLATE = DEFAULT_TOPIC_SWITCH_TEMPLATE + def __init__( self, model: BaseChatModel | dict, @@ -243,7 +245,7 @@ def __init__( name="topic_switch", mode=GraderMode.POINTWISE, description="Evaluate topic switch handling ability in multi-turn conversations", - template=template or DEFAULT_TOPIC_SWITCH_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, **kwargs, diff --git a/openjudge/graders/multimodal/image_coherence.py b/openjudge/graders/multimodal/image_coherence.py index d9e104e30..1d4b49054 100644 --- a/openjudge/graders/multimodal/image_coherence.py +++ b/openjudge/graders/multimodal/image_coherence.py @@ -193,12 +193,14 @@ class ImageCoherenceGrader(LLMGrader): >>> print(result.score) # 4.8 - image coherent with sales context """ + DEFAULT_TEMPLATE = DEFAULT_IMAGE_COHERENCE_TEMPLATE + def __init__( self, model: BaseChatModel | dict, max_context_size: int = 500, threshold: float = 0.7, - template: PromptTemplate = DEFAULT_IMAGE_COHERENCE_TEMPLATE, + template: PromptTemplate = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -218,7 +220,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate image-text coherence", model=model, - template=template or DEFAULT_IMAGE_COHERENCE_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, ) diff --git a/openjudge/graders/multimodal/image_helpfulness.py b/openjudge/graders/multimodal/image_helpfulness.py index 8e81677ab..109ed8780 100644 --- a/openjudge/graders/multimodal/image_helpfulness.py +++ b/openjudge/graders/multimodal/image_helpfulness.py @@ -195,12 +195,14 @@ class ImageHelpfulnessGrader(LLMGrader): >>> print(result.score) # 4.5 - diagram very helpful for understanding """ + DEFAULT_TEMPLATE = DEFAULT_IMAGE_HELPFULNESS_TEMPLATE + def __init__( self, model: BaseChatModel | dict, max_context_size: int = 500, threshold: float = 0.7, - template: PromptTemplate = DEFAULT_IMAGE_HELPFULNESS_TEMPLATE, + template: PromptTemplate = None, language: LanguageEnum = LanguageEnum.EN, strategy: BaseEvaluationStrategy | None = None, ): @@ -220,7 +222,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate image helpfulness for understanding text", model=model, - template=template or DEFAULT_IMAGE_HELPFULNESS_TEMPLATE, + template=template or self.DEFAULT_TEMPLATE, language=language, strategy=strategy, )