agentscope-ai · weizhang25 · Feb 26, 2026 · Feb 26, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/cookbooks/finance_grader/event_interpretation/event_analysis.py b/cookbooks/finance_grader/event_interpretation/event_analysis.py
@@ -219,6 +219,8 @@ class EventAnalysisGrader(LLMGrader):
         >>> print(result.rank)  # [2, 1] means answer_2 is better
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_EVENT_ANALYSIS_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
@@ -241,7 +243,7 @@ def __init__(
             mode=GraderMode.LISTWISE,
             description="Evaluate financial event analysis quality by comparing two responses",
             model=model,
-            template=template or DEFAULT_EVENT_ANALYSIS_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/cookbooks/finance_grader/event_interpretation/event_identification.py b/cookbooks/finance_grader/event_interpretation/event_identification.py
@@ -196,6 +196,8 @@ class EventIdentificationGrader(LLMGrader):
         >>> print(result.rank)  # [2, 1] means answer_2 is better
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_EVENT_IDENTIFICATION_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
@@ -218,7 +220,7 @@ def __init__(
             mode=GraderMode.LISTWISE,
             description="Evaluate financial event identification quality by comparing two responses",
             model=model,
-            template=template or DEFAULT_EVENT_IDENTIFICATION_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/cookbooks/finance_grader/industry_research/characteristics_analysis.py b/cookbooks/finance_grader/industry_research/characteristics_analysis.py
@@ -216,6 +216,8 @@ class CharacteristicsAnalysisGrader(LLMGrader):
         >>> print(result.rank)  # [2, 1] if answer_2 is better
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_CHARACTERISTICS_ANALYSIS_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
@@ -236,7 +238,7 @@ def __init__(
             mode=GraderMode.LISTWISE,
             description="Evaluate industry characteristics analysis quality by comparing two responses",
             model=model,
-            template=template or DEFAULT_CHARACTERISTICS_ANALYSIS_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/cookbooks/finance_grader/industry_research/risk_analysis.py b/cookbooks/finance_grader/industry_research/risk_analysis.py
@@ -208,6 +208,8 @@ class RiskAnalysisGrader(LLMGrader):
         >>> print(result.rank)  # [2, 1] if answer_2 is better
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_RISK_ANALYSIS_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
@@ -228,7 +230,7 @@ def __init__(
             mode=GraderMode.LISTWISE,
             description="Evaluate financial risk analysis quality by comparing two responses",
             model=model,
-            template=template or DEFAULT_RISK_ANALYSIS_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/cookbooks/finance_grader/industry_research/underlying_comparison.py b/cookbooks/finance_grader/industry_research/underlying_comparison.py
@@ -212,6 +212,8 @@ class UnderlyingComparisonGrader(LLMGrader):
         >>> print(result.rank)  # [2, 1] if answer_2 is better
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_UNDERLYING_COMPARISON_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
@@ -232,7 +234,7 @@ def __init__(
             mode=GraderMode.LISTWISE,
             description="Evaluate underlying comparison analysis quality by comparing two responses",
             model=model,
-            template=template or DEFAULT_UNDERLYING_COMPARISON_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/cookbooks/finance_grader/macro_analysis/concept_explanation.py b/cookbooks/finance_grader/macro_analysis/concept_explanation.py
@@ -187,6 +187,8 @@ class ConceptExplanationGrader(LLMGrader):
         >>> print(result.rank)  # [2, 1] if answer_2 is better
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_CONCEPT_EXPLANATION_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
@@ -207,7 +209,7 @@ def __init__(
             mode=GraderMode.LISTWISE,
             description="Evaluate macroeconomic concept explanation quality by comparing two responses",
             model=model,
-            template=template or DEFAULT_CONCEPT_EXPLANATION_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/cookbooks/finance_grader/macro_analysis/macro_analysis.py b/cookbooks/finance_grader/macro_analysis/macro_analysis.py
@@ -214,6 +214,8 @@ class MacroAnalysisGrader(LLMGrader):
         >>> print(result.rank)  # [2, 1] if answer_2 is better
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_MACRO_ANALYSIS_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
@@ -234,7 +236,7 @@ def __init__(
             mode=GraderMode.LISTWISE,
             description="Evaluate macroeconomic analysis quality by comparing two responses",
             model=model,
-            template=template or DEFAULT_MACRO_ANALYSIS_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/cookbooks/finance_grader/stock_analysis/fundamental_analysis.py b/cookbooks/finance_grader/stock_analysis/fundamental_analysis.py
@@ -212,6 +212,8 @@ class FundamentalAnalysisGrader(LLMGrader):
         >>> print(result.rank)  # [2, 1] if answer_2 is better
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_FUNDAMENTAL_ANALYSIS_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
@@ -232,7 +234,7 @@ def __init__(
             mode=GraderMode.LISTWISE,
             description="Evaluate fundamental analysis quality by comparing two responses",
             model=model,
-            template=template or DEFAULT_FUNDAMENTAL_ANALYSIS_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/cookbooks/finance_grader/stock_analysis/overall_logic.py b/cookbooks/finance_grader/stock_analysis/overall_logic.py
@@ -204,6 +204,8 @@ class OverallLogicGrader(LLMGrader):
         >>> print(result.rank)  # [2, 1] if answer_2 is better
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_OVERALL_LOGIC_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
@@ -224,7 +226,7 @@ def __init__(
             mode=GraderMode.LISTWISE,
             description="Evaluate overall logic and structure quality by comparing two responses",
             model=model,
-            template=template or DEFAULT_OVERALL_LOGIC_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/cookbooks/finance_grader/stock_analysis/stock_risk_analysis.py b/cookbooks/finance_grader/stock_analysis/stock_risk_analysis.py
@@ -211,6 +211,8 @@ class StockRiskAnalysisGrader(LLMGrader):
         >>> print(result.rank)  # [2, 1] if answer_2 is better
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_STOCK_RISK_ANALYSIS_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
@@ -231,7 +233,7 @@ def __init__(
             mode=GraderMode.LISTWISE,
             description="Evaluate stock risk analysis quality by comparing two responses",
             model=model,
-            template=template or DEFAULT_STOCK_RISK_ANALYSIS_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/cookbooks/finance_grader/stock_analysis/valuation_analysis.py b/cookbooks/finance_grader/stock_analysis/valuation_analysis.py
@@ -202,6 +202,8 @@ class ValuationAnalysisGrader(LLMGrader):
         >>> print(result.rank)  # [2, 1] if answer_2 is better
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_VALUATION_ANALYSIS_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
@@ -222,7 +224,7 @@ def __init__(
             mode=GraderMode.LISTWISE,
             description="Evaluate valuation analysis quality by comparing two responses",
             model=model,
-            template=template or DEFAULT_VALUATION_ANALYSIS_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/cookbooks/finance_grader/stock_search/search_integrity.py b/cookbooks/finance_grader/stock_search/search_integrity.py
@@ -187,6 +187,8 @@ class SearchIntegrityGrader(LLMGrader):
         >>> print(result.rank)  # [2, 1] if answer_2 is better
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_SEARCH_INTEGRITY_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
@@ -207,7 +209,7 @@ def __init__(
             mode=GraderMode.LISTWISE,
             description="Evaluate stock search integrity and completeness by comparing two responses",
             model=model,
-            template=template or DEFAULT_SEARCH_INTEGRITY_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/cookbooks/finance_grader/stock_search/search_relevance.py b/cookbooks/finance_grader/stock_search/search_relevance.py
@@ -187,6 +187,8 @@ class SearchRelevanceGrader(LLMGrader):
         >>> print(result.rank)  # [2, 1] if answer_2 is better
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_SEARCH_RELEVANCE_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
@@ -207,7 +209,7 @@ def __init__(
             mode=GraderMode.LISTWISE,
             description="Evaluate stock search relevance by comparing two responses",
             model=model,
-            template=template or DEFAULT_SEARCH_RELEVANCE_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/cookbooks/finance_grader/stock_search/search_timeliness.py b/cookbooks/finance_grader/stock_search/search_timeliness.py
@@ -187,6 +187,8 @@ class SearchTimelinessGrader(LLMGrader):
         >>> print(result.rank)  # [2, 1] if answer_2 is better
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_SEARCH_TIMELINESS_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
@@ -207,7 +209,7 @@ def __init__(
             mode=GraderMode.LISTWISE,
             description="Evaluate stock search timeliness by comparing two responses",
             model=model,
-            template=template or DEFAULT_SEARCH_TIMELINESS_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/openjudge/graders/agent/action/action_alignment.py b/openjudge/graders/agent/action/action_alignment.py
@@ -175,7 +175,7 @@ class ActionAlignmentGrader(LLMGrader):
     def __init__(
         self,
         model: BaseChatModel | dict,
-        template: Optional[PromptTemplate] = DEFAULT_TEMPLATE,
+        template: Optional[PromptTemplate] = None,
         language: LanguageEnum = LanguageEnum.EN,
         strategy: BaseEvaluationStrategy | None = None,
     ):

diff --git a/openjudge/graders/agent/memory/memory_accuracy.py b/openjudge/graders/agent/memory/memory_accuracy.py
@@ -170,10 +170,12 @@ class MemoryAccuracyGrader(LLMGrader):
         >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_MEMORY_ACCURACY_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
-        template: Optional[PromptTemplate] = DEFAULT_MEMORY_ACCURACY_TEMPLATE,
+        template: Optional[PromptTemplate] = None,
         language: LanguageEnum = LanguageEnum.EN,
         strategy: BaseEvaluationStrategy | None = None,
     ):
@@ -192,7 +194,7 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate memory accuracy",
             model=model,
-            template=template or DEFAULT_MEMORY_ACCURACY_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/openjudge/graders/agent/memory/memory_detail_preservation.py b/openjudge/graders/agent/memory/memory_detail_preservation.py
@@ -170,10 +170,12 @@ class MemoryDetailPreservationGrader(LLMGrader):
         >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
-        template: Optional[PromptTemplate] = DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE,
+        template: Optional[PromptTemplate] = None,
         language: LanguageEnum = LanguageEnum.EN,
         strategy: BaseEvaluationStrategy | None = None,
     ):
@@ -192,7 +194,7 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate memory detail preservation",
             model=model,
-            template=template or DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py
@@ -172,6 +172,8 @@ class MemoryRetrievalEffectivenessGrader(LLMGrader):
         >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_MEMORY_RETRIEVAL_EFFECTIVENESS_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
@@ -193,7 +195,7 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate memory retrieval effectiveness",
             model=model,
-            template=template or DEFAULT_MEMORY_RETRIEVAL_EFFECTIVENESS_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/openjudge/graders/agent/plan/plan_feasibility.py b/openjudge/graders/agent/plan/plan_feasibility.py
@@ -173,10 +173,12 @@ class PlanFeasibilityGrader(LLMGrader):
         >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_PLAN_FEASIBILITY_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
-        template: Optional[PromptTemplate] = DEFAULT_PLAN_FEASIBILITY_TEMPLATE,
+        template: Optional[PromptTemplate] = None,
         language: LanguageEnum = LanguageEnum.EN,
         strategy: BaseEvaluationStrategy | None = None,
     ):
@@ -195,7 +197,7 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate plan feasibility",
             model=model,
-            template=template or DEFAULT_PLAN_FEASIBILITY_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/openjudge/graders/agent/reflection/reflection_accuracy.py b/openjudge/graders/agent/reflection/reflection_accuracy.py
@@ -170,10 +170,12 @@ class ReflectionAccuracyGrader(LLMGrader):
         >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_REFLECTION_ACCURACY_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
-        template: Optional[PromptTemplate] = DEFAULT_REFLECTION_ACCURACY_TEMPLATE,
+        template: Optional[PromptTemplate] = None,
         language: LanguageEnum = LanguageEnum.EN,
         strategy: BaseEvaluationStrategy | None = None,
     ):
@@ -191,7 +193,7 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate reflection accuracy",
             model=model,
-            template=template or DEFAULT_REFLECTION_ACCURACY_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )

diff --git a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py
@@ -304,10 +304,12 @@ class ReflectionOutcomeUnderstandingGrader(LLMGrader):
         >>> print(f"Score: {result.score}")  # Expected: 1.0
     """
 
+    DEFAULT_TEMPLATE = DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE
+
     def __init__(
         self,
         model: BaseChatModel | dict,
-        template: Optional[PromptTemplate] = DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE,
+        template: Optional[PromptTemplate] = None,
         language: LanguageEnum = LanguageEnum.EN,
         strategy: BaseEvaluationStrategy | None = None,
     ):
@@ -326,7 +328,7 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate reflection outcome understanding",
             model=model,
-            template=template or DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE,
+            template=template or self.DEFAULT_TEMPLATE,
             language=language,
             strategy=strategy,
         )