diff --git a/backend/examples/run_prometheus_direct_judge.py b/backend/examples/run_prometheus_direct_judge.py index 7d1c7fa7..6b6d2dcf 100644 --- a/backend/examples/run_prometheus_direct_judge.py +++ b/backend/examples/run_prometheus_direct_judge.py @@ -1,7 +1,7 @@ from evalassist.judges import Criteria, CriteriaOption, Instance, MPrometheusDirectJudge if __name__ == "__main__": - judge = MPrometheusDirectJudge(m_prometheus_b_params=3) + judge = MPrometheusDirectJudge(billions_of_params=3) instances = [ Instance( diff --git a/backend/examples/run_prometheus_pairwise_judge.py b/backend/examples/run_prometheus_pairwise_judge.py index 18e04322..0f05ae8c 100644 --- a/backend/examples/run_prometheus_pairwise_judge.py +++ b/backend/examples/run_prometheus_pairwise_judge.py @@ -1,7 +1,7 @@ from evalassist.judges import Criteria, Instance, MPrometheusPairwiseJudge if __name__ == "__main__": - judge = MPrometheusPairwiseJudge(m_prometheus_b_params=3) + judge = MPrometheusPairwiseJudge(billions_of_params=3) instances = [ Instance( diff --git a/backend/src/evalassist/api_types.py b/backend/src/evalassist/api_types.py index 038a55ac..d25c8dc4 100644 --- a/backend/src/evalassist/api_types.py +++ b/backend/src/evalassist/api_types.py @@ -211,6 +211,8 @@ class NotebookParams(BaseModel): instances: list[DirectInstanceDTO] | list[PairwiseInstanceDTO] examples: list[DirectInstanceDTO] | list[PairwiseInstanceDTO] criteria: CriteriaDTO | CriteriaWithOptionsDTO + judge: str + judge_params: dict # class DownloadTestCaseParams(BaseModel): diff --git a/backend/src/evalassist/judges/direct_judge.py b/backend/src/evalassist/judges/direct_judge.py index 339edfb0..0179739d 100644 --- a/backend/src/evalassist/judges/direct_judge.py +++ b/backend/src/evalassist/judges/direct_judge.py @@ -56,7 +56,7 @@ def __init__( "Either provide set generate_synthetic_persona to False or don't provide a judge_description_prompt." ) - if self.self_consistency: + if self.self_consistency is True or self.self_consistency > 1: temp = getattr(self.inference_engine, "temperature", None) if temp is not None: try: diff --git a/backend/src/evalassist/judges/mprometheus_judge.py b/backend/src/evalassist/judges/mprometheus_judge.py index dd3be704..9f5fc35a 100644 --- a/backend/src/evalassist/judges/mprometheus_judge.py +++ b/backend/src/evalassist/judges/mprometheus_judge.py @@ -1,18 +1,19 @@ from typing import Literal, cast -from evalassist.judges.utils import get_to_evaluate_text +from fastapi import HTTPException from .base import BaseDirectJudge, BasePairwiseJudge from .types import Criteria, DirectInstanceResult, Instance, PairwiseInstanceResult +from .utils import get_to_evaluate_text class MPrometheusJudge: m_prometheus_model_name: str - def __init__(self, m_prometheus_b_params: Literal[3, 7, 14] = 3, **kwargs): + def __init__(self, billions_of_params: Literal[3, 7, 14] = 3, **kwargs): super().__init__(**kwargs) self.m_prometheus_model_name = ( - f"Unbabel/M-Prometheus-{str(m_prometheus_b_params)}B" + f"Unbabel/M-Prometheus-{str(billions_of_params)}B" ) @@ -42,12 +43,18 @@ def _run( instances: list[Instance], criteria: list[Criteria], ) -> list[DirectInstanceResult]: - from prometheus_eval import PrometheusEval - from prometheus_eval.prompts import ( - ABSOLUTE_PROMPT_WO_REF, - SCORE_RUBRIC_TEMPLATE, - ) - from prometheus_eval.vllm import VLLM + try: + from prometheus_eval import PrometheusEval + from prometheus_eval.prompts import ( + ABSOLUTE_PROMPT_WO_REF, + SCORE_RUBRIC_TEMPLATE, + ) + from prometheus_eval.vllm import VLLM + except ModuleNotFoundError: + raise HTTPException( + status_code=404, + detail="Failed to import 'prometheus_eval' package. Make sure it is installed correctly.", + ) self._validate_criteria(criteria) self._validate_instances(instances) diff --git a/backend/src/evalassist/main.py b/backend/src/evalassist/main.py index 2dccde6c..8cdc4ec3 100644 --- a/backend/src/evalassist/main.py +++ b/backend/src/evalassist/main.py @@ -7,6 +7,7 @@ import nbformat as nbf import nest_asyncio import pandas as pd +from evalassist.judges.base import UnitxtInferenceEngineMixin from fastapi import ( APIRouter, BackgroundTasks, @@ -311,16 +312,23 @@ def run(): else: temperature = 0.0 - inference_engine: InferenceEngine = get_inference_engine_from_judge_metadata( - evaluator_name=evaluator_name, - custom_model_name=custom_model_name, - provider=req.provider, - llm_provider_credentials=req.llm_provider_credentials, - custom_params={ - **DEFAULT_JUDGE_INFERENCE_PARAMS, - "temperature": temperature, - }, - ) + judge_class = JUDGE_CLASS_MAP[req.type][req.judge] + judge_requires_model = issubclass(judge_class, UnitxtInferenceEngineMixin) + + if judge_requires_model: + inference_engine: InferenceEngine = ( + get_inference_engine_from_judge_metadata( + evaluator_name=evaluator_name, + custom_model_name=custom_model_name, + provider=req.provider, + llm_provider_credentials=req.llm_provider_credentials, + custom_params={ + **DEFAULT_JUDGE_INFERENCE_PARAMS, + "temperature": temperature, + }, + ) + ) + if ( req.criteria.to_evaluate_field is None or req.criteria.context_fields is None @@ -337,14 +345,17 @@ def run(): example.to_instance_result(req.criteria.to_evaluate_field) for example in req.examples ] - judge = JUDGE_CLASS_MAP[req.type][req.judge]( - **{"inference_engine": inference_engine, **req.judge_params} - ) # type: ignore + + params = req.judge_params + if judge_requires_model: + params["inference_engine"] = inference_engine # type: ignore + + judge = judge_class(**params) # type: ignore if req.type == EvaluatorTypeEnum.DIRECT: criteria = req.criteria.to_criteria(examples=examples) if evaluator_name.name.startswith("GRANITE_GUARDIAN"): - judge = GraniteGuardianJudge(inference_engine=inference_engine) + judge = GraniteGuardianJudge(inference_engine=inference_engine) # type: ignore per_instance_result = judge.evaluate( instances=instances, criteria=criteria, @@ -534,7 +545,10 @@ def download_notebook(params: NotebookParams, background_tasks: BackgroundTasks) "evaluator_type": params.evaluator_type, "model_name": model_name, "plain_python_script": params.plain_python_script, + "judge": params.judge, + "judge_params": params.judge_params, } + if params.evaluator_type == EvaluatorTypeEnum.DIRECT: nb = DirectEvaluationNotebook(**p).generate_notebook() # type: ignore else: diff --git a/backend/src/evalassist/notebook_generation.py b/backend/src/evalassist/notebook_generation.py index 74ab6494..08e0a126 100644 --- a/backend/src/evalassist/notebook_generation.py +++ b/backend/src/evalassist/notebook_generation.py @@ -4,7 +4,8 @@ from typing import Literal, cast import nbformat as nbf -from evalassist.judges import Criteria, Instance +from evalassist.judges import JUDGE_CLASS_MAP, Criteria, Instance +from evalassist.judges.base import UnitxtInferenceEngineMixin from .api_types import ( EvaluatorNameEnum, @@ -16,6 +17,58 @@ from .utils import get_cross_inference_engine_params +def format_value(value, indent=0): + """ + Format values as valid Python code. + Supports nested dicts, literals, and variable references. + """ + pad = " " * indent + + # Variable reference + if isinstance(value, VariableRef): + return value.name + + # Basic literals + if isinstance(value, (bool, int, float)): + return repr(value) + + # Strings + if isinstance(value, str): + return repr(value) + + # Nested dictionary + if isinstance(value, dict): + if not value: + return "{}" + + inner_indent = indent + 4 + inner_pad = " " * inner_indent + + items = [] + for k, v in value.items(): + items.append(f"{inner_pad}{repr(k)}: {format_value(v, inner_indent)},") + + return "{\n" + "\n".join(items) + f"\n{pad}}}" + + raise ValueError(f"Unsupported param type: {type(value)}") + + +def generate_constructor_code(class_name: str, params: dict) -> str: + lines = [f"{class_name}("] + for key, value in params.items(): + formatted = format_value(value, indent=4) + lines.append(f" {key}={formatted},") + lines.append(")") + return "\n".join(lines) + + +class VariableRef: + """Wrapper representing a variable name in generated code.""" + + def __init__(self, name: str): + self.name = name + + class Cell: type: Literal["code", "md"] content: str @@ -37,6 +90,8 @@ def __init__( evaluator_type: EvaluatorTypeEnum, model_name: str, plain_python_script: bool, + judge: str, + judge_params: dict, ): self.instances = instances self.criteria = criteria @@ -44,12 +99,25 @@ def __init__( self.evaluator_name = evaluator_name self.evaluator_type = evaluator_type self.plain_python_script = plain_python_script - self.inference_engine_params = get_cross_inference_engine_params( - credentials=credentials, - provider=provider, - model_name=model_name, - custom_params=DEFAULT_JUDGE_INFERENCE_PARAMS, + self.judge = judge + self.judge_params = judge_params + + self.judge_class = JUDGE_CLASS_MAP[evaluator_type][judge] + + self.judge_requires_model = issubclass( + self.judge_class, UnitxtInferenceEngineMixin ) + + if self.judge_requires_model: + self.inference_engine_params = get_cross_inference_engine_params( + credentials=credentials, + provider=provider, + model_name=model_name, + custom_params=DEFAULT_JUDGE_INFERENCE_PARAMS, + ) + else: + self.inference_engine_params = None + self.cells: list[Cell] = [] def generate_notebook(self): @@ -148,15 +216,24 @@ def get_import_code(self): """ def get_setup_and_run_eval_code(self): - params = re.sub( - r"\btrue\b", "True", json.dumps(self.inference_engine_params, indent=4) + if self.judge_requires_model: + inference_engine_construct_str = generate_constructor_code( + "CrossProviderInferenceEngine", params=self.inference_engine_params + ) # type: ignore + else: + inference_engine_construct_str = "" + + judge_params = self.judge_params + if self.judge_requires_model: + judge_params["inference_engine"] = VariableRef("inference_engine") + judge_construct_str = generate_constructor_code( + self.judge_class.__name__, params=judge_params ) + return f"""\ -inference_engine = CrossProviderInferenceEngine(**{params}) +inference_engine = {inference_engine_construct_str} -judge = DirectJudge( - inference_engine=inference_engine, -) +judge = {judge_construct_str} results: list[DirectInstanceResult] = judge(instances, criteria) diff --git a/frontend/src/components/SingleExampleEvaluation/Modals/ConfigurationModal.module.scss b/frontend/src/components/SingleExampleEvaluation/Modals/ConfigurationModal.module.scss index f705d7d8..ae35a379 100644 --- a/frontend/src/components/SingleExampleEvaluation/Modals/ConfigurationModal.module.scss +++ b/frontend/src/components/SingleExampleEvaluation/Modals/ConfigurationModal.module.scss @@ -3,18 +3,28 @@ .container { display: flex; flex-direction: column; - gap: 2rem; } .configOptions { display: grid; - grid-template-columns: repeat(2, 1fr); - gap: 1rem; /* optional spacing */ + grid-template-columns: repeat(3, 1fr); + gap: 1rem; align-items: center; } .section { display: flex; flex-direction: column; - gap: 0.75rem; + gap: 1rem; +} + +.subSection { + display: flex; + flex-direction: column; + gap: 1rem; +} + +.topDivider { + border-top: 1px solid $border-subtle-00; + padding-top: 1rem; } diff --git a/frontend/src/components/SingleExampleEvaluation/Modals/ConfigurationModal.tsx b/frontend/src/components/SingleExampleEvaluation/Modals/ConfigurationModal.tsx index e0efca51..7a651ea5 100644 --- a/frontend/src/components/SingleExampleEvaluation/Modals/ConfigurationModal.tsx +++ b/frontend/src/components/SingleExampleEvaluation/Modals/ConfigurationModal.tsx @@ -10,6 +10,7 @@ import { BASE_JUDGE_PARAMS_MAP, JUDGE_DEFAULT_PARAMS_MAP, JUDGE_PARAMS_MAP, + JUDGE_REQUIRES_MODEL_SELECTION_MAP, } from '@constants' import { useCurrentTestCase } from '@providers/CurrentTestCaseProvider' import { useEvaluatorOptionsContext } from '@providers/EvaluatorOptionsProvider' @@ -32,15 +33,25 @@ export const ConfigurationModal = ({ open, setOpen }: Props) => { const onJudgeSelect = useCallback( (e: { target: { value: string } }) => { - setCurrentTestCase({ - ...currentTestCase, - judge: { - name: e.target.value, - params: { - ...BASE_JUDGE_DEFAULT_PARAMS_MAP, - ...JUDGE_DEFAULT_PARAMS_MAP[currentTestCase.type][e.target.value], + setCurrentTestCase((prev) => { + const params = { + ...BASE_JUDGE_DEFAULT_PARAMS_MAP, + ...JUDGE_DEFAULT_PARAMS_MAP[currentTestCase.type][e.target.value], + } + + Object.entries(prev.judge.params).forEach(([prevParam, prevValue]) => { + if (prevParam in params) { + params[prevParam] = prevValue + } + }) + + return { + ...currentTestCase, + judge: { + name: e.target.value, + params, }, - }, + } }) }, [currentTestCase, setCurrentTestCase], @@ -105,124 +116,133 @@ export const ConfigurationModal = ({ open, setOpen }: Props) => { >
-
-
{'Select judge'}
- -
-
-
{'Judge configuration'}
-
- {Object.entries(currentTestCase.judge.params) - // sort the params by type alphabetically to make it look more clean - .sort(([k, v], [k2, v2]) => { - return allParamTypes[k].toString().localeCompare(allParamTypes[k2].toString()) - }) - .toReversed() - .map(([param, value], i) => - allParamTypes[param] === 'boolean' ? ( - onChangeParamValue(param, state.checked)} - checked={currentTestCase.judge.params[param]} - /> - ) : Array.isArray(allParamTypes[param]) ? ( - - ) : allParamTypes[param] === 'number' ? ( - onChangeParamValue(param, state.value)} - value={currentTestCase.judge.params[param]} - label={capitalizeFirstWord(param)} - /> - ) : ( - 'unknown type config' - ), - )} +
+

{'Judge Configuration'}

+
+
{'Select judge'}
+ +
+
+
{'Configuration'}
+
+ {Object.entries(currentTestCase.judge.params) + // sort the params by type alphabetically to make it look more clean + .sort(([k, v], [k2, v2]) => { + return allParamTypes[k].toString().localeCompare(allParamTypes[k2].toString()) + }) + .toReversed() + .map(([param, value], i) => + allParamTypes[param] === 'boolean' ? ( + onChangeParamValue(param, state.checked)} + checked={currentTestCase.judge.params[param]} + /> + ) : Array.isArray(allParamTypes[param]) ? ( + + ) : allParamTypes[param] === 'number' ? ( + onChangeParamValue(param, state.value)} + value={currentTestCase.judge.params[param]} + label={capitalizeFirstWord(param)} + /> + ) : ( + 'unknown type config' + ), + )} +
+
+
+ {JUDGE_REQUIRES_MODEL_SELECTION_MAP[currentTestCase.type][currentTestCase.judge.name] && ( + <> +
{'Judge model'}
+ { + setCurrentTestCase({ ...currentTestCase, evaluator }) + }} + helperChildren={ + <> + + {'How do evaluators work?'} + + + + } + /> + + )}
-
-
{'Judge model'}
-
+ +
+

{'Synthetic generation'}

+
+ {/*
{'Model'}
*/} { - setCurrentTestCase({ ...currentTestCase, evaluator }) - }} + selectedEvaluator={currentTestCase.syntheticGenerationConfig.evaluator} + setSelectedEvaluator={(newValue) => + setCurrentTestCase({ + ...currentTestCase, + syntheticGenerationConfig: { + ...currentTestCase.syntheticGenerationConfig, + evaluator: newValue, + }, + }) + } + evaluatorOptions={ + returnByPipelineType( + currentTestCase.type, + nonGraniteGuardianDirectEvaluators, + nonGraniteGuardianPairwiseEvaluators, + ) || [] + } + dropdownLabel={'Synthetic generation model'} + selectionComponentNameWithArticle="a model" + selectionComponentName="model" helperChildren={ <> - {'How do evaluators work?'} + {'What is synthetic generation?'} - + } /> -
-
{'Synthetic generation model'}
- - setCurrentTestCase({ - ...currentTestCase, - syntheticGenerationConfig: { - ...currentTestCase.syntheticGenerationConfig, - evaluator: newValue, - }, - }) - } - evaluatorOptions={ - returnByPipelineType( - currentTestCase.type, - nonGraniteGuardianDirectEvaluators, - nonGraniteGuardianPairwiseEvaluators, - ) || [] - } - dropdownLabel={'Synthetic generation'} - selectionComponentNameWithArticle="a model" - selectionComponentName="model" - helperChildren={ - <> - - {'What is synthetic generation?'} - - - - } - /> -
diff --git a/frontend/src/components/SingleExampleEvaluation/Modals/DownloadModal.tsx b/frontend/src/components/SingleExampleEvaluation/Modals/DownloadModal.tsx index 6e203968..9c75684e 100644 --- a/frontend/src/components/SingleExampleEvaluation/Modals/DownloadModal.tsx +++ b/frontend/src/components/SingleExampleEvaluation/Modals/DownloadModal.tsx @@ -35,9 +35,11 @@ export const DownloadModal = ({ open, setOpen }: Props) => { if (['ipynb', 'py'].includes(selected)) { downloadUnitxtCode({ downloadAsScript: selected === 'py' }) } else if (selected === 'test_case') { + downloadTestCase() } else { + downloadTestData() } - }, [setOpen, selected, downloadUnitxtCode]) + }, [setOpen, selected, downloadUnitxtCode, downloadTestCase, downloadTestData]) return ( > = { generate_feedback: 'boolean', on_generation_failure: ['raise', 'random'], }, - m_prometheus: {}, + m_prometheus: { + billions_of_params: ['3', '5', '7'], + }, criticized: {}, thesis_antithesis: {}, unitxt: {}, @@ -160,7 +162,9 @@ export const JUDGE_PARAMS_MAP: Record> = { }, pairwise: { eval_assist: {}, - m_prometheus: {}, + m_prometheus: { + billions_of_params: ['3', '5', '7'], + }, unitxt: {}, }, } @@ -172,7 +176,9 @@ export const JUDGE_DEFAULT_PARAMS_MAP: Record> = { generate_feedback: true, on_generation_failure: 'random', }, - m_prometheus: {}, + m_prometheus: { + billions_of_params: '3', + }, criticized: {}, thesis_antithesis: {}, unitxt: {}, @@ -180,7 +186,25 @@ export const JUDGE_DEFAULT_PARAMS_MAP: Record> = { }, pairwise: { eval_assist: {}, - m_prometheus: {}, + m_prometheus: { + billions_of_params: '3', + }, unitxt: {}, }, } + +export const JUDGE_REQUIRES_MODEL_SELECTION_MAP: Record> = { + direct: { + eval_assist: true, + m_prometheus: false, + criticized: true, + thesis_antithesis: true, + unitxt: true, + granite_guardian: true, + }, + pairwise: { + eval_assist: true, + m_prometheus: false, + unitxt: true, + }, +} diff --git a/frontend/src/customHooks/useNotebookGeneration.ts b/frontend/src/customHooks/useNotebookGeneration.ts index 46785eee..d7685518 100644 --- a/frontend/src/customHooks/useNotebookGeneration.ts +++ b/frontend/src/customHooks/useNotebookGeneration.ts @@ -42,6 +42,8 @@ export const useUnitxtCodeGeneration = () => { evaluator_type: currentTestCase.type, test_case_name: currentTestCase.name || '', plain_python_script: downloadAsScript, + judge: currentTestCase.judge.name, + judge_params: currentTestCase.judge.params, }) if (!response.ok) {