Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/examples/run_prometheus_direct_judge.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from evalassist.judges import Criteria, CriteriaOption, Instance, MPrometheusDirectJudge

if __name__ == "__main__":
judge = MPrometheusDirectJudge(m_prometheus_b_params=3)
judge = MPrometheusDirectJudge(billions_of_params=3)

instances = [
Instance(
Expand Down
2 changes: 1 addition & 1 deletion backend/examples/run_prometheus_pairwise_judge.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from evalassist.judges import Criteria, Instance, MPrometheusPairwiseJudge

if __name__ == "__main__":
judge = MPrometheusPairwiseJudge(m_prometheus_b_params=3)
judge = MPrometheusPairwiseJudge(billions_of_params=3)

instances = [
Instance(
Expand Down
2 changes: 2 additions & 0 deletions backend/src/evalassist/api_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,8 @@ class NotebookParams(BaseModel):
instances: list[DirectInstanceDTO] | list[PairwiseInstanceDTO]
examples: list[DirectInstanceDTO] | list[PairwiseInstanceDTO]
criteria: CriteriaDTO | CriteriaWithOptionsDTO
judge: str
judge_params: dict


# class DownloadTestCaseParams(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion backend/src/evalassist/judges/direct_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def __init__(
"Either provide set generate_synthetic_persona to False or don't provide a judge_description_prompt."
)

if self.self_consistency:
if self.self_consistency is True or self.self_consistency > 1:
temp = getattr(self.inference_engine, "temperature", None)
if temp is not None:
try:
Expand Down
25 changes: 16 additions & 9 deletions backend/src/evalassist/judges/mprometheus_judge.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
from typing import Literal, cast

from evalassist.judges.utils import get_to_evaluate_text
from fastapi import HTTPException

from .base import BaseDirectJudge, BasePairwiseJudge
from .types import Criteria, DirectInstanceResult, Instance, PairwiseInstanceResult
from .utils import get_to_evaluate_text


class MPrometheusJudge:
m_prometheus_model_name: str

def __init__(self, m_prometheus_b_params: Literal[3, 7, 14] = 3, **kwargs):
def __init__(self, billions_of_params: Literal[3, 7, 14] = 3, **kwargs):
super().__init__(**kwargs)
self.m_prometheus_model_name = (
f"Unbabel/M-Prometheus-{str(m_prometheus_b_params)}B"
f"Unbabel/M-Prometheus-{str(billions_of_params)}B"
)


Expand Down Expand Up @@ -42,12 +43,18 @@ def _run(
instances: list[Instance],
criteria: list[Criteria],
) -> list[DirectInstanceResult]:
from prometheus_eval import PrometheusEval
from prometheus_eval.prompts import (
ABSOLUTE_PROMPT_WO_REF,
SCORE_RUBRIC_TEMPLATE,
)
from prometheus_eval.vllm import VLLM
try:
from prometheus_eval import PrometheusEval
from prometheus_eval.prompts import (
ABSOLUTE_PROMPT_WO_REF,
SCORE_RUBRIC_TEMPLATE,
)
from prometheus_eval.vllm import VLLM
except ModuleNotFoundError:
raise HTTPException(
status_code=404,
detail="Failed to import 'prometheus_eval' package. Make sure it is installed correctly.",
)

self._validate_criteria(criteria)
self._validate_instances(instances)
Expand Down
42 changes: 28 additions & 14 deletions backend/src/evalassist/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import nbformat as nbf
import nest_asyncio
import pandas as pd
from evalassist.judges.base import UnitxtInferenceEngineMixin
from fastapi import (
APIRouter,
BackgroundTasks,
Expand Down Expand Up @@ -311,16 +312,23 @@ def run():
else:
temperature = 0.0

inference_engine: InferenceEngine = get_inference_engine_from_judge_metadata(
evaluator_name=evaluator_name,
custom_model_name=custom_model_name,
provider=req.provider,
llm_provider_credentials=req.llm_provider_credentials,
custom_params={
**DEFAULT_JUDGE_INFERENCE_PARAMS,
"temperature": temperature,
},
)
judge_class = JUDGE_CLASS_MAP[req.type][req.judge]
judge_requires_model = issubclass(judge_class, UnitxtInferenceEngineMixin)

if judge_requires_model:
inference_engine: InferenceEngine = (
get_inference_engine_from_judge_metadata(
evaluator_name=evaluator_name,
custom_model_name=custom_model_name,
provider=req.provider,
llm_provider_credentials=req.llm_provider_credentials,
custom_params={
**DEFAULT_JUDGE_INFERENCE_PARAMS,
"temperature": temperature,
},
)
)

if (
req.criteria.to_evaluate_field is None
or req.criteria.context_fields is None
Expand All @@ -337,14 +345,17 @@ def run():
example.to_instance_result(req.criteria.to_evaluate_field)
for example in req.examples
]
judge = JUDGE_CLASS_MAP[req.type][req.judge](
**{"inference_engine": inference_engine, **req.judge_params}
) # type: ignore

params = req.judge_params
if judge_requires_model:
params["inference_engine"] = inference_engine # type: ignore

judge = judge_class(**params) # type: ignore

if req.type == EvaluatorTypeEnum.DIRECT:
criteria = req.criteria.to_criteria(examples=examples)
if evaluator_name.name.startswith("GRANITE_GUARDIAN"):
judge = GraniteGuardianJudge(inference_engine=inference_engine)
judge = GraniteGuardianJudge(inference_engine=inference_engine) # type: ignore
per_instance_result = judge.evaluate(
instances=instances,
criteria=criteria,
Expand Down Expand Up @@ -534,7 +545,10 @@ def download_notebook(params: NotebookParams, background_tasks: BackgroundTasks)
"evaluator_type": params.evaluator_type,
"model_name": model_name,
"plain_python_script": params.plain_python_script,
"judge": params.judge,
"judge_params": params.judge_params,
}

if params.evaluator_type == EvaluatorTypeEnum.DIRECT:
nb = DirectEvaluationNotebook(**p).generate_notebook() # type: ignore
else:
Expand Down
101 changes: 89 additions & 12 deletions backend/src/evalassist/notebook_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from typing import Literal, cast

import nbformat as nbf
from evalassist.judges import Criteria, Instance
from evalassist.judges import JUDGE_CLASS_MAP, Criteria, Instance
from evalassist.judges.base import UnitxtInferenceEngineMixin

from .api_types import (
EvaluatorNameEnum,
Expand All @@ -16,6 +17,58 @@
from .utils import get_cross_inference_engine_params


def format_value(value, indent=0):
"""
Format values as valid Python code.
Supports nested dicts, literals, and variable references.
"""
pad = " " * indent

# Variable reference
if isinstance(value, VariableRef):
return value.name

# Basic literals
if isinstance(value, (bool, int, float)):
return repr(value)

# Strings
if isinstance(value, str):
return repr(value)

# Nested dictionary
if isinstance(value, dict):
if not value:
return "{}"

inner_indent = indent + 4
inner_pad = " " * inner_indent

items = []
for k, v in value.items():
items.append(f"{inner_pad}{repr(k)}: {format_value(v, inner_indent)},")

return "{\n" + "\n".join(items) + f"\n{pad}}}"

raise ValueError(f"Unsupported param type: {type(value)}")


def generate_constructor_code(class_name: str, params: dict) -> str:
lines = [f"{class_name}("]
for key, value in params.items():
formatted = format_value(value, indent=4)
lines.append(f" {key}={formatted},")
lines.append(")")
return "\n".join(lines)


class VariableRef:
"""Wrapper representing a variable name in generated code."""

def __init__(self, name: str):
self.name = name


class Cell:
type: Literal["code", "md"]
content: str
Expand All @@ -37,19 +90,34 @@ def __init__(
evaluator_type: EvaluatorTypeEnum,
model_name: str,
plain_python_script: bool,
judge: str,
judge_params: dict,
):
self.instances = instances
self.criteria = criteria
self.test_case_name = test_case_name
self.evaluator_name = evaluator_name
self.evaluator_type = evaluator_type
self.plain_python_script = plain_python_script
self.inference_engine_params = get_cross_inference_engine_params(
credentials=credentials,
provider=provider,
model_name=model_name,
custom_params=DEFAULT_JUDGE_INFERENCE_PARAMS,
self.judge = judge
self.judge_params = judge_params

self.judge_class = JUDGE_CLASS_MAP[evaluator_type][judge]

self.judge_requires_model = issubclass(
self.judge_class, UnitxtInferenceEngineMixin
)

if self.judge_requires_model:
self.inference_engine_params = get_cross_inference_engine_params(
credentials=credentials,
provider=provider,
model_name=model_name,
custom_params=DEFAULT_JUDGE_INFERENCE_PARAMS,
)
else:
self.inference_engine_params = None

self.cells: list[Cell] = []

def generate_notebook(self):
Expand Down Expand Up @@ -148,15 +216,24 @@ def get_import_code(self):
"""

def get_setup_and_run_eval_code(self):
params = re.sub(
r"\btrue\b", "True", json.dumps(self.inference_engine_params, indent=4)
if self.judge_requires_model:
inference_engine_construct_str = generate_constructor_code(
"CrossProviderInferenceEngine", params=self.inference_engine_params
) # type: ignore
else:
inference_engine_construct_str = ""

judge_params = self.judge_params
if self.judge_requires_model:
judge_params["inference_engine"] = VariableRef("inference_engine")
judge_construct_str = generate_constructor_code(
self.judge_class.__name__, params=judge_params
)

return f"""\
inference_engine = CrossProviderInferenceEngine(**{params})
inference_engine = {inference_engine_construct_str}

judge = DirectJudge(
inference_engine=inference_engine,
)
judge = {judge_construct_str}

results: list[DirectInstanceResult] = judge(instances, criteria)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,28 @@
.container {
display: flex;
flex-direction: column;
gap: 2rem;
}

.configOptions {
display: grid;
grid-template-columns: repeat(2, 1fr);
gap: 1rem; /* optional spacing */
grid-template-columns: repeat(3, 1fr);
gap: 1rem;
align-items: center;
}

.section {
display: flex;
flex-direction: column;
gap: 0.75rem;
gap: 1rem;
}

.subSection {
display: flex;
flex-direction: column;
gap: 1rem;
}

.topDivider {
border-top: 1px solid $border-subtle-00;
padding-top: 1rem;
}
Loading