Skip to content
Open
11 changes: 8 additions & 3 deletions studies/retriever-only-financebench.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: "retriever-only-financebench"
name: "retriever-only-financebench-v2"
dataset:
dataset_dir: partitioned
description: Financial dataset that contains everything about finance, including
Expand Down Expand Up @@ -104,11 +104,16 @@ search_space:
num_queries_max: 20
num_queries_min: 2
num_queries_step: 2
top_k:
kmax: 128
kmin: 2
log: true
step: 1
splitter:
chunk_overlap_frac_max: 0.75
chunk_overlap_frac_min: 0.0
chunk_overlap_frac_step: 0.25
chunk_max_exp: 12
chunk_max_exp: 13
chunk_min_exp: 9
methods:
- html
Expand All @@ -121,7 +126,7 @@ optimization:
embedding_device: cuda
# use_hf_embedding_models: true
gpus_per_trial: 0.2
max_concurrent_trials: 50
max_concurrent_trials: 40
num_eval_samples: 100
num_eval_batch: 10
num_retries_unique_params: 10
Expand Down
3 changes: 3 additions & 0 deletions syftr/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ class Paths(BaseModel):
tmp_dir / "huggingface"
)
index_cache: Annotated[Path, Field(validate_default=True)] = tmp_dir / "indexcache"
retrieval_cache: Annotated[Path, Field(validate_default=True)] = (
tmp_dir / "retrieval_cache"
)
onnx_dir: Annotated[Path, Field(validate_default=True)] = tmp_dir / "onnx"
sota_dir: Annotated[Path, Field(validate_default=True)] = data_dir / "sota"
lock_dir: Annotated[Path, Field(validate_default=True)] = tmp_dir / "syftr-locks"
Expand Down
262 changes: 262 additions & 0 deletions syftr/custom_metrics.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import asyncio
import math
import typing as T

import numpy as np
from llama_index.core.evaluation import BaseEvaluator
from llama_index.core.evaluation.base import EvaluationResult
from llama_index.core.evaluation.retrieval.metrics_base import (
BaseRetrievalMetric,
RetrievalMetricResult,
)
from llama_index.core.prompts.mixin import PromptDictType
from rapidfuzz.fuzz import partial_ratio
from rouge_score import rouge_scorer


Expand Down Expand Up @@ -87,3 +93,259 @@ def lognormal_confidence(values: T.List[float], zscore: float) -> float:
if len(values) == 0:
return np.nan
return zscore * float(np.std(values, ddof=1)) / np.sqrt(len(values))


class ExactMatchEvaluator(BaseEvaluator):
"""
Evaluator that calculates exact match by comparing reference contexts
with retrieved contexts.
"""

async def aevaluate(
self,
query: T.Optional[str] = None,
response: T.Optional[str] = None,
contexts: T.Optional[T.Sequence[str]] = None,
**kwargs: T.Any,
) -> EvaluationResult:
"""
Evaluate exact match by computing the proportion of reference contexts
that are present in the retrieved contexts.
"""
reference = kwargs.get("reference")

if not reference:
raise ValueError("Reference contexts are empty.")
if not contexts:
raise ValueError("Retrieved contexts are empty.")

matched = sum(any(ref in context for context in contexts) for ref in reference)
recall = matched / len(reference) if reference else 0.0
return EvaluationResult(
passing=recall > 0,
score=recall,
)

def evaluate(
self,
query: T.Optional[str] = None,
response: T.Optional[str] = None,
contexts: T.Optional[T.Sequence[str]] = None,
**kwargs: T.Any,
) -> EvaluationResult:
"""
Synchronous version of the evaluation method for compatibility with base class.
"""
loop = asyncio.get_event_loop()
return loop.run_until_complete(
self.aevaluate(query, response, contexts, **kwargs)
)

def _get_prompts(self) -> PromptDictType:
"""Get prompts."""
return {}

def _update_prompts(self, prompts_dict: PromptDictType) -> None:
"""Update prompts."""
pass


class FuzzyRecallEvaluator(BaseEvaluator):
"""
Evaluator that calculates fuzzy recall by comparing reference contexts
with retrieved contexts using partial_ratio from rapidfuzz.
"""

def __init__(self, threshold: float = 90.0):
self.threshold = threshold

async def fuzzy_match_async(self, ref: str, doc: str) -> bool:
return await asyncio.to_thread(partial_ratio, ref, doc) >= self.threshold

async def fuzzy_contains_async(self, ref: str, docs: T.Sequence[str]) -> bool:
tasks = [self.fuzzy_match_async(ref, doc) for doc in docs]
for coro in asyncio.as_completed(tasks):
if await coro:
return True
return False

async def aevaluate(
self,
query: T.Optional[str] = None,
response: T.Optional[str] = None,
contexts: T.Optional[T.Sequence[str]] = None,
**kwargs: T.Any,
) -> EvaluationResult:
"""
Evaluate fuzzy recall by computing the proportion of reference contexts
that have a fuzzy match in the retrieved contexts.
"""
reference = kwargs.get("reference")

if not reference:
raise ValueError("Reference contexts are empty.")
if not contexts:
raise ValueError("Retrieved contexts are empty.")

tasks = [self.fuzzy_contains_async(ref, contexts) for ref in reference]
results = await asyncio.gather(*tasks)
matched = sum(results)
recall = matched / len(reference) if reference else 0.0
return EvaluationResult(
passing=recall > 0,
score=recall,
)

def evaluate(
self,
query: T.Optional[str] = None,
response: T.Optional[str] = None,
contexts: T.Optional[T.Sequence[str]] = None,
**kwargs: T.Any,
) -> EvaluationResult:
"""
Synchronous version of the evaluation method for compatibility with base class.
"""
loop = asyncio.get_event_loop()
return loop.run_until_complete(
self.aevaluate(query, response, contexts, **kwargs)
)

def _get_prompts(self) -> PromptDictType:
"""Get prompts."""
return {}

def _update_prompts(self, prompts_dict: PromptDictType) -> None:
"""Update prompts."""
pass


class MRREvaluator(BaseEvaluator):
"""
Evaluator that calculates Mean Reciprocal Rank (MRR) for a single query by
finding the first matching reference in the retrieved contexts.
"""

async def aevaluate(
self,
query: T.Optional[str] = None,
response: T.Optional[str] = None,
contexts: T.Optional[T.Sequence[str]] = None,
**kwargs: T.Any,
) -> EvaluationResult:
"""
Evaluate reciprocal rank of the first relevant document.
Assumes `reference` is a list of correct answers and `contexts` is
a list of retrieved documents ordered by relevance.
"""
reference = kwargs.get("reference")

if not reference:
raise ValueError("Reference contexts are empty.")
if not contexts:
raise ValueError("Retrieved contexts are empty.")

reciprocal_rank = 0.0
for i, context in enumerate(contexts):
if any(ref in context for ref in reference):
reciprocal_rank = 1.0 / (i + 1)
break

return EvaluationResult(
passing=reciprocal_rank > 0,
score=reciprocal_rank,
)

def evaluate(
self,
query: T.Optional[str] = None,
response: T.Optional[str] = None,
contexts: T.Optional[T.Sequence[str]] = None,
**kwargs: T.Any,
) -> EvaluationResult:
loop = asyncio.get_event_loop()
return loop.run_until_complete(
self.aevaluate(query, response, contexts, **kwargs)
)

def _get_prompts(self) -> PromptDictType:
return {}

def _update_prompts(self, prompts_dict: PromptDictType) -> None:
pass


class NDCGEvaluator(BaseEvaluator):
"""
Evaluator that calculates Normalized Discounted Cumulative Gain (NDCG)
based on relevance of retrieved contexts.
"""

def __init__(self, k: int = 10):
self.k = k

def _dcg(self, relevance_scores: T.List[float]) -> float:
return sum(rel / math.log2(i + 2) for i, rel in enumerate(relevance_scores))

def _get_relevance(
self, context: str, reference: T.Union[T.Sequence[str], T.Mapping[str, float]]
) -> float:
if isinstance(reference, dict):
# Graded relevance
return max(
(score for ref, score in reference.items() if ref in context),
default=0.0,
)
else:
# Binary relevance
return 1.0 if any(ref in context for ref in reference) else 0.0

async def aevaluate(
self,
query: T.Optional[str] = None,
response: T.Optional[str] = None,
contexts: T.Optional[T.Sequence[str]] = None,
**kwargs: T.Any,
) -> EvaluationResult:
reference = kwargs.get("reference")

if not reference:
raise ValueError("Reference contexts are empty.")
if not contexts:
raise ValueError("Retrieved contexts are empty.")

top_k_contexts = contexts[: self.k]
relevance_scores = [self._get_relevance(c, reference) for c in top_k_contexts]
dcg = self._dcg(relevance_scores)

# Ideal DCG: sorted relevance scores
if isinstance(reference, dict):
ideal_scores = sorted(reference.values(), reverse=True)[: self.k]
else:
ideal_scores = [1.0] * min(len(reference), self.k)

idcg = self._dcg(ideal_scores)
ndcg = dcg / idcg if idcg > 0 else 0.0

return EvaluationResult(
passing=ndcg > 0,
score=ndcg,
)

def evaluate(
self,
query: T.Optional[str] = None,
response: T.Optional[str] = None,
contexts: T.Optional[T.Sequence[str]] = None,
**kwargs: T.Any,
) -> EvaluationResult:
loop = asyncio.get_event_loop()
return loop.run_until_complete(
self.aevaluate(query, response, contexts, **kwargs)
)

def _get_prompts(self) -> PromptDictType:
return {}

def _update_prompts(self, prompts_dict: PromptDictType) -> None:
pass
Loading