datarobot · mhauskn-dr · Jun 14, 2025 · Jun 14, 2025 · Jun 15, 2025 · Jun 16, 2025
diff --git a/studies/retriever-only-financebench.yaml b/studies/retriever-only-financebench.yaml
@@ -1,4 +1,4 @@
-name: "retriever-only-financebench"
+name: "retriever-only-financebench-v2"
 dataset:
   dataset_dir: partitioned
   description: Financial dataset that contains everything about finance, including
@@ -104,11 +104,16 @@ search_space:
       num_queries_max: 20
       num_queries_min: 2
       num_queries_step: 2
+    top_k:
+      kmax: 128
+      kmin: 2
+      log: true
+      step: 1      
   splitter:
     chunk_overlap_frac_max: 0.75
     chunk_overlap_frac_min: 0.0
     chunk_overlap_frac_step: 0.25
-    chunk_max_exp: 12
+    chunk_max_exp: 13
     chunk_min_exp: 9
     methods:
     - html
@@ -121,7 +126,7 @@ optimization:
   embedding_device: cuda
   # use_hf_embedding_models: true
   gpus_per_trial: 0.2
-  max_concurrent_trials: 50
+  max_concurrent_trials: 40
   num_eval_samples: 100
   num_eval_batch: 10
   num_retries_unique_params: 10

diff --git a/syftr/configuration.py b/syftr/configuration.py
@@ -133,6 +133,9 @@ class Paths(BaseModel):
         tmp_dir / "huggingface"
     )
     index_cache: Annotated[Path, Field(validate_default=True)] = tmp_dir / "indexcache"
+    retrieval_cache: Annotated[Path, Field(validate_default=True)] = (
+        tmp_dir / "retrieval_cache"
+    )
     onnx_dir: Annotated[Path, Field(validate_default=True)] = tmp_dir / "onnx"
     sota_dir: Annotated[Path, Field(validate_default=True)] = data_dir / "sota"
     lock_dir: Annotated[Path, Field(validate_default=True)] = tmp_dir / "syftr-locks"

diff --git a/syftr/custom_metrics.py b/syftr/custom_metrics.py
@@ -1,10 +1,16 @@
+import asyncio
+import math
 import typing as T
 
 import numpy as np
+from llama_index.core.evaluation import BaseEvaluator
+from llama_index.core.evaluation.base import EvaluationResult
 from llama_index.core.evaluation.retrieval.metrics_base import (
     BaseRetrievalMetric,
     RetrievalMetricResult,
 )
+from llama_index.core.prompts.mixin import PromptDictType
+from rapidfuzz.fuzz import partial_ratio
 from rouge_score import rouge_scorer
 
 
@@ -87,3 +93,259 @@ def lognormal_confidence(values: T.List[float], zscore: float) -> float:
     if len(values) == 0:
         return np.nan
     return zscore * float(np.std(values, ddof=1)) / np.sqrt(len(values))
+
+
+class ExactMatchEvaluator(BaseEvaluator):
+    """
+    Evaluator that calculates exact match by comparing reference contexts
+    with retrieved contexts.
+    """
+
+    async def aevaluate(
+        self,
+        query: T.Optional[str] = None,
+        response: T.Optional[str] = None,
+        contexts: T.Optional[T.Sequence[str]] = None,
+        **kwargs: T.Any,
+    ) -> EvaluationResult:
+        """
+        Evaluate exact match by computing the proportion of reference contexts
+        that are present in the retrieved contexts.
+        """
+        reference = kwargs.get("reference")
+
+        if not reference:
+            raise ValueError("Reference contexts are empty.")
+        if not contexts:
+            raise ValueError("Retrieved contexts are empty.")
+
+        matched = sum(any(ref in context for context in contexts) for ref in reference)
+        recall = matched / len(reference) if reference else 0.0
+        return EvaluationResult(
+            passing=recall > 0,
+            score=recall,
+        )
+
+    def evaluate(
+        self,
+        query: T.Optional[str] = None,
+        response: T.Optional[str] = None,
+        contexts: T.Optional[T.Sequence[str]] = None,
+        **kwargs: T.Any,
+    ) -> EvaluationResult:
+        """
+        Synchronous version of the evaluation method for compatibility with base class.
+        """
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.aevaluate(query, response, contexts, **kwargs)
+        )
+
+    def _get_prompts(self) -> PromptDictType:
+        """Get prompts."""
+        return {}
+
+    def _update_prompts(self, prompts_dict: PromptDictType) -> None:
+        """Update prompts."""
+        pass
+
+
+class FuzzyRecallEvaluator(BaseEvaluator):
+    """
+    Evaluator that calculates fuzzy recall by comparing reference contexts
+    with retrieved contexts using partial_ratio from rapidfuzz.
+    """
+
+    def __init__(self, threshold: float = 90.0):
+        self.threshold = threshold
+
+    async def fuzzy_match_async(self, ref: str, doc: str) -> bool:
+        return await asyncio.to_thread(partial_ratio, ref, doc) >= self.threshold
+
+    async def fuzzy_contains_async(self, ref: str, docs: T.Sequence[str]) -> bool:
+        tasks = [self.fuzzy_match_async(ref, doc) for doc in docs]
+        for coro in asyncio.as_completed(tasks):
+            if await coro:
+                return True
+        return False
+
+    async def aevaluate(
+        self,
+        query: T.Optional[str] = None,
+        response: T.Optional[str] = None,
+        contexts: T.Optional[T.Sequence[str]] = None,
+        **kwargs: T.Any,
+    ) -> EvaluationResult:
+        """
+        Evaluate fuzzy recall by computing the proportion of reference contexts
+        that have a fuzzy match in the retrieved contexts.
+        """
+        reference = kwargs.get("reference")
+
+        if not reference:
+            raise ValueError("Reference contexts are empty.")
+        if not contexts:
+            raise ValueError("Retrieved contexts are empty.")
+
+        tasks = [self.fuzzy_contains_async(ref, contexts) for ref in reference]
+        results = await asyncio.gather(*tasks)
+        matched = sum(results)
+        recall = matched / len(reference) if reference else 0.0
+        return EvaluationResult(
+            passing=recall > 0,
+            score=recall,
+        )
+
+    def evaluate(
+        self,
+        query: T.Optional[str] = None,
+        response: T.Optional[str] = None,
+        contexts: T.Optional[T.Sequence[str]] = None,
+        **kwargs: T.Any,
+    ) -> EvaluationResult:
+        """
+        Synchronous version of the evaluation method for compatibility with base class.
+        """
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.aevaluate(query, response, contexts, **kwargs)
+        )
+
+    def _get_prompts(self) -> PromptDictType:
+        """Get prompts."""
+        return {}
+
+    def _update_prompts(self, prompts_dict: PromptDictType) -> None:
+        """Update prompts."""
+        pass
+
+
+class MRREvaluator(BaseEvaluator):
+    """
+    Evaluator that calculates Mean Reciprocal Rank (MRR) for a single query by
+    finding the first matching reference in the retrieved contexts.
+    """
+
+    async def aevaluate(
+        self,
+        query: T.Optional[str] = None,
+        response: T.Optional[str] = None,
+        contexts: T.Optional[T.Sequence[str]] = None,
+        **kwargs: T.Any,
+    ) -> EvaluationResult:
+        """
+        Evaluate reciprocal rank of the first relevant document.
+        Assumes `reference` is a list of correct answers and `contexts` is
+        a list of retrieved documents ordered by relevance.
+        """
+        reference = kwargs.get("reference")
+
+        if not reference:
+            raise ValueError("Reference contexts are empty.")
+        if not contexts:
+            raise ValueError("Retrieved contexts are empty.")
+
+        reciprocal_rank = 0.0
+        for i, context in enumerate(contexts):
+            if any(ref in context for ref in reference):
+                reciprocal_rank = 1.0 / (i + 1)
+                break
+
+        return EvaluationResult(
+            passing=reciprocal_rank > 0,
+            score=reciprocal_rank,
+        )
+
+    def evaluate(
+        self,
+        query: T.Optional[str] = None,
+        response: T.Optional[str] = None,
+        contexts: T.Optional[T.Sequence[str]] = None,
+        **kwargs: T.Any,
+    ) -> EvaluationResult:
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.aevaluate(query, response, contexts, **kwargs)
+        )
+
+    def _get_prompts(self) -> PromptDictType:
+        return {}
+
+    def _update_prompts(self, prompts_dict: PromptDictType) -> None:
+        pass
+
+
+class NDCGEvaluator(BaseEvaluator):
+    """
+    Evaluator that calculates Normalized Discounted Cumulative Gain (NDCG)
+    based on relevance of retrieved contexts.
+    """
+
+    def __init__(self, k: int = 10):
+        self.k = k
+
+    def _dcg(self, relevance_scores: T.List[float]) -> float:
+        return sum(rel / math.log2(i + 2) for i, rel in enumerate(relevance_scores))
+
+    def _get_relevance(
+        self, context: str, reference: T.Union[T.Sequence[str], T.Mapping[str, float]]
+    ) -> float:
+        if isinstance(reference, dict):
+            # Graded relevance
+            return max(
+                (score for ref, score in reference.items() if ref in context),
+                default=0.0,
+            )
+        else:
+            # Binary relevance
+            return 1.0 if any(ref in context for ref in reference) else 0.0
+
+    async def aevaluate(
+        self,
+        query: T.Optional[str] = None,
+        response: T.Optional[str] = None,
+        contexts: T.Optional[T.Sequence[str]] = None,
+        **kwargs: T.Any,
+    ) -> EvaluationResult:
+        reference = kwargs.get("reference")
+
+        if not reference:
+            raise ValueError("Reference contexts are empty.")
+        if not contexts:
+            raise ValueError("Retrieved contexts are empty.")
+
+        top_k_contexts = contexts[: self.k]
+        relevance_scores = [self._get_relevance(c, reference) for c in top_k_contexts]
+        dcg = self._dcg(relevance_scores)
+
+        # Ideal DCG: sorted relevance scores
+        if isinstance(reference, dict):
+            ideal_scores = sorted(reference.values(), reverse=True)[: self.k]
+        else:
+            ideal_scores = [1.0] * min(len(reference), self.k)
+
+        idcg = self._dcg(ideal_scores)
+        ndcg = dcg / idcg if idcg > 0 else 0.0
+
+        return EvaluationResult(
+            passing=ndcg > 0,
+            score=ndcg,
+        )
+
+    def evaluate(
+        self,
+        query: T.Optional[str] = None,
+        response: T.Optional[str] = None,
+        contexts: T.Optional[T.Sequence[str]] = None,
+        **kwargs: T.Any,
+    ) -> EvaluationResult:
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.aevaluate(query, response, contexts, **kwargs)
+        )
+
+    def _get_prompts(self) -> PromptDictType:
+        return {}
+
+    def _update_prompts(self, prompts_dict: PromptDictType) -> None:
+        pass