explodinggradients · kauabh · Aug 25, 2025 · Aug 25, 2025 · Aug 25, 2025 · Aug 25, 2025
diff --git a/docs/concepts/metrics/available_metrics/traditional.md b/docs/concepts/metrics/available_metrics/traditional.md
@@ -125,3 +125,29 @@ Output
 ```
 1.0
 ```
+
+# CHRF Score
+
+The `ChrfScore` metric evaluates the similarity between a `response` and a `reference` using **character n-gram F-score**. Unlike BLEU, which emphasizes precision, CHRF accounts for both **precision and recall**, making it more suitable for:
+
+- Morphologically rich languages
+- Responses with paraphrasing or flexible wording
+
+CHRF scores range from 0 to 1, where 1 indicates a perfect match between the generated response and the reference. This is a non-LLM-based metric, relying entirely on deterministic comparisons.
+
+```python
+from ragas.dataset_schema import SingleTurnSample
+from ragas.metrics import ChrfScore
+
+sample = SingleTurnSample(
+    response="The Eiffel Tower is located in India.",
+    reference="The Eiffel Tower is located in Paris."
+)
+
+scorer = ChrfScore()
+await scorer.single_turn_ascore(sample)
+```
+Output
+```
+0.8048
+```
diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py
@@ -11,6 +11,7 @@
 )
 from ragas.metrics._aspect_critic import AspectCritic
 from ragas.metrics._bleu_score import BleuScore
+from ragas.metrics._chrf_score import ChrfScore
 from ragas.metrics._context_entities_recall import (
     ContextEntityRecall,
     context_entity_recall,
@@ -121,6 +122,7 @@
     "ExactMatch",
     "StringPresence",
     "BleuScore",
+    "ChrfScore",
     "RougeScore",
     "DataCompyScore",
     "LLMSQLEquivalence",

diff --git a/src/ragas/metrics/_chrf_score.py b/src/ragas/metrics/_chrf_score.py
@@ -0,0 +1,51 @@
+import typing as t
+from dataclasses import dataclass, field
+
+from langchain_core.callbacks import Callbacks
+
+from ragas.dataset_schema import SingleTurnSample
+from ragas.metrics.base import MetricType, SingleTurnMetric
+from ragas.run_config import RunConfig
+
+
+@dataclass
+class ChrfScore(SingleTurnMetric):
+    name: str = "chrf_score"
+    _required_columns: t.Dict[MetricType, t.Set[str]] = field(
+        default_factory=lambda: {MetricType.SINGLE_TURN: {"reference", "response"}}
+    )
+    language: str = "english"
+    kwargs: t.Dict[str, t.Any] = field(default_factory=dict)
+
+    def __post_init__(self):
+        try:
+            from sacrebleu import corpus_chrf
+        except ImportError:
+            raise ImportError(
+                "sacrebleu is required for chrf score. Please install it using `pip install sacrebleu`"
+            )
+        self.corpus_chrf = corpus_chrf
+
+    def init(self, run_config: RunConfig):
+        pass
+
+    async def _single_turn_ascore(
+        self, sample: SingleTurnSample, callbacks: Callbacks
+    ) -> float:
+        reference, response = sample.reference, sample.response
+        assert isinstance(reference, str), "ChrfScore expects a valid reference string"
+        assert isinstance(response, str), "ChrfScore expects a valid response string"
+
+        reference_sentences = reference.split(". ")
+        response_sentences = response.split(". ")
+
+        # corpus_chrf expects a list of strings and a list of list of strings
+        references = [[ref] for ref in reference_sentences]
+        hypotheses = response_sentences
+
+        score = self.corpus_chrf(hypotheses, references, **self.kwargs).score / 100
+        assert isinstance(score, float), "Expecting a float"
+        return score
+
+    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
+        return await self._single_turn_ascore(SingleTurnSample(**row), callbacks)