Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions docs/concepts/metrics/available_metrics/traditional.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,29 @@ Output
```
1.0
```

# CHRF Score

The `ChrfScore` metric evaluates the similarity between a `response` and a `reference` using **character n-gram F-score**. Unlike BLEU, which emphasizes precision, CHRF accounts for both **precision and recall**, making it more suitable for:

- Morphologically rich languages
- Responses with paraphrasing or flexible wording

CHRF scores range from 0 to 1, where 1 indicates a perfect match between the generated response and the reference. This is a non-LLM-based metric, relying entirely on deterministic comparisons.

```python
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import ChrfScore

sample = SingleTurnSample(
response="The Eiffel Tower is located in India.",
reference="The Eiffel Tower is located in Paris."
)

scorer = ChrfScore()
await scorer.single_turn_ascore(sample)
```
Output
```
0.8048
```
2 changes: 2 additions & 0 deletions src/ragas/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
)
from ragas.metrics._aspect_critic import AspectCritic
from ragas.metrics._bleu_score import BleuScore
from ragas.metrics._chrf_score import ChrfScore
from ragas.metrics._context_entities_recall import (
ContextEntityRecall,
context_entity_recall,
Expand Down Expand Up @@ -121,6 +122,7 @@
"ExactMatch",
"StringPresence",
"BleuScore",
"ChrfScore",
"RougeScore",
"DataCompyScore",
"LLMSQLEquivalence",
Expand Down
51 changes: 51 additions & 0 deletions src/ragas/metrics/_chrf_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import typing as t
from dataclasses import dataclass, field

from langchain_core.callbacks import Callbacks

from ragas.dataset_schema import SingleTurnSample
from ragas.metrics.base import MetricType, SingleTurnMetric
from ragas.run_config import RunConfig


@dataclass
class ChrfScore(SingleTurnMetric):
name: str = "chrf_score"
_required_columns: t.Dict[MetricType, t.Set[str]] = field(
default_factory=lambda: {MetricType.SINGLE_TURN: {"reference", "response"}}
)
language: str = "english"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

language is not used anywhere. Did you miss it's code?

kwargs: t.Dict[str, t.Any] = field(default_factory=dict)

def __post_init__(self):
try:
from sacrebleu import corpus_chrf
except ImportError:
raise ImportError(
"sacrebleu is required for chrf score. Please install it using `pip install sacrebleu`"
)
self.corpus_chrf = corpus_chrf

def init(self, run_config: RunConfig):
pass

async def _single_turn_ascore(
self, sample: SingleTurnSample, callbacks: Callbacks
) -> float:
reference, response = sample.reference, sample.response
assert isinstance(reference, str), "ChrfScore expects a valid reference string"
assert isinstance(response, str), "ChrfScore expects a valid response string"

reference_sentences = reference.split(". ")
response_sentences = response.split(". ")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CHRF should operate on the entire text as character n-grams. Might not need sentence-level splitting.


# corpus_chrf expects a list of strings and a list of list of strings
references = [[ref] for ref in reference_sentences]
hypotheses = response_sentences

score = self.corpus_chrf(hypotheses, references, **self.kwargs).score / 100
assert isinstance(score, float), "Expecting a float"
return score

async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
return await self._single_turn_ascore(SingleTurnSample(**row), callbacks)