Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ license = {file = "LICENSE"}
dependencies = [
# Core dependencies
"numpy",
"datasets",
"datasets>=4.0.0",
"tiktoken",
"pydantic>=2.0.0",
"nest-asyncio",
Expand Down Expand Up @@ -158,7 +158,8 @@ dev = [
"build>=1.3.0",
# Additional tools for full dev
"nbmake",
"notebook",
"notebook",
"unstructured[md]",
"arize-phoenix>=6.1.0",
"openinference-instrumentation-langchain>=0.1.29",
# Include all optional features
Expand Down
6 changes: 1 addition & 5 deletions src/ragas/metrics/_answer_relevance.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import asyncio
import logging
import typing as t
from dataclasses import dataclass, field
Expand Down Expand Up @@ -141,10 +140,7 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
prompt_input = ResponseRelevanceInput(response=row["response"])

responses = await self.question_generation.generate_multiple(
data=prompt_input,
llm=self.llm,
callbacks=callbacks,
n=self.strictness
data=prompt_input, llm=self.llm, callbacks=callbacks, n=self.strictness
)

return self._calculate_score(responses, row)
Expand Down
1 change: 1 addition & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Tests package
10 changes: 4 additions & 6 deletions tests/benchmarks/benchmark_eval.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import time

from datasets import DatasetDict, load_dataset

from ragas import evaluate
from ragas.metrics import (
ContextUtilization,
Expand All @@ -13,12 +11,12 @@
faithfulness,
)

from ..e2e.test_dataset_utils import load_amnesty_dataset_safe

# from ragas.metrics.critique import harmfulness # Import unavailable

# data
ds = load_dataset("explodinggradients/amnesty_qa", "english_v2")
assert isinstance(ds, DatasetDict)
eval_dataset = ds["eval"]
# data - using safe dataset loading
eval_dataset = load_amnesty_dataset_safe("english_v2")

# metrics
metrics = [
Expand Down
1 change: 1 addition & 0 deletions tests/e2e/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# E2E tests package
6 changes: 6 additions & 0 deletions tests/e2e/test_adaptation.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import os

import pytest

from ragas.llms import llm_factory
from ragas.metrics import context_recall


@pytest.mark.asyncio
@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
async def test_adapt():
llm = llm_factory("gpt-4o")
await context_recall.adapt_prompts(llm=llm, language="spanish")
Expand Down
8 changes: 5 additions & 3 deletions tests/e2e/test_amnesty_in_ci.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import typing as t

import pytest
from datasets import load_dataset

from ragas import EvaluationDataset, evaluate
from ragas.metrics import (
Expand All @@ -10,12 +10,13 @@
context_recall,
faithfulness,
)
from tests.e2e.test_dataset_utils import load_amnesty_dataset_safe

if t.TYPE_CHECKING:
from datasets import Dataset

# loading the V2 dataset
amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v3")["eval"] # type: ignore
# loading the dataset
amnesty_qa = load_amnesty_dataset_safe("english_v3") # type: ignore


def assert_in_range(score: float, value: float, plus_or_minus: float):
Expand All @@ -26,6 +27,7 @@ def assert_in_range(score: float, value: float, plus_or_minus: float):


@pytest.mark.ragas_ci
@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
def test_amnesty_e2e():
result = evaluate(
EvaluationDataset.from_hf_dataset(t.cast("Dataset", amnesty_qa))[:1],
Expand Down
56 changes: 56 additions & 0 deletions tests/e2e/test_dataset_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Utilities for creating test datasets in e2e tests."""

import logging

from datasets import Dataset, load_dataset

logger = logging.getLogger(__name__)

# Sample data structure matching the amnesty_qa dataset
SAMPLE_AMNESTY_DATA = [
{
"user_input": "What are the global implications of the USA Supreme Court ruling on abortion?",
"reference": "The global implications of the USA Supreme Court ruling on abortion are significant. The ruling has led to limited or no access to abortion for one in three women and girls of reproductive age in states where abortion access is restricted. These states also have weaker maternal health support, higher maternal death rates, and higher child poverty rates. Additionally, the ruling has had an impact beyond national borders due to the USA's geopolitical and cultural influence globally.",
"response": "The global implications of the USA Supreme Court ruling on abortion can be significant, as it sets a precedent for other countries and influences the global discourse on reproductive rights. The Supreme Court's ruling can serve as a reference point for other countries grappling with their own abortion laws.",
"retrieved_contexts": [
"In 2022, the USA Supreme Court handed down a decision ruling that overturned 50 years of jurisprudence recognizing a constitutional right to abortion.",
"This decision has had a massive impact: one in three women and girls of reproductive age now live in states where abortion access is either totally or near-totally inaccessible.",
"The USA Supreme Court ruling has also had impacts beyond national borders due to the geopolitical and cultural influence wielded by the USA globally.",
],
},
{
"user_input": "How does climate change affect human rights?",
"reference": "Climate change poses significant threats to human rights by affecting access to water, food security, health, and adequate housing. It disproportionately impacts vulnerable populations and can lead to displacement and migration.",
"response": "Climate change impacts human rights through multiple pathways including threats to life, health, food, water, and adequate standard of living. The effects are often most severe for marginalized communities.",
"retrieved_contexts": [
"Climate change threatens the effective enjoyment of human rights including life, water and sanitation, food, health, housing, and livelihoods.",
"The impacts of climate change will be felt most acutely by those segments of the population who are already in vulnerable situations.",
"Climate change is already displacing people and will continue to do so in the future.",
],
},
]


def load_amnesty_dataset_safe(config: str = "english_v3"):
"""
Safely load the amnesty_qa dataset, falling back to local data if remote fails.

Args:
config: Dataset configuration name (e.g., "english_v3", "english_v2")

Returns:
Dataset: The loaded dataset
"""
try:
logger.info(f"Attempting to load amnesty_qa dataset with config '{config}'")
dataset = load_dataset("explodinggradients/amnesty_qa", config)["eval"]
logger.info(f"Successfully loaded dataset with {len(dataset)} samples")
return dataset
except Exception as e:
logger.warning(f"Failed to load remote dataset: {e}")
logger.info("Using local sample data as fallback")

# Create a local dataset from sample data
local_dataset = Dataset.from_list(SAMPLE_AMNESTY_DATA)
logger.info(f"Created local dataset with {len(local_dataset)} samples")
return local_dataset
7 changes: 5 additions & 2 deletions tests/e2e/test_fullflow.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
import os
import typing as t

from datasets import load_dataset
import pytest

from ragas import EvaluationDataset, evaluate
from ragas.metrics import answer_relevancy, context_precision, faithfulness
from ragas.metrics._aspect_critic import harmfulness
from tests.e2e.test_dataset_utils import load_amnesty_dataset_safe

if t.TYPE_CHECKING:
from datasets import Dataset


@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
def test_evaluate_e2e():
ds = load_dataset("explodinggradients/amnesty_qa", "english_v3")["eval"] # type: ignore
ds = load_amnesty_dataset_safe("english_v3") # type: ignore
result = evaluate(
EvaluationDataset.from_hf_dataset(t.cast("Dataset", ds))[:1],
metrics=[answer_relevancy, context_precision, faithfulness, harmfulness],
Expand Down
5 changes: 5 additions & 0 deletions tests/e2e/test_testset_generation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import os

import pytest

from ragas.testset import TestsetGenerator


@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
def test_testset_generation_e2e():
# generate kg
from langchain_community.document_loaders import DirectoryLoader
Expand Down
8 changes: 5 additions & 3 deletions tests/unit/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@ def sample_func(a, b):
def test_generate_cache_key_bound_method():
"""Test that cache keys stay the same, when caching bound methods of different objects."""

class Clazz():
class Clazz:
def __init__(self, irrelevant):
self.irrelevant = irrelevant

def sample_func(self, a, b):
return a + b

Expand All @@ -55,7 +55,9 @@ def sample_func(self, a, b):

key1 = _generate_cache_key(object.sample_func, (1, 2), {})
key2 = _generate_cache_key(object2.sample_func, (1, 2), {})
assert key1 == key2, "Cache keys should match even if the originating objects the methods are bound to are not the same, as long as the arguments match"
assert key1 == key2, (
"Cache keys should match even if the originating objects the methods are bound to are not the same, as long as the arguments match"
)


def test_no_cache_backend():
Expand Down
Loading