explodinggradients · jjmachan · Aug 26, 2025 · Aug 20, 2025 · Aug 25, 2025 · Aug 25, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ license = {file = "LICENSE"}
 dependencies = [
     # Core dependencies
     "numpy",
-    "datasets",
+    "datasets>=4.0.0",
     "tiktoken",
     "pydantic>=2.0.0",
     "nest-asyncio",
@@ -158,7 +158,8 @@ dev = [
     "build>=1.3.0",
     # Additional tools for full dev
     "nbmake",
-    "notebook", 
+    "notebook",
+    "unstructured[md]",
     "arize-phoenix>=6.1.0",
     "openinference-instrumentation-langchain>=0.1.29",
     # Include all optional features

diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import asyncio
 import logging
 import typing as t
 from dataclasses import dataclass, field
@@ -141,10 +140,7 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
         prompt_input = ResponseRelevanceInput(response=row["response"])
 
         responses = await self.question_generation.generate_multiple(
-            data=prompt_input,
-            llm=self.llm,
-            callbacks=callbacks,
-            n=self.strictness
+            data=prompt_input, llm=self.llm, callbacks=callbacks, n=self.strictness
         )
 
         return self._calculate_score(responses, row)

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -0,0 +1 @@
+# Tests package
diff --git a/tests/benchmarks/benchmark_eval.py b/tests/benchmarks/benchmark_eval.py
@@ -1,7 +1,5 @@
 import time
 
-from datasets import DatasetDict, load_dataset
-
 from ragas import evaluate
 from ragas.metrics import (
     ContextUtilization,
@@ -13,12 +11,12 @@
     faithfulness,
 )
 
+from ..e2e.test_dataset_utils import load_amnesty_dataset_safe
+
 # from ragas.metrics.critique import harmfulness  # Import unavailable
 
-# data
-ds = load_dataset("explodinggradients/amnesty_qa", "english_v2")
-assert isinstance(ds, DatasetDict)
-eval_dataset = ds["eval"]
+# data - using safe dataset loading
+eval_dataset = load_amnesty_dataset_safe("english_v2")
 
 # metrics
 metrics = [

diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py
@@ -0,0 +1 @@
+# E2E tests package
diff --git a/tests/e2e/test_adaptation.py b/tests/e2e/test_adaptation.py
@@ -1,7 +1,13 @@
+import os
+
+import pytest
+
 from ragas.llms import llm_factory
 from ragas.metrics import context_recall
 
 
+@pytest.mark.asyncio
+@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
 async def test_adapt():
     llm = llm_factory("gpt-4o")
     await context_recall.adapt_prompts(llm=llm, language="spanish")

diff --git a/tests/e2e/test_amnesty_in_ci.py b/tests/e2e/test_amnesty_in_ci.py
@@ -1,7 +1,7 @@
+import os
 import typing as t
 
 import pytest
-from datasets import load_dataset
 
 from ragas import EvaluationDataset, evaluate
 from ragas.metrics import (
@@ -10,12 +10,13 @@
     context_recall,
     faithfulness,
 )
+from tests.e2e.test_dataset_utils import load_amnesty_dataset_safe
 
 if t.TYPE_CHECKING:
     from datasets import Dataset
 
-# loading the V2 dataset
-amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v3")["eval"]  # type: ignore
+# loading the dataset
+amnesty_qa = load_amnesty_dataset_safe("english_v3")  # type: ignore
 
 
 def assert_in_range(score: float, value: float, plus_or_minus: float):
@@ -26,6 +27,7 @@ def assert_in_range(score: float, value: float, plus_or_minus: float):
 
 
 @pytest.mark.ragas_ci
+@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
 def test_amnesty_e2e():
     result = evaluate(
         EvaluationDataset.from_hf_dataset(t.cast("Dataset", amnesty_qa))[:1],

diff --git a/tests/e2e/test_dataset_utils.py b/tests/e2e/test_dataset_utils.py
@@ -0,0 +1,56 @@
+"""Utilities for creating test datasets in e2e tests."""
+
+import logging
+
+from datasets import Dataset, load_dataset
+
+logger = logging.getLogger(__name__)
+
+# Sample data structure matching the amnesty_qa dataset
+SAMPLE_AMNESTY_DATA = [
+    {
+        "user_input": "What are the global implications of the USA Supreme Court ruling on abortion?",
+        "reference": "The global implications of the USA Supreme Court ruling on abortion are significant. The ruling has led to limited or no access to abortion for one in three women and girls of reproductive age in states where abortion access is restricted. These states also have weaker maternal health support, higher maternal death rates, and higher child poverty rates. Additionally, the ruling has had an impact beyond national borders due to the USA's geopolitical and cultural influence globally.",
+        "response": "The global implications of the USA Supreme Court ruling on abortion can be significant, as it sets a precedent for other countries and influences the global discourse on reproductive rights. The Supreme Court's ruling can serve as a reference point for other countries grappling with their own abortion laws.",
+        "retrieved_contexts": [
+            "In 2022, the USA Supreme Court handed down a decision ruling that overturned 50 years of jurisprudence recognizing a constitutional right to abortion.",
+            "This decision has had a massive impact: one in three women and girls of reproductive age now live in states where abortion access is either totally or near-totally inaccessible.",
+            "The USA Supreme Court ruling has also had impacts beyond national borders due to the geopolitical and cultural influence wielded by the USA globally.",
+        ],
+    },
+    {
+        "user_input": "How does climate change affect human rights?",
+        "reference": "Climate change poses significant threats to human rights by affecting access to water, food security, health, and adequate housing. It disproportionately impacts vulnerable populations and can lead to displacement and migration.",
+        "response": "Climate change impacts human rights through multiple pathways including threats to life, health, food, water, and adequate standard of living. The effects are often most severe for marginalized communities.",
+        "retrieved_contexts": [
+            "Climate change threatens the effective enjoyment of human rights including life, water and sanitation, food, health, housing, and livelihoods.",
+            "The impacts of climate change will be felt most acutely by those segments of the population who are already in vulnerable situations.",
+            "Climate change is already displacing people and will continue to do so in the future.",
+        ],
+    },
+]
+
+
+def load_amnesty_dataset_safe(config: str = "english_v3"):
+    """
+    Safely load the amnesty_qa dataset, falling back to local data if remote fails.
+
+    Args:
+        config: Dataset configuration name (e.g., "english_v3", "english_v2")
+
+    Returns:
+        Dataset: The loaded dataset
+    """
+    try:
+        logger.info(f"Attempting to load amnesty_qa dataset with config '{config}'")
+        dataset = load_dataset("explodinggradients/amnesty_qa", config)["eval"]
+        logger.info(f"Successfully loaded dataset with {len(dataset)} samples")
+        return dataset
+    except Exception as e:
+        logger.warning(f"Failed to load remote dataset: {e}")
+        logger.info("Using local sample data as fallback")
+
+        # Create a local dataset from sample data
+        local_dataset = Dataset.from_list(SAMPLE_AMNESTY_DATA)
+        logger.info(f"Created local dataset with {len(local_dataset)} samples")
+        return local_dataset
diff --git a/tests/e2e/test_fullflow.py b/tests/e2e/test_fullflow.py
@@ -1,17 +1,20 @@
+import os
 import typing as t
 
-from datasets import load_dataset
+import pytest
 
 from ragas import EvaluationDataset, evaluate
 from ragas.metrics import answer_relevancy, context_precision, faithfulness
 from ragas.metrics._aspect_critic import harmfulness
+from tests.e2e.test_dataset_utils import load_amnesty_dataset_safe
 
 if t.TYPE_CHECKING:
     from datasets import Dataset
 
 
+@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
 def test_evaluate_e2e():
-    ds = load_dataset("explodinggradients/amnesty_qa", "english_v3")["eval"]  # type: ignore
+    ds = load_amnesty_dataset_safe("english_v3")  # type: ignore
     result = evaluate(
         EvaluationDataset.from_hf_dataset(t.cast("Dataset", ds))[:1],
         metrics=[answer_relevancy, context_precision, faithfulness, harmfulness],

diff --git a/tests/e2e/test_testset_generation.py b/tests/e2e/test_testset_generation.py
@@ -1,6 +1,11 @@
+import os
+
+import pytest
+
 from ragas.testset import TestsetGenerator
 
 
+@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
 def test_testset_generation_e2e():
     # generate kg
     from langchain_community.document_loaders import DirectoryLoader

diff --git a/tests/unit/test_cache.py b/tests/unit/test_cache.py
@@ -43,10 +43,10 @@ def sample_func(a, b):
 def test_generate_cache_key_bound_method():
     """Test that cache keys stay the same, when caching bound methods of different objects."""
 
-    class Clazz():
+    class Clazz:
         def __init__(self, irrelevant):
             self.irrelevant = irrelevant
-            
+
         def sample_func(self, a, b):
             return a + b
 
@@ -55,7 +55,9 @@ def sample_func(self, a, b):
 
     key1 = _generate_cache_key(object.sample_func, (1, 2), {})
     key2 = _generate_cache_key(object2.sample_func, (1, 2), {})
-    assert key1 == key2, "Cache keys should match even if the originating objects the methods are bound to are not the same, as long as the arguments match"
+    assert key1 == key2, (
+        "Cache keys should match even if the originating objects the methods are bound to are not the same, as long as the arguments match"
+    )
 
 
 def test_no_cache_backend():