diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/__init__.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/__init__.py index 1fb5be56..78145317 100644 --- a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/__init__.py +++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/__init__.py @@ -24,12 +24,8 @@ ) from .agent import KnowledgeAgentManager, KnowledgeGroundedAgent -from .evaluation import ( - DeepSearchQADataset, - DeepSearchQAEvaluator, - DSQAExample, - EvaluationResult, -) +from .data import DeepSearchQADataset, DSQAExample +from .evaluation import DeepSearchQAEvaluator, EvaluationResult __all__ = [ diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/__init__.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/__init__.py new file mode 100644 index 00000000..2a1e4af7 --- /dev/null +++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/__init__.py @@ -0,0 +1,13 @@ +"""Data loading and management for knowledge QA evaluation. + +This module provides tools for loading and managing benchmark datasets +like DeepSearchQA. +""" + +from .deepsearchqa import DeepSearchQADataset, DSQAExample + + +__all__ = [ + "DSQAExample", + "DeepSearchQADataset", +] diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/deepsearchqa.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/deepsearchqa.py new file mode 100644 index 00000000..f84d2ffa --- /dev/null +++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/deepsearchqa.py @@ -0,0 +1,239 @@ +"""DeepSearchQA dataset loader. + +This module provides classes for loading and accessing the DeepSearchQA +benchmark dataset from Kaggle. +""" + +import logging +from pathlib import Path + +import kagglehub +import pandas as pd +from pydantic import BaseModel, Field + + +logger = logging.getLogger(__name__) + + +class DSQAExample(BaseModel): + """A single example from the DeepSearchQA dataset.""" + + example_id: int = Field(description="Unique identifier for the example.") + problem: str = Field(description="The research question/problem to solve.") + problem_category: str = Field(description="Category of the problem (e.g., 'Politics & Government').") + answer: str = Field(description="The ground truth answer.") + answer_type: str = Field(description="Type of answer (e.g., 'Single Answer', 'List').") + + +class DeepSearchQADataset: + """Loader and manager for the DeepSearchQA dataset. + + This class handles downloading, loading, and accessing examples from + the DeepSearchQA benchmark dataset. + + Parameters + ---------- + cache_dir : str or Path, optional + Directory to cache the dataset. If not provided, uses kagglehub default. + + Examples + -------- + >>> dataset = DeepSearchQADataset() + >>> print(f"Total examples: {len(dataset)}") + >>> example = dataset[0] + >>> print(example.problem) + """ + + def __init__(self, cache_dir: str | Path | None = None) -> None: + """Initialize the dataset loader. + + Parameters + ---------- + cache_dir : str or Path, optional + Directory to cache the dataset. + """ + self._cache_dir = Path(cache_dir) if cache_dir else None + self._df: pd.DataFrame | None = None + self._examples: list[DSQAExample] | None = None + + def _download_dataset(self) -> Path: + """Download the dataset using kagglehub. + + Returns + ------- + Path + Path to the downloaded dataset directory. + """ + logger.info("Downloading DeepSearchQA dataset...") + path = kagglehub.dataset_download("deepmind/deepsearchqa") + return Path(path) + + def _load_data(self) -> None: + """Load the dataset into memory.""" + if self._df is not None: + return + + dataset_path = self._download_dataset() + csv_path = dataset_path / "DSQA-full.csv" + + if not csv_path.exists(): + raise FileNotFoundError(f"Dataset file not found: {csv_path}") + + self._df = pd.read_csv(csv_path) + + # Filter out rows with missing answers + original_count = len(self._df) + self._df = self._df.dropna(subset=["answer"]) + dropped_count = original_count - len(self._df) + if dropped_count > 0: + logger.info(f"Dropped {dropped_count} examples with missing answers") + + logger.info(f"Loaded {len(self._df)} examples from DeepSearchQA") + + # Convert to examples + self._examples = [ + DSQAExample( + example_id=row["example_id"], + problem=row["problem"], + problem_category=row["problem_category"], + answer=str(row["answer"]), # Ensure string type + answer_type=row["answer_type"], + ) + for _, row in self._df.iterrows() + ] + + @property + def dataframe(self) -> pd.DataFrame: + """Get the raw pandas DataFrame. + + Returns + ------- + pd.DataFrame + The full dataset as a DataFrame. + """ + self._load_data() + assert self._df is not None + return self._df + + @property + def examples(self) -> list[DSQAExample]: + """Get all examples as DSQAExample objects. + + Returns + ------- + list[DSQAExample] + All examples in the dataset. + """ + self._load_data() + assert self._examples is not None + return self._examples + + def __len__(self) -> int: + """Return the number of examples in the dataset.""" + self._load_data() + assert self._examples is not None + return len(self._examples) + + def __getitem__(self, index: int) -> DSQAExample: + """Get an example by index. + + Parameters + ---------- + index : int + The index of the example to retrieve. + + Returns + ------- + DSQAExample + The example at the given index. + """ + self._load_data() + assert self._examples is not None + return self._examples[index] + + def get_by_category(self, category: str) -> list[DSQAExample]: + """Get all examples in a specific category. + + Parameters + ---------- + category : str + The problem category to filter by. + + Returns + ------- + list[DSQAExample] + Examples matching the category. + """ + return [ex for ex in self.examples if ex.problem_category == category] + + def get_by_id(self, example_id: int) -> DSQAExample | None: + """Get a single example by its ID. + + Parameters + ---------- + example_id : int + The unique identifier of the example. + + Returns + ------- + DSQAExample or None + The example with the given ID, or None if not found. + """ + for ex in self.examples: + if ex.example_id == example_id: + return ex + return None + + def get_by_ids(self, example_ids: list[int]) -> list[DSQAExample]: + """Get multiple examples by their IDs. + + Parameters + ---------- + example_ids : list[int] + List of example IDs to retrieve. + + Returns + ------- + list[DSQAExample] + Examples matching the given IDs, in the order requested. + Missing IDs are silently skipped. + """ + id_to_example = {ex.example_id: ex for ex in self.examples} + return [id_to_example[eid] for eid in example_ids if eid in id_to_example] + + def get_categories(self) -> list[str]: + """Get all unique problem categories. + + Returns + ------- + list[str] + List of unique category names. + """ + return list(self.dataframe["problem_category"].unique()) + + def sample(self, n: int = 10, random_state: int | None = None) -> list[DSQAExample]: + """Get a random sample of examples. + + Parameters + ---------- + n : int, optional + Number of examples to sample, by default 10. + random_state : int, optional + Random seed for reproducibility. + + Returns + ------- + list[DSQAExample] + Randomly sampled examples. + """ + sampled_df = self.dataframe.sample(n=min(n, len(self)), random_state=random_state) + return [ + DSQAExample( + example_id=row["example_id"], + problem=row["problem"], + problem_category=row["problem_category"], + answer=row["answer"], + answer_type=row["answer_type"], + ) + for _, row in sampled_df.iterrows() + ] diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py index 3459b6a2..2aaf0543 100644 --- a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py +++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py @@ -1,18 +1,18 @@ """Evaluation utilities for DeepSearchQA benchmark. -This module provides tools for loading, running, and evaluating the +This module provides tools for running and evaluating agents on the DeepSearchQA benchmark dataset. """ import asyncio import logging -from pathlib import Path from typing import TYPE_CHECKING -import kagglehub import pandas as pd from pydantic import BaseModel, Field +from .data import DeepSearchQADataset, DSQAExample + if TYPE_CHECKING: from .agent import KnowledgeGroundedAgent @@ -21,16 +21,6 @@ logger = logging.getLogger(__name__) -class DSQAExample(BaseModel): - """A single example from the DeepSearchQA dataset.""" - - example_id: int = Field(description="Unique identifier for the example.") - problem: str = Field(description="The research question/problem to solve.") - problem_category: str = Field(description="Category of the problem (e.g., 'Politics & Government').") - answer: str = Field(description="The ground truth answer.") - answer_type: str = Field(description="Type of answer (e.g., 'Single Answer', 'List').") - - class EvaluationResult(BaseModel): """Result of evaluating a single example.""" @@ -44,185 +34,6 @@ class EvaluationResult(BaseModel): evaluation_notes: str = Field(default="", description="Additional notes about the evaluation.") -class DeepSearchQADataset: - """Loader and manager for the DeepSearchQA dataset. - - This class handles downloading, loading, and accessing examples from - the DeepSearchQA benchmark dataset. - - Parameters - ---------- - cache_dir : str or Path, optional - Directory to cache the dataset. If not provided, uses kagglehub default. - - Examples - -------- - >>> dataset = DeepSearchQADataset() - >>> print(f"Total examples: {len(dataset)}") - >>> example = dataset[0] - >>> print(example.problem) - """ - - def __init__(self, cache_dir: str | Path | None = None) -> None: - """Initialize the dataset loader. - - Parameters - ---------- - cache_dir : str or Path, optional - Directory to cache the dataset. - """ - self._cache_dir = Path(cache_dir) if cache_dir else None - self._df: pd.DataFrame | None = None - self._examples: list[DSQAExample] | None = None - - def _download_dataset(self) -> Path: - """Download the dataset using kagglehub. - - Returns - ------- - Path - Path to the downloaded dataset directory. - """ - logger.info("Downloading DeepSearchQA dataset...") - path = kagglehub.dataset_download("deepmind/deepsearchqa") - return Path(path) - - def _load_data(self) -> None: - """Load the dataset into memory.""" - if self._df is not None: - return - - dataset_path = self._download_dataset() - csv_path = dataset_path / "DSQA-full.csv" - - if not csv_path.exists(): - raise FileNotFoundError(f"Dataset file not found: {csv_path}") - - self._df = pd.read_csv(csv_path) - - # Filter out rows with missing answers - original_count = len(self._df) - self._df = self._df.dropna(subset=["answer"]) - dropped_count = original_count - len(self._df) - if dropped_count > 0: - logger.info(f"Dropped {dropped_count} examples with missing answers") - - logger.info(f"Loaded {len(self._df)} examples from DeepSearchQA") - - # Convert to examples - self._examples = [ - DSQAExample( - example_id=row["example_id"], - problem=row["problem"], - problem_category=row["problem_category"], - answer=str(row["answer"]), # Ensure string type - answer_type=row["answer_type"], - ) - for _, row in self._df.iterrows() - ] - - @property - def dataframe(self) -> pd.DataFrame: - """Get the raw pandas DataFrame. - - Returns - ------- - pd.DataFrame - The full dataset as a DataFrame. - """ - self._load_data() - assert self._df is not None - return self._df - - @property - def examples(self) -> list[DSQAExample]: - """Get all examples as DSQAExample objects. - - Returns - ------- - list[DSQAExample] - All examples in the dataset. - """ - self._load_data() - assert self._examples is not None - return self._examples - - def __len__(self) -> int: - """Return the number of examples in the dataset.""" - self._load_data() - assert self._examples is not None - return len(self._examples) - - def __getitem__(self, index: int) -> DSQAExample: - """Get an example by index. - - Parameters - ---------- - index : int - The index of the example to retrieve. - - Returns - ------- - DSQAExample - The example at the given index. - """ - self._load_data() - assert self._examples is not None - return self._examples[index] - - def get_by_category(self, category: str) -> list[DSQAExample]: - """Get all examples in a specific category. - - Parameters - ---------- - category : str - The problem category to filter by. - - Returns - ------- - list[DSQAExample] - Examples matching the category. - """ - return [ex for ex in self.examples if ex.problem_category == category] - - def get_categories(self) -> list[str]: - """Get all unique problem categories. - - Returns - ------- - list[str] - List of unique category names. - """ - return list(self.dataframe["problem_category"].unique()) - - def sample(self, n: int = 10, random_state: int | None = None) -> list[DSQAExample]: - """Get a random sample of examples. - - Parameters - ---------- - n : int, optional - Number of examples to sample, by default 10. - random_state : int, optional - Random seed for reproducibility. - - Returns - ------- - list[DSQAExample] - Randomly sampled examples. - """ - sampled_df = self.dataframe.sample(n=min(n, len(self)), random_state=random_state) - return [ - DSQAExample( - example_id=row["example_id"], - problem=row["problem"], - problem_category=row["problem_category"], - answer=row["answer"], - answer_type=row["answer_type"], - ) - for _, row in sampled_df.iterrows() - ] - - class DeepSearchQAEvaluator: """Evaluator for running and scoring DeepSearchQA benchmark. diff --git a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/__init__.py b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/__init__.py new file mode 100644 index 00000000..6b8be356 --- /dev/null +++ b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/__init__.py @@ -0,0 +1 @@ +"""Tests for knowledge QA data module.""" diff --git a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/test_deepsearchqa.py b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/test_deepsearchqa.py new file mode 100644 index 00000000..d55e7d94 --- /dev/null +++ b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/test_deepsearchqa.py @@ -0,0 +1,216 @@ +"""Tests for DeepSearchQA dataset loading and management.""" + +from unittest.mock import patch + +import pandas as pd +import pytest +from aieng.agent_evals.knowledge_qa.data import DeepSearchQADataset, DSQAExample + + +class TestDSQAExample: + """Tests for the DSQAExample model.""" + + def test_example_creation(self): + """Test creating an example.""" + example = DSQAExample( + example_id=0, + problem="What is the capital of France?", + problem_category="Geography", + answer="Paris", + answer_type="Single Answer", + ) + assert example.example_id == 0 + assert example.problem == "What is the capital of France?" + assert example.problem_category == "Geography" + assert example.answer == "Paris" + assert example.answer_type == "Single Answer" + + +class TestDeepSearchQADataset: + """Tests for the DeepSearchQADataset class.""" + + @pytest.fixture + def mock_csv_data(self): + """Create mock CSV data.""" + return { + "example_id": [0, 1, 2], + "problem": ["Q1", "Q2", "Q3"], + "problem_category": ["Cat A", "Cat B", "Cat A"], + "answer": ["A1", "A2", "A3"], + "answer_type": ["Single Answer", "List", "Single Answer"], + } + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_dataset_loading(self, mock_read_csv, mock_download, mock_csv_data): + """Test loading the dataset.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + examples = dataset.examples + + assert len(examples) == 3 + assert examples[0].problem == "Q1" + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_dataset_length(self, mock_read_csv, mock_download, mock_csv_data): + """Test getting dataset length.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + assert len(dataset) == 3 + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_dataset_indexing(self, mock_read_csv, mock_download, mock_csv_data): + """Test indexing into the dataset.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + example = dataset[1] + + assert example.example_id == 1 + assert example.problem == "Q2" + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_get_by_category(self, mock_read_csv, mock_download, mock_csv_data): + """Test filtering by category.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + cat_a_examples = dataset.get_by_category("Cat A") + + assert len(cat_a_examples) == 2 + assert all(ex.problem_category == "Cat A" for ex in cat_a_examples) + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_get_by_id(self, mock_read_csv, mock_download, mock_csv_data): + """Test getting a single example by ID.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + example = dataset.get_by_id(1) + + assert example is not None + assert example.example_id == 1 + assert example.problem == "Q2" + + # Test non-existent ID + assert dataset.get_by_id(999) is None + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_get_by_ids(self, mock_read_csv, mock_download, mock_csv_data): + """Test getting multiple examples by IDs.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + examples = dataset.get_by_ids([0, 2]) + + assert len(examples) == 2 + assert examples[0].example_id == 0 + assert examples[1].example_id == 2 + + # Test with missing IDs (should skip them) + examples = dataset.get_by_ids([0, 999, 1]) + assert len(examples) == 2 + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_get_categories(self, mock_read_csv, mock_download, mock_csv_data): + """Test getting unique categories.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + categories = dataset.get_categories() + + assert "Cat A" in categories + assert "Cat B" in categories + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_sample(self, mock_read_csv, mock_download, mock_csv_data): + """Test random sampling.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + sample = dataset.sample(n=2, random_state=42) + + assert len(sample) == 2 + assert all(isinstance(ex, DSQAExample) for ex in sample) + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_dataframe_property(self, mock_read_csv, mock_download, mock_csv_data): + """Test accessing the raw dataframe.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + df = dataset.dataframe + + assert isinstance(df, pd.DataFrame) + assert len(df) == 3 + assert "problem" in df.columns + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_filter_missing_answers(self, mock_read_csv, mock_download): + """Test that rows with missing answers are filtered out.""" + mock_download.return_value = "/fake/path" + data_with_na = pd.DataFrame( + { + "example_id": [0, 1, 2], + "problem": ["Q1", "Q2", "Q3"], + "problem_category": ["Cat A", "Cat B", "Cat A"], + "answer": ["A1", None, "A3"], # One missing answer + "answer_type": ["Single Answer", "List", "Single Answer"], + } + ) + mock_read_csv.return_value = data_with_na + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + examples = dataset.examples + + # Should only have 2 examples after filtering + assert len(examples) == 2 + assert examples[0].example_id == 0 + assert examples[1].example_id == 2 + + +@pytest.mark.integration_test +class TestDeepSearchQADatasetIntegration: + """Integration tests for DeepSearchQADataset. + + These tests download the actual dataset from Kaggle. + """ + + def test_load_real_dataset(self): + """Test loading the real dataset.""" + dataset = DeepSearchQADataset() + + # Dataset may have fewer than 900 examples after filtering NaN answers + assert len(dataset) > 800 # Should have most examples + assert dataset[0].example_id == 0 + assert len(dataset.get_categories()) > 0 diff --git a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_evaluation.py b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_evaluation.py index 782be286..57c06cb9 100644 --- a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_evaluation.py +++ b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_evaluation.py @@ -1,33 +1,6 @@ """Tests for DeepSearchQA evaluation utilities.""" -from unittest.mock import patch - -import pandas as pd -import pytest -from aieng.agent_evals.knowledge_qa.evaluation import ( - DeepSearchQADataset, - DSQAExample, - EvaluationResult, -) - - -class TestDSQAExample: - """Tests for the DSQAExample model.""" - - def test_example_creation(self): - """Test creating an example.""" - example = DSQAExample( - example_id=0, - problem="What is the capital of France?", - problem_category="Geography", - answer="Paris", - answer_type="Single Answer", - ) - assert example.example_id == 0 - assert example.problem == "What is the capital of France?" - assert example.problem_category == "Geography" - assert example.answer == "Paris" - assert example.answer_type == "Single Answer" +from aieng.agent_evals.knowledge_qa.evaluation import EvaluationResult class TestEvaluationResult: @@ -63,115 +36,24 @@ def test_result_defaults(self): assert result.is_correct is None assert result.evaluation_notes == "" + def test_result_with_correctness(self): + """Test evaluation result with correctness flag.""" + result = EvaluationResult( + example_id=2, + problem="What is 2+2?", + ground_truth="4", + prediction="4", + is_correct=True, + ) + assert result.is_correct is True -class TestDeepSearchQADataset: - """Tests for the DeepSearchQADataset class.""" - - @pytest.fixture - def mock_csv_data(self): - """Create mock CSV data.""" - return { - "example_id": [0, 1, 2], - "problem": ["Q1", "Q2", "Q3"], - "problem_category": ["Cat A", "Cat B", "Cat A"], - "answer": ["A1", "A2", "A3"], - "answer_type": ["Single Answer", "List", "Single Answer"], - } - - @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") - def test_dataset_loading(self, mock_read_csv, mock_download, mock_csv_data): - """Test loading the dataset.""" - mock_download.return_value = "/fake/path" - mock_read_csv.return_value = pd.DataFrame(mock_csv_data) - - with patch("pathlib.Path.exists", return_value=True): - dataset = DeepSearchQADataset() - examples = dataset.examples - - assert len(examples) == 3 - assert examples[0].problem == "Q1" - - @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") - def test_dataset_length(self, mock_read_csv, mock_download, mock_csv_data): - """Test getting dataset length.""" - mock_download.return_value = "/fake/path" - mock_read_csv.return_value = pd.DataFrame(mock_csv_data) - - with patch("pathlib.Path.exists", return_value=True): - dataset = DeepSearchQADataset() - assert len(dataset) == 3 - - @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") - def test_dataset_indexing(self, mock_read_csv, mock_download, mock_csv_data): - """Test indexing into the dataset.""" - mock_download.return_value = "/fake/path" - mock_read_csv.return_value = pd.DataFrame(mock_csv_data) - - with patch("pathlib.Path.exists", return_value=True): - dataset = DeepSearchQADataset() - example = dataset[1] - - assert example.example_id == 1 - assert example.problem == "Q2" - - @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") - def test_get_by_category(self, mock_read_csv, mock_download, mock_csv_data): - """Test filtering by category.""" - mock_download.return_value = "/fake/path" - mock_read_csv.return_value = pd.DataFrame(mock_csv_data) - - with patch("pathlib.Path.exists", return_value=True): - dataset = DeepSearchQADataset() - cat_a_examples = dataset.get_by_category("Cat A") - - assert len(cat_a_examples) == 2 - assert all(ex.problem_category == "Cat A" for ex in cat_a_examples) - - @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") - def test_get_categories(self, mock_read_csv, mock_download, mock_csv_data): - """Test getting unique categories.""" - mock_download.return_value = "/fake/path" - mock_read_csv.return_value = pd.DataFrame(mock_csv_data) - - with patch("pathlib.Path.exists", return_value=True): - dataset = DeepSearchQADataset() - categories = dataset.get_categories() - - assert "Cat A" in categories - assert "Cat B" in categories - - @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") - def test_sample(self, mock_read_csv, mock_download, mock_csv_data): - """Test random sampling.""" - mock_download.return_value = "/fake/path" - mock_read_csv.return_value = pd.DataFrame(mock_csv_data) - - with patch("pathlib.Path.exists", return_value=True): - dataset = DeepSearchQADataset() - sample = dataset.sample(n=2, random_state=42) - - assert len(sample) == 2 - assert all(isinstance(ex, DSQAExample) for ex in sample) - - -@pytest.mark.integration_test -class TestDeepSearchQADatasetIntegration: - """Integration tests for DeepSearchQADataset. - - These tests download the actual dataset from Kaggle. - """ - - def test_load_real_dataset(self): - """Test loading the real dataset.""" - dataset = DeepSearchQADataset() - - # Dataset may have fewer than 900 examples after filtering NaN answers - assert len(dataset) > 800 # Should have most examples - assert dataset[0].example_id == 0 - assert len(dataset.get_categories()) > 0 + def test_result_with_notes(self): + """Test evaluation result with evaluation notes.""" + result = EvaluationResult( + example_id=3, + problem="Complex question", + ground_truth="Complex answer", + prediction="Model's answer", + evaluation_notes="Partial match detected", + ) + assert result.evaluation_notes == "Partial match detected" diff --git a/implementations/knowledge_qa/data/langfuse_upload.py b/implementations/knowledge_qa/data/langfuse_upload.py new file mode 100644 index 00000000..c7f4a8f8 --- /dev/null +++ b/implementations/knowledge_qa/data/langfuse_upload.py @@ -0,0 +1,140 @@ +"""Upload DeepSearchQA dataset subset to Langfuse. + +This script uploads a subset of the DeepSearchQA benchmark to Langfuse +for use with the Langfuse experiment evaluation framework. + +Usage: + python langfuse_upload.py --samples 10 --category "Finance & Economics" + python langfuse_upload.py --ids 123 456 789 +""" + +import asyncio +import json +import logging +import tempfile +from pathlib import Path + +import click +from aieng.agent_evals.knowledge_qa.data import DeepSearchQADataset +from aieng.agent_evals.langfuse import upload_dataset_to_langfuse as upload_file_to_langfuse +from dotenv import load_dotenv + + +load_dotenv(verbose=True) +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s") +logger = logging.getLogger(__name__) + + +DEFAULT_DATASET_NAME = "DeepSearchQA-Subset" + + +async def upload_deepsearch_qa_to_langfuse( + dataset_name: str, + samples: int = 10, + category: str | None = None, + ids: list[int] | None = None, +) -> None: + """Upload DeepSearchQA examples to Langfuse. + + This function converts DeepSearchQA examples to a temporary JSONL file + and uses the shared upload utility for consistent formatting and progress tracking. + + Parameters + ---------- + dataset_name : str + Name for the dataset in Langfuse. + samples : int + Number of samples to upload (ignored if ids provided). + category : str, optional + Filter by category (ignored if ids provided). + ids : list[int], optional + Specific example IDs to upload. + """ + # Load DeepSearchQA dataset + logger.info("Loading DeepSearchQA dataset...") + dataset = DeepSearchQADataset() + logger.info(f"Loaded {len(dataset)} total examples") + + # Select examples based on criteria + if ids: + examples = dataset.get_by_ids(ids) + logger.info(f"Selected {len(examples)} examples by ID") + elif category: + examples = dataset.get_by_category(category)[:samples] + logger.info(f"Selected {len(examples)} examples from category '{category}'") + else: + examples = dataset.examples[:samples] + logger.info(f"Selected first {len(examples)} examples") + + if not examples: + logger.error("No examples found matching criteria") + return + + # Convert examples to JSONL format for the shared upload utility + # Use a temporary file that's automatically cleaned up + with tempfile.NamedTemporaryFile( + mode="w", + encoding="utf-8", + suffix=".jsonl", + prefix=f"deepsearchqa_{dataset_name}_", + delete=False, + ) as temp_file: + temp_path = Path(temp_file.name) + logger.info(f"Writing {len(examples)} examples to temporary file...") + + for example in examples: + record = { + "input": example.problem, + "expected_output": example.answer, + "metadata": { + "example_id": example.example_id, + "category": example.problem_category, + "answer_type": example.answer_type, + }, + } + temp_file.write(json.dumps(record, ensure_ascii=False) + "\n") + + try: + # Use the shared upload utility with progress tracking and deduplication + await upload_file_to_langfuse( + dataset_path=str(temp_path), + dataset_name=dataset_name, + ) + finally: + # Clean up temporary file + if temp_path.exists(): + temp_path.unlink() + logger.debug(f"Removed temporary file: {temp_path}") + + +@click.command() +@click.option( + "--dataset-name", + default=DEFAULT_DATASET_NAME, + help="Name for the dataset in Langfuse.", +) +@click.option( + "--samples", + default=10, + type=int, + help="Number of samples to upload (default: 10).", +) +@click.option( + "--category", + default=None, + help="Filter by category (e.g., 'Finance & Economics').", +) +@click.option( + "--ids", + multiple=True, + type=int, + help="Specific example IDs to upload (can be used multiple times).", +) +def cli(dataset_name: str, samples: int, category: str | None, ids: tuple[int, ...]) -> None: + """Upload DeepSearchQA examples to Langfuse.""" + ids_list = list(ids) if ids else None + asyncio.run(upload_deepsearch_qa_to_langfuse(dataset_name, samples, category, ids_list)) + + +if __name__ == "__main__": + cli()