diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/__init__.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/__init__.py
index 1fb5be56..78145317 100644
--- a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/__init__.py
+++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/__init__.py
@@ -24,12 +24,8 @@
 )
 
 from .agent import KnowledgeAgentManager, KnowledgeGroundedAgent
-from .evaluation import (
-    DeepSearchQADataset,
-    DeepSearchQAEvaluator,
-    DSQAExample,
-    EvaluationResult,
-)
+from .data import DeepSearchQADataset, DSQAExample
+from .evaluation import DeepSearchQAEvaluator, EvaluationResult
 
 
 __all__ = [
diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/__init__.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/__init__.py
new file mode 100644
index 00000000..2a1e4af7
--- /dev/null
+++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/__init__.py
@@ -0,0 +1,13 @@
+"""Data loading and management for knowledge QA evaluation.
+
+This module provides tools for loading and managing benchmark datasets
+like DeepSearchQA.
+"""
+
+from .deepsearchqa import DeepSearchQADataset, DSQAExample
+
+
+__all__ = [
+    "DSQAExample",
+    "DeepSearchQADataset",
+]
diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/deepsearchqa.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/deepsearchqa.py
new file mode 100644
index 00000000..f84d2ffa
--- /dev/null
+++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/deepsearchqa.py
@@ -0,0 +1,239 @@
+"""DeepSearchQA dataset loader.
+
+This module provides classes for loading and accessing the DeepSearchQA
+benchmark dataset from Kaggle.
+"""
+
+import logging
+from pathlib import Path
+
+import kagglehub
+import pandas as pd
+from pydantic import BaseModel, Field
+
+
+logger = logging.getLogger(__name__)
+
+
+class DSQAExample(BaseModel):
+    """A single example from the DeepSearchQA dataset."""
+
+    example_id: int = Field(description="Unique identifier for the example.")
+    problem: str = Field(description="The research question/problem to solve.")
+    problem_category: str = Field(description="Category of the problem (e.g., 'Politics & Government').")
+    answer: str = Field(description="The ground truth answer.")
+    answer_type: str = Field(description="Type of answer (e.g., 'Single Answer', 'List').")
+
+
+class DeepSearchQADataset:
+    """Loader and manager for the DeepSearchQA dataset.
+
+    This class handles downloading, loading, and accessing examples from
+    the DeepSearchQA benchmark dataset.
+
+    Parameters
+    ----------
+    cache_dir : str or Path, optional
+        Directory to cache the dataset. If not provided, uses kagglehub default.
+
+    Examples
+    --------
+    >>> dataset = DeepSearchQADataset()
+    >>> print(f"Total examples: {len(dataset)}")
+    >>> example = dataset[0]
+    >>> print(example.problem)
+    """
+
+    def __init__(self, cache_dir: str | Path | None = None) -> None:
+        """Initialize the dataset loader.
+
+        Parameters
+        ----------
+        cache_dir : str or Path, optional
+            Directory to cache the dataset.
+        """
+        self._cache_dir = Path(cache_dir) if cache_dir else None
+        self._df: pd.DataFrame | None = None
+        self._examples: list[DSQAExample] | None = None
+
+    def _download_dataset(self) -> Path:
+        """Download the dataset using kagglehub.
+
+        Returns
+        -------
+        Path
+            Path to the downloaded dataset directory.
+        """
+        logger.info("Downloading DeepSearchQA dataset...")
+        path = kagglehub.dataset_download("deepmind/deepsearchqa")
+        return Path(path)
+
+    def _load_data(self) -> None:
+        """Load the dataset into memory."""
+        if self._df is not None:
+            return
+
+        dataset_path = self._download_dataset()
+        csv_path = dataset_path / "DSQA-full.csv"
+
+        if not csv_path.exists():
+            raise FileNotFoundError(f"Dataset file not found: {csv_path}")
+
+        self._df = pd.read_csv(csv_path)
+
+        # Filter out rows with missing answers
+        original_count = len(self._df)
+        self._df = self._df.dropna(subset=["answer"])
+        dropped_count = original_count - len(self._df)
+        if dropped_count > 0:
+            logger.info(f"Dropped {dropped_count} examples with missing answers")
+
+        logger.info(f"Loaded {len(self._df)} examples from DeepSearchQA")
+
+        # Convert to examples
+        self._examples = [
+            DSQAExample(
+                example_id=row["example_id"],
+                problem=row["problem"],
+                problem_category=row["problem_category"],
+                answer=str(row["answer"]),  # Ensure string type
+                answer_type=row["answer_type"],
+            )
+            for _, row in self._df.iterrows()
+        ]
+
+    @property
+    def dataframe(self) -> pd.DataFrame:
+        """Get the raw pandas DataFrame.
+
+        Returns
+        -------
+        pd.DataFrame
+            The full dataset as a DataFrame.
+        """
+        self._load_data()
+        assert self._df is not None
+        return self._df
+
+    @property
+    def examples(self) -> list[DSQAExample]:
+        """Get all examples as DSQAExample objects.
+
+        Returns
+        -------
+        list[DSQAExample]
+            All examples in the dataset.
+        """
+        self._load_data()
+        assert self._examples is not None
+        return self._examples
+
+    def __len__(self) -> int:
+        """Return the number of examples in the dataset."""
+        self._load_data()
+        assert self._examples is not None
+        return len(self._examples)
+
+    def __getitem__(self, index: int) -> DSQAExample:
+        """Get an example by index.
+
+        Parameters
+        ----------
+        index : int
+            The index of the example to retrieve.
+
+        Returns
+        -------
+        DSQAExample
+            The example at the given index.
+        """
+        self._load_data()
+        assert self._examples is not None
+        return self._examples[index]
+
+    def get_by_category(self, category: str) -> list[DSQAExample]:
+        """Get all examples in a specific category.
+
+        Parameters
+        ----------
+        category : str
+            The problem category to filter by.
+
+        Returns
+        -------
+        list[DSQAExample]
+            Examples matching the category.
+        """
+        return [ex for ex in self.examples if ex.problem_category == category]
+
+    def get_by_id(self, example_id: int) -> DSQAExample | None:
+        """Get a single example by its ID.
+
+        Parameters
+        ----------
+        example_id : int
+            The unique identifier of the example.
+
+        Returns
+        -------
+        DSQAExample or None
+            The example with the given ID, or None if not found.
+        """
+        for ex in self.examples:
+            if ex.example_id == example_id:
+                return ex
+        return None
+
+    def get_by_ids(self, example_ids: list[int]) -> list[DSQAExample]:
+        """Get multiple examples by their IDs.
+
+        Parameters
+        ----------
+        example_ids : list[int]
+            List of example IDs to retrieve.
+
+        Returns
+        -------
+        list[DSQAExample]
+            Examples matching the given IDs, in the order requested.
+            Missing IDs are silently skipped.
+        """
+        id_to_example = {ex.example_id: ex for ex in self.examples}
+        return [id_to_example[eid] for eid in example_ids if eid in id_to_example]
+
+    def get_categories(self) -> list[str]:
+        """Get all unique problem categories.
+
+        Returns
+        -------
+        list[str]
+            List of unique category names.
+        """
+        return list(self.dataframe["problem_category"].unique())
+
+    def sample(self, n: int = 10, random_state: int | None = None) -> list[DSQAExample]:
+        """Get a random sample of examples.
+
+        Parameters
+        ----------
+        n : int, optional
+            Number of examples to sample, by default 10.
+        random_state : int, optional
+            Random seed for reproducibility.
+
+        Returns
+        -------
+        list[DSQAExample]
+            Randomly sampled examples.
+        """
+        sampled_df = self.dataframe.sample(n=min(n, len(self)), random_state=random_state)
+        return [
+            DSQAExample(
+                example_id=row["example_id"],
+                problem=row["problem"],
+                problem_category=row["problem_category"],
+                answer=row["answer"],
+                answer_type=row["answer_type"],
+            )
+            for _, row in sampled_df.iterrows()
+        ]
diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py
index 3459b6a2..2aaf0543 100644
--- a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py
+++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py
@@ -1,18 +1,18 @@
 """Evaluation utilities for DeepSearchQA benchmark.
 
-This module provides tools for loading, running, and evaluating the
+This module provides tools for running and evaluating agents on the
 DeepSearchQA benchmark dataset.
 """
 
 import asyncio
 import logging
-from pathlib import Path
 from typing import TYPE_CHECKING
 
-import kagglehub
 import pandas as pd
 from pydantic import BaseModel, Field
 
+from .data import DeepSearchQADataset, DSQAExample
+
 
 if TYPE_CHECKING:
     from .agent import KnowledgeGroundedAgent
@@ -21,16 +21,6 @@
 logger = logging.getLogger(__name__)
 
 
-class DSQAExample(BaseModel):
-    """A single example from the DeepSearchQA dataset."""
-
-    example_id: int = Field(description="Unique identifier for the example.")
-    problem: str = Field(description="The research question/problem to solve.")
-    problem_category: str = Field(description="Category of the problem (e.g., 'Politics & Government').")
-    answer: str = Field(description="The ground truth answer.")
-    answer_type: str = Field(description="Type of answer (e.g., 'Single Answer', 'List').")
-
-
 class EvaluationResult(BaseModel):
     """Result of evaluating a single example."""
 
@@ -44,185 +34,6 @@ class EvaluationResult(BaseModel):
     evaluation_notes: str = Field(default="", description="Additional notes about the evaluation.")
 
 
-class DeepSearchQADataset:
-    """Loader and manager for the DeepSearchQA dataset.
-
-    This class handles downloading, loading, and accessing examples from
-    the DeepSearchQA benchmark dataset.
-
-    Parameters
-    ----------
-    cache_dir : str or Path, optional
-        Directory to cache the dataset. If not provided, uses kagglehub default.
-
-    Examples
-    --------
-    >>> dataset = DeepSearchQADataset()
-    >>> print(f"Total examples: {len(dataset)}")
-    >>> example = dataset[0]
-    >>> print(example.problem)
-    """
-
-    def __init__(self, cache_dir: str | Path | None = None) -> None:
-        """Initialize the dataset loader.
-
-        Parameters
-        ----------
-        cache_dir : str or Path, optional
-            Directory to cache the dataset.
-        """
-        self._cache_dir = Path(cache_dir) if cache_dir else None
-        self._df: pd.DataFrame | None = None
-        self._examples: list[DSQAExample] | None = None
-
-    def _download_dataset(self) -> Path:
-        """Download the dataset using kagglehub.
-
-        Returns
-        -------
-        Path
-            Path to the downloaded dataset directory.
-        """
-        logger.info("Downloading DeepSearchQA dataset...")
-        path = kagglehub.dataset_download("deepmind/deepsearchqa")
-        return Path(path)
-
-    def _load_data(self) -> None:
-        """Load the dataset into memory."""
-        if self._df is not None:
-            return
-
-        dataset_path = self._download_dataset()
-        csv_path = dataset_path / "DSQA-full.csv"
-
-        if not csv_path.exists():
-            raise FileNotFoundError(f"Dataset file not found: {csv_path}")
-
-        self._df = pd.read_csv(csv_path)
-
-        # Filter out rows with missing answers
-        original_count = len(self._df)
-        self._df = self._df.dropna(subset=["answer"])
-        dropped_count = original_count - len(self._df)
-        if dropped_count > 0:
-            logger.info(f"Dropped {dropped_count} examples with missing answers")
-
-        logger.info(f"Loaded {len(self._df)} examples from DeepSearchQA")
-
-        # Convert to examples
-        self._examples = [
-            DSQAExample(
-                example_id=row["example_id"],
-                problem=row["problem"],
-                problem_category=row["problem_category"],
-                answer=str(row["answer"]),  # Ensure string type
-                answer_type=row["answer_type"],
-            )
-            for _, row in self._df.iterrows()
-        ]
-
-    @property
-    def dataframe(self) -> pd.DataFrame:
-        """Get the raw pandas DataFrame.
-
-        Returns
-        -------
-        pd.DataFrame
-            The full dataset as a DataFrame.
-        """
-        self._load_data()
-        assert self._df is not None
-        return self._df
-
-    @property
-    def examples(self) -> list[DSQAExample]:
-        """Get all examples as DSQAExample objects.
-
-        Returns
-        -------
-        list[DSQAExample]
-            All examples in the dataset.
-        """
-        self._load_data()
-        assert self._examples is not None
-        return self._examples
-
-    def __len__(self) -> int:
-        """Return the number of examples in the dataset."""
-        self._load_data()
-        assert self._examples is not None
-        return len(self._examples)
-
-    def __getitem__(self, index: int) -> DSQAExample:
-        """Get an example by index.
-
-        Parameters
-        ----------
-        index : int
-            The index of the example to retrieve.
-
-        Returns
-        -------
-        DSQAExample
-            The example at the given index.
-        """
-        self._load_data()
-        assert self._examples is not None
-        return self._examples[index]
-
-    def get_by_category(self, category: str) -> list[DSQAExample]:
-        """Get all examples in a specific category.
-
-        Parameters
-        ----------
-        category : str
-            The problem category to filter by.
-
-        Returns
-        -------
-        list[DSQAExample]
-            Examples matching the category.
-        """
-        return [ex for ex in self.examples if ex.problem_category == category]
-
-    def get_categories(self) -> list[str]:
-        """Get all unique problem categories.
-
-        Returns
-        -------
-        list[str]
-            List of unique category names.
-        """
-        return list(self.dataframe["problem_category"].unique())
-
-    def sample(self, n: int = 10, random_state: int | None = None) -> list[DSQAExample]:
-        """Get a random sample of examples.
-
-        Parameters
-        ----------
-        n : int, optional
-            Number of examples to sample, by default 10.
-        random_state : int, optional
-            Random seed for reproducibility.
-
-        Returns
-        -------
-        list[DSQAExample]
-            Randomly sampled examples.
-        """
-        sampled_df = self.dataframe.sample(n=min(n, len(self)), random_state=random_state)
-        return [
-            DSQAExample(
-                example_id=row["example_id"],
-                problem=row["problem"],
-                problem_category=row["problem_category"],
-                answer=row["answer"],
-                answer_type=row["answer_type"],
-            )
-            for _, row in sampled_df.iterrows()
-        ]
-
-
 class DeepSearchQAEvaluator:
     """Evaluator for running and scoring DeepSearchQA benchmark.
 
diff --git a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/__init__.py b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/__init__.py
new file mode 100644
index 00000000..6b8be356
--- /dev/null
+++ b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/__init__.py
@@ -0,0 +1 @@
+"""Tests for knowledge QA data module."""
diff --git a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/test_deepsearchqa.py b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/test_deepsearchqa.py
new file mode 100644
index 00000000..d55e7d94
--- /dev/null
+++ b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/test_deepsearchqa.py
@@ -0,0 +1,216 @@
+"""Tests for DeepSearchQA dataset loading and management."""
+
+from unittest.mock import patch
+
+import pandas as pd
+import pytest
+from aieng.agent_evals.knowledge_qa.data import DeepSearchQADataset, DSQAExample
+
+
+class TestDSQAExample:
+    """Tests for the DSQAExample model."""
+
+    def test_example_creation(self):
+        """Test creating an example."""
+        example = DSQAExample(
+            example_id=0,
+            problem="What is the capital of France?",
+            problem_category="Geography",
+            answer="Paris",
+            answer_type="Single Answer",
+        )
+        assert example.example_id == 0
+        assert example.problem == "What is the capital of France?"
+        assert example.problem_category == "Geography"
+        assert example.answer == "Paris"
+        assert example.answer_type == "Single Answer"
+
+
+class TestDeepSearchQADataset:
+    """Tests for the DeepSearchQADataset class."""
+
+    @pytest.fixture
+    def mock_csv_data(self):
+        """Create mock CSV data."""
+        return {
+            "example_id": [0, 1, 2],
+            "problem": ["Q1", "Q2", "Q3"],
+            "problem_category": ["Cat A", "Cat B", "Cat A"],
+            "answer": ["A1", "A2", "A3"],
+            "answer_type": ["Single Answer", "List", "Single Answer"],
+        }
+
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download")
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv")
+    def test_dataset_loading(self, mock_read_csv, mock_download, mock_csv_data):
+        """Test loading the dataset."""
+        mock_download.return_value = "/fake/path"
+        mock_read_csv.return_value = pd.DataFrame(mock_csv_data)
+
+        with patch("pathlib.Path.exists", return_value=True):
+            dataset = DeepSearchQADataset()
+            examples = dataset.examples
+
+        assert len(examples) == 3
+        assert examples[0].problem == "Q1"
+
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download")
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv")
+    def test_dataset_length(self, mock_read_csv, mock_download, mock_csv_data):
+        """Test getting dataset length."""
+        mock_download.return_value = "/fake/path"
+        mock_read_csv.return_value = pd.DataFrame(mock_csv_data)
+
+        with patch("pathlib.Path.exists", return_value=True):
+            dataset = DeepSearchQADataset()
+            assert len(dataset) == 3
+
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download")
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv")
+    def test_dataset_indexing(self, mock_read_csv, mock_download, mock_csv_data):
+        """Test indexing into the dataset."""
+        mock_download.return_value = "/fake/path"
+        mock_read_csv.return_value = pd.DataFrame(mock_csv_data)
+
+        with patch("pathlib.Path.exists", return_value=True):
+            dataset = DeepSearchQADataset()
+            example = dataset[1]
+
+        assert example.example_id == 1
+        assert example.problem == "Q2"
+
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download")
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv")
+    def test_get_by_category(self, mock_read_csv, mock_download, mock_csv_data):
+        """Test filtering by category."""
+        mock_download.return_value = "/fake/path"
+        mock_read_csv.return_value = pd.DataFrame(mock_csv_data)
+
+        with patch("pathlib.Path.exists", return_value=True):
+            dataset = DeepSearchQADataset()
+            cat_a_examples = dataset.get_by_category("Cat A")
+
+        assert len(cat_a_examples) == 2
+        assert all(ex.problem_category == "Cat A" for ex in cat_a_examples)
+
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download")
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv")
+    def test_get_by_id(self, mock_read_csv, mock_download, mock_csv_data):
+        """Test getting a single example by ID."""
+        mock_download.return_value = "/fake/path"
+        mock_read_csv.return_value = pd.DataFrame(mock_csv_data)
+
+        with patch("pathlib.Path.exists", return_value=True):
+            dataset = DeepSearchQADataset()
+            example = dataset.get_by_id(1)
+
+        assert example is not None
+        assert example.example_id == 1
+        assert example.problem == "Q2"
+
+        # Test non-existent ID
+        assert dataset.get_by_id(999) is None
+
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download")
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv")
+    def test_get_by_ids(self, mock_read_csv, mock_download, mock_csv_data):
+        """Test getting multiple examples by IDs."""
+        mock_download.return_value = "/fake/path"
+        mock_read_csv.return_value = pd.DataFrame(mock_csv_data)
+
+        with patch("pathlib.Path.exists", return_value=True):
+            dataset = DeepSearchQADataset()
+            examples = dataset.get_by_ids([0, 2])
+
+        assert len(examples) == 2
+        assert examples[0].example_id == 0
+        assert examples[1].example_id == 2
+
+        # Test with missing IDs (should skip them)
+        examples = dataset.get_by_ids([0, 999, 1])
+        assert len(examples) == 2
+
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download")
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv")
+    def test_get_categories(self, mock_read_csv, mock_download, mock_csv_data):
+        """Test getting unique categories."""
+        mock_download.return_value = "/fake/path"
+        mock_read_csv.return_value = pd.DataFrame(mock_csv_data)
+
+        with patch("pathlib.Path.exists", return_value=True):
+            dataset = DeepSearchQADataset()
+            categories = dataset.get_categories()
+
+        assert "Cat A" in categories
+        assert "Cat B" in categories
+
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download")
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv")
+    def test_sample(self, mock_read_csv, mock_download, mock_csv_data):
+        """Test random sampling."""
+        mock_download.return_value = "/fake/path"
+        mock_read_csv.return_value = pd.DataFrame(mock_csv_data)
+
+        with patch("pathlib.Path.exists", return_value=True):
+            dataset = DeepSearchQADataset()
+            sample = dataset.sample(n=2, random_state=42)
+
+        assert len(sample) == 2
+        assert all(isinstance(ex, DSQAExample) for ex in sample)
+
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download")
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv")
+    def test_dataframe_property(self, mock_read_csv, mock_download, mock_csv_data):
+        """Test accessing the raw dataframe."""
+        mock_download.return_value = "/fake/path"
+        mock_read_csv.return_value = pd.DataFrame(mock_csv_data)
+
+        with patch("pathlib.Path.exists", return_value=True):
+            dataset = DeepSearchQADataset()
+            df = dataset.dataframe
+
+        assert isinstance(df, pd.DataFrame)
+        assert len(df) == 3
+        assert "problem" in df.columns
+
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download")
+    @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv")
+    def test_filter_missing_answers(self, mock_read_csv, mock_download):
+        """Test that rows with missing answers are filtered out."""
+        mock_download.return_value = "/fake/path"
+        data_with_na = pd.DataFrame(
+            {
+                "example_id": [0, 1, 2],
+                "problem": ["Q1", "Q2", "Q3"],
+                "problem_category": ["Cat A", "Cat B", "Cat A"],
+                "answer": ["A1", None, "A3"],  # One missing answer
+                "answer_type": ["Single Answer", "List", "Single Answer"],
+            }
+        )
+        mock_read_csv.return_value = data_with_na
+
+        with patch("pathlib.Path.exists", return_value=True):
+            dataset = DeepSearchQADataset()
+            examples = dataset.examples
+
+        # Should only have 2 examples after filtering
+        assert len(examples) == 2
+        assert examples[0].example_id == 0
+        assert examples[1].example_id == 2
+
+
+@pytest.mark.integration_test
+class TestDeepSearchQADatasetIntegration:
+    """Integration tests for DeepSearchQADataset.
+
+    These tests download the actual dataset from Kaggle.
+    """
+
+    def test_load_real_dataset(self):
+        """Test loading the real dataset."""
+        dataset = DeepSearchQADataset()
+
+        # Dataset may have fewer than 900 examples after filtering NaN answers
+        assert len(dataset) > 800  # Should have most examples
+        assert dataset[0].example_id == 0
+        assert len(dataset.get_categories()) > 0
diff --git a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_evaluation.py b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_evaluation.py
index 782be286..57c06cb9 100644
--- a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_evaluation.py
+++ b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_evaluation.py
@@ -1,33 +1,6 @@
 """Tests for DeepSearchQA evaluation utilities."""
 
-from unittest.mock import patch
-
-import pandas as pd
-import pytest
-from aieng.agent_evals.knowledge_qa.evaluation import (
-    DeepSearchQADataset,
-    DSQAExample,
-    EvaluationResult,
-)
-
-
-class TestDSQAExample:
-    """Tests for the DSQAExample model."""
-
-    def test_example_creation(self):
-        """Test creating an example."""
-        example = DSQAExample(
-            example_id=0,
-            problem="What is the capital of France?",
-            problem_category="Geography",
-            answer="Paris",
-            answer_type="Single Answer",
-        )
-        assert example.example_id == 0
-        assert example.problem == "What is the capital of France?"
-        assert example.problem_category == "Geography"
-        assert example.answer == "Paris"
-        assert example.answer_type == "Single Answer"
+from aieng.agent_evals.knowledge_qa.evaluation import EvaluationResult
 
 
 class TestEvaluationResult:
@@ -63,115 +36,24 @@ def test_result_defaults(self):
         assert result.is_correct is None
         assert result.evaluation_notes == ""
 
+    def test_result_with_correctness(self):
+        """Test evaluation result with correctness flag."""
+        result = EvaluationResult(
+            example_id=2,
+            problem="What is 2+2?",
+            ground_truth="4",
+            prediction="4",
+            is_correct=True,
+        )
+        assert result.is_correct is True
 
-class TestDeepSearchQADataset:
-    """Tests for the DeepSearchQADataset class."""
-
-    @pytest.fixture
-    def mock_csv_data(self):
-        """Create mock CSV data."""
-        return {
-            "example_id": [0, 1, 2],
-            "problem": ["Q1", "Q2", "Q3"],
-            "problem_category": ["Cat A", "Cat B", "Cat A"],
-            "answer": ["A1", "A2", "A3"],
-            "answer_type": ["Single Answer", "List", "Single Answer"],
-        }
-
-    @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download")
-    @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv")
-    def test_dataset_loading(self, mock_read_csv, mock_download, mock_csv_data):
-        """Test loading the dataset."""
-        mock_download.return_value = "/fake/path"
-        mock_read_csv.return_value = pd.DataFrame(mock_csv_data)
-
-        with patch("pathlib.Path.exists", return_value=True):
-            dataset = DeepSearchQADataset()
-            examples = dataset.examples
-
-        assert len(examples) == 3
-        assert examples[0].problem == "Q1"
-
-    @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download")
-    @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv")
-    def test_dataset_length(self, mock_read_csv, mock_download, mock_csv_data):
-        """Test getting dataset length."""
-        mock_download.return_value = "/fake/path"
-        mock_read_csv.return_value = pd.DataFrame(mock_csv_data)
-
-        with patch("pathlib.Path.exists", return_value=True):
-            dataset = DeepSearchQADataset()
-            assert len(dataset) == 3
-
-    @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download")
-    @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv")
-    def test_dataset_indexing(self, mock_read_csv, mock_download, mock_csv_data):
-        """Test indexing into the dataset."""
-        mock_download.return_value = "/fake/path"
-        mock_read_csv.return_value = pd.DataFrame(mock_csv_data)
-
-        with patch("pathlib.Path.exists", return_value=True):
-            dataset = DeepSearchQADataset()
-            example = dataset[1]
-
-        assert example.example_id == 1
-        assert example.problem == "Q2"
-
-    @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download")
-    @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv")
-    def test_get_by_category(self, mock_read_csv, mock_download, mock_csv_data):
-        """Test filtering by category."""
-        mock_download.return_value = "/fake/path"
-        mock_read_csv.return_value = pd.DataFrame(mock_csv_data)
-
-        with patch("pathlib.Path.exists", return_value=True):
-            dataset = DeepSearchQADataset()
-            cat_a_examples = dataset.get_by_category("Cat A")
-
-        assert len(cat_a_examples) == 2
-        assert all(ex.problem_category == "Cat A" for ex in cat_a_examples)
-
-    @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download")
-    @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv")
-    def test_get_categories(self, mock_read_csv, mock_download, mock_csv_data):
-        """Test getting unique categories."""
-        mock_download.return_value = "/fake/path"
-        mock_read_csv.return_value = pd.DataFrame(mock_csv_data)
-
-        with patch("pathlib.Path.exists", return_value=True):
-            dataset = DeepSearchQADataset()
-            categories = dataset.get_categories()
-
-        assert "Cat A" in categories
-        assert "Cat B" in categories
-
-    @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download")
-    @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv")
-    def test_sample(self, mock_read_csv, mock_download, mock_csv_data):
-        """Test random sampling."""
-        mock_download.return_value = "/fake/path"
-        mock_read_csv.return_value = pd.DataFrame(mock_csv_data)
-
-        with patch("pathlib.Path.exists", return_value=True):
-            dataset = DeepSearchQADataset()
-            sample = dataset.sample(n=2, random_state=42)
-
-        assert len(sample) == 2
-        assert all(isinstance(ex, DSQAExample) for ex in sample)
-
-
-@pytest.mark.integration_test
-class TestDeepSearchQADatasetIntegration:
-    """Integration tests for DeepSearchQADataset.
-
-    These tests download the actual dataset from Kaggle.
-    """
-
-    def test_load_real_dataset(self):
-        """Test loading the real dataset."""
-        dataset = DeepSearchQADataset()
-
-        # Dataset may have fewer than 900 examples after filtering NaN answers
-        assert len(dataset) > 800  # Should have most examples
-        assert dataset[0].example_id == 0
-        assert len(dataset.get_categories()) > 0
+    def test_result_with_notes(self):
+        """Test evaluation result with evaluation notes."""
+        result = EvaluationResult(
+            example_id=3,
+            problem="Complex question",
+            ground_truth="Complex answer",
+            prediction="Model's answer",
+            evaluation_notes="Partial match detected",
+        )
+        assert result.evaluation_notes == "Partial match detected"
diff --git a/implementations/knowledge_qa/data/langfuse_upload.py b/implementations/knowledge_qa/data/langfuse_upload.py
new file mode 100644
index 00000000..c7f4a8f8
--- /dev/null
+++ b/implementations/knowledge_qa/data/langfuse_upload.py
@@ -0,0 +1,140 @@
+"""Upload DeepSearchQA dataset subset to Langfuse.
+
+This script uploads a subset of the DeepSearchQA benchmark to Langfuse
+for use with the Langfuse experiment evaluation framework.
+
+Usage:
+    python langfuse_upload.py --samples 10 --category "Finance & Economics"
+    python langfuse_upload.py --ids 123 456 789
+"""
+
+import asyncio
+import json
+import logging
+import tempfile
+from pathlib import Path
+
+import click
+from aieng.agent_evals.knowledge_qa.data import DeepSearchQADataset
+from aieng.agent_evals.langfuse import upload_dataset_to_langfuse as upload_file_to_langfuse
+from dotenv import load_dotenv
+
+
+load_dotenv(verbose=True)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_DATASET_NAME = "DeepSearchQA-Subset"
+
+
+async def upload_deepsearch_qa_to_langfuse(
+    dataset_name: str,
+    samples: int = 10,
+    category: str | None = None,
+    ids: list[int] | None = None,
+) -> None:
+    """Upload DeepSearchQA examples to Langfuse.
+
+    This function converts DeepSearchQA examples to a temporary JSONL file
+    and uses the shared upload utility for consistent formatting and progress tracking.
+
+    Parameters
+    ----------
+    dataset_name : str
+        Name for the dataset in Langfuse.
+    samples : int
+        Number of samples to upload (ignored if ids provided).
+    category : str, optional
+        Filter by category (ignored if ids provided).
+    ids : list[int], optional
+        Specific example IDs to upload.
+    """
+    # Load DeepSearchQA dataset
+    logger.info("Loading DeepSearchQA dataset...")
+    dataset = DeepSearchQADataset()
+    logger.info(f"Loaded {len(dataset)} total examples")
+
+    # Select examples based on criteria
+    if ids:
+        examples = dataset.get_by_ids(ids)
+        logger.info(f"Selected {len(examples)} examples by ID")
+    elif category:
+        examples = dataset.get_by_category(category)[:samples]
+        logger.info(f"Selected {len(examples)} examples from category '{category}'")
+    else:
+        examples = dataset.examples[:samples]
+        logger.info(f"Selected first {len(examples)} examples")
+
+    if not examples:
+        logger.error("No examples found matching criteria")
+        return
+
+    # Convert examples to JSONL format for the shared upload utility
+    # Use a temporary file that's automatically cleaned up
+    with tempfile.NamedTemporaryFile(
+        mode="w",
+        encoding="utf-8",
+        suffix=".jsonl",
+        prefix=f"deepsearchqa_{dataset_name}_",
+        delete=False,
+    ) as temp_file:
+        temp_path = Path(temp_file.name)
+        logger.info(f"Writing {len(examples)} examples to temporary file...")
+
+        for example in examples:
+            record = {
+                "input": example.problem,
+                "expected_output": example.answer,
+                "metadata": {
+                    "example_id": example.example_id,
+                    "category": example.problem_category,
+                    "answer_type": example.answer_type,
+                },
+            }
+            temp_file.write(json.dumps(record, ensure_ascii=False) + "\n")
+
+    try:
+        # Use the shared upload utility with progress tracking and deduplication
+        await upload_file_to_langfuse(
+            dataset_path=str(temp_path),
+            dataset_name=dataset_name,
+        )
+    finally:
+        # Clean up temporary file
+        if temp_path.exists():
+            temp_path.unlink()
+            logger.debug(f"Removed temporary file: {temp_path}")
+
+
+@click.command()
+@click.option(
+    "--dataset-name",
+    default=DEFAULT_DATASET_NAME,
+    help="Name for the dataset in Langfuse.",
+)
+@click.option(
+    "--samples",
+    default=10,
+    type=int,
+    help="Number of samples to upload (default: 10).",
+)
+@click.option(
+    "--category",
+    default=None,
+    help="Filter by category (e.g., 'Finance & Economics').",
+)
+@click.option(
+    "--ids",
+    multiple=True,
+    type=int,
+    help="Specific example IDs to upload (can be used multiple times).",
+)
+def cli(dataset_name: str, samples: int, category: str | None, ids: tuple[int, ...]) -> None:
+    """Upload DeepSearchQA examples to Langfuse."""
+    ids_list = list(ids) if ids else None
+    asyncio.run(upload_deepsearch_qa_to_langfuse(dataset_name, samples, category, ids_list))
+
+
+if __name__ == "__main__":
+    cli()