From 2df99071e6136916956f107e66634681cdea744b Mon Sep 17 00:00:00 2001 From: Amrit Krishnan Date: Wed, 4 Feb 2026 14:45:21 -0500 Subject: [PATCH 1/4] Rename modules to knowledge_qa --- .../__init__.py | 2 +- .../agent.py | 6 +- .../evaluation.py | 2 +- aieng-eval-agents/pyproject.toml | 2 +- .../__init__.py | 0 .../test_agent.py | 132 +++++++++--------- .../test_evaluation.py | 26 ++-- .../01_grounding_basics.ipynb | 2 +- .../02_agent_basics.ipynb | 2 +- .../03_multi_turn.ipynb | 2 +- .../README.md | 8 +- .../gradio_app.py | 4 +- 12 files changed, 94 insertions(+), 94 deletions(-) rename aieng-eval-agents/aieng/agent_evals/{knowledge_agent => knowledge_qa}/__init__.py (95%) rename aieng-eval-agents/aieng/agent_evals/{knowledge_agent => knowledge_qa}/agent.py (98%) rename aieng-eval-agents/aieng/agent_evals/{knowledge_agent => knowledge_qa}/evaluation.py (99%) rename aieng-eval-agents/tests/aieng/agent_evals/{knowledge_agent => knowledge_qa}/__init__.py (100%) rename aieng-eval-agents/tests/aieng/agent_evals/{knowledge_agent => knowledge_qa}/test_agent.py (84%) rename aieng-eval-agents/tests/aieng/agent_evals/{knowledge_agent => knowledge_qa}/test_evaluation.py (85%) rename implementations/{knowledge_agent => knowledge_qa}/01_grounding_basics.ipynb (97%) rename implementations/{knowledge_agent => knowledge_qa}/02_agent_basics.ipynb (96%) rename implementations/{knowledge_agent => knowledge_qa}/03_multi_turn.ipynb (99%) rename implementations/{knowledge_agent => knowledge_qa}/README.md (94%) rename implementations/{knowledge_agent => knowledge_qa}/gradio_app.py (97%) diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_agent/__init__.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/__init__.py similarity index 95% rename from aieng-eval-agents/aieng/agent_evals/knowledge_agent/__init__.py rename to aieng-eval-agents/aieng/agent_evals/knowledge_qa/__init__.py index aeb1ade1..1fb5be56 100644 --- a/aieng-eval-agents/aieng/agent_evals/knowledge_agent/__init__.py +++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/__init__.py @@ -5,7 +5,7 @@ Example ------- ->>> from aieng.agent_evals.knowledge_agent import ( +>>> from aieng.agent_evals.knowledge_qa import ( ... KnowledgeGroundedAgent, ... DeepSearchQADataset, ... DeepSearchQAEvaluator, diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_agent/agent.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py similarity index 98% rename from aieng-eval-agents/aieng/agent_evals/knowledge_agent/agent.py rename to aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py index 9de982c8..43b0ebc5 100644 --- a/aieng-eval-agents/aieng/agent_evals/knowledge_agent/agent.py +++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/agent.py @@ -241,7 +241,7 @@ class KnowledgeGroundedAgent: Examples -------- - >>> from aieng.agent_evals.knowledge_agent import KnowledgeGroundedAgent + >>> from aieng.agent_evals.knowledge_qa import KnowledgeGroundedAgent >>> agent = KnowledgeGroundedAgent() >>> response = agent.answer("Who won the 2024 Nobel Prize in Physics?") >>> print(response.text) @@ -283,7 +283,7 @@ def __init__( # Runner orchestrates the ReAct loop self._runner = Runner( - app_name="knowledge_agent", + app_name="knowledge_qa", agent=self._agent, session_service=self._session_service, ) @@ -310,7 +310,7 @@ async def _get_or_create_session_async(self, session_id: str | None = None) -> s if session_id not in self._sessions: # Create a new ADK session through the session service session = await self._session_service.create_session( - app_name="knowledge_agent", + app_name="knowledge_qa", user_id="user", state={}, ) diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_agent/evaluation.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py similarity index 99% rename from aieng-eval-agents/aieng/agent_evals/knowledge_agent/evaluation.py rename to aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py index 8046dbce..3459b6a2 100644 --- a/aieng-eval-agents/aieng/agent_evals/knowledge_agent/evaluation.py +++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py @@ -238,7 +238,7 @@ class DeepSearchQAEvaluator: Examples -------- - >>> from aieng.agent_evals.knowledge_agent import ( + >>> from aieng.agent_evals.knowledge_qa import ( ... KnowledgeGroundedAgent, ... DeepSearchQAEvaluator, ... ) diff --git a/aieng-eval-agents/pyproject.toml b/aieng-eval-agents/pyproject.toml index d8a7d718..87537fb2 100644 --- a/aieng-eval-agents/pyproject.toml +++ b/aieng-eval-agents/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ ] [project.scripts] -knowledge-agent = "aieng.agent_evals.knowledge_agent.cli:main" +knowledge-qa = "aieng.agent_evals.knowledge_qa.cli:main" [dependency-groups] dev = [ diff --git a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_agent/__init__.py b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/__init__.py similarity index 100% rename from aieng-eval-agents/tests/aieng/agent_evals/knowledge_agent/__init__.py rename to aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/__init__.py diff --git a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_agent/test_agent.py b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py similarity index 84% rename from aieng-eval-agents/tests/aieng/agent_evals/knowledge_agent/test_agent.py rename to aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py index 02ee1dd2..1e5b5b70 100644 --- a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_agent/test_agent.py +++ b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_agent.py @@ -3,7 +3,7 @@ from unittest.mock import AsyncMock, MagicMock, patch import pytest -from aieng.agent_evals.knowledge_agent.agent import ( +from aieng.agent_evals.knowledge_qa.agent import ( SYSTEM_INSTRUCTIONS, KnowledgeAgentManager, KnowledgeGroundedAgent, @@ -22,10 +22,10 @@ def mock_config(self): config.default_worker_model = "gemini-2.5-flash" return config - @patch("aieng.agent_evals.knowledge_agent.agent.Runner") - @patch("aieng.agent_evals.knowledge_agent.agent.InMemorySessionService") - @patch("aieng.agent_evals.knowledge_agent.agent.Agent") - @patch("aieng.agent_evals.knowledge_agent.agent.create_google_search_tool") + @patch("aieng.agent_evals.knowledge_qa.agent.Runner") + @patch("aieng.agent_evals.knowledge_qa.agent.InMemorySessionService") + @patch("aieng.agent_evals.knowledge_qa.agent.Agent") + @patch("aieng.agent_evals.knowledge_qa.agent.create_google_search_tool") def test_agent_initialization( self, mock_create_tool, @@ -55,10 +55,10 @@ def test_agent_initialization( mock_session_service.assert_called_once() mock_runner_class.assert_called_once() - @patch("aieng.agent_evals.knowledge_agent.agent.Runner") - @patch("aieng.agent_evals.knowledge_agent.agent.InMemorySessionService") - @patch("aieng.agent_evals.knowledge_agent.agent.Agent") - @patch("aieng.agent_evals.knowledge_agent.agent.create_google_search_tool") + @patch("aieng.agent_evals.knowledge_qa.agent.Runner") + @patch("aieng.agent_evals.knowledge_qa.agent.InMemorySessionService") + @patch("aieng.agent_evals.knowledge_qa.agent.Agent") + @patch("aieng.agent_evals.knowledge_qa.agent.create_google_search_tool") def test_agent_with_custom_model( self, mock_create_tool, @@ -74,10 +74,10 @@ def test_agent_with_custom_model( assert call_kwargs["model"] == "gemini-2.5-pro" @pytest.mark.asyncio - @patch("aieng.agent_evals.knowledge_agent.agent.Runner") - @patch("aieng.agent_evals.knowledge_agent.agent.InMemorySessionService") - @patch("aieng.agent_evals.knowledge_agent.agent.Agent") - @patch("aieng.agent_evals.knowledge_agent.agent.create_google_search_tool") + @patch("aieng.agent_evals.knowledge_qa.agent.Runner") + @patch("aieng.agent_evals.knowledge_qa.agent.InMemorySessionService") + @patch("aieng.agent_evals.knowledge_qa.agent.Agent") + @patch("aieng.agent_evals.knowledge_qa.agent.create_google_search_tool") async def test_get_or_create_session( self, mock_create_tool, @@ -110,10 +110,10 @@ async def test_get_or_create_session( assert session3 != session1 @pytest.mark.asyncio - @patch("aieng.agent_evals.knowledge_agent.agent.Runner") - @patch("aieng.agent_evals.knowledge_agent.agent.InMemorySessionService") - @patch("aieng.agent_evals.knowledge_agent.agent.Agent") - @patch("aieng.agent_evals.knowledge_agent.agent.create_google_search_tool") + @patch("aieng.agent_evals.knowledge_qa.agent.Runner") + @patch("aieng.agent_evals.knowledge_qa.agent.InMemorySessionService") + @patch("aieng.agent_evals.knowledge_qa.agent.Agent") + @patch("aieng.agent_evals.knowledge_qa.agent.create_google_search_tool") async def test_get_or_create_session_generates_id( self, mock_create_tool, @@ -136,10 +136,10 @@ async def test_get_or_create_session_generates_id( assert session is not None @pytest.mark.asyncio - @patch("aieng.agent_evals.knowledge_agent.agent.Runner") - @patch("aieng.agent_evals.knowledge_agent.agent.InMemorySessionService") - @patch("aieng.agent_evals.knowledge_agent.agent.Agent") - @patch("aieng.agent_evals.knowledge_agent.agent.create_google_search_tool") + @patch("aieng.agent_evals.knowledge_qa.agent.Runner") + @patch("aieng.agent_evals.knowledge_qa.agent.InMemorySessionService") + @patch("aieng.agent_evals.knowledge_qa.agent.Agent") + @patch("aieng.agent_evals.knowledge_qa.agent.create_google_search_tool") async def test_answer_async( self, mock_create_tool, @@ -176,10 +176,10 @@ async def mock_run_async(*args, **kwargs): assert response.text == "Paris is the capital of France." @pytest.mark.asyncio - @patch("aieng.agent_evals.knowledge_agent.agent.Runner") - @patch("aieng.agent_evals.knowledge_agent.agent.InMemorySessionService") - @patch("aieng.agent_evals.knowledge_agent.agent.Agent") - @patch("aieng.agent_evals.knowledge_agent.agent.create_google_search_tool") + @patch("aieng.agent_evals.knowledge_qa.agent.Runner") + @patch("aieng.agent_evals.knowledge_qa.agent.InMemorySessionService") + @patch("aieng.agent_evals.knowledge_qa.agent.Agent") + @patch("aieng.agent_evals.knowledge_qa.agent.create_google_search_tool") async def test_answer_async_extracts_function_calls( self, mock_create_tool, @@ -234,10 +234,10 @@ async def mock_run_async(*args, **kwargs): assert "capital of France" in response.search_queries @pytest.mark.asyncio - @patch("aieng.agent_evals.knowledge_agent.agent.Runner") - @patch("aieng.agent_evals.knowledge_agent.agent.InMemorySessionService") - @patch("aieng.agent_evals.knowledge_agent.agent.Agent") - @patch("aieng.agent_evals.knowledge_agent.agent.create_google_search_tool") + @patch("aieng.agent_evals.knowledge_qa.agent.Runner") + @patch("aieng.agent_evals.knowledge_qa.agent.InMemorySessionService") + @patch("aieng.agent_evals.knowledge_qa.agent.Agent") + @patch("aieng.agent_evals.knowledge_qa.agent.create_google_search_tool") async def test_answer_async_extracts_sources_from_function_responses( self, mock_create_tool, @@ -297,10 +297,10 @@ async def mock_run_async(*args, **kwargs): assert response.sources[1].uri == "https://example.com/paris" @pytest.mark.asyncio - @patch("aieng.agent_evals.knowledge_agent.agent.Runner") - @patch("aieng.agent_evals.knowledge_agent.agent.InMemorySessionService") - @patch("aieng.agent_evals.knowledge_agent.agent.Agent") - @patch("aieng.agent_evals.knowledge_agent.agent.create_google_search_tool") + @patch("aieng.agent_evals.knowledge_qa.agent.Runner") + @patch("aieng.agent_evals.knowledge_qa.agent.InMemorySessionService") + @patch("aieng.agent_evals.knowledge_qa.agent.Agent") + @patch("aieng.agent_evals.knowledge_qa.agent.create_google_search_tool") async def test_answer_async_extracts_grounding_chunks_from_responses( self, mock_create_tool, @@ -360,10 +360,10 @@ async def mock_run_async(*args, **kwargs): assert response.sources[1].uri == "https://news.com/article" @pytest.mark.asyncio - @patch("aieng.agent_evals.knowledge_agent.agent.Runner") - @patch("aieng.agent_evals.knowledge_agent.agent.InMemorySessionService") - @patch("aieng.agent_evals.knowledge_agent.agent.Agent") - @patch("aieng.agent_evals.knowledge_agent.agent.create_google_search_tool") + @patch("aieng.agent_evals.knowledge_qa.agent.Runner") + @patch("aieng.agent_evals.knowledge_qa.agent.InMemorySessionService") + @patch("aieng.agent_evals.knowledge_qa.agent.Agent") + @patch("aieng.agent_evals.knowledge_qa.agent.create_google_search_tool") async def test_answer_async_extracts_grounding_metadata( self, mock_create_tool, @@ -426,10 +426,10 @@ async def mock_run_async(*args, **kwargs): assert "grounded query" in response.search_queries @pytest.mark.asyncio - @patch("aieng.agent_evals.knowledge_agent.agent.Runner") - @patch("aieng.agent_evals.knowledge_agent.agent.InMemorySessionService") - @patch("aieng.agent_evals.knowledge_agent.agent.Agent") - @patch("aieng.agent_evals.knowledge_agent.agent.create_google_search_tool") + @patch("aieng.agent_evals.knowledge_qa.agent.Runner") + @patch("aieng.agent_evals.knowledge_qa.agent.InMemorySessionService") + @patch("aieng.agent_evals.knowledge_qa.agent.Agent") + @patch("aieng.agent_evals.knowledge_qa.agent.create_google_search_tool") async def test_answer_async_extracts_grounding_metadata_from_content( self, mock_create_tool, @@ -495,10 +495,10 @@ async def mock_run_async(*args, **kwargs): assert "content query" in response.search_queries @pytest.mark.asyncio - @patch("aieng.agent_evals.knowledge_agent.agent.Runner") - @patch("aieng.agent_evals.knowledge_agent.agent.InMemorySessionService") - @patch("aieng.agent_evals.knowledge_agent.agent.Agent") - @patch("aieng.agent_evals.knowledge_agent.agent.create_google_search_tool") + @patch("aieng.agent_evals.knowledge_qa.agent.Runner") + @patch("aieng.agent_evals.knowledge_qa.agent.InMemorySessionService") + @patch("aieng.agent_evals.knowledge_qa.agent.Agent") + @patch("aieng.agent_evals.knowledge_qa.agent.create_google_search_tool") async def test_answer_async_handles_multiple_search_tool_names( self, mock_create_tool, @@ -561,10 +561,10 @@ async def mock_run_async(*args, **kwargs): assert "query three" in response.search_queries @pytest.mark.asyncio - @patch("aieng.agent_evals.knowledge_agent.agent.Runner") - @patch("aieng.agent_evals.knowledge_agent.agent.InMemorySessionService") - @patch("aieng.agent_evals.knowledge_agent.agent.Agent") - @patch("aieng.agent_evals.knowledge_agent.agent.create_google_search_tool") + @patch("aieng.agent_evals.knowledge_qa.agent.Runner") + @patch("aieng.agent_evals.knowledge_qa.agent.InMemorySessionService") + @patch("aieng.agent_evals.knowledge_qa.agent.Agent") + @patch("aieng.agent_evals.knowledge_qa.agent.create_google_search_tool") async def test_answer_async_handles_empty_events( self, mock_create_tool, @@ -618,10 +618,10 @@ async def mock_run_async(*args, **kwargs): class TestKnowledgeAgentManager: """Tests for the KnowledgeAgentManager class.""" - @patch("aieng.agent_evals.knowledge_agent.agent.Runner") - @patch("aieng.agent_evals.knowledge_agent.agent.InMemorySessionService") - @patch("aieng.agent_evals.knowledge_agent.agent.Agent") - @patch("aieng.agent_evals.knowledge_agent.agent.create_google_search_tool") + @patch("aieng.agent_evals.knowledge_qa.agent.Runner") + @patch("aieng.agent_evals.knowledge_qa.agent.InMemorySessionService") + @patch("aieng.agent_evals.knowledge_qa.agent.Agent") + @patch("aieng.agent_evals.knowledge_qa.agent.create_google_search_tool") def test_lazy_initialization( self, mock_create_tool, @@ -630,7 +630,7 @@ def test_lazy_initialization( mock_runner_class, ): """Test that clients are lazily initialized.""" - with patch("aieng.agent_evals.knowledge_agent.agent.Configs") as mock_config_class: + with patch("aieng.agent_evals.knowledge_qa.agent.Configs") as mock_config_class: mock_config_class.return_value = MagicMock() manager = KnowledgeAgentManager() @@ -644,10 +644,10 @@ def test_lazy_initialization( # Now should be initialized assert manager.is_initialized() - @patch("aieng.agent_evals.knowledge_agent.agent.Runner") - @patch("aieng.agent_evals.knowledge_agent.agent.InMemorySessionService") - @patch("aieng.agent_evals.knowledge_agent.agent.Agent") - @patch("aieng.agent_evals.knowledge_agent.agent.create_google_search_tool") + @patch("aieng.agent_evals.knowledge_qa.agent.Runner") + @patch("aieng.agent_evals.knowledge_qa.agent.InMemorySessionService") + @patch("aieng.agent_evals.knowledge_qa.agent.Agent") + @patch("aieng.agent_evals.knowledge_qa.agent.create_google_search_tool") def test_close( self, mock_create_tool, @@ -656,7 +656,7 @@ def test_close( mock_runner_class, ): """Test closing the client manager.""" - with patch("aieng.agent_evals.knowledge_agent.agent.Configs") as mock_config_class: + with patch("aieng.agent_evals.knowledge_qa.agent.Configs") as mock_config_class: mock_config_class.return_value = MagicMock() manager = KnowledgeAgentManager() @@ -666,10 +666,10 @@ def test_close( manager.close() assert not manager.is_initialized() - @patch("aieng.agent_evals.knowledge_agent.agent.Runner") - @patch("aieng.agent_evals.knowledge_agent.agent.InMemorySessionService") - @patch("aieng.agent_evals.knowledge_agent.agent.Agent") - @patch("aieng.agent_evals.knowledge_agent.agent.create_google_search_tool") + @patch("aieng.agent_evals.knowledge_qa.agent.Runner") + @patch("aieng.agent_evals.knowledge_qa.agent.InMemorySessionService") + @patch("aieng.agent_evals.knowledge_qa.agent.Agent") + @patch("aieng.agent_evals.knowledge_qa.agent.create_google_search_tool") def test_agent_reuse( self, mock_create_tool, @@ -678,7 +678,7 @@ def test_agent_reuse( mock_runner_class, ): """Test that agent is reused on multiple accesses.""" - with patch("aieng.agent_evals.knowledge_agent.agent.Configs") as mock_config_class: + with patch("aieng.agent_evals.knowledge_qa.agent.Configs") as mock_config_class: mock_config_class.return_value = MagicMock() manager = KnowledgeAgentManager() @@ -698,7 +698,7 @@ class TestKnowledgeGroundedAgentIntegration: def test_agent_creation_real(self): """Test creating a real agent instance.""" - from aieng.agent_evals.knowledge_agent import ( # noqa: PLC0415 + from aieng.agent_evals.knowledge_qa import ( # noqa: PLC0415 KnowledgeGroundedAgent, ) @@ -709,7 +709,7 @@ def test_agent_creation_real(self): @pytest.mark.asyncio async def test_answer_real_question(self): """Test answering a real question.""" - from aieng.agent_evals.knowledge_agent import ( # noqa: PLC0415 + from aieng.agent_evals.knowledge_qa import ( # noqa: PLC0415 KnowledgeGroundedAgent, ) diff --git a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_agent/test_evaluation.py b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_evaluation.py similarity index 85% rename from aieng-eval-agents/tests/aieng/agent_evals/knowledge_agent/test_evaluation.py rename to aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_evaluation.py index 49f95c2b..782be286 100644 --- a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_agent/test_evaluation.py +++ b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_evaluation.py @@ -4,7 +4,7 @@ import pandas as pd import pytest -from aieng.agent_evals.knowledge_agent.evaluation import ( +from aieng.agent_evals.knowledge_qa.evaluation import ( DeepSearchQADataset, DSQAExample, EvaluationResult, @@ -78,8 +78,8 @@ def mock_csv_data(self): "answer_type": ["Single Answer", "List", "Single Answer"], } - @patch("aieng.agent_evals.knowledge_agent.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_agent.evaluation.pd.read_csv") + @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") def test_dataset_loading(self, mock_read_csv, mock_download, mock_csv_data): """Test loading the dataset.""" mock_download.return_value = "/fake/path" @@ -92,8 +92,8 @@ def test_dataset_loading(self, mock_read_csv, mock_download, mock_csv_data): assert len(examples) == 3 assert examples[0].problem == "Q1" - @patch("aieng.agent_evals.knowledge_agent.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_agent.evaluation.pd.read_csv") + @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") def test_dataset_length(self, mock_read_csv, mock_download, mock_csv_data): """Test getting dataset length.""" mock_download.return_value = "/fake/path" @@ -103,8 +103,8 @@ def test_dataset_length(self, mock_read_csv, mock_download, mock_csv_data): dataset = DeepSearchQADataset() assert len(dataset) == 3 - @patch("aieng.agent_evals.knowledge_agent.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_agent.evaluation.pd.read_csv") + @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") def test_dataset_indexing(self, mock_read_csv, mock_download, mock_csv_data): """Test indexing into the dataset.""" mock_download.return_value = "/fake/path" @@ -117,8 +117,8 @@ def test_dataset_indexing(self, mock_read_csv, mock_download, mock_csv_data): assert example.example_id == 1 assert example.problem == "Q2" - @patch("aieng.agent_evals.knowledge_agent.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_agent.evaluation.pd.read_csv") + @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") def test_get_by_category(self, mock_read_csv, mock_download, mock_csv_data): """Test filtering by category.""" mock_download.return_value = "/fake/path" @@ -131,8 +131,8 @@ def test_get_by_category(self, mock_read_csv, mock_download, mock_csv_data): assert len(cat_a_examples) == 2 assert all(ex.problem_category == "Cat A" for ex in cat_a_examples) - @patch("aieng.agent_evals.knowledge_agent.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_agent.evaluation.pd.read_csv") + @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") def test_get_categories(self, mock_read_csv, mock_download, mock_csv_data): """Test getting unique categories.""" mock_download.return_value = "/fake/path" @@ -145,8 +145,8 @@ def test_get_categories(self, mock_read_csv, mock_download, mock_csv_data): assert "Cat A" in categories assert "Cat B" in categories - @patch("aieng.agent_evals.knowledge_agent.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_agent.evaluation.pd.read_csv") + @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") def test_sample(self, mock_read_csv, mock_download, mock_csv_data): """Test random sampling.""" mock_download.return_value = "/fake/path" diff --git a/implementations/knowledge_agent/01_grounding_basics.ipynb b/implementations/knowledge_qa/01_grounding_basics.ipynb similarity index 97% rename from implementations/knowledge_agent/01_grounding_basics.ipynb rename to implementations/knowledge_qa/01_grounding_basics.ipynb index 20c8438d..48c9b238 100644 --- a/implementations/knowledge_agent/01_grounding_basics.ipynb +++ b/implementations/knowledge_qa/01_grounding_basics.ipynb @@ -22,7 +22,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Setup: Load environment and configure rich console\nfrom aieng.agent_evals import (\n create_console,\n display_comparison,\n display_response,\n display_source_table,\n)\nfrom aieng.agent_evals.knowledge_agent import KnowledgeAgentConfig, KnowledgeGroundedAgent\nfrom dotenv import load_dotenv\nfrom google import genai\nfrom rich.panel import Panel\n\n\nconsole = create_console()\nload_dotenv(verbose=True)" + "source": "# Setup: Load environment and configure rich console\nfrom aieng.agent_evals import (\n create_console,\n display_comparison,\n display_response,\n display_source_table,\n)\nfrom aieng.agent_evals.knowledge_qa import KnowledgeAgentConfig, KnowledgeGroundedAgent\nfrom dotenv import load_dotenv\nfrom google import genai\nfrom rich.panel import Panel\n\n\nconsole = create_console()\nload_dotenv(verbose=True)" }, { "cell_type": "markdown", diff --git a/implementations/knowledge_agent/02_agent_basics.ipynb b/implementations/knowledge_qa/02_agent_basics.ipynb similarity index 96% rename from implementations/knowledge_agent/02_agent_basics.ipynb rename to implementations/knowledge_qa/02_agent_basics.ipynb index a76bfad5..7f9d0884 100644 --- a/implementations/knowledge_agent/02_agent_basics.ipynb +++ b/implementations/knowledge_qa/02_agent_basics.ipynb @@ -22,7 +22,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Setup: Load environment and configure rich console\nfrom aieng.agent_evals import (\n create_console,\n display_example,\n display_info,\n display_response,\n display_success,\n)\nfrom aieng.agent_evals.knowledge_agent import (\n DeepSearchQADataset,\n KnowledgeAgentManager,\n KnowledgeGroundedAgent,\n)\nfrom aieng.agent_evals.knowledge_agent.agent import SYSTEM_INSTRUCTIONS\nfrom dotenv import load_dotenv\nfrom rich.markdown import Markdown\nfrom rich.panel import Panel\nfrom rich.table import Table\n\n\nconsole = create_console()\nload_dotenv(verbose=True)" + "source": "# Setup: Load environment and configure rich console\nfrom aieng.agent_evals import (\n create_console,\n display_example,\n display_info,\n display_response,\n display_success,\n)\nfrom aieng.agent_evals.knowledge_qa import (\n DeepSearchQADataset,\n KnowledgeAgentManager,\n KnowledgeGroundedAgent,\n)\nfrom aieng.agent_evals.knowledge_qa.agent import SYSTEM_INSTRUCTIONS\nfrom dotenv import load_dotenv\nfrom rich.markdown import Markdown\nfrom rich.panel import Panel\nfrom rich.table import Table\n\n\nconsole = create_console()\nload_dotenv(verbose=True)" }, { "cell_type": "markdown", diff --git a/implementations/knowledge_agent/03_multi_turn.ipynb b/implementations/knowledge_qa/03_multi_turn.ipynb similarity index 99% rename from implementations/knowledge_agent/03_multi_turn.ipynb rename to implementations/knowledge_qa/03_multi_turn.ipynb index 0695b355..a0e39096 100644 --- a/implementations/knowledge_agent/03_multi_turn.ipynb +++ b/implementations/knowledge_qa/03_multi_turn.ipynb @@ -20,7 +20,7 @@ " display_metrics_table,\n", " display_success,\n", ")\n", - "from aieng.agent_evals.knowledge_agent import (\n", + "from aieng.agent_evals.knowledge_qa import (\n", " DeepSearchQADataset,\n", " DeepSearchQAEvaluator,\n", " KnowledgeGroundedAgent,\n", diff --git a/implementations/knowledge_agent/README.md b/implementations/knowledge_qa/README.md similarity index 94% rename from implementations/knowledge_agent/README.md rename to implementations/knowledge_qa/README.md index f7171bc4..d796e37f 100644 --- a/implementations/knowledge_agent/README.md +++ b/implementations/knowledge_qa/README.md @@ -41,13 +41,13 @@ uv sync Run the Gradio app: ```bash -uv run --env-file .env gradio implementations/knowledge_agent/gradio_app.py +uv run --env-file .env gradio implementations/knowledge_qa/gradio_app.py ``` ### Programmatic Usage ```python -from aieng.agent_evals.knowledge_agent import KnowledgeGroundedAgent +from aieng.agent_evals.knowledge_qa import KnowledgeGroundedAgent agent = KnowledgeGroundedAgent() @@ -65,7 +65,7 @@ print(f"Tool calls: {response.tool_calls}") ### Evaluation on DeepSearchQA ```python -from aieng.agent_evals.knowledge_agent import ( +from aieng.agent_evals.knowledge_qa import ( KnowledgeGroundedAgent, DeepSearchQAEvaluator, ) @@ -90,7 +90,7 @@ print(df[["example_id", "ground_truth", "prediction", "sources_used"]]) ## Architecture ``` -aieng.agent_evals.knowledge_agent/ +aieng.agent_evals.knowledge_qa/ ├── config.py # Configuration (Pydantic settings) ├── grounding_tool.py # GoogleSearchTool wrapper and response models ├── agent.py # KnowledgeGroundedAgent (ADK Agent + Runner) diff --git a/implementations/knowledge_agent/gradio_app.py b/implementations/knowledge_qa/gradio_app.py similarity index 97% rename from implementations/knowledge_agent/gradio_app.py rename to implementations/knowledge_qa/gradio_app.py index c11f60ab..734c06e8 100644 --- a/implementations/knowledge_agent/gradio_app.py +++ b/implementations/knowledge_qa/gradio_app.py @@ -4,7 +4,7 @@ knowledge-grounded QA agent with Google ADK and explicit Google Search tool calls. Run with: - uv run --env-file .env gradio implementations/knowledge_agent/gradio_app.py + uv run --env-file .env gradio implementations/knowledge_qa/gradio_app.py """ import asyncio @@ -13,7 +13,7 @@ from typing import Any, Generator import gradio as gr -from aieng.agent_evals.knowledge_agent import ( +from aieng.agent_evals.knowledge_qa import ( DeepSearchQADataset, KnowledgeAgentManager, ) From c5ee97417d8f2cc582f24fbe2d50a8b98831d6f7 Mon Sep 17 00:00:00 2001 From: Amrit Krishnan Date: Wed, 4 Feb 2026 15:05:23 -0500 Subject: [PATCH 2/4] Add langfuse upload script --- .../knowledge_qa/data/langfuse_upload.py | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 implementations/knowledge_qa/data/langfuse_upload.py diff --git a/implementations/knowledge_qa/data/langfuse_upload.py b/implementations/knowledge_qa/data/langfuse_upload.py new file mode 100644 index 00000000..6c204ea3 --- /dev/null +++ b/implementations/knowledge_qa/data/langfuse_upload.py @@ -0,0 +1,140 @@ +"""Upload DeepSearchQA dataset subset to Langfuse. + +This script uploads a subset of the DeepSearchQA benchmark to Langfuse +for use with the Langfuse experiment evaluation framework. + +Usage: + python langfuse_upload.py --samples 10 --category "Finance & Economics" + python langfuse_upload.py --ids 123 456 789 +""" + +import asyncio +import json +import logging +import tempfile +from pathlib import Path + +import click +from aieng.agent_evals.knowledge_qa.evaluation import DeepSearchQADataset +from aieng.agent_evals.langfuse import upload_dataset_to_langfuse as upload_file_to_langfuse +from dotenv import load_dotenv + + +load_dotenv(verbose=True) +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s") +logger = logging.getLogger(__name__) + + +DEFAULT_DATASET_NAME = "DeepSearchQA-Subset" + + +async def upload_deepsearch_qa_to_langfuse( + dataset_name: str, + samples: int = 10, + category: str | None = None, + ids: list[int] | None = None, +) -> None: + """Upload DeepSearchQA examples to Langfuse. + + This function converts DeepSearchQA examples to a temporary JSONL file + and uses the shared upload utility for consistent formatting and progress tracking. + + Parameters + ---------- + dataset_name : str + Name for the dataset in Langfuse. + samples : int + Number of samples to upload (ignored if ids provided). + category : str, optional + Filter by category (ignored if ids provided). + ids : list[int], optional + Specific example IDs to upload. + """ + # Load DeepSearchQA dataset + logger.info("Loading DeepSearchQA dataset...") + dataset = DeepSearchQADataset() + logger.info(f"Loaded {len(dataset)} total examples") + + # Select examples based on criteria + if ids: + examples = dataset.get_by_ids(ids) + logger.info(f"Selected {len(examples)} examples by ID") + elif category: + examples = dataset.get_by_category(category)[:samples] + logger.info(f"Selected {len(examples)} examples from category '{category}'") + else: + examples = dataset.examples[:samples] + logger.info(f"Selected first {len(examples)} examples") + + if not examples: + logger.error("No examples found matching criteria") + return + + # Convert examples to JSONL format for the shared upload utility + # Use a temporary file that's automatically cleaned up + with tempfile.NamedTemporaryFile( + mode="w", + encoding="utf-8", + suffix=".jsonl", + prefix=f"deepsearchqa_{dataset_name}_", + delete=False, + ) as temp_file: + temp_path = Path(temp_file.name) + logger.info(f"Writing {len(examples)} examples to temporary file...") + + for example in examples: + record = { + "input": example.problem, + "expected_output": example.answer, + "metadata": { + "example_id": example.example_id, + "category": example.problem_category, + "answer_type": example.answer_type, + }, + } + temp_file.write(json.dumps(record, ensure_ascii=False) + "\n") + + try: + # Use the shared upload utility with progress tracking and deduplication + await upload_file_to_langfuse( + dataset_path=str(temp_path), + dataset_name=dataset_name, + ) + finally: + # Clean up temporary file + if temp_path.exists(): + temp_path.unlink() + logger.debug(f"Removed temporary file: {temp_path}") + + +@click.command() +@click.option( + "--dataset-name", + default=DEFAULT_DATASET_NAME, + help="Name for the dataset in Langfuse.", +) +@click.option( + "--samples", + default=10, + type=int, + help="Number of samples to upload (default: 10).", +) +@click.option( + "--category", + default=None, + help="Filter by category (e.g., 'Finance & Economics').", +) +@click.option( + "--ids", + multiple=True, + type=int, + help="Specific example IDs to upload (can be used multiple times).", +) +def cli(dataset_name: str, samples: int, category: str | None, ids: tuple[int, ...]) -> None: + """Upload DeepSearchQA examples to Langfuse.""" + ids_list = list(ids) if ids else None + asyncio.run(upload_deepsearch_qa_to_langfuse(dataset_name, samples, category, ids_list)) + + +if __name__ == "__main__": + cli() From fcde923844a258023e073fb41d6d9eb194a0ba10 Mon Sep 17 00:00:00 2001 From: Amrit Krishnan Date: Wed, 4 Feb 2026 15:07:26 -0500 Subject: [PATCH 3/4] Add updates to the evals dataset --- .../agent_evals/knowledge_qa/evaluation.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py index 3459b6a2..0d4feb11 100644 --- a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py +++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py @@ -185,6 +185,41 @@ def get_by_category(self, category: str) -> list[DSQAExample]: """ return [ex for ex in self.examples if ex.problem_category == category] + def get_by_id(self, example_id: int) -> DSQAExample | None: + """Get a single example by its ID. + + Parameters + ---------- + example_id : int + The unique identifier of the example. + + Returns + ------- + DSQAExample or None + The example with the given ID, or None if not found. + """ + for ex in self.examples: + if ex.example_id == example_id: + return ex + return None + + def get_by_ids(self, example_ids: list[int]) -> list[DSQAExample]: + """Get multiple examples by their IDs. + + Parameters + ---------- + example_ids : list[int] + List of example IDs to retrieve. + + Returns + ------- + list[DSQAExample] + Examples matching the given IDs, in the order requested. + Missing IDs are silently skipped. + """ + id_to_example = {ex.example_id: ex for ex in self.examples} + return [id_to_example[eid] for eid in example_ids if eid in id_to_example] + def get_categories(self) -> list[str]: """Get all unique problem categories. From 5069705ce71dffb7efb35a850da7f7efef98a518 Mon Sep 17 00:00:00 2001 From: Amrit Krishnan Date: Wed, 4 Feb 2026 16:10:18 -0500 Subject: [PATCH 4/4] Refactor modules to separate evaluation and dataset --- .../agent_evals/knowledge_qa/__init__.py | 7 +- .../agent_evals/knowledge_qa/data/__init__.py | 13 + .../knowledge_qa/data/deepsearchqa.py | 239 ++++++++++++++++++ .../agent_evals/knowledge_qa/evaluation.py | 230 +---------------- .../agent_evals/knowledge_qa/data/__init__.py | 1 + .../knowledge_qa/data/test_deepsearchqa.py | 216 ++++++++++++++++ .../knowledge_qa/test_evaluation.py | 160 ++---------- .../knowledge_qa/data/langfuse_upload.py | 2 +- implementations/knowledge_qa/gradio_app.py | 6 +- 9 files changed, 497 insertions(+), 377 deletions(-) create mode 100644 aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/__init__.py create mode 100644 aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/deepsearchqa.py create mode 100644 aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/__init__.py create mode 100644 aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/test_deepsearchqa.py diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/__init__.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/__init__.py index 1fb5be56..45b94641 100644 --- a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/__init__.py +++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/__init__.py @@ -24,12 +24,7 @@ ) from .agent import KnowledgeAgentManager, KnowledgeGroundedAgent -from .evaluation import ( - DeepSearchQADataset, - DeepSearchQAEvaluator, - DSQAExample, - EvaluationResult, -) +from .evaluation import DeepSearchQAEvaluator, EvaluationResult __all__ = [ diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/__init__.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/__init__.py new file mode 100644 index 00000000..2a1e4af7 --- /dev/null +++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/__init__.py @@ -0,0 +1,13 @@ +"""Data loading and management for knowledge QA evaluation. + +This module provides tools for loading and managing benchmark datasets +like DeepSearchQA. +""" + +from .deepsearchqa import DeepSearchQADataset, DSQAExample + + +__all__ = [ + "DSQAExample", + "DeepSearchQADataset", +] diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/deepsearchqa.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/deepsearchqa.py new file mode 100644 index 00000000..f84d2ffa --- /dev/null +++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/data/deepsearchqa.py @@ -0,0 +1,239 @@ +"""DeepSearchQA dataset loader. + +This module provides classes for loading and accessing the DeepSearchQA +benchmark dataset from Kaggle. +""" + +import logging +from pathlib import Path + +import kagglehub +import pandas as pd +from pydantic import BaseModel, Field + + +logger = logging.getLogger(__name__) + + +class DSQAExample(BaseModel): + """A single example from the DeepSearchQA dataset.""" + + example_id: int = Field(description="Unique identifier for the example.") + problem: str = Field(description="The research question/problem to solve.") + problem_category: str = Field(description="Category of the problem (e.g., 'Politics & Government').") + answer: str = Field(description="The ground truth answer.") + answer_type: str = Field(description="Type of answer (e.g., 'Single Answer', 'List').") + + +class DeepSearchQADataset: + """Loader and manager for the DeepSearchQA dataset. + + This class handles downloading, loading, and accessing examples from + the DeepSearchQA benchmark dataset. + + Parameters + ---------- + cache_dir : str or Path, optional + Directory to cache the dataset. If not provided, uses kagglehub default. + + Examples + -------- + >>> dataset = DeepSearchQADataset() + >>> print(f"Total examples: {len(dataset)}") + >>> example = dataset[0] + >>> print(example.problem) + """ + + def __init__(self, cache_dir: str | Path | None = None) -> None: + """Initialize the dataset loader. + + Parameters + ---------- + cache_dir : str or Path, optional + Directory to cache the dataset. + """ + self._cache_dir = Path(cache_dir) if cache_dir else None + self._df: pd.DataFrame | None = None + self._examples: list[DSQAExample] | None = None + + def _download_dataset(self) -> Path: + """Download the dataset using kagglehub. + + Returns + ------- + Path + Path to the downloaded dataset directory. + """ + logger.info("Downloading DeepSearchQA dataset...") + path = kagglehub.dataset_download("deepmind/deepsearchqa") + return Path(path) + + def _load_data(self) -> None: + """Load the dataset into memory.""" + if self._df is not None: + return + + dataset_path = self._download_dataset() + csv_path = dataset_path / "DSQA-full.csv" + + if not csv_path.exists(): + raise FileNotFoundError(f"Dataset file not found: {csv_path}") + + self._df = pd.read_csv(csv_path) + + # Filter out rows with missing answers + original_count = len(self._df) + self._df = self._df.dropna(subset=["answer"]) + dropped_count = original_count - len(self._df) + if dropped_count > 0: + logger.info(f"Dropped {dropped_count} examples with missing answers") + + logger.info(f"Loaded {len(self._df)} examples from DeepSearchQA") + + # Convert to examples + self._examples = [ + DSQAExample( + example_id=row["example_id"], + problem=row["problem"], + problem_category=row["problem_category"], + answer=str(row["answer"]), # Ensure string type + answer_type=row["answer_type"], + ) + for _, row in self._df.iterrows() + ] + + @property + def dataframe(self) -> pd.DataFrame: + """Get the raw pandas DataFrame. + + Returns + ------- + pd.DataFrame + The full dataset as a DataFrame. + """ + self._load_data() + assert self._df is not None + return self._df + + @property + def examples(self) -> list[DSQAExample]: + """Get all examples as DSQAExample objects. + + Returns + ------- + list[DSQAExample] + All examples in the dataset. + """ + self._load_data() + assert self._examples is not None + return self._examples + + def __len__(self) -> int: + """Return the number of examples in the dataset.""" + self._load_data() + assert self._examples is not None + return len(self._examples) + + def __getitem__(self, index: int) -> DSQAExample: + """Get an example by index. + + Parameters + ---------- + index : int + The index of the example to retrieve. + + Returns + ------- + DSQAExample + The example at the given index. + """ + self._load_data() + assert self._examples is not None + return self._examples[index] + + def get_by_category(self, category: str) -> list[DSQAExample]: + """Get all examples in a specific category. + + Parameters + ---------- + category : str + The problem category to filter by. + + Returns + ------- + list[DSQAExample] + Examples matching the category. + """ + return [ex for ex in self.examples if ex.problem_category == category] + + def get_by_id(self, example_id: int) -> DSQAExample | None: + """Get a single example by its ID. + + Parameters + ---------- + example_id : int + The unique identifier of the example. + + Returns + ------- + DSQAExample or None + The example with the given ID, or None if not found. + """ + for ex in self.examples: + if ex.example_id == example_id: + return ex + return None + + def get_by_ids(self, example_ids: list[int]) -> list[DSQAExample]: + """Get multiple examples by their IDs. + + Parameters + ---------- + example_ids : list[int] + List of example IDs to retrieve. + + Returns + ------- + list[DSQAExample] + Examples matching the given IDs, in the order requested. + Missing IDs are silently skipped. + """ + id_to_example = {ex.example_id: ex for ex in self.examples} + return [id_to_example[eid] for eid in example_ids if eid in id_to_example] + + def get_categories(self) -> list[str]: + """Get all unique problem categories. + + Returns + ------- + list[str] + List of unique category names. + """ + return list(self.dataframe["problem_category"].unique()) + + def sample(self, n: int = 10, random_state: int | None = None) -> list[DSQAExample]: + """Get a random sample of examples. + + Parameters + ---------- + n : int, optional + Number of examples to sample, by default 10. + random_state : int, optional + Random seed for reproducibility. + + Returns + ------- + list[DSQAExample] + Randomly sampled examples. + """ + sampled_df = self.dataframe.sample(n=min(n, len(self)), random_state=random_state) + return [ + DSQAExample( + example_id=row["example_id"], + problem=row["problem"], + problem_category=row["problem_category"], + answer=row["answer"], + answer_type=row["answer_type"], + ) + for _, row in sampled_df.iterrows() + ] diff --git a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py index 0d4feb11..2aaf0543 100644 --- a/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py +++ b/aieng-eval-agents/aieng/agent_evals/knowledge_qa/evaluation.py @@ -1,18 +1,18 @@ """Evaluation utilities for DeepSearchQA benchmark. -This module provides tools for loading, running, and evaluating the +This module provides tools for running and evaluating agents on the DeepSearchQA benchmark dataset. """ import asyncio import logging -from pathlib import Path from typing import TYPE_CHECKING -import kagglehub import pandas as pd from pydantic import BaseModel, Field +from .data import DeepSearchQADataset, DSQAExample + if TYPE_CHECKING: from .agent import KnowledgeGroundedAgent @@ -21,16 +21,6 @@ logger = logging.getLogger(__name__) -class DSQAExample(BaseModel): - """A single example from the DeepSearchQA dataset.""" - - example_id: int = Field(description="Unique identifier for the example.") - problem: str = Field(description="The research question/problem to solve.") - problem_category: str = Field(description="Category of the problem (e.g., 'Politics & Government').") - answer: str = Field(description="The ground truth answer.") - answer_type: str = Field(description="Type of answer (e.g., 'Single Answer', 'List').") - - class EvaluationResult(BaseModel): """Result of evaluating a single example.""" @@ -44,220 +34,6 @@ class EvaluationResult(BaseModel): evaluation_notes: str = Field(default="", description="Additional notes about the evaluation.") -class DeepSearchQADataset: - """Loader and manager for the DeepSearchQA dataset. - - This class handles downloading, loading, and accessing examples from - the DeepSearchQA benchmark dataset. - - Parameters - ---------- - cache_dir : str or Path, optional - Directory to cache the dataset. If not provided, uses kagglehub default. - - Examples - -------- - >>> dataset = DeepSearchQADataset() - >>> print(f"Total examples: {len(dataset)}") - >>> example = dataset[0] - >>> print(example.problem) - """ - - def __init__(self, cache_dir: str | Path | None = None) -> None: - """Initialize the dataset loader. - - Parameters - ---------- - cache_dir : str or Path, optional - Directory to cache the dataset. - """ - self._cache_dir = Path(cache_dir) if cache_dir else None - self._df: pd.DataFrame | None = None - self._examples: list[DSQAExample] | None = None - - def _download_dataset(self) -> Path: - """Download the dataset using kagglehub. - - Returns - ------- - Path - Path to the downloaded dataset directory. - """ - logger.info("Downloading DeepSearchQA dataset...") - path = kagglehub.dataset_download("deepmind/deepsearchqa") - return Path(path) - - def _load_data(self) -> None: - """Load the dataset into memory.""" - if self._df is not None: - return - - dataset_path = self._download_dataset() - csv_path = dataset_path / "DSQA-full.csv" - - if not csv_path.exists(): - raise FileNotFoundError(f"Dataset file not found: {csv_path}") - - self._df = pd.read_csv(csv_path) - - # Filter out rows with missing answers - original_count = len(self._df) - self._df = self._df.dropna(subset=["answer"]) - dropped_count = original_count - len(self._df) - if dropped_count > 0: - logger.info(f"Dropped {dropped_count} examples with missing answers") - - logger.info(f"Loaded {len(self._df)} examples from DeepSearchQA") - - # Convert to examples - self._examples = [ - DSQAExample( - example_id=row["example_id"], - problem=row["problem"], - problem_category=row["problem_category"], - answer=str(row["answer"]), # Ensure string type - answer_type=row["answer_type"], - ) - for _, row in self._df.iterrows() - ] - - @property - def dataframe(self) -> pd.DataFrame: - """Get the raw pandas DataFrame. - - Returns - ------- - pd.DataFrame - The full dataset as a DataFrame. - """ - self._load_data() - assert self._df is not None - return self._df - - @property - def examples(self) -> list[DSQAExample]: - """Get all examples as DSQAExample objects. - - Returns - ------- - list[DSQAExample] - All examples in the dataset. - """ - self._load_data() - assert self._examples is not None - return self._examples - - def __len__(self) -> int: - """Return the number of examples in the dataset.""" - self._load_data() - assert self._examples is not None - return len(self._examples) - - def __getitem__(self, index: int) -> DSQAExample: - """Get an example by index. - - Parameters - ---------- - index : int - The index of the example to retrieve. - - Returns - ------- - DSQAExample - The example at the given index. - """ - self._load_data() - assert self._examples is not None - return self._examples[index] - - def get_by_category(self, category: str) -> list[DSQAExample]: - """Get all examples in a specific category. - - Parameters - ---------- - category : str - The problem category to filter by. - - Returns - ------- - list[DSQAExample] - Examples matching the category. - """ - return [ex for ex in self.examples if ex.problem_category == category] - - def get_by_id(self, example_id: int) -> DSQAExample | None: - """Get a single example by its ID. - - Parameters - ---------- - example_id : int - The unique identifier of the example. - - Returns - ------- - DSQAExample or None - The example with the given ID, or None if not found. - """ - for ex in self.examples: - if ex.example_id == example_id: - return ex - return None - - def get_by_ids(self, example_ids: list[int]) -> list[DSQAExample]: - """Get multiple examples by their IDs. - - Parameters - ---------- - example_ids : list[int] - List of example IDs to retrieve. - - Returns - ------- - list[DSQAExample] - Examples matching the given IDs, in the order requested. - Missing IDs are silently skipped. - """ - id_to_example = {ex.example_id: ex for ex in self.examples} - return [id_to_example[eid] for eid in example_ids if eid in id_to_example] - - def get_categories(self) -> list[str]: - """Get all unique problem categories. - - Returns - ------- - list[str] - List of unique category names. - """ - return list(self.dataframe["problem_category"].unique()) - - def sample(self, n: int = 10, random_state: int | None = None) -> list[DSQAExample]: - """Get a random sample of examples. - - Parameters - ---------- - n : int, optional - Number of examples to sample, by default 10. - random_state : int, optional - Random seed for reproducibility. - - Returns - ------- - list[DSQAExample] - Randomly sampled examples. - """ - sampled_df = self.dataframe.sample(n=min(n, len(self)), random_state=random_state) - return [ - DSQAExample( - example_id=row["example_id"], - problem=row["problem"], - problem_category=row["problem_category"], - answer=row["answer"], - answer_type=row["answer_type"], - ) - for _, row in sampled_df.iterrows() - ] - - class DeepSearchQAEvaluator: """Evaluator for running and scoring DeepSearchQA benchmark. diff --git a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/__init__.py b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/__init__.py new file mode 100644 index 00000000..6b8be356 --- /dev/null +++ b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/__init__.py @@ -0,0 +1 @@ +"""Tests for knowledge QA data module.""" diff --git a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/test_deepsearchqa.py b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/test_deepsearchqa.py new file mode 100644 index 00000000..d55e7d94 --- /dev/null +++ b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/data/test_deepsearchqa.py @@ -0,0 +1,216 @@ +"""Tests for DeepSearchQA dataset loading and management.""" + +from unittest.mock import patch + +import pandas as pd +import pytest +from aieng.agent_evals.knowledge_qa.data import DeepSearchQADataset, DSQAExample + + +class TestDSQAExample: + """Tests for the DSQAExample model.""" + + def test_example_creation(self): + """Test creating an example.""" + example = DSQAExample( + example_id=0, + problem="What is the capital of France?", + problem_category="Geography", + answer="Paris", + answer_type="Single Answer", + ) + assert example.example_id == 0 + assert example.problem == "What is the capital of France?" + assert example.problem_category == "Geography" + assert example.answer == "Paris" + assert example.answer_type == "Single Answer" + + +class TestDeepSearchQADataset: + """Tests for the DeepSearchQADataset class.""" + + @pytest.fixture + def mock_csv_data(self): + """Create mock CSV data.""" + return { + "example_id": [0, 1, 2], + "problem": ["Q1", "Q2", "Q3"], + "problem_category": ["Cat A", "Cat B", "Cat A"], + "answer": ["A1", "A2", "A3"], + "answer_type": ["Single Answer", "List", "Single Answer"], + } + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_dataset_loading(self, mock_read_csv, mock_download, mock_csv_data): + """Test loading the dataset.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + examples = dataset.examples + + assert len(examples) == 3 + assert examples[0].problem == "Q1" + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_dataset_length(self, mock_read_csv, mock_download, mock_csv_data): + """Test getting dataset length.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + assert len(dataset) == 3 + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_dataset_indexing(self, mock_read_csv, mock_download, mock_csv_data): + """Test indexing into the dataset.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + example = dataset[1] + + assert example.example_id == 1 + assert example.problem == "Q2" + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_get_by_category(self, mock_read_csv, mock_download, mock_csv_data): + """Test filtering by category.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + cat_a_examples = dataset.get_by_category("Cat A") + + assert len(cat_a_examples) == 2 + assert all(ex.problem_category == "Cat A" for ex in cat_a_examples) + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_get_by_id(self, mock_read_csv, mock_download, mock_csv_data): + """Test getting a single example by ID.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + example = dataset.get_by_id(1) + + assert example is not None + assert example.example_id == 1 + assert example.problem == "Q2" + + # Test non-existent ID + assert dataset.get_by_id(999) is None + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_get_by_ids(self, mock_read_csv, mock_download, mock_csv_data): + """Test getting multiple examples by IDs.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + examples = dataset.get_by_ids([0, 2]) + + assert len(examples) == 2 + assert examples[0].example_id == 0 + assert examples[1].example_id == 2 + + # Test with missing IDs (should skip them) + examples = dataset.get_by_ids([0, 999, 1]) + assert len(examples) == 2 + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_get_categories(self, mock_read_csv, mock_download, mock_csv_data): + """Test getting unique categories.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + categories = dataset.get_categories() + + assert "Cat A" in categories + assert "Cat B" in categories + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_sample(self, mock_read_csv, mock_download, mock_csv_data): + """Test random sampling.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + sample = dataset.sample(n=2, random_state=42) + + assert len(sample) == 2 + assert all(isinstance(ex, DSQAExample) for ex in sample) + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_dataframe_property(self, mock_read_csv, mock_download, mock_csv_data): + """Test accessing the raw dataframe.""" + mock_download.return_value = "/fake/path" + mock_read_csv.return_value = pd.DataFrame(mock_csv_data) + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + df = dataset.dataframe + + assert isinstance(df, pd.DataFrame) + assert len(df) == 3 + assert "problem" in df.columns + + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.kagglehub.dataset_download") + @patch("aieng.agent_evals.knowledge_qa.data.deepsearchqa.pd.read_csv") + def test_filter_missing_answers(self, mock_read_csv, mock_download): + """Test that rows with missing answers are filtered out.""" + mock_download.return_value = "/fake/path" + data_with_na = pd.DataFrame( + { + "example_id": [0, 1, 2], + "problem": ["Q1", "Q2", "Q3"], + "problem_category": ["Cat A", "Cat B", "Cat A"], + "answer": ["A1", None, "A3"], # One missing answer + "answer_type": ["Single Answer", "List", "Single Answer"], + } + ) + mock_read_csv.return_value = data_with_na + + with patch("pathlib.Path.exists", return_value=True): + dataset = DeepSearchQADataset() + examples = dataset.examples + + # Should only have 2 examples after filtering + assert len(examples) == 2 + assert examples[0].example_id == 0 + assert examples[1].example_id == 2 + + +@pytest.mark.integration_test +class TestDeepSearchQADatasetIntegration: + """Integration tests for DeepSearchQADataset. + + These tests download the actual dataset from Kaggle. + """ + + def test_load_real_dataset(self): + """Test loading the real dataset.""" + dataset = DeepSearchQADataset() + + # Dataset may have fewer than 900 examples after filtering NaN answers + assert len(dataset) > 800 # Should have most examples + assert dataset[0].example_id == 0 + assert len(dataset.get_categories()) > 0 diff --git a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_evaluation.py b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_evaluation.py index 782be286..57c06cb9 100644 --- a/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_evaluation.py +++ b/aieng-eval-agents/tests/aieng/agent_evals/knowledge_qa/test_evaluation.py @@ -1,33 +1,6 @@ """Tests for DeepSearchQA evaluation utilities.""" -from unittest.mock import patch - -import pandas as pd -import pytest -from aieng.agent_evals.knowledge_qa.evaluation import ( - DeepSearchQADataset, - DSQAExample, - EvaluationResult, -) - - -class TestDSQAExample: - """Tests for the DSQAExample model.""" - - def test_example_creation(self): - """Test creating an example.""" - example = DSQAExample( - example_id=0, - problem="What is the capital of France?", - problem_category="Geography", - answer="Paris", - answer_type="Single Answer", - ) - assert example.example_id == 0 - assert example.problem == "What is the capital of France?" - assert example.problem_category == "Geography" - assert example.answer == "Paris" - assert example.answer_type == "Single Answer" +from aieng.agent_evals.knowledge_qa.evaluation import EvaluationResult class TestEvaluationResult: @@ -63,115 +36,24 @@ def test_result_defaults(self): assert result.is_correct is None assert result.evaluation_notes == "" + def test_result_with_correctness(self): + """Test evaluation result with correctness flag.""" + result = EvaluationResult( + example_id=2, + problem="What is 2+2?", + ground_truth="4", + prediction="4", + is_correct=True, + ) + assert result.is_correct is True -class TestDeepSearchQADataset: - """Tests for the DeepSearchQADataset class.""" - - @pytest.fixture - def mock_csv_data(self): - """Create mock CSV data.""" - return { - "example_id": [0, 1, 2], - "problem": ["Q1", "Q2", "Q3"], - "problem_category": ["Cat A", "Cat B", "Cat A"], - "answer": ["A1", "A2", "A3"], - "answer_type": ["Single Answer", "List", "Single Answer"], - } - - @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") - def test_dataset_loading(self, mock_read_csv, mock_download, mock_csv_data): - """Test loading the dataset.""" - mock_download.return_value = "/fake/path" - mock_read_csv.return_value = pd.DataFrame(mock_csv_data) - - with patch("pathlib.Path.exists", return_value=True): - dataset = DeepSearchQADataset() - examples = dataset.examples - - assert len(examples) == 3 - assert examples[0].problem == "Q1" - - @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") - def test_dataset_length(self, mock_read_csv, mock_download, mock_csv_data): - """Test getting dataset length.""" - mock_download.return_value = "/fake/path" - mock_read_csv.return_value = pd.DataFrame(mock_csv_data) - - with patch("pathlib.Path.exists", return_value=True): - dataset = DeepSearchQADataset() - assert len(dataset) == 3 - - @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") - def test_dataset_indexing(self, mock_read_csv, mock_download, mock_csv_data): - """Test indexing into the dataset.""" - mock_download.return_value = "/fake/path" - mock_read_csv.return_value = pd.DataFrame(mock_csv_data) - - with patch("pathlib.Path.exists", return_value=True): - dataset = DeepSearchQADataset() - example = dataset[1] - - assert example.example_id == 1 - assert example.problem == "Q2" - - @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") - def test_get_by_category(self, mock_read_csv, mock_download, mock_csv_data): - """Test filtering by category.""" - mock_download.return_value = "/fake/path" - mock_read_csv.return_value = pd.DataFrame(mock_csv_data) - - with patch("pathlib.Path.exists", return_value=True): - dataset = DeepSearchQADataset() - cat_a_examples = dataset.get_by_category("Cat A") - - assert len(cat_a_examples) == 2 - assert all(ex.problem_category == "Cat A" for ex in cat_a_examples) - - @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") - def test_get_categories(self, mock_read_csv, mock_download, mock_csv_data): - """Test getting unique categories.""" - mock_download.return_value = "/fake/path" - mock_read_csv.return_value = pd.DataFrame(mock_csv_data) - - with patch("pathlib.Path.exists", return_value=True): - dataset = DeepSearchQADataset() - categories = dataset.get_categories() - - assert "Cat A" in categories - assert "Cat B" in categories - - @patch("aieng.agent_evals.knowledge_qa.evaluation.kagglehub.dataset_download") - @patch("aieng.agent_evals.knowledge_qa.evaluation.pd.read_csv") - def test_sample(self, mock_read_csv, mock_download, mock_csv_data): - """Test random sampling.""" - mock_download.return_value = "/fake/path" - mock_read_csv.return_value = pd.DataFrame(mock_csv_data) - - with patch("pathlib.Path.exists", return_value=True): - dataset = DeepSearchQADataset() - sample = dataset.sample(n=2, random_state=42) - - assert len(sample) == 2 - assert all(isinstance(ex, DSQAExample) for ex in sample) - - -@pytest.mark.integration_test -class TestDeepSearchQADatasetIntegration: - """Integration tests for DeepSearchQADataset. - - These tests download the actual dataset from Kaggle. - """ - - def test_load_real_dataset(self): - """Test loading the real dataset.""" - dataset = DeepSearchQADataset() - - # Dataset may have fewer than 900 examples after filtering NaN answers - assert len(dataset) > 800 # Should have most examples - assert dataset[0].example_id == 0 - assert len(dataset.get_categories()) > 0 + def test_result_with_notes(self): + """Test evaluation result with evaluation notes.""" + result = EvaluationResult( + example_id=3, + problem="Complex question", + ground_truth="Complex answer", + prediction="Model's answer", + evaluation_notes="Partial match detected", + ) + assert result.evaluation_notes == "Partial match detected" diff --git a/implementations/knowledge_qa/data/langfuse_upload.py b/implementations/knowledge_qa/data/langfuse_upload.py index 6c204ea3..c7f4a8f8 100644 --- a/implementations/knowledge_qa/data/langfuse_upload.py +++ b/implementations/knowledge_qa/data/langfuse_upload.py @@ -15,7 +15,7 @@ from pathlib import Path import click -from aieng.agent_evals.knowledge_qa.evaluation import DeepSearchQADataset +from aieng.agent_evals.knowledge_qa.data import DeepSearchQADataset from aieng.agent_evals.langfuse import upload_dataset_to_langfuse as upload_file_to_langfuse from dotenv import load_dotenv diff --git a/implementations/knowledge_qa/gradio_app.py b/implementations/knowledge_qa/gradio_app.py index 734c06e8..9e64655d 100644 --- a/implementations/knowledge_qa/gradio_app.py +++ b/implementations/knowledge_qa/gradio_app.py @@ -13,10 +13,8 @@ from typing import Any, Generator import gradio as gr -from aieng.agent_evals.knowledge_qa import ( - DeepSearchQADataset, - KnowledgeAgentManager, -) +from aieng.agent_evals.knowledge_qa import KnowledgeAgentManager +from aieng.agent_evals.knowledge_qa.data import DeepSearchQADataset from dotenv import load_dotenv from gradio.components.chatbot import ChatMessage