diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 65cffaa..fe05773 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -83,7 +83,7 @@ deriva/ │ ├── common/ (Shared utilities) │ ├── types.py - Shared TypedDicts, Protocols, ProgressReporter -│ ├── logging.py - Pipeline logging (JSON Lines) +│ ├── logging.py - Pipeline logging with structlog (JSON Lines output) │ ├── chunking.py - File chunking with overlap support │ └── utils.py - File encoding, helpers │ @@ -719,13 +719,14 @@ def add_node(self, node: GraphNode, node_id: str | None = None) -> str: ### Overview -The LLM adapter (`adapters/llm/`) provides a unified interface for multiple LLM providers with caching and structured output support. +The LLM adapter (`adapters/llm/`) provides a unified interface for multiple LLM providers using **pydantic-ai** for agent-based LLM interactions with automatic retries, caching, and structured output support. **Supported Providers:** - **Azure OpenAI** - Enterprise Azure deployments - **OpenAI** - Direct OpenAI API - **Anthropic** - Claude models +- **Mistral** - Mistral AI models - **Ollama** - Local LLM inference (no API key required) ### Basic Usage @@ -744,9 +745,9 @@ else: print(f"Error: {response.error}") ``` -### Structured Output with Pydantic +### Structured Output with pydantic-ai -Use `response_model` to get validated, type-safe responses: +Use `response_model` to get validated, type-safe responses via pydantic-ai agents: ```python from pydantic import BaseModel, Field @@ -1035,7 +1036,7 @@ def classify_files( - Registry comes from DatabaseManager (passed as data, not manager) #### `logging.py` -**Goal:** JSON Lines logging for pipeline runs with configurable verbosity. +**Goal:** JSON Lines logging for pipeline runs using structlog with configurable verbosity. ```python class LogLevel(int, Enum): @@ -1043,15 +1044,20 @@ class LogLevel(int, Enum): STEP = 2 # Steps: Repository, Directory, File, etc. DETAIL = 3 # Item-level: each file, node, edge -class PipelineLogger: - def log(self, level: int, phase: str, status: str, ...) -> None +class RunLogger: + """Structured logger using structlog for JSON Lines output.""" + def phase_start(self, phase: str, message: str = "") -> None + def phase_end(self, phase: str, message: str = "") -> None + def step(self, step_name: str) -> StepContext # Context manager def get_entries(self, min_level: int = 1) -> List[LogEntry] ``` **Rules:** -- Logs stored in `logs/run_{id}/log_{datetime}.jsonl` + +- Uses structlog for structured logging with JSON Lines output +- Logs stored in `workspace/logs/run_{id}/` - Use level 1 for phase start/end, level 2 for steps, level 3 for details -- Logger instance created in app.py, passed to extraction functions +- Logger instance created in services, passed to extraction/derivation functions #### `utils.py` **Goal:** Shared utility functions for file handling and data processing. diff --git a/deriva/adapters/llm/README.md b/deriva/adapters/llm/README.md index 2c555a8..7f28f1f 100644 --- a/deriva/adapters/llm/README.md +++ b/deriva/adapters/llm/README.md @@ -1,12 +1,12 @@ # LLM Adapter -Multi-provider LLM abstraction with caching and structured output support. +Multi-provider LLM abstraction using pydantic-ai with caching and structured output support. -**Version:** 1.0.0 +**Version:** 2.0.0 ## Purpose -The LLM adapter provides a unified interface for querying multiple LLM providers (Azure OpenAI, OpenAI, Anthropic, Ollama, LM Studio) with automatic caching and Pydantic-based structured output parsing. +The LLM adapter provides a unified interface for querying multiple LLM providers (Azure OpenAI, OpenAI, Anthropic, Mistral, Ollama, LM Studio) using **pydantic-ai** for agent-based interactions with automatic retries and Pydantic-based structured output parsing. ## Key Exports @@ -56,7 +56,9 @@ if response.response_type == "live": print(response.content) ``` -## Structured Output with Pydantic +## Structured Output with pydantic-ai + +Uses pydantic-ai agents for type-safe, validated responses: ```python from pydantic import BaseModel, Field @@ -72,7 +74,7 @@ result = llm.query( prompt="Extract the main business concept from this code...", response_model=BusinessConcept ) -# result is a validated BusinessConcept instance +# result is a validated BusinessConcept instance (via pydantic-ai agent) print(result.name) ``` @@ -111,13 +113,16 @@ LLM_LMSTUDIO_LOCAL_URL=http://localhost:1234/v1/chat/completions ## Providers -| Provider | Class | Description | -|----------|-------|-------------| -| Azure OpenAI | `AzureOpenAIProvider` | Azure-hosted OpenAI models | -| OpenAI | `OpenAIProvider` | OpenAI API direct | -| Anthropic | `AnthropicProvider` | Claude models | -| Ollama | `OllamaProvider` | Local Ollama models | -| LM Studio | `LMStudioProvider` | Local LM Studio (OpenAI-compatible) | +All providers are implemented via pydantic-ai's model abstraction: + +| Provider | pydantic-ai Model | Description | +|--------------|--------------------| ------------------------------------| +| Azure OpenAI | `AzureOpenAIModel` | Azure-hosted OpenAI models | +| OpenAI | `OpenAIModel` | OpenAI API direct | +| Anthropic | `AnthropicModel` | Claude models | +| Mistral | `MistralModel` | Mistral AI models | +| Ollama | `OllamaModel` | Local Ollama models | +| LM Studio | `OpenAIModel` | Local LM Studio (OpenAI-compatible) | ## Response Types diff --git a/tests/test_cli/test_cli.py b/tests/test_cli/test_cli.py index dbc0cbd..a801366 100644 --- a/tests/test_cli/test_cli.py +++ b/tests/test_cli/test_cli.py @@ -726,3 +726,498 @@ def test_filetype_stats(self, mock_session_class): assert result.exit_code == 0 assert "FILE TYPE STATISTICS" in result.stdout + + +class TestBenchmarkAnalyzeCommand: + """Tests for benchmark analyze command.""" + + @patch("deriva.cli.commands.benchmark.PipelineSession") + def test_analyze_invalid_format(self, mock_session_class): + """Should reject invalid format.""" + result = runner.invoke(app, ["benchmark", "analyze", "session_123", "-f", "xml"]) + + assert result.exit_code == 1 + assert "format must be" in result.output + + @patch("deriva.cli.commands.benchmark.PipelineSession") + def test_analyze_session_not_found(self, mock_session_class): + """Should handle session not found error.""" + mock_session = MagicMock() + mock_session.analyze_benchmark.side_effect = ValueError("Session not found") + mock_session_class.return_value.__enter__.return_value = mock_session + + result = runner.invoke(app, ["benchmark", "analyze", "nonexistent"]) + + assert result.exit_code == 1 + assert "Session not found" in result.output + + @patch("deriva.cli.commands.benchmark._get_run_stats_from_ocel") + @patch("deriva.cli.commands.benchmark.PipelineSession") + def test_analyze_success(self, mock_session_class, mock_get_stats): + """Should analyze benchmark successfully.""" + mock_session = MagicMock() + mock_analyzer = MagicMock() + + # Mock summary with all required attributes + mock_summary = MagicMock() + mock_summary.intra_model = [] + mock_summary.inter_model = [] + mock_summary.localization.hotspots = [] + + mock_analyzer.compute_full_analysis.return_value = mock_summary + mock_analyzer.export_summary.return_value = "output.json" + mock_session.analyze_benchmark.return_value = mock_analyzer + mock_session_class.return_value.__enter__.return_value = mock_session + mock_get_stats.return_value = {} + + result = runner.invoke(app, ["benchmark", "analyze", "session_123"]) + + assert result.exit_code == 0 + assert "ANALYZING BENCHMARK" in result.stdout + + @patch("deriva.cli.commands.benchmark._get_run_stats_from_ocel") + @patch("deriva.cli.commands.benchmark.PipelineSession") + def test_analyze_with_intra_model_data(self, mock_session_class, mock_get_stats): + """Should display intra-model consistency data.""" + mock_session = MagicMock() + mock_analyzer = MagicMock() + + # Mock intra-model data + mock_intra = MagicMock() + mock_intra.model = "gpt4" + mock_intra.stable_edges = ["e1", "e2"] + mock_intra.unstable_edges = ["e3"] + + mock_summary = MagicMock() + mock_summary.intra_model = [mock_intra] + mock_summary.inter_model = [] + mock_summary.localization.hotspots = [] + + mock_analyzer.compute_full_analysis.return_value = mock_summary + mock_analyzer.export_summary.return_value = "output.json" + mock_session.analyze_benchmark.return_value = mock_analyzer + mock_session_class.return_value.__enter__.return_value = mock_session + mock_get_stats.return_value = {"gpt4": [(100, 50), (110, 55)]} + + result = runner.invoke(app, ["benchmark", "analyze", "session_123"]) + + assert result.exit_code == 0 + assert "INTRA-MODEL CONSISTENCY" in result.stdout + + +class TestBenchmarkDeviationsCommand: + """Tests for benchmark deviations command.""" + + def test_deviations_invalid_sort(self): + """Should reject invalid sort-by value.""" + result = runner.invoke(app, ["benchmark", "deviations", "session_123", "--sort-by", "invalid"]) + + assert result.exit_code == 1 + assert "sort-by must be" in result.output + + @patch("deriva.cli.commands.benchmark.PipelineSession") + def test_deviations_session_not_found(self, mock_session_class): + """Should handle session not found error.""" + mock_session = MagicMock() + mock_session.analyze_config_deviations.side_effect = ValueError("Not found") + mock_session_class.return_value.__enter__.return_value = mock_session + + result = runner.invoke(app, ["benchmark", "deviations", "nonexistent"]) + + assert result.exit_code == 1 + assert "Not found" in result.output + + @patch("deriva.modules.analysis.generate_recommendations") + @patch("deriva.cli.commands.benchmark.PipelineSession") + def test_deviations_success(self, mock_session_class, mock_recommendations): + """Should analyze deviations successfully.""" + mock_session = MagicMock() + mock_analyzer = MagicMock() + + mock_report = MagicMock() + mock_report.total_runs = 5 + mock_report.total_deviations = 10 + mock_report.overall_consistency = 0.85 + mock_report.config_deviations = [] + + mock_analyzer.analyze.return_value = mock_report + mock_analyzer.export_json.return_value = "deviations.json" + mock_session.analyze_config_deviations.return_value = mock_analyzer + mock_session_class.return_value.__enter__.return_value = mock_session + mock_recommendations.return_value = [] + + result = runner.invoke(app, ["benchmark", "deviations", "session_123"]) + + assert result.exit_code == 0 + assert "Total runs analyzed: 5" in result.stdout + + +class TestBenchmarkComprehensiveCommand: + """Tests for benchmark comprehensive-analysis command.""" + + def test_comprehensive_invalid_format(self): + """Should reject invalid format.""" + result = runner.invoke(app, ["benchmark", "comprehensive-analysis", "session_1", "-f", "csv"]) + + assert result.exit_code == 1 + assert "format must be" in result.output + + @patch("deriva.services.analysis.BenchmarkAnalyzer") + @patch("deriva.cli.commands.benchmark.PipelineSession") + def test_comprehensive_session_not_found(self, mock_session_class, mock_analyzer_class): + """Should handle session not found error.""" + mock_session = MagicMock() + mock_session_class.return_value.__enter__.return_value = mock_session + mock_analyzer_class.side_effect = ValueError("Session not found") + + result = runner.invoke(app, ["benchmark", "comprehensive-analysis", "bad_session"]) + + assert result.exit_code == 1 + assert "Session not found" in result.output + + @patch("deriva.services.analysis.BenchmarkAnalyzer") + @patch("deriva.cli.commands.benchmark.PipelineSession") + def test_comprehensive_success(self, mock_session_class, mock_analyzer_class): + """Should run comprehensive analysis successfully.""" + mock_session = MagicMock() + mock_session_class.return_value.__enter__.return_value = mock_session + + mock_analyzer = MagicMock() + mock_report = MagicMock() + mock_report.repositories = ["repo1"] + mock_report.models = ["gpt4"] + mock_report.overall_consistency = 0.9 + mock_report.overall_precision = 0.85 + mock_report.overall_recall = 0.88 + mock_report.stability_reports = {} + mock_report.semantic_reports = {} + mock_report.cross_repo = None + mock_report.recommendations = ["Improve consistency"] + + mock_analyzer.generate_report.return_value = mock_report + mock_analyzer.export_all.return_value = {"json": "out.json", "markdown": "out.md"} + mock_analyzer_class.return_value = mock_analyzer + + result = runner.invoke(app, ["benchmark", "comprehensive-analysis", "session_1"]) + + assert result.exit_code == 0 + assert "BENCHMARK ANALYSIS" in result.stdout + assert "Consistency:" in result.stdout + + +class TestBenchmarkModelsCommand: + """Tests for benchmark models command.""" + + @patch("deriva.cli.commands.benchmark.PipelineSession") + def test_benchmark_models_with_data(self, mock_session_class): + """Should list available benchmark models.""" + mock_session = MagicMock() + + mock_config = MagicMock() + mock_config.provider = "openai" + mock_config.model = "gpt-4" + mock_config.api_url = "https://api.openai.com/v1/very-long-url-that-should-be-truncated" + + mock_session.list_benchmark_models.return_value = {"gpt4": mock_config} + mock_session_class.return_value.__enter__.return_value = mock_session + + result = runner.invoke(app, ["benchmark", "models"]) + + assert result.exit_code == 0 + assert "AVAILABLE BENCHMARK MODELS" in result.stdout + assert "gpt4" in result.stdout + assert "Provider: openai" in result.stdout + + +class TestBenchmarkRunWithErrors: + """Tests for benchmark run command error scenarios.""" + + @patch("deriva.cli.commands.benchmark.create_benchmark_progress_reporter") + @patch("deriva.cli.commands.benchmark.PipelineSession") + def test_benchmark_run_with_errors(self, mock_session_class, mock_progress): + """Should display errors from benchmark run.""" + mock_session = MagicMock() + mock_result = MagicMock() + mock_result.session_id = "bench_123" + mock_result.runs_completed = 2 + mock_result.runs_failed = 1 + mock_result.duration_seconds = 60.0 + mock_result.ocel_path = "ocel.json" + mock_result.success = False + mock_result.errors = ["Error 1", "Error 2", "Error 3", "Error 4", "Error 5", "Error 6"] + mock_session.run_benchmark.return_value = mock_result + mock_session_class.return_value.__enter__.return_value = mock_session + + mock_reporter = MagicMock() + mock_progress.return_value = mock_reporter + mock_reporter.__enter__ = MagicMock(return_value=mock_reporter) + mock_reporter.__exit__ = MagicMock(return_value=False) + + result = runner.invoke(app, ["benchmark", "run", "--repos", "repo1", "--models", "gpt4"]) + + assert result.exit_code == 1 + assert "Errors (6)" in result.stdout + assert "... and 1 more" in result.stdout + + +class TestConfigShowCommand: + """Tests for config show command.""" + + def test_show_invalid_step_type(self): + """Should reject invalid step type.""" + result = runner.invoke(app, ["config", "show", "invalid", "Test"]) + + assert result.exit_code == 1 + assert "Error" in result.output + + @patch("deriva.cli.commands.config.config") + @patch("deriva.cli.commands.config.PipelineSession") + def test_show_extraction_config(self, mock_session_class, mock_config): + """Should show extraction config details.""" + mock_session = MagicMock() + mock_session_class.return_value.__enter__.return_value = mock_session + + mock_cfg = MagicMock() + mock_cfg.node_type = "BusinessConcept" + mock_cfg.sequence = 1 + mock_cfg.enabled = True + mock_cfg.input_sources = None + mock_cfg.instruction = "Extract business concepts" + mock_cfg.example = '{"concepts": []}' + mock_config.get_extraction_config.return_value = mock_cfg + + result = runner.invoke(app, ["config", "show", "extraction", "BusinessConcept"]) + + assert result.exit_code == 0 + assert "EXTRACTION CONFIG: BusinessConcept" in result.stdout + assert "Sequence: 1" in result.stdout + + @patch("deriva.cli.commands.config.config") + @patch("deriva.cli.commands.config.PipelineSession") + def test_show_extraction_not_found(self, mock_session_class, mock_config): + """Should handle extraction config not found.""" + mock_session = MagicMock() + mock_session_class.return_value.__enter__.return_value = mock_session + mock_config.get_extraction_config.return_value = None + + result = runner.invoke(app, ["config", "show", "extraction", "NotFound"]) + + assert result.exit_code == 1 + assert "not found" in result.stdout + + @patch("deriva.cli.commands.config.config") + @patch("deriva.cli.commands.config.PipelineSession") + def test_show_derivation_config(self, mock_session_class, mock_config): + """Should show derivation config details.""" + mock_session = MagicMock() + mock_session_class.return_value.__enter__.return_value = mock_session + + mock_cfg = MagicMock() + mock_cfg.element_type = "ApplicationComponent" + mock_cfg.sequence = 1 + mock_cfg.enabled = True + mock_cfg.input_graph_query = "MATCH (n) RETURN n" + mock_cfg.instruction = "Derive application components" + mock_config.get_derivation_config.return_value = mock_cfg + + result = runner.invoke(app, ["config", "show", "derivation", "ApplicationComponent"]) + + assert result.exit_code == 0 + assert "DERIVATION CONFIG: ApplicationComponent" in result.stdout + + @patch("deriva.cli.commands.config.config") + @patch("deriva.cli.commands.config.PipelineSession") + def test_show_derivation_not_found(self, mock_session_class, mock_config): + """Should handle derivation config not found.""" + mock_session = MagicMock() + mock_session_class.return_value.__enter__.return_value = mock_session + mock_config.get_derivation_config.return_value = None + + result = runner.invoke(app, ["config", "show", "derivation", "NotFound"]) + + assert result.exit_code == 1 + assert "not found" in result.stdout + + +class TestConfigUpdateCommand: + """Tests for config update command.""" + + def test_update_invalid_step_type(self): + """Should reject invalid step type.""" + result = runner.invoke(app, ["config", "update", "invalid", "Test"]) + + assert result.exit_code == 1 + assert "Error" in result.output + + @patch("deriva.cli.commands.config.config") + @patch("deriva.cli.commands.config.PipelineSession") + def test_update_derivation_success(self, mock_session_class, mock_config): + """Should update derivation config successfully.""" + mock_session = MagicMock() + mock_session_class.return_value.__enter__.return_value = mock_session + + mock_config.create_derivation_config_version.return_value = { + "success": True, + "old_version": 1, + "new_version": 2, + } + + result = runner.invoke( + app, + ["config", "update", "derivation", "AppComp", "-i", "New instruction"], + ) + + assert result.exit_code == 0 + assert "Updated derivation config" in result.stdout + assert "Version: 1 -> 2" in result.stdout + + @patch("deriva.cli.commands.config.config") + @patch("deriva.cli.commands.config.PipelineSession") + def test_update_extraction_success(self, mock_session_class, mock_config): + """Should update extraction config successfully.""" + mock_session = MagicMock() + mock_session_class.return_value.__enter__.return_value = mock_session + + mock_config.create_extraction_config_version.return_value = { + "success": True, + "old_version": 1, + "new_version": 2, + } + + result = runner.invoke( + app, + ["config", "update", "extraction", "Concept", "-i", "New instruction"], + ) + + assert result.exit_code == 0 + assert "Updated extraction config" in result.stdout + + @patch("deriva.cli.commands.config.config") + @patch("deriva.cli.commands.config.PipelineSession") + def test_update_with_params(self, mock_session_class, mock_config): + """Should update config with params successfully.""" + mock_session = MagicMock() + mock_session_class.return_value.__enter__.return_value = mock_session + + mock_config.create_derivation_config_version.return_value = { + "success": True, + "old_version": 1, + "new_version": 2, + } + + result = runner.invoke( + app, + ["config", "update", "derivation", "AppComp", "-p", '{"key": "value"}'], + ) + + assert result.exit_code == 0 + assert "Params: updated" in result.stdout + + def test_update_invalid_params_json(self): + """Should reject invalid params JSON.""" + result = runner.invoke( + app, + ["config", "update", "derivation", "AppComp", "-p", "not valid json"], + ) + + assert result.exit_code == 1 + assert "params must be valid JSON" in result.output + + @patch("deriva.cli.commands.config.config") + @patch("deriva.cli.commands.config.PipelineSession") + def test_update_failure(self, mock_session_class, mock_config): + """Should handle update failure.""" + mock_session = MagicMock() + mock_session_class.return_value.__enter__.return_value = mock_session + + mock_config.create_derivation_config_version.return_value = { + "success": False, + "error": "Config not found", + } + + result = runner.invoke( + app, + ["config", "update", "derivation", "NotFound", "-i", "test"], + ) + + assert result.exit_code == 1 + assert "Config not found" in result.output + + +class TestConfigVersionsCommand: + """Tests for config versions command.""" + + @patch("deriva.cli.commands.config.config") + @patch("deriva.cli.commands.config.PipelineSession") + def test_versions_success(self, mock_session_class, mock_config): + """Should show config versions.""" + mock_session = MagicMock() + mock_session_class.return_value.__enter__.return_value = mock_session + + mock_config.get_active_config_versions.return_value = { + "extraction": {"BusinessConcept": 3, "TypeDefinition": 1}, + "derivation": {"ApplicationComponent": 2}, + } + + result = runner.invoke(app, ["config", "versions"]) + + assert result.exit_code == 0 + assert "ACTIVE CONFIG VERSIONS" in result.stdout + assert "BusinessConcept" in result.stdout + assert "v3" in result.stdout + + +class TestFiletypeListEmpty: + """Tests for filetype list when empty.""" + + @patch("deriva.cli.commands.config.PipelineSession") + def test_filetype_list_empty(self, mock_session_class): + """Should show message when no file types.""" + mock_session = MagicMock() + mock_session.get_file_types.return_value = [] + mock_session_class.return_value.__enter__.return_value = mock_session + + result = runner.invoke(app, ["config", "filetype", "list"]) + + assert result.exit_code == 0 + assert "No file types registered" in result.stdout + + @patch("deriva.cli.commands.config.PipelineSession") + def test_filetype_add_failure(self, mock_session_class): + """Should handle add file type failure.""" + mock_session = MagicMock() + mock_session.add_file_type.return_value = False + mock_session_class.return_value.__enter__.return_value = mock_session + + result = runner.invoke(app, ["config", "filetype", "add", ".py", "code", "python"]) + + assert result.exit_code == 1 + assert "Failed to add" in result.output + + @patch("deriva.cli.commands.config.PipelineSession") + def test_filetype_delete_failure(self, mock_session_class): + """Should handle delete file type failure.""" + mock_session = MagicMock() + mock_session.delete_file_type.return_value = False + mock_session_class.return_value.__enter__.return_value = mock_session + + result = runner.invoke(app, ["config", "filetype", "delete", ".unknown"]) + + assert result.exit_code == 1 + assert "not found" in result.output + + +class TestBenchmarkListEmpty: + """Tests for benchmark list when empty.""" + + @patch("deriva.cli.commands.benchmark.PipelineSession") + def test_benchmark_list_empty(self, mock_session_class): + """Should show message when no sessions.""" + mock_session = MagicMock() + mock_session.list_benchmarks.return_value = [] + mock_session_class.return_value.__enter__.return_value = mock_session + + result = runner.invoke(app, ["benchmark", "list"]) + + assert result.exit_code == 0 + assert "No benchmark sessions found" in result.stdout