diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000000..08808eae8f --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,34 @@ +name: Tests + +on: + push: + branches: [ main, fix/tests-ci-security ] + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e core/ + # Also install dev dependencies if needed for tests + pip install -e "core/[dev]" + + - name: Run tests + run: | + python -m pytest core/tests/ -v diff --git a/core/framework/credentials/storage.py b/core/framework/credentials/storage.py index bee7f8dfd8..2ca8f7b604 100644 --- a/core/framework/credentials/storage.py +++ b/core/framework/credentials/storage.py @@ -224,7 +224,7 @@ def list_all(self) -> list[str]: index_path = self.base_path / "metadata" / "index.json" if not index_path.exists(): return [] - with open(index_path) as f: + with open(index_path, encoding="utf-8") as f: index = json.load(f) return list(index.get("credentials", {}).keys()) @@ -265,7 +265,7 @@ def _update_index( index_path = self.base_path / "metadata" / "index.json" if index_path.exists(): - with open(index_path) as f: + with open(index_path, encoding="utf-8") as f: index = json.load(f) else: index = {"credentials": {}, "version": "1.0"} @@ -280,7 +280,7 @@ def _update_index( index["last_modified"] = datetime.now(UTC).isoformat() - with open(index_path, "w") as f: + with open(index_path, "w", encoding="utf-8") as f: json.dump(index, f, indent=2) diff --git a/core/framework/graph/edge.py b/core/framework/graph/edge.py index 886daa3075..2cb08322e3 100644 --- a/core/framework/graph/edge.py +++ b/core/framework/graph/edge.py @@ -98,7 +98,7 @@ class EdgeSpec(BaseModel): model_config = {"extra": "allow"} - def should_traverse( + async def should_traverse( self, source_success: bool, source_output: dict[str, Any], @@ -139,7 +139,7 @@ def should_traverse( if llm is None or goal is None: # Fallback to ON_SUCCESS if LLM not available return source_success - return self._llm_decide( + return await self._llm_decide( llm=llm, goal=goal, source_success=source_success, @@ -184,7 +184,7 @@ def _evaluate_condition( logger.warning(f" Available context keys: {list(context.keys())}") return False - def _llm_decide( + async def _llm_decide( self, llm: Any, goal: Any, @@ -230,7 +230,7 @@ def _llm_decide( {{"proceed": true/false, "reasoning": "brief explanation"}}""" try: - response = llm.complete( + response = await llm.complete( messages=[{"role": "user", "content": prompt}], system="You are a routing agent. Respond with JSON only.", max_tokens=150, diff --git a/core/framework/graph/executor.py b/core/framework/graph/executor.py index 4da971af51..cad9da8575 100644 --- a/core/framework/graph/executor.py +++ b/core/framework/graph/executor.py @@ -453,7 +453,7 @@ async def execute( current_node_id = result.next_node else: # Get all traversable edges for fan-out detection - traversable_edges = self._get_all_traversable_edges( + traversable_edges = await self._get_all_traversable_edges( graph=graph, goal=goal, current_node_id=current_node_id, @@ -500,7 +500,7 @@ async def execute( break else: # Sequential: follow single edge (existing logic via _follow_edges) - next_node = self._follow_edges( + next_node = await self._follow_edges( graph=graph, goal=goal, current_node_id=current_node_id, @@ -650,7 +650,7 @@ def _get_node_implementation( # Should never reach here due to validation above raise RuntimeError(f"Unhandled node type: {node_spec.node_type}") - def _follow_edges( + async def _follow_edges( self, graph: GraphSpec, goal: Goal, @@ -665,7 +665,7 @@ def _follow_edges( for edge in edges: target_node_spec = graph.get_node(edge.target) - if edge.should_traverse( + if await edge.should_traverse( source_success=result.success, source_output=result.output, memory=memory.read_all(), @@ -688,7 +688,7 @@ def _follow_edges( self.logger.warning(f"โš  Output validation failed: {validation.errors}") # Clean the output - cleaned_output = self.output_cleaner.clean_output( + cleaned_output = await self.output_cleaner.clean_output( output=output_to_validate, source_node_id=current_node_id, target_node_spec=target_node_spec, @@ -726,7 +726,7 @@ def _follow_edges( return None - def _get_all_traversable_edges( + async def _get_all_traversable_edges( self, graph: GraphSpec, goal: Goal, @@ -746,7 +746,7 @@ def _get_all_traversable_edges( for edge in edges: target_node_spec = graph.get_node(edge.target) - if edge.should_traverse( + if await edge.should_traverse( source_success=result.success, source_output=result.output, memory=memory.read_all(), @@ -859,7 +859,7 @@ async def execute_single_branch( f"โš  Output validation failed for branch " f"{branch.node_id}: {validation.errors}" ) - cleaned_output = self.output_cleaner.clean_output( + cleaned_output = await self.output_cleaner.clean_output( output=source_result.output, source_node_id=source_node_spec.id if source_node_spec else "unknown", target_node_spec=node_spec, diff --git a/core/framework/graph/node.py b/core/framework/graph/node.py index a9763bb512..492ba0e59c 100644 --- a/core/framework/graph/node.py +++ b/core/framework/graph/node.py @@ -795,7 +795,7 @@ def executor(tool_use: ToolUse) -> ToolResult: # Retry the call with compaction instruction if ctx.available_tools and self.tool_executor: - response = ctx.llm.complete_with_tools( + response = await ctx.llm.complete_with_tools( messages=compaction_messages, system=system, tools=ctx.available_tools, @@ -803,7 +803,7 @@ def executor(tool_use: ToolUse) -> ToolResult: max_tokens=ctx.max_tokens, ) else: - response = ctx.llm.complete( + response = await ctx.llm.complete( messages=compaction_messages, system=system, json_mode=use_json_mode, @@ -884,7 +884,7 @@ def executor(tool_use: ToolUse) -> ToolResult: # Re-call LLM with feedback if ctx.available_tools and self.tool_executor: - response = ctx.llm.complete_with_tools( + response = await ctx.llm.complete_with_tools( messages=current_messages, system=system, tools=ctx.available_tools, @@ -892,7 +892,7 @@ def executor(tool_use: ToolUse) -> ToolResult: max_tokens=ctx.max_tokens, ) else: - response = ctx.llm.complete( + response = await ctx.llm.complete( messages=current_messages, system=system, json_mode=use_json_mode, @@ -1514,7 +1514,7 @@ async def _llm_route( logger.info(" ๐Ÿค” Router using LLM to choose path...") try: - response = ctx.llm.complete( + response = await ctx.llm.complete( messages=[{"role": "user", "content": prompt}], system=ctx.node_spec.system_prompt or "You are a routing agent. Respond with JSON only.", diff --git a/core/framework/graph/output_cleaner.py b/core/framework/graph/output_cleaner.py index b51f0af1b1..2889c3ac45 100644 --- a/core/framework/graph/output_cleaner.py +++ b/core/framework/graph/output_cleaner.py @@ -204,7 +204,7 @@ def validate_output( warnings=warnings, ) - def clean_output( + async def clean_output( self, output: dict[str, Any], source_node_id: str, @@ -286,7 +286,7 @@ def clean_output( f"๐Ÿงน Cleaning output from '{source_node_id}' using {self.config.fast_model}" ) - response = self.llm.complete( + response = await self.llm.complete( messages=[{"role": "user", "content": prompt}], system=( "You clean malformed agent outputs. Return only valid JSON matching the schema." diff --git a/core/framework/graph/safe_eval.py b/core/framework/graph/safe_eval.py index 83e1fdd833..3ffd2bcd3f 100644 --- a/core/framework/graph/safe_eval.py +++ b/core/framework/graph/safe_eval.py @@ -73,17 +73,23 @@ def visit_Expr(self, node: ast.Expr) -> Any: return self.visit(node.value) def visit_Constant(self, node: ast.Constant) -> Any: - return node.value + # Strictly allow only basic types: int, float, str, bool, bytes, NoneType + if isinstance(node.value, (int, float, str, bool, bytes, type(None))): + return node.value + raise ValueError(f"Constant of type {type(node.value).__name__} is not allowed") - # --- Number/String/Bytes/NameConstant (Python < 3.8 compat if needed) --- + # --- Number/String/Bytes/NameConstant (Python < 3.8 compat) --- def visit_Num(self, node: ast.Num) -> Any: - return node.n + return self.visit_Constant(ast.Constant(value=node.n)) def visit_Str(self, node: ast.Str) -> Any: - return node.s + return self.visit_Constant(ast.Constant(value=node.s)) + + def visit_Bytes(self, node: ast.Bytes) -> Any: + return self.visit_Constant(ast.Constant(value=node.s)) def visit_NameConstant(self, node: ast.NameConstant) -> Any: - return node.value + return self.visit_Constant(ast.Constant(value=node.value)) # --- Data Structures --- def visit_List(self, node: ast.List) -> list: diff --git a/core/framework/llm/anthropic.py b/core/framework/llm/anthropic.py index a07643c0a7..ba33e7cf00 100644 --- a/core/framework/llm/anthropic.py +++ b/core/framework/llm/anthropic.py @@ -6,6 +6,7 @@ from framework.llm.litellm import LiteLLMProvider from framework.llm.provider import LLMProvider, LLMResponse, Tool, ToolResult, ToolUse +from framework.llm.resilience import ResilienceConfig def _get_api_key_from_credential_manager() -> str | None: @@ -39,6 +40,7 @@ def __init__( self, api_key: str | None = None, model: str = "claude-haiku-4-5-20251001", + resilience_config: ResilienceConfig | None = None, ): """ Initialize the Anthropic provider. @@ -47,7 +49,9 @@ def __init__( api_key: Anthropic API key. If not provided, uses CredentialManager or ANTHROPIC_API_KEY env var. model: Model to use (default: claude-haiku-4-5-20251001) + resilience_config: Optional resilience configuration. """ + super().__init__(resilience_config) # Delegate to LiteLLMProvider internally. self.api_key = api_key or _get_api_key_from_credential_manager() if not self.api_key: @@ -60,9 +64,10 @@ def __init__( self._provider = LiteLLMProvider( model=model, api_key=self.api_key, + resilience_config=self.resilience_config, ) - def complete( + async def complete( self, messages: list[dict[str, Any]], system: str = "", @@ -72,7 +77,7 @@ def complete( json_mode: bool = False, ) -> LLMResponse: """Generate a completion from Claude (via LiteLLM).""" - return self._provider.complete( + return await self._provider.complete( messages=messages, system=system, tools=tools, @@ -81,7 +86,7 @@ def complete( json_mode=json_mode, ) - def complete_with_tools( + async def complete_with_tools( self, messages: list[dict[str, Any]], system: str, @@ -90,7 +95,7 @@ def complete_with_tools( max_iterations: int = 10, ) -> LLMResponse: """Run a tool-use loop until Claude produces a final response (via LiteLLM).""" - return self._provider.complete_with_tools( + return await self._provider.complete_with_tools( messages=messages, system=system, tools=tools, diff --git a/core/framework/llm/litellm.py b/core/framework/llm/litellm.py index 1b993be02f..10c1d8741d 100644 --- a/core/framework/llm/litellm.py +++ b/core/framework/llm/litellm.py @@ -17,6 +17,7 @@ litellm = None # type: ignore[assignment] from framework.llm.provider import LLMProvider, LLMResponse, Tool, ToolResult, ToolUse +from framework.llm.resilience import ResilienceConfig class LiteLLMProvider(LLMProvider): @@ -61,6 +62,7 @@ def __init__( model: str = "gpt-4o-mini", api_key: str | None = None, api_base: str | None = None, + resilience_config: ResilienceConfig | None = None, **kwargs: Any, ): """ @@ -73,8 +75,10 @@ def __init__( look for the appropriate env var (OPENAI_API_KEY, ANTHROPIC_API_KEY, etc.) api_base: Custom API base URL (for proxies or local deployments) + resilience_config: Optional resilience configuration. **kwargs: Additional arguments passed to litellm.completion() """ + super().__init__(resilience_config) self.model = model self.api_key = api_key self.api_base = api_base @@ -85,7 +89,7 @@ def __init__( "LiteLLM is not installed. Please install it with: pip install litellm" ) - def complete( + async def complete( self, messages: list[dict[str, Any]], system: str = "", @@ -94,7 +98,27 @@ def complete( response_format: dict[str, Any] | None = None, json_mode: bool = False, ) -> LLMResponse: - """Generate a completion using LiteLLM.""" + """Resiliently generate a completion using LiteLLM.""" + return await self._execute_with_resilience( + self._complete, + messages=messages, + system=system, + tools=tools, + max_tokens=max_tokens, + response_format=response_format, + json_mode=json_mode + ) + + async def _complete( + self, + messages: list[dict[str, Any]], + system: str = "", + tools: list[Tool] | None = None, + max_tokens: int = 1024, + response_format: dict[str, Any] | None = None, + json_mode: bool = False, + ) -> LLMResponse: + """Internal completion implementation.""" # Prepare messages with system prompt full_messages = [] if system: @@ -133,7 +157,7 @@ def complete( kwargs["response_format"] = response_format # Make the call - response = litellm.completion(**kwargs) # type: ignore[union-attr] + response = await litellm.acompletion(**kwargs) # type: ignore[union-attr] # Extract content content = response.choices[0].message.content or "" @@ -152,7 +176,27 @@ def complete( raw_response=response, ) - def complete_with_tools( + async def complete_with_tools( + self, + messages: list[dict[str, Any]], + system: str, + tools: list[Tool], + tool_executor: Callable[[ToolUse], ToolResult], + max_iterations: int = 10, + max_tokens: int = 4096, + ) -> LLMResponse: + """Resiliently run a tool-use loop.""" + return await self._execute_with_resilience( + self._complete_with_tools, + messages=messages, + system=system, + tools=tools, + tool_executor=tool_executor, + max_iterations=max_iterations, + max_tokens=max_tokens + ) + + async def _complete_with_tools( self, messages: list[dict[str, Any]], system: str, @@ -161,7 +205,7 @@ def complete_with_tools( max_iterations: int = 10, max_tokens: int = 4096, ) -> LLMResponse: - """Run a tool-use loop until the LLM produces a final response.""" + """Internal tool-use loop implementation.""" # Prepare messages with system prompt current_messages = [] if system: @@ -189,7 +233,7 @@ def complete_with_tools( if self.api_base: kwargs["api_base"] = self.api_base - response = litellm.completion(**kwargs) # type: ignore[union-attr] + response = await litellm.acompletion(**kwargs) # type: ignore[union-attr] # Track tokens usage = response.usage diff --git a/core/framework/llm/mock.py b/core/framework/llm/mock.py index 0f17004526..4d00317cc1 100644 --- a/core/framework/llm/mock.py +++ b/core/framework/llm/mock.py @@ -6,6 +6,7 @@ from typing import Any from framework.llm.provider import LLMProvider, LLMResponse, Tool, ToolResult, ToolUse +from framework.llm.resilience import ResilienceConfig class MockLLMProvider(LLMProvider): @@ -26,13 +27,19 @@ class MockLLMProvider(LLMProvider): # Returns: {"name": "mock_value", "age": "mock_value"} """ - def __init__(self, model: str = "mock-model"): + def __init__( + self, + model: str = "mock-model", + resilience_config: ResilienceConfig | None = None, + ): """ Initialize the mock LLM provider. Args: model: Model name to report in responses (default: "mock-model") + resilience_config: Optional resilience configuration. """ + super().__init__(resilience_config) self.model = model def _extract_output_keys(self, system: str) -> list[str]: @@ -106,7 +113,7 @@ def _generate_mock_response( # Plain text mock response return "This is a mock response for testing purposes." - def complete( + async def complete( self, messages: list[dict[str, Any]], system: str = "", @@ -139,7 +146,7 @@ def complete( stop_reason="mock_complete", ) - def complete_with_tools( + async def complete_with_tools( self, messages: list[dict[str, Any]], system: str, diff --git a/core/framework/llm/provider.py b/core/framework/llm/provider.py index f8fd13ebfe..0347bc559d 100644 --- a/core/framework/llm/provider.py +++ b/core/framework/llm/provider.py @@ -6,6 +6,8 @@ from typing import Any +from framework.llm.resilience import ResilienceConfig, CircuitBreaker, RetryHandler + @dataclass class LLMResponse: """Response from an LLM call.""" @@ -56,8 +58,23 @@ class LLMProvider(ABC): - Error handling """ + def __init__(self, resilience_config: ResilienceConfig | None = None): + self.resilience_config = resilience_config or ResilienceConfig() + self._circuit_breaker = CircuitBreaker(self.resilience_config) + self._retry_handler = RetryHandler(self.resilience_config) + + async def _execute_with_resilience(self, func: Callable[..., Any], *args: Any, **kwargs: Any) -> Any: + """ + Helper to run a provider method with both retry and circuit breaker. + """ + # Inner function that the circuit breaker will call, which in turn calls retry + async def _call_with_retry(): + return await self._retry_handler.execute_with_retry(func, *args, **kwargs) + + return await self._circuit_breaker.call(_call_with_retry) + @abstractmethod - def complete( + async def complete( self, messages: list[dict[str, Any]], system: str = "", @@ -86,7 +103,7 @@ def complete( pass @abstractmethod - def complete_with_tools( + async def complete_with_tools( self, messages: list[dict[str, Any]], system: str, diff --git a/core/framework/llm/resilience.py b/core/framework/llm/resilience.py new file mode 100644 index 0000000000..8aa7e03570 --- /dev/null +++ b/core/framework/llm/resilience.py @@ -0,0 +1,130 @@ +import time +import asyncio +import random +import logging +from enum import Enum +from dataclasses import dataclass, field +from typing import Any, Callable, TypeVar, Generic + +logger = logging.getLogger(__name__) + +T = TypeVar("T") + +class CircuitState(Enum): + CLOSED = "closed" # Normal operation + OPEN = "open" # Failure detected, requests blocked + HALF_OPEN = "half_open" # Testing if service has recovered + +@dataclass +class ResilienceConfig: + """Configuration for LLM resilience features.""" + # Retry settings + max_retries: int = 3 + initial_delay: float = 1.0 + exponential_base: float = 2.0 + jitter: bool = True + + # Circuit Breaker settings + failure_threshold: int = 5 # Failures before opening circuit + recovery_timeout: float = 30.0 # Seconds to stay open before half-open + min_requests: int = 10 # Min requests before breaker can trip + +class CircuitBreaker: + """ + Implements the Circuit Breaker pattern to protect against provider outages. + """ + def __init__(self, config: ResilienceConfig): + self.config = config + self.state = CircuitState.CLOSED + self.failure_count = 0 + self.last_failure_time = 0.0 + self.lock = asyncio.Lock() + + async def call(self, func: Callable[..., T], *args: Any, **kwargs: Any) -> T: + """Execute a function with circuit breaker protection.""" + async with self.lock: + await self._before_call() + + try: + result = await func(*args, **kwargs) + async with self.lock: + self._on_success() + return result + except Exception as e: + async with self.lock: + self._on_failure() + raise e + + async def _before_call(self) -> None: + """Check if call is allowed based on current state.""" + if self.state == CircuitState.OPEN: + elapsed = time.time() - self.last_failure_time + if elapsed >= self.config.recovery_timeout: + logger.info("๐Ÿ”Œ Circuit Breaker: Transitioning to HALF-OPEN") + self.state = CircuitState.HALF_OPEN + else: + raise RuntimeError( + f"Circuit is OPEN. Requests blocked for another {self.config.recovery_timeout - elapsed:.1f}s" + ) + + def _on_success(self) -> None: + """Handle successful call.""" + if self.state == CircuitState.HALF_OPEN: + logger.info("๐Ÿ”Œ Circuit Breaker: Recovery detected! Transitioning to CLOSED") + self.state = CircuitState.CLOSED + self.failure_count = 0 + elif self.state == CircuitState.CLOSED: + # Optionally reset failure count on success if we want "consecutive" failure logic + # For now, let's keep it simple: success in closed state doesn't reset but stays closed. + # actually resetting is better to prevent "leaky" failures over long time. + self.failure_count = 0 + + def _on_failure(self) -> None: + """Handle failed call.""" + self.failure_count += 1 + self.last_failure_time = time.time() + + if self.state == CircuitState.CLOSED: + if self.failure_count >= self.config.failure_threshold: + logger.warning( + f"๐Ÿ”Œ Circuit Breaker: Failure threshold ({self.config.failure_threshold}) " + f"reached. Opening circuit for {self.config.recovery_timeout}s" + ) + self.state = CircuitState.OPEN + elif self.state == CircuitState.HALF_OPEN: + logger.warning("๐Ÿ”Œ Circuit Breaker: Failed in HALF-OPEN state. Re-opening circuit.") + self.state = CircuitState.OPEN + +class RetryHandler: + """ + Handles exponential backoff retry logic. + """ + def __init__(self, config: ResilienceConfig): + self.config = config + + async def execute_with_retry(self, func: Callable[..., T], *args: Any, **kwargs: Any) -> T: + """Execute a function with exponential backoff retries.""" + last_error = None + + for attempt in range(self.config.max_retries + 1): + try: + return await func(*args, **kwargs) + except Exception as e: + last_error = e + if attempt == self.config.max_retries: + break + + # Calculate delay: base * (factor^attempt) + delay = self.config.initial_delay * (self.config.exponential_base ** attempt) + + # Add jitter to prevent thundering herd + if self.config.jitter: + delay *= (0.5 + random.random()) + + logger.warning( + f"โš ๏ธ Call failed: {str(e)}. " + f"Retrying in {delay:.2f}s (attempt {attempt + 1}/{self.config.max_retries})..." + ) + await asyncio.sleep(delay) + + raise last_error if last_error else RuntimeError("Retry loop exhausted without error recorded") diff --git a/core/framework/llm/test_resilience.py b/core/framework/llm/test_resilience.py new file mode 100644 index 0000000000..2b7f9c2b22 --- /dev/null +++ b/core/framework/llm/test_resilience.py @@ -0,0 +1,109 @@ +import pytest +import asyncio +import time +from unittest.mock import AsyncMock, MagicMock +from framework.llm.resilience import ResilienceConfig, CircuitBreaker, RetryHandler, CircuitState + +@pytest.mark.asyncio +async def test_retry_handler_success(): + config = ResilienceConfig(max_retries=3, initial_delay=0.01) + handler = RetryHandler(config) + + func = AsyncMock(return_value="success") + result = await handler.execute_with_retry(func) + + assert result == "success" + assert func.call_count == 1 + +@pytest.mark.asyncio +async def test_retry_handler_fail_then_success(): + config = ResilienceConfig(max_retries=3, initial_delay=0.01) + handler = RetryHandler(config) + + func = AsyncMock() + func.side_effect = [ValueError("fail1"), ValueError("fail2"), "success"] + + result = await handler.execute_with_retry(func) + + assert result == "success" + assert func.call_count == 3 + +@pytest.mark.asyncio +async def test_retry_handler_exhaust_retries(): + config = ResilienceConfig(max_retries=2, initial_delay=0.01) + handler = RetryHandler(config) + + func = AsyncMock(side_effect=ValueError("constant fail")) + + with pytest.raises(ValueError, match="constant fail"): + await handler.execute_with_retry(func) + + assert func.call_count == 3 # Initial + 2 retries + +@pytest.mark.asyncio +async def test_circuit_breaker_tripping(): + config = ResilienceConfig(failure_threshold=2, recovery_timeout=0.1) + breaker = CircuitBreaker(config) + + func = AsyncMock(side_effect=ValueError("fail")) + + # First failure + with pytest.raises(ValueError): + await breaker.call(func) + assert breaker.state == CircuitState.CLOSED + + # Second failure - should trip + with pytest.raises(ValueError): + await breaker.call(func) + assert breaker.state == CircuitState.OPEN + + # Third call - should fail immediately without calling func + with pytest.raises(RuntimeError, match="Circuit is OPEN"): + await breaker.call(func) + assert func.call_count == 2 + +@pytest.mark.asyncio +async def test_circuit_breaker_recovery(): + config = ResilienceConfig(failure_threshold=1, recovery_timeout=0.05) + breaker = CircuitBreaker(config) + + func = AsyncMock(side_effect=ValueError("fail")) + + # Trip the circuit + with pytest.raises(ValueError): + await breaker.call(func) + assert breaker.state == CircuitState.OPEN + + # Wait for recovery timeout + await asyncio.sleep(0.1) + + # Next call should be HALF-OPEN + func.side_effect = None + func.return_value = "recovered" + + result = await breaker.call(func) + assert result == "recovered" + assert breaker.state == CircuitState.CLOSED + assert breaker.failure_count == 0 + +@pytest.mark.asyncio +async def test_circuit_breaker_half_open_failure(): + config = ResilienceConfig(failure_threshold=1, recovery_timeout=0.05) + breaker = CircuitBreaker(config) + + # Trip it + try: + await breaker.call(AsyncMock(side_effect=ValueError("fail"))) + except ValueError: + pass + assert breaker.state == CircuitState.OPEN + + await asyncio.sleep(0.1) + + # Call fails in HALF-OPEN + func = AsyncMock(side_effect=ValueError("still failing")) + with pytest.raises(ValueError): + await breaker.call(func) + + assert breaker.state == CircuitState.OPEN + assert func.call_count == 1 diff --git a/core/framework/mcp/agent_builder_server.py b/core/framework/mcp/agent_builder_server.py index 99275315f8..fd573ba686 100644 --- a/core/framework/mcp/agent_builder_server.py +++ b/core/framework/mcp/agent_builder_server.py @@ -122,11 +122,11 @@ def _save_session(session: BuildSession): # Save session file session_file = SESSIONS_DIR / f"{session.id}.json" - with open(session_file, "w") as f: + with open(session_file, "w", encoding="utf-8") as f: json.dump(session.to_dict(), f, indent=2, default=str) # Update active session pointer - with open(ACTIVE_SESSION_FILE, "w") as f: + with open(ACTIVE_SESSION_FILE, "w", encoding="utf-8") as f: f.write(session.id) @@ -136,7 +136,7 @@ def _load_session(session_id: str) -> BuildSession: if not session_file.exists(): raise ValueError(f"Session '{session_id}' not found") - with open(session_file) as f: + with open(session_file, encoding="utf-8") as f: data = json.load(f) return BuildSession.from_dict(data) @@ -148,7 +148,7 @@ def _load_active_session() -> BuildSession | None: return None try: - with open(ACTIVE_SESSION_FILE) as f: + with open(ACTIVE_SESSION_FILE, encoding="utf-8") as f: session_id = f.read().strip() if session_id: @@ -202,7 +202,7 @@ def list_sessions() -> str: if SESSIONS_DIR.exists(): for session_file in SESSIONS_DIR.glob("*.json"): try: - with open(session_file) as f: + with open(session_file, encoding="utf-8") as f: data = json.load(f) sessions.append( { @@ -222,7 +222,7 @@ def list_sessions() -> str: active_id = None if ACTIVE_SESSION_FILE.exists(): try: - with open(ACTIVE_SESSION_FILE) as f: + with open(ACTIVE_SESSION_FILE, encoding="utf-8") as f: active_id = f.read().strip() except Exception: pass @@ -246,7 +246,7 @@ def load_session_by_id(session_id: Annotated[str, "ID of the session to load"]) _session = _load_session(session_id) # Update active session pointer - with open(ACTIVE_SESSION_FILE, "w") as f: + with open(ACTIVE_SESSION_FILE, "w", encoding="utf-8") as f: f.write(session_id) return json.dumps( @@ -284,7 +284,7 @@ def delete_session(session_id: Annotated[str, "ID of the session to delete"]) -> _session = None if ACTIVE_SESSION_FILE.exists(): - with open(ACTIVE_SESSION_FILE) as f: + with open(ACTIVE_SESSION_FILE, encoding="utf-8") as f: active_id = f.read().strip() if active_id == session_id: ACTIVE_SESSION_FILE.unlink() diff --git a/core/framework/runner/cli.py b/core/framework/runner/cli.py index 9f9b789e1a..bc92d06de1 100644 --- a/core/framework/runner/cli.py +++ b/core/framework/runner/cli.py @@ -199,7 +199,7 @@ def cmd_run(args: argparse.Namespace) -> int: return 1 elif args.input_file: try: - with open(args.input_file) as f: + with open(args.input_file, encoding="utf-8") as f: context = json.load(f) except (FileNotFoundError, json.JSONDecodeError) as e: print(f"Error reading input file: {e}", file=sys.stderr) @@ -251,7 +251,7 @@ def cmd_run(args: argparse.Namespace) -> int: # Output results if args.output: - with open(args.output, "w") as f: + with open(args.output, "w", encoding="utf-8") as f: json.dump(output, f, indent=2, default=str) if not args.quiet: print(f"Results written to {args.output}") diff --git a/core/framework/storage/backend.py b/core/framework/storage/backend.py index 9cb94ac31b..26bef57065 100644 --- a/core/framework/storage/backend.py +++ b/core/framework/storage/backend.py @@ -46,19 +46,53 @@ def _ensure_dirs(self) -> None: for d in dirs: d.mkdir(parents=True, exist_ok=True) + def _validate_key(self, key: str) -> None: + """ + Validate key to prevent path traversal attacks. + + Args: + key: The key to validate + + Raises: + ValueError: If key contains path traversal or dangerous patterns + """ + if not key or key.strip() == "": + raise ValueError("Key cannot be empty") + + # Block path separators + if "/" in key or "\\" in key: + raise ValueError(f"Invalid key format: path separators not allowed in '{key}'") + + # Block parent directory references + if ".." in key or key.startswith("."): + raise ValueError(f"Invalid key format: path traversal detected in '{key}'") + + # Block absolute paths + if key.startswith("/") or (len(key) > 1 and key[1] == ":"): + raise ValueError(f"Invalid key format: absolute paths not allowed in '{key}'") + + # Block null bytes (Unix path injection) + if "\x00" in key: + raise ValueError("Invalid key format: null bytes not allowed") + + # Block other dangerous special characters + dangerous_chars = {"<", ">", "|", "&", "$", "`", "'", '"'} + if any(char in key for char in dangerous_chars): + raise ValueError(f"Invalid key format: contains dangerous characters in '{key}'") + # === RUN OPERATIONS === def save_run(self, run: Run) -> None: """Save a run to storage.""" # Save full run using Pydantic's model_dump_json run_path = self.base_path / "runs" / f"{run.id}.json" - with open(run_path, "w") as f: + with open(run_path, "w", encoding="utf-8") as f: f.write(run.model_dump_json(indent=2)) # Save summary summary = RunSummary.from_run(run) summary_path = self.base_path / "summaries" / f"{run.id}.json" - with open(summary_path, "w") as f: + with open(summary_path, "w", encoding="utf-8") as f: f.write(summary.model_dump_json(indent=2)) # Update indexes @@ -72,7 +106,7 @@ def load_run(self, run_id: str) -> Run | None: run_path = self.base_path / "runs" / f"{run_id}.json" if not run_path.exists(): return None - with open(run_path) as f: + with open(run_path, encoding="utf-8") as f: return Run.model_validate_json(f.read()) def load_summary(self, run_id: str) -> RunSummary | None: @@ -85,7 +119,7 @@ def load_summary(self, run_id: str) -> RunSummary | None: return RunSummary.from_run(run) return None - with open(summary_path) as f: + with open(summary_path, encoding="utf-8") as f: return RunSummary.model_validate_json(f.read()) def delete_run(self, run_id: str) -> bool: @@ -140,10 +174,11 @@ def list_all_goals(self) -> list[str]: def _get_index(self, index_type: str, key: str) -> list[str]: """Get values from an index.""" + self._validate_key(key) # Prevent path traversal index_path = self.base_path / "indexes" / index_type / f"{key}.json" if not index_path.exists(): return [] - with open(index_path) as f: + with open(index_path, encoding="utf-8") as f: return json.load(f) def _add_to_index(self, index_type: str, key: str, value: str) -> None: @@ -152,7 +187,7 @@ def _add_to_index(self, index_type: str, key: str, value: str) -> None: values = self._get_index(index_type, key) if value not in values: values.append(value) - with open(index_path, "w") as f: + with open(index_path, "w", encoding="utf-8") as f: json.dump(values, f) def _remove_from_index(self, index_type: str, key: str, value: str) -> None: @@ -161,7 +196,7 @@ def _remove_from_index(self, index_type: str, key: str, value: str) -> None: values = self._get_index(index_type, key) if value in values: values.remove(value) - with open(index_path, "w") as f: + with open(index_path, "w", encoding="utf-8") as f: json.dump(values, f) # === UTILITY === diff --git a/core/framework/testing/test_storage.py b/core/framework/testing/test_storage.py index b7462d201d..f0ac396f0f 100644 --- a/core/framework/testing/test_storage.py +++ b/core/framework/testing/test_storage.py @@ -55,17 +55,53 @@ def _ensure_dirs(self) -> None: for d in dirs: d.mkdir(parents=True, exist_ok=True) + def _validate_key(self, key: str) -> None: + """ + Validate key to prevent path traversal attacks. + + Args: + key: The key to validate + + Raises: + ValueError: If key contains path traversal or dangerous patterns + """ + if not key or key.strip() == "": + raise ValueError("Key cannot be empty") + + # Block path separators + if "/" in key or "\\" in key: + raise ValueError(f"Invalid key format: path separators not allowed in '{key}'") + + # Block parent directory references + if ".." in key or key.startswith("."): + raise ValueError(f"Invalid key format: path traversal detected in '{key}'") + + # Block absolute paths + if key.startswith("/") or (len(key) > 1 and key[1] == ":"): + raise ValueError(f"Invalid key format: absolute paths not allowed in '{key}'") + + # Block null bytes (Unix path injection) + if "\x00" in key: + raise ValueError("Invalid key format: null bytes not allowed") + + # Block other dangerous special characters + dangerous_chars = {"<", ">", "|", "&", "$", "`", "'", '"'} + if any(char in key for char in dangerous_chars): + raise ValueError(f"Invalid key format: contains dangerous characters in '{key}'") + # === TEST OPERATIONS === def save_test(self, test: Test) -> None: """Save a test to storage.""" + self._validate_key(test.id) + self._validate_key(test.goal_id) # Ensure goal directory exists goal_dir = self.base_path / "tests" / test.goal_id goal_dir.mkdir(parents=True, exist_ok=True) # Save full test test_path = goal_dir / f"{test.id}.json" - with open(test_path, "w") as f: + with open(test_path, "w", encoding="utf-8") as f: f.write(test.model_dump_json(indent=2)) # Update indexes @@ -79,7 +115,7 @@ def load_test(self, goal_id: str, test_id: str) -> Test | None: test_path = self.base_path / "tests" / goal_id / f"{test_id}.json" if not test_path.exists(): return None - with open(test_path) as f: + with open(test_path, encoding="utf-8") as f: return Test.model_validate_json(f.read()) def delete_test(self, goal_id: str, test_id: str) -> bool: @@ -175,12 +211,12 @@ def save_result(self, test_id: str, result: TestResult) -> None: # Save with timestamp timestamp = result.timestamp.strftime("%Y%m%d_%H%M%S") result_path = results_dir / f"{timestamp}.json" - with open(result_path, "w") as f: + with open(result_path, "w", encoding="utf-8") as f: f.write(result.model_dump_json(indent=2)) # Update latest latest_path = results_dir / "latest.json" - with open(latest_path, "w") as f: + with open(latest_path, "w", encoding="utf-8") as f: f.write(result.model_dump_json(indent=2)) def get_latest_result(self, test_id: str) -> TestResult | None: @@ -188,7 +224,7 @@ def get_latest_result(self, test_id: str) -> TestResult | None: latest_path = self.base_path / "results" / test_id / "latest.json" if not latest_path.exists(): return None - with open(latest_path) as f: + with open(latest_path, encoding="utf-8") as f: return TestResult.model_validate_json(f.read()) def get_result_history(self, test_id: str, limit: int = 10) -> list[TestResult]: @@ -204,7 +240,7 @@ def get_result_history(self, test_id: str, limit: int = 10) -> list[TestResult]: results = [] for f in result_files: - with open(f) as file: + with open(f, encoding="utf-8") as file: results.append(TestResult.model_validate_json(file.read())) return results @@ -213,10 +249,11 @@ def get_result_history(self, test_id: str, limit: int = 10) -> list[TestResult]: def _get_index(self, index_type: str, key: str) -> list[str]: """Get values from an index.""" + self._validate_key(key) # Prevent path traversal index_path = self.base_path / "indexes" / index_type / f"{key}.json" if not index_path.exists(): return [] - with open(index_path) as f: + with open(index_path, encoding="utf-8") as f: return json.load(f) def _add_to_index(self, index_type: str, key: str, value: str) -> None: @@ -225,7 +262,7 @@ def _add_to_index(self, index_type: str, key: str, value: str) -> None: values = self._get_index(index_type, key) if value not in values: values.append(value) - with open(index_path, "w") as f: + with open(index_path, "w", encoding="utf-8") as f: json.dump(values, f) def _remove_from_index(self, index_type: str, key: str, value: str) -> None: @@ -234,7 +271,7 @@ def _remove_from_index(self, index_type: str, key: str, value: str) -> None: values = self._get_index(index_type, key) if value in values: values.remove(value) - with open(index_path, "w") as f: + with open(index_path, "w", encoding="utf-8") as f: json.dump(values, f) # === UTILITY === diff --git a/core/setup_mcp.py b/core/setup_mcp.py index d7b4dfff0b..237aae9bc9 100755 --- a/core/setup_mcp.py +++ b/core/setup_mcp.py @@ -97,7 +97,7 @@ def main(): if mcp_config_path.exists(): log_success("MCP configuration found at .mcp.json") logger.info("Configuration:") - with open(mcp_config_path) as f: + with open(mcp_config_path, encoding="utf-8") as f: config = json.load(f) logger.info(json.dumps(config, indent=2)) else: @@ -114,7 +114,7 @@ def main(): } } - with open(mcp_config_path, "w") as f: + with open(mcp_config_path, "w", encoding="utf-8") as f: json.dump(config, f, indent=2) log_success("Created .mcp.json") diff --git a/core/tests/test_litellm_provider.py b/core/tests/test_litellm_provider.py index e2b67683a2..0a02f687d3 100644 --- a/core/tests/test_litellm_provider.py +++ b/core/tests/test_litellm_provider.py @@ -10,11 +10,13 @@ """ import os -from unittest.mock import MagicMock, patch +import pytest +from unittest.mock import MagicMock, patch, AsyncMock from framework.llm.anthropic import AnthropicProvider from framework.llm.litellm import LiteLLMProvider from framework.llm.provider import LLMProvider, Tool, ToolResult, ToolUse +from framework.llm.resilience import ResilienceConfig class TestLiteLLMProviderInit: @@ -63,8 +65,9 @@ def test_init_ollama_no_key_needed(self): class TestLiteLLMProviderComplete: """Test LiteLLMProvider.complete() method.""" - @patch("litellm.completion") - def test_complete_basic(self, mock_completion): + @pytest.mark.asyncio + @patch("litellm.acompletion", new_callable=AsyncMock) + async def test_complete_basic(self, mock_acompletion): """Test basic completion call.""" # Mock response mock_response = MagicMock() @@ -74,10 +77,10 @@ def test_complete_basic(self, mock_completion): mock_response.model = "gpt-4o-mini" mock_response.usage.prompt_tokens = 10 mock_response.usage.completion_tokens = 20 - mock_completion.return_value = mock_response + mock_acompletion.return_value = mock_response provider = LiteLLMProvider(model="gpt-4o-mini", api_key="test-key") - result = provider.complete(messages=[{"role": "user", "content": "Hello"}]) + result = await provider.complete(messages=[{"role": "user", "content": "Hello"}]) assert result.content == "Hello! I'm an AI assistant." assert result.model == "gpt-4o-mini" @@ -85,14 +88,15 @@ def test_complete_basic(self, mock_completion): assert result.output_tokens == 20 assert result.stop_reason == "stop" - # Verify litellm.completion was called correctly - mock_completion.assert_called_once() - call_kwargs = mock_completion.call_args[1] + # Verify litellm.acompletion was called correctly + mock_acompletion.assert_called_once() + call_kwargs = mock_acompletion.call_args[1] assert call_kwargs["model"] == "gpt-4o-mini" assert call_kwargs["api_key"] == "test-key" - @patch("litellm.completion") - def test_complete_with_system_prompt(self, mock_completion): + @pytest.mark.asyncio + @patch("litellm.acompletion", new_callable=AsyncMock) + async def test_complete_with_system_prompt(self, mock_acompletion): """Test completion with system prompt.""" mock_response = MagicMock() mock_response.choices = [MagicMock()] @@ -101,20 +105,21 @@ def test_complete_with_system_prompt(self, mock_completion): mock_response.model = "gpt-4o-mini" mock_response.usage.prompt_tokens = 15 mock_response.usage.completion_tokens = 5 - mock_completion.return_value = mock_response + mock_acompletion.return_value = mock_response provider = LiteLLMProvider(model="gpt-4o-mini", api_key="test-key") - provider.complete( + await provider.complete( messages=[{"role": "user", "content": "Hello"}], system="You are a helpful assistant." ) - call_kwargs = mock_completion.call_args[1] + call_kwargs = mock_acompletion.call_args[1] messages = call_kwargs["messages"] assert messages[0]["role"] == "system" assert messages[0]["content"] == "You are a helpful assistant." - @patch("litellm.completion") - def test_complete_with_tools(self, mock_completion): + @pytest.mark.asyncio + @patch("litellm.acompletion", new_callable=AsyncMock) + async def test_complete_with_tools(self, mock_acompletion): """Test completion with tools.""" mock_response = MagicMock() mock_response.choices = [MagicMock()] @@ -123,7 +128,7 @@ def test_complete_with_tools(self, mock_completion): mock_response.model = "gpt-4o-mini" mock_response.usage.prompt_tokens = 20 mock_response.usage.completion_tokens = 10 - mock_completion.return_value = mock_response + mock_acompletion.return_value = mock_response provider = LiteLLMProvider(model="gpt-4o-mini", api_key="test-key") @@ -138,11 +143,11 @@ def test_complete_with_tools(self, mock_completion): ) ] - provider.complete( + await provider.complete( messages=[{"role": "user", "content": "What's the weather?"}], tools=tools ) - call_kwargs = mock_completion.call_args[1] + call_kwargs = mock_acompletion.call_args[1] assert "tools" in call_kwargs assert call_kwargs["tools"][0]["type"] == "function" assert call_kwargs["tools"][0]["function"]["name"] == "get_weather" @@ -151,8 +156,9 @@ def test_complete_with_tools(self, mock_completion): class TestLiteLLMProviderToolUse: """Test LiteLLMProvider.complete_with_tools() method.""" - @patch("litellm.completion") - def test_complete_with_tools_single_iteration(self, mock_completion): + @pytest.mark.asyncio + @patch("litellm.acompletion", new_callable=AsyncMock) + async def test_complete_with_tools_single_iteration(self, mock_acompletion): """Test tool use with single iteration.""" # First response: tool call tool_call_response = MagicMock() @@ -179,7 +185,7 @@ def test_complete_with_tools_single_iteration(self, mock_completion): final_response.usage.prompt_tokens = 30 final_response.usage.completion_tokens = 10 - mock_completion.side_effect = [tool_call_response, final_response] + mock_acompletion.side_effect = [tool_call_response, final_response] provider = LiteLLMProvider(model="gpt-4o-mini", api_key="test-key") @@ -197,7 +203,7 @@ def test_complete_with_tools_single_iteration(self, mock_completion): def tool_executor(tool_use: ToolUse) -> ToolResult: return ToolResult(tool_use_id=tool_use.id, content="Sunny, 22C", is_error=False) - result = provider.complete_with_tools( + result = await provider.complete_with_tools( messages=[{"role": "user", "content": "What's the weather in London?"}], system="You are a weather assistant.", tools=tools, @@ -207,7 +213,7 @@ def tool_executor(tool_use: ToolUse) -> ToolResult: assert result.content == "The weather in London is sunny." assert result.input_tokens == 50 # 20 + 30 assert result.output_tokens == 25 # 15 + 10 - assert mock_completion.call_count == 2 + assert mock_acompletion.call_count == 2 class TestToolConversion: @@ -261,8 +267,9 @@ def test_anthropic_provider_uses_litellm_internally(self): assert provider._provider.model == "claude-3-haiku-20240307" assert provider._provider.api_key == "test-key" - @patch("litellm.completion") - def test_anthropic_provider_complete(self, mock_completion): + @pytest.mark.asyncio + @patch("litellm.acompletion", new_callable=AsyncMock) + async def test_anthropic_provider_complete(self, mock_acompletion): """Test AnthropicProvider.complete() delegates to LiteLLM.""" mock_response = MagicMock() mock_response.choices = [MagicMock()] @@ -271,10 +278,10 @@ def test_anthropic_provider_complete(self, mock_completion): mock_response.model = "claude-3-haiku-20240307" mock_response.usage.prompt_tokens = 10 mock_response.usage.completion_tokens = 5 - mock_completion.return_value = mock_response + mock_acompletion.return_value = mock_response provider = AnthropicProvider(api_key="test-key", model="claude-3-haiku-20240307") - result = provider.complete( + result = await provider.complete( messages=[{"role": "user", "content": "Hello"}], system="You are helpful.", max_tokens=100, @@ -285,13 +292,14 @@ def test_anthropic_provider_complete(self, mock_completion): assert result.input_tokens == 10 assert result.output_tokens == 5 - mock_completion.assert_called_once() - call_kwargs = mock_completion.call_args[1] + mock_acompletion.assert_called_once() + call_kwargs = mock_acompletion.call_args[1] assert call_kwargs["model"] == "claude-3-haiku-20240307" assert call_kwargs["api_key"] == "test-key" - @patch("litellm.completion") - def test_anthropic_provider_complete_with_tools(self, mock_completion): + @pytest.mark.asyncio + @patch("litellm.acompletion", new_callable=AsyncMock) + async def test_anthropic_provider_complete_with_tools(self, mock_acompletion): """Test AnthropicProvider.complete_with_tools() delegates to LiteLLM.""" # Mock a simple response (no tool calls) mock_response = MagicMock() @@ -302,7 +310,7 @@ def test_anthropic_provider_complete_with_tools(self, mock_completion): mock_response.model = "claude-3-haiku-20240307" mock_response.usage.prompt_tokens = 20 mock_response.usage.completion_tokens = 10 - mock_completion.return_value = mock_response + mock_acompletion.return_value = mock_response provider = AnthropicProvider(api_key="test-key", model="claude-3-haiku-20240307") @@ -317,7 +325,7 @@ def test_anthropic_provider_complete_with_tools(self, mock_completion): def tool_executor(tool_use: ToolUse) -> ToolResult: return ToolResult(tool_use_id=tool_use.id, content="3:00 PM", is_error=False) - result = provider.complete_with_tools( + result = await provider.complete_with_tools( messages=[{"role": "user", "content": "What time is it?"}], system="You are a time assistant.", tools=tools, @@ -325,10 +333,11 @@ def tool_executor(tool_use: ToolUse) -> ToolResult: ) assert result.content == "The time is 3:00 PM." - mock_completion.assert_called_once() + mock_acompletion.assert_called_once() - @patch("litellm.completion") - def test_anthropic_provider_passes_response_format(self, mock_completion): + @pytest.mark.asyncio + @patch("litellm.acompletion", new_callable=AsyncMock) + async def test_anthropic_provider_passes_response_format(self, mock_acompletion): """Test that AnthropicProvider accepts and forwards response_format.""" # Setup mock mock_response = MagicMock() @@ -338,23 +347,24 @@ def test_anthropic_provider_passes_response_format(self, mock_completion): mock_response.model = "claude-3-haiku-20240307" mock_response.usage.prompt_tokens = 10 mock_response.usage.completion_tokens = 5 - mock_completion.return_value = mock_response + mock_acompletion.return_value = mock_response provider = AnthropicProvider(api_key="test-key") fmt = {"type": "json_object"} - provider.complete(messages=[{"role": "user", "content": "hi"}], response_format=fmt) + await provider.complete(messages=[{"role": "user", "content": "hi"}], response_format=fmt) # Verify it was passed to litellm - call_kwargs = mock_completion.call_args[1] + call_kwargs = mock_acompletion.call_args[1] assert call_kwargs["response_format"] == fmt class TestJsonMode: """Test json_mode parameter for structured JSON output via prompt engineering.""" - @patch("litellm.completion") - def test_json_mode_adds_instruction_to_system_prompt(self, mock_completion): + @pytest.mark.asyncio + @patch("litellm.acompletion", new_callable=AsyncMock) + async def test_json_mode_adds_instruction_to_system_prompt(self, mock_acompletion): """Test that json_mode=True adds JSON instruction to system prompt.""" mock_response = MagicMock() mock_response.choices = [MagicMock()] @@ -363,16 +373,16 @@ def test_json_mode_adds_instruction_to_system_prompt(self, mock_completion): mock_response.model = "gpt-4o-mini" mock_response.usage.prompt_tokens = 10 mock_response.usage.completion_tokens = 5 - mock_completion.return_value = mock_response + mock_acompletion.return_value = mock_response provider = LiteLLMProvider(model="gpt-4o-mini", api_key="test-key") - provider.complete( + await provider.complete( messages=[{"role": "user", "content": "Return JSON"}], system="You are helpful.", json_mode=True, ) - call_kwargs = mock_completion.call_args[1] + call_kwargs = mock_acompletion.call_args[1] # Should NOT use response_format (prompt engineering instead) assert "response_format" not in call_kwargs # Should have JSON instruction appended to system message @@ -381,8 +391,9 @@ def test_json_mode_adds_instruction_to_system_prompt(self, mock_completion): assert "You are helpful." in messages[0]["content"] assert "Please respond with a valid JSON object" in messages[0]["content"] - @patch("litellm.completion") - def test_json_mode_creates_system_prompt_if_none(self, mock_completion): + @pytest.mark.asyncio + @patch("litellm.acompletion", new_callable=AsyncMock) + async def test_json_mode_creates_system_prompt_if_none(self, mock_acompletion): """Test that json_mode=True creates system prompt if none provided.""" mock_response = MagicMock() mock_response.choices = [MagicMock()] @@ -391,19 +402,20 @@ def test_json_mode_creates_system_prompt_if_none(self, mock_completion): mock_response.model = "gpt-4o-mini" mock_response.usage.prompt_tokens = 10 mock_response.usage.completion_tokens = 5 - mock_completion.return_value = mock_response + mock_acompletion.return_value = mock_response provider = LiteLLMProvider(model="gpt-4o-mini", api_key="test-key") - provider.complete(messages=[{"role": "user", "content": "Return JSON"}], json_mode=True) + await provider.complete(messages=[{"role": "user", "content": "Return JSON"}], json_mode=True) - call_kwargs = mock_completion.call_args[1] + call_kwargs = mock_acompletion.call_args[1] messages = call_kwargs["messages"] # Should insert a system message with JSON instruction assert messages[0]["role"] == "system" assert "Please respond with a valid JSON object" in messages[0]["content"] - @patch("litellm.completion") - def test_json_mode_false_no_instruction(self, mock_completion): + @pytest.mark.asyncio + @patch("litellm.acompletion", new_callable=AsyncMock) + async def test_json_mode_false_no_instruction(self, mock_acompletion): """Test that json_mode=False does not add JSON instruction.""" mock_response = MagicMock() mock_response.choices = [MagicMock()] @@ -412,23 +424,24 @@ def test_json_mode_false_no_instruction(self, mock_completion): mock_response.model = "gpt-4o-mini" mock_response.usage.prompt_tokens = 10 mock_response.usage.completion_tokens = 5 - mock_completion.return_value = mock_response + mock_acompletion.return_value = mock_response provider = LiteLLMProvider(model="gpt-4o-mini", api_key="test-key") - provider.complete( + await provider.complete( messages=[{"role": "user", "content": "Hello"}], system="You are helpful.", json_mode=False, ) - call_kwargs = mock_completion.call_args[1] + call_kwargs = mock_acompletion.call_args[1] assert "response_format" not in call_kwargs messages = call_kwargs["messages"] assert messages[0]["role"] == "system" assert "Please respond with a valid JSON object" not in messages[0]["content"] - @patch("litellm.completion") - def test_json_mode_default_is_false(self, mock_completion): + @pytest.mark.asyncio + @patch("litellm.acompletion", new_callable=AsyncMock) + async def test_json_mode_default_is_false(self, mock_acompletion): """Test that json_mode defaults to False (no JSON instruction).""" mock_response = MagicMock() mock_response.choices = [MagicMock()] @@ -437,21 +450,22 @@ def test_json_mode_default_is_false(self, mock_completion): mock_response.model = "gpt-4o-mini" mock_response.usage.prompt_tokens = 10 mock_response.usage.completion_tokens = 5 - mock_completion.return_value = mock_response + mock_acompletion.return_value = mock_response provider = LiteLLMProvider(model="gpt-4o-mini", api_key="test-key") - provider.complete( + await provider.complete( messages=[{"role": "user", "content": "Hello"}], system="You are helpful." ) - call_kwargs = mock_completion.call_args[1] + call_kwargs = mock_acompletion.call_args[1] assert "response_format" not in call_kwargs messages = call_kwargs["messages"] # System prompt should be unchanged assert messages[0]["content"] == "You are helpful." - @patch("litellm.completion") - def test_anthropic_provider_passes_json_mode(self, mock_completion): + @pytest.mark.asyncio + @patch("litellm.acompletion", new_callable=AsyncMock) + async def test_anthropic_provider_passes_json_mode(self, mock_acompletion): """Test that AnthropicProvider passes json_mode through (prompt engineering).""" mock_response = MagicMock() mock_response.choices = [MagicMock()] @@ -460,16 +474,16 @@ def test_anthropic_provider_passes_json_mode(self, mock_completion): mock_response.model = "claude-haiku-4-5-20251001" mock_response.usage.prompt_tokens = 10 mock_response.usage.completion_tokens = 5 - mock_completion.return_value = mock_response + mock_acompletion.return_value = mock_response provider = AnthropicProvider(api_key="test-key") - provider.complete( + await provider.complete( messages=[{"role": "user", "content": "Return JSON"}], system="You are helpful.", json_mode=True, ) - call_kwargs = mock_completion.call_args[1] + call_kwargs = mock_acompletion.call_args[1] # Should NOT use response_format assert "response_format" not in call_kwargs # Should have JSON instruction in system prompt diff --git a/single_test_error.txt b/single_test_error.txt new file mode 100644 index 0000000000..26acfa601f --- /dev/null +++ b/single_test_error.txt @@ -0,0 +1,60 @@ +============================= test session starts ============================= +platform win32 -- Python 3.12.6, pytest-9.0.2, pluggy-1.6.0 -- C:\Python312\python.exe +cachedir: .pytest_cache +rootdir: C:\Users\Advika Nagool\Desktop\Hiv\core +configfile: pyproject.toml +plugins: anyio-4.10.0, langsmith-0.4.31, asyncio-1.3.0, xdist-3.8.0, typeguard-4.4.4 +asyncio: mode=Mode.STRICT, debug=False, asyncio_default_fixture_loop_scope=None, asyncio_default_test_loop_scope=function +collecting ... collected 1 item + +core\tests\test_builder.py::TestBuilderQueryBasics::test_get_recent_failures FAILED [100%] + +================================== FAILURES =================================== +_______________ TestBuilderQueryBasics.test_get_recent_failures _______________ + +self = +tmp_path = WindowsPath('C:/Users/Advika Nagool/AppData/Local/Temp/pytest-of-Advika Nagool/pytest-9/test_get_recent_failures0') + + def test_get_recent_failures(self, tmp_path: Path): + """Test getting recent failed runs.""" + runtime = Runtime(tmp_path) + create_successful_run(runtime) +> create_failed_run(runtime) + +core\tests\test_builder.py:120: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +core\tests\test_builder.py:67: in create_failed_run + runtime.end_run(success=False, narrative="Failed to process - no data") +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + +self = +input = '{\r\n "run_id": "run_20260129_001523_5d095b01",\r\n "goal_id": "test_goal",\r\n "status": "failed",\r\n "duration...ical_problems": [\r\n "Processing failed due to empty input"\r\n ],\r\n "warnings": [],\r\n "successes": []\r\n}' +final = False + + def encode(self, input, final=False): +> return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 264: character maps to + +C:\Python312\Lib\encodings\cp1252.py:19: UnicodeEncodeError +============================== warnings summary =============================== +core\framework\graph\safe_eval.py:79 + C:\Users\Advika Nagool\Desktop\Hiv\core\framework\graph\safe_eval.py:79: DeprecationWarning: ast.Num is deprecated and will be removed in Python 3.14; use ast.Constant instead + def visit_Num(self, node: ast.Num) -> Any: + +core\framework\graph\safe_eval.py:82 + C:\Users\Advika Nagool\Desktop\Hiv\core\framework\graph\safe_eval.py:82: DeprecationWarning: ast.Str is deprecated and will be removed in Python 3.14; use ast.Constant instead + def visit_Str(self, node: ast.Str) -> Any: + +core\framework\graph\safe_eval.py:85 + C:\Users\Advika Nagool\Desktop\Hiv\core\framework\graph\safe_eval.py:85: DeprecationWarning: ast.NameConstant is deprecated and will be removed in Python 3.14; use ast.Constant instead + def visit_NameConstant(self, node: ast.NameConstant) -> Any: + +-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html +=========================== short test summary info =========================== +FAILED core\tests\test_builder.py::TestBuilderQueryBasics::test_get_recent_failures - UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 264: character maps to +======================== 1 failed, 3 warnings in 4.15s ======================== diff --git a/test_failures.txt b/test_failures.txt new file mode 100644 index 0000000000..9c876343cf Binary files /dev/null and b/test_failures.txt differ diff --git a/test_failures_short.txt b/test_failures_short.txt new file mode 100644 index 0000000000..a3ad3ae4fd --- /dev/null +++ b/test_failures_short.txt @@ -0,0 +1,551 @@ +============================= test session starts ============================= +platform win32 -- Python 3.12.6, pytest-9.0.2, pluggy-1.6.0 -- C:\Python312\python.exe +cachedir: .pytest_cache +rootdir: C:\Users\Advika Nagool\Desktop\Hiv\core +configfile: pyproject.toml +plugins: anyio-4.10.0, langsmith-0.4.31, asyncio-1.3.0, xdist-3.8.0, typeguard-4.4.4 +asyncio: mode=Mode.STRICT, debug=False, asyncio_default_fixture_loop_scope=None, asyncio_default_test_loop_scope=function +collecting ... collected 273 items + +core\tests\test_builder.py::TestBuilderQueryBasics::test_get_run_summary PASSED [ 0%] +core\tests\test_builder.py::TestBuilderQueryBasics::test_get_full_run PASSED [ 0%] +core\tests\test_builder.py::TestBuilderQueryBasics::test_list_runs_for_goal PASSED [ 1%] +core\tests\test_builder.py::TestBuilderQueryBasics::test_get_recent_failures FAILED [ 1%] +core\tests\test_builder.py::TestFailureAnalysis::test_analyze_failure FAILED [ 1%] +core\tests\test_builder.py::TestFailureAnalysis::test_analyze_failure_returns_none_for_success PASSED [ 2%] +core\tests\test_builder.py::TestFailureAnalysis::test_failure_analysis_has_suggestions FAILED [ 2%] +core\tests\test_builder.py::TestFailureAnalysis::test_get_decision_trace PASSED [ 2%] +core\tests\test_builder.py::TestPatternAnalysis::test_find_patterns_basic FAILED [ 3%] +core\tests\test_builder.py::TestPatternAnalysis::test_find_patterns_common_failures FAILED [ 3%] +core\tests\test_builder.py::TestPatternAnalysis::test_find_patterns_problematic_nodes FAILED [ 4%] +core\tests\test_builder.py::TestPatternAnalysis::test_compare_runs FAILED [ 4%] +core\tests\test_builder.py::TestImprovementSuggestions::test_suggest_improvements FAILED [ 4%] +core\tests\test_builder.py::TestImprovementSuggestions::test_suggest_improvements_for_low_success_rate FAILED [ 5%] +core\tests\test_builder.py::TestImprovementSuggestions::test_get_node_performance PASSED [ 5%] +core\tests\test_builder.py::TestBuilderWorkflow::test_builder_investigation_workflow FAILED [ 5%] +core\tests\test_execution_stream.py::test_execution_stream_retention PASSED [ 6%] +core\tests\test_executor_max_retries.py::test_executor_respects_custom_max_retries_high PASSED [ 6%] +core\tests\test_executor_max_retries.py::test_executor_respects_custom_max_retries_low PASSED [ 6%] +core\tests\test_executor_max_retries.py::test_executor_respects_default_max_retries PASSED [ 7%] +core\tests\test_executor_max_retries.py::test_executor_max_retries_two_succeeds_on_second PASSED [ 7%] +core\tests\test_executor_max_retries.py::test_executor_different_nodes_different_max_retries PASSED [ 8%] +core\tests\test_fanout.py::test_fanout_triggers_on_multiple_success_edges PASSED [ 8%] +core\tests\test_fanout.py::test_branches_execute_concurrently PASSED [ 8%] +core\tests\test_fanout.py::test_convergence_at_fan_in_node PASSED [ 9%] +core\tests\test_fanout.py::test_fail_all_strategy_raises_on_branch_failure PASSED [ 9%] +core\tests\test_fanout.py::test_continue_others_strategy_allows_partial_success PASSED [ 9%] +core\tests\test_fanout.py::test_wait_all_strategy_collects_all_results PASSED [ 10%] +core\tests\test_fanout.py::test_per_branch_retry PASSED [ 10%] +core\tests\test_fanout.py::test_single_edge_no_parallel_overhead PASSED [ 10%] +core\tests\test_fanout.py::test_detect_fan_out_nodes PASSED [ 11%] +core\tests\test_fanout.py::test_detect_fan_in_nodes PASSED [ 11%] +core\tests\test_fanout.py::test_parallel_disabled_uses_sequential PASSED [ 12%] +core\tests\test_flexible_executor.py::TestPlanDataStructures::test_plan_step_creation PASSED [ 12%] +core\tests\test_flexible_executor.py::TestPlanDataStructures::test_plan_step_is_ready PASSED [ 12%] +core\tests\test_flexible_executor.py::TestPlanDataStructures::test_plan_get_ready_steps PASSED [ 13%] +core\tests\test_flexible_executor.py::TestPlanDataStructures::test_plan_is_complete PASSED [ 13%] +core\tests\test_flexible_executor.py::TestPlanDataStructures::test_plan_to_feedback_context PASSED [ 13%] +core\tests\test_flexible_executor.py::TestCodeSandbox::test_simple_execution PASSED [ 14%] +core\tests\test_flexible_executor.py::TestCodeSandbox::test_input_injection PASSED [ 14%] +core\tests\test_flexible_executor.py::TestCodeSandbox::test_blocked_import PASSED [ 15%] +core\tests\test_flexible_executor.py::TestCodeSandbox::test_blocked_private_access PASSED [ 15%] +core\tests\test_flexible_executor.py::TestCodeSandbox::test_blocked_exec_eval PASSED [ 15%] +core\tests\test_flexible_executor.py::TestCodeSandbox::test_safe_eval_expression PASSED [ 16%] +core\tests\test_flexible_executor.py::TestCodeSandbox::test_allowed_modules PASSED [ 16%] +core\tests\test_flexible_executor.py::TestHybridJudge::test_rule_based_accept PASSED [ 16%] +core\tests\test_flexible_executor.py::TestHybridJudge::test_rule_based_retry PASSED [ 17%] +core\tests\test_flexible_executor.py::TestHybridJudge::test_rule_priority PASSED [ 17%] +core\tests\test_flexible_executor.py::TestHybridJudge::test_default_judge_rules PASSED [ 17%] +core\tests\test_flexible_executor.py::TestJudgment::test_judgment_creation PASSED [ 18%] +core\tests\test_flexible_executor.py::TestJudgment::test_judgment_with_feedback PASSED [ 18%] +core\tests\test_flexible_executor.py::TestPlanExecutionResult::test_completed_result PASSED [ 19%] +core\tests\test_flexible_executor.py::TestPlanExecutionResult::test_needs_replan_result PASSED [ 19%] +core\tests\test_flexible_executor.py::TestFlexibleExecutorIntegration::test_executor_creation PASSED [ 19%] +core\tests\test_flexible_executor.py::TestFlexibleExecutorIntegration::test_executor_with_custom_judge PASSED [ 20%] +core\tests\test_graph_executor.py::test_executor_single_node_success PASSED [ 20%] +core\tests\test_graph_executor.py::test_executor_single_node_failure PASSED [ 20%] +core\tests\test_hallucination_detection.py::TestSharedMemoryHallucinationDetection::test_detects_code_at_start PASSED [ 21%] +core\tests\test_hallucination_detection.py::TestSharedMemoryHallucinationDetection::test_detects_code_in_middle PASSED [ 21%] +core\tests\test_hallucination_detection.py::TestSharedMemoryHallucinationDetection::test_detects_code_at_end PASSED [ 21%] +core\tests\test_hallucination_detection.py::TestSharedMemoryHallucinationDetection::test_detects_javascript_code PASSED [ 22%] +core\tests\test_hallucination_detection.py::TestSharedMemoryHallucinationDetection::test_detects_sql_injection PASSED [ 22%] +core\tests\test_hallucination_detection.py::TestSharedMemoryHallucinationDetection::test_detects_script_injection PASSED [ 23%] +core\tests\test_hallucination_detection.py::TestSharedMemoryHallucinationDetection::test_allows_short_strings_without_validation PASSED [ 23%] +core\tests\test_hallucination_detection.py::TestSharedMemoryHallucinationDetection::test_allows_long_strings_without_code PASSED [ 23%] +core\tests\test_hallucination_detection.py::TestSharedMemoryHallucinationDetection::test_validate_false_bypasses_check PASSED [ 24%] +core\tests\test_hallucination_detection.py::TestSharedMemoryHallucinationDetection::test_sampling_for_very_long_strings PASSED [ 24%] +core\tests\test_hallucination_detection.py::TestOutputValidatorHallucinationDetection::test_detects_code_anywhere_in_output PASSED [ 24%] +core\tests\test_hallucination_detection.py::TestOutputValidatorHallucinationDetection::test_contains_code_indicators_full_check PASSED [ 25%] +core\tests\test_hallucination_detection.py::TestOutputValidatorHallucinationDetection::test_contains_code_indicators_sampling PASSED [ 25%] +core\tests\test_hallucination_detection.py::TestOutputValidatorHallucinationDetection::test_no_false_positive_for_clean_text PASSED [ 26%] +core\tests\test_hallucination_detection.py::TestOutputValidatorHallucinationDetection::test_detects_multiple_languages PASSED [ 26%] +core\tests\test_hallucination_detection.py::TestEdgeCases::test_empty_string PASSED [ 26%] +core\tests\test_hallucination_detection.py::TestEdgeCases::test_non_string_values PASSED [ 27%] +core\tests\test_hallucination_detection.py::TestEdgeCases::test_exactly_5000_chars PASSED [ 27%] +core\tests\test_hallucination_detection.py::TestEdgeCases::test_5001_chars_triggers_validation PASSED [ 27%] +core\tests\test_litellm_provider.py::TestLiteLLMProviderInit::test_init_with_defaults PASSED [ 28%] +core\tests\test_litellm_provider.py::TestLiteLLMProviderInit::test_init_with_custom_model PASSED [ 28%] +core\tests\test_litellm_provider.py::TestLiteLLMProviderInit::test_init_deepseek_model PASSED [ 28%] +core\tests\test_litellm_provider.py::TestLiteLLMProviderInit::test_init_with_api_key PASSED [ 29%] +core\tests\test_litellm_provider.py::TestLiteLLMProviderInit::test_init_with_api_base PASSED [ 29%] +core\tests\test_litellm_provider.py::TestLiteLLMProviderInit::test_init_ollama_no_key_needed PASSED [ 30%] +core\tests\test_litellm_provider.py::TestLiteLLMProviderComplete::test_complete_basic PASSED [ 30%] +core\tests\test_litellm_provider.py::TestLiteLLMProviderComplete::test_complete_with_system_prompt PASSED [ 30%] +core\tests\test_litellm_provider.py::TestLiteLLMProviderComplete::test_complete_with_tools PASSED [ 31%] +core\tests\test_litellm_provider.py::TestLiteLLMProviderToolUse::test_complete_with_tools_single_iteration PASSED [ 31%] +core\tests\test_litellm_provider.py::TestToolConversion::test_tool_to_openai_format PASSED [ 31%] +core\tests\test_litellm_provider.py::TestAnthropicProviderBackwardCompatibility::test_anthropic_provider_is_llm_provider PASSED [ 32%] +core\tests\test_litellm_provider.py::TestAnthropicProviderBackwardCompatibility::test_anthropic_provider_init_defaults PASSED [ 32%] +core\tests\test_litellm_provider.py::TestAnthropicProviderBackwardCompatibility::test_anthropic_provider_init_custom_model PASSED [ 32%] +core\tests\test_litellm_provider.py::TestAnthropicProviderBackwardCompatibility::test_anthropic_provider_uses_litellm_internally PASSED [ 33%] +core\tests\test_litellm_provider.py::TestAnthropicProviderBackwardCompatibility::test_anthropic_provider_complete PASSED [ 33%] +core\tests\test_litellm_provider.py::TestAnthropicProviderBackwardCompatibility::test_anthropic_provider_complete_with_tools PASSED [ 34%] +core\tests\test_litellm_provider.py::TestAnthropicProviderBackwardCompatibility::test_anthropic_provider_passes_response_format PASSED [ 34%] +core\tests\test_litellm_provider.py::TestJsonMode::test_json_mode_adds_instruction_to_system_prompt PASSED [ 34%] +core\tests\test_litellm_provider.py::TestJsonMode::test_json_mode_creates_system_prompt_if_none PASSED [ 35%] +core\tests\test_litellm_provider.py::TestJsonMode::test_json_mode_false_no_instruction PASSED [ 35%] +core\tests\test_litellm_provider.py::TestJsonMode::test_json_mode_default_is_false PASSED [ 35%] +core\tests\test_litellm_provider.py::TestJsonMode::test_anthropic_provider_passes_json_mode PASSED [ 36%] +core\tests\test_llm_judge.py::TestLLMJudgeWithProvider::test_init_with_provider PASSED [ 36%] +core\tests\test_llm_judge.py::TestLLMJudgeWithProvider::test_evaluate_uses_provider PASSED [ 36%] +core\tests\test_llm_judge.py::TestLLMJudgeWithProvider::test_evaluate_passes_correct_arguments PASSED [ 37%] +core\tests\test_llm_judge.py::TestLLMJudgeWithProvider::test_evaluate_failing_result PASSED [ 37%] +core\tests\test_llm_judge.py::TestLLMJudgeResponseParsing::test_parse_plain_json PASSED [ 38%] +core\tests\test_llm_judge.py::TestLLMJudgeResponseParsing::test_parse_json_in_markdown_code_block PASSED [ 38%] +core\tests\test_llm_judge.py::TestLLMJudgeResponseParsing::test_parse_json_in_plain_code_block PASSED [ 38%] +core\tests\test_llm_judge.py::TestLLMJudgeResponseParsing::test_parse_response_with_whitespace PASSED [ 39%] +core\tests\test_llm_judge.py::TestLLMJudgeResponseParsing::test_default_explanation_when_missing PASSED [ 39%] +core\tests\test_llm_judge.py::TestLLMJudgeResponseParsing::test_passes_coerced_to_bool PASSED [ 39%] +core\tests\test_llm_judge.py::TestLLMJudgeResponseParsing::test_passes_false_when_missing PASSED [ 40%] +core\tests\test_llm_judge.py::TestLLMJudgeErrorHandling::test_invalid_json_response PASSED [ 40%] +core\tests\test_llm_judge.py::TestLLMJudgeErrorHandling::test_provider_raises_exception PASSED [ 41%] +core\tests\test_llm_judge.py::TestLLMJudgeBackwardCompatibility::test_init_without_provider PASSED [ 41%] +core\tests\test_llm_judge.py::TestLLMJudgeBackwardCompatibility::test_evaluate_without_provider_uses_anthropic PASSED [ 41%] +core\tests\test_llm_judge.py::TestLLMJudgeBackwardCompatibility::test_anthropic_client_lazy_loaded PASSED [ 42%] +core\tests\test_llm_judge.py::TestLLMJudgeBackwardCompatibility::test_anthropic_import_error_handling PASSED [ 42%] +core\tests\test_llm_judge.py::TestLLMJudgeBackwardCompatibility::test_anthropic_client_uses_correct_model PASSED [ 42%] +core\tests\test_llm_judge.py::TestLLMJudgeIntegrationPatterns::test_with_anthropic_provider PASSED [ 43%] +core\tests\test_llm_judge.py::TestLLMJudgeIntegrationPatterns::test_with_multiple_evaluations PASSED [ 43%] +core\tests\test_llm_judge.py::TestLLMJudgeIntegrationPatterns::test_provider_reuse_across_judges PASSED [ 43%] +core\tests\test_mcp_server.py::TestMCPDependencies::test_mcp_package_available PASSED [ 44%] +core\tests\test_mcp_server.py::TestMCPDependencies::test_fastmcp_available PASSED [ 44%] +core\tests\test_mcp_server.py::TestAgentBuilderServerModule::test_module_importable PASSED [ 45%] +core\tests\test_mcp_server.py::TestAgentBuilderServerModule::test_mcp_object_exported PASSED [ 45%] +core\tests\test_mcp_server.py::TestAgentBuilderServerModule::test_mcp_server_name PASSED [ 45%] +core\tests\test_mcp_server.py::TestMCPPackageExports::test_package_importable PASSED [ 46%] +core\tests\test_mcp_server.py::TestMCPPackageExports::test_agent_builder_server_exported PASSED [ 46%] +core\tests\test_node_json_extraction.py::TestJsonExtraction::test_clean_json PASSED [ 46%] +core\tests\test_node_json_extraction.py::TestJsonExtraction::test_json_with_whitespace PASSED [ 47%] +core\tests\test_node_json_extraction.py::TestJsonExtraction::test_markdown_code_block_at_start PASSED [ 47%] +core\tests\test_node_json_extraction.py::TestJsonExtraction::test_markdown_code_block_without_json_label PASSED [ 47%] +core\tests\test_node_json_extraction.py::TestJsonExtraction::test_prose_around_markdown_block PASSED [ 48%] +core\tests\test_node_json_extraction.py::TestJsonExtraction::test_json_embedded_in_prose PASSED [ 48%] +core\tests\test_node_json_extraction.py::TestJsonExtraction::test_nested_json PASSED [ 49%] +core\tests\test_node_json_extraction.py::TestJsonExtraction::test_deeply_nested_json PASSED [ 49%] +core\tests\test_node_json_extraction.py::TestJsonExtraction::test_json_with_array PASSED [ 49%] +core\tests\test_node_json_extraction.py::TestJsonExtraction::test_json_with_string_containing_braces PASSED [ 50%] +core\tests\test_node_json_extraction.py::TestJsonExtraction::test_json_with_escaped_quotes PASSED [ 50%] +core\tests\test_node_json_extraction.py::TestJsonExtraction::test_multiple_json_objects_takes_first PASSED [ 50%] +core\tests\test_node_json_extraction.py::TestJsonExtraction::test_json_with_boolean_and_null PASSED [ 51%] +core\tests\test_node_json_extraction.py::TestJsonExtraction::test_json_with_numbers PASSED [ 51%] +core\tests\test_node_json_extraction.py::TestJsonExtraction::test_invalid_json_raises_error PASSED [ 52%] +core\tests\test_node_json_extraction.py::TestJsonExtraction::test_empty_string_raises_error PASSED [ 52%] +core\tests\test_orchestrator.py::TestOrchestratorLLMInitialization::test_auto_creates_litellm_provider_when_no_llm_passed PASSED [ 52%] +core\tests\test_orchestrator.py::TestOrchestratorLLMInitialization::test_uses_custom_model_parameter PASSED [ 53%] +core\tests\test_orchestrator.py::TestOrchestratorLLMInitialization::test_supports_openai_model_names PASSED [ 53%] +core\tests\test_orchestrator.py::TestOrchestratorLLMInitialization::test_supports_anthropic_model_names PASSED [ 53%] +core\tests\test_orchestrator.py::TestOrchestratorLLMInitialization::test_skips_auto_creation_when_llm_passed PASSED [ 54%] +core\tests\test_orchestrator.py::TestOrchestratorLLMInitialization::test_model_attribute_stored_correctly PASSED [ 54%] +core\tests\test_orchestrator.py::TestOrchestratorLLMProviderType::test_llm_is_litellm_provider_instance PASSED [ 54%] +core\tests\test_orchestrator.py::TestOrchestratorLLMProviderType::test_llm_implements_llm_provider_interface PASSED [ 55%] +core\tests\test_plan.py::TestActionTypeEnum::test_action_type_values_exist PASSED [ 55%] +core\tests\test_plan.py::TestActionTypeEnum::test_action_type_count PASSED [ 56%] +core\tests\test_plan.py::TestActionTypeEnum::test_action_type_string_enum PASSED [ 56%] +core\tests\test_plan.py::TestStepStatusEnum::test_step_status_values_exist PASSED [ 56%] +core\tests\test_plan.py::TestStepStatusEnum::test_step_status_count PASSED [ 57%] +core\tests\test_plan.py::TestStepStatusEnum::test_step_status_transition_pending_to_in_progress PASSED [ 57%] +core\tests\test_plan.py::TestStepStatusEnum::test_step_status_transition_in_progress_to_completed PASSED [ 57%] +core\tests\test_plan.py::TestStepStatusEnum::test_step_status_transition_in_progress_to_failed PASSED [ 58%] +core\tests\test_plan.py::TestApprovalDecisionEnum::test_approval_decision_values_exist PASSED [ 58%] +core\tests\test_plan.py::TestApprovalDecisionEnum::test_approval_decision_count PASSED [ 58%] +core\tests\test_plan.py::TestJudgmentActionEnum::test_judgment_action_values_exist PASSED [ 59%] +core\tests\test_plan.py::TestJudgmentActionEnum::test_judgment_action_count PASSED [ 59%] +core\tests\test_plan.py::TestExecutionStatusEnum::test_execution_status_values_exist PASSED [ 60%] +core\tests\test_plan.py::TestExecutionStatusEnum::test_execution_status_count PASSED [ 60%] +core\tests\test_plan.py::TestPlanStepIsReady::test_plan_step_is_ready_no_deps PASSED [ 60%] +core\tests\test_plan.py::TestPlanStepIsReady::test_plan_step_is_ready_deps_met PASSED [ 61%] +core\tests\test_plan.py::TestPlanStepIsReady::test_plan_step_not_ready_deps_missing PASSED [ 61%] +core\tests\test_plan.py::TestPlanStepIsReady::test_plan_step_not_ready_wrong_status PASSED [ 61%] +core\tests\test_plan.py::TestPlanStepIsReady::test_plan_step_not_ready_completed_status PASSED [ 62%] +core\tests\test_plan.py::TestPlanStepIsReady::test_plan_step_is_ready_multiple_deps_all_met PASSED [ 62%] +core\tests\test_plan.py::TestPlanFromJson::test_plan_from_json_string PASSED [ 63%] +core\tests\test_plan.py::TestPlanFromJson::test_plan_from_json_dict PASSED [ 63%] +core\tests\test_plan.py::TestPlanFromJson::test_plan_from_json_nested_plan_key PASSED [ 63%] +core\tests\test_plan.py::TestPlanFromJson::test_plan_from_json_action_type_conversion PASSED [ 64%] +core\tests\test_plan.py::TestPlanFromJson::test_plan_from_json_all_action_types PASSED [ 64%] +core\tests\test_plan.py::TestPlanFromJson::test_from_json_invalid_action_type PASSED [ 64%] +core\tests\test_plan.py::TestPlanFromJson::test_from_json_malformed_json_string PASSED [ 65%] +core\tests\test_plan.py::TestPlanFromJson::test_from_json_missing_step_id PASSED [ 65%] +core\tests\test_plan.py::TestPlanFromJson::test_from_json_wrong_type_for_steps PASSED [ 65%] +core\tests\test_plan.py::TestPlanFromJson::test_from_json_empty_data PASSED [ 66%] +core\tests\test_plan.py::TestPlanMethods::test_plan_get_step PASSED [ 66%] +core\tests\test_plan.py::TestPlanMethods::test_plan_get_step_not_found PASSED [ 67%] +core\tests\test_plan.py::TestPlanMethods::test_plan_get_ready_steps PASSED [ 67%] +core\tests\test_plan.py::TestPlanMethods::test_plan_get_completed_steps PASSED [ 67%] +core\tests\test_plan.py::TestPlanMethods::test_plan_is_complete_false PASSED [ 68%] +core\tests\test_plan.py::TestPlanMethods::test_plan_is_complete_true PASSED [ 68%] +core\tests\test_plan.py::TestPlanMethods::test_plan_is_complete_empty PASSED [ 68%] +core\tests\test_plan.py::TestPlanMethods::test_plan_to_feedback_context PASSED [ 69%] +core\tests\test_plan.py::TestPlanRoundTrip::test_plan_round_trip_model_dump PASSED [ 69%] +core\tests\test_plan.py::TestPlanRoundTrip::test_plan_round_trip_json_string PASSED [ 69%] +core\tests\test_plan.py::TestPlanRoundTrip::test_plan_step_serialization PASSED [ 70%] +core\tests\test_pydantic_validation.py::TestNodeSpecOutputModel::test_nodespec_accepts_output_model PASSED [ 70%] +core\tests\test_pydantic_validation.py::TestNodeSpecOutputModel::test_nodespec_output_model_optional PASSED [ 71%] +core\tests\test_pydantic_validation.py::TestNodeSpecOutputModel::test_nodespec_custom_validation_retries PASSED [ 71%] +core\tests\test_pydantic_validation.py::TestOutputValidatorPydantic::test_validate_valid_output PASSED [ 71%] +core\tests\test_pydantic_validation.py::TestOutputValidatorPydantic::test_validate_missing_required_field PASSED [ 72%] +core\tests\test_pydantic_validation.py::TestOutputValidatorPydantic::test_validate_wrong_type PASSED [ 72%] +core\tests\test_pydantic_validation.py::TestOutputValidatorPydantic::test_validate_complex_model PASSED [ 72%] +core\tests\test_pydantic_validation.py::TestOutputValidatorPydantic::test_validate_field_constraints PASSED [ 73%] +core\tests\test_pydantic_validation.py::TestOutputValidatorPydantic::test_validate_range_constraints PASSED [ 73%] +core\tests\test_pydantic_validation.py::TestOutputValidatorPydantic::test_validate_realistic_model PASSED [ 73%] +core\tests\test_pydantic_validation.py::TestValidationFeedback::test_format_feedback_includes_errors PASSED [ 74%] +core\tests\test_pydantic_validation.py::TestValidationFeedback::test_format_feedback_includes_schema PASSED [ 74%] +core\tests\test_pydantic_validation.py::TestNodeResultValidationErrors::test_noderesult_includes_validation_errors PASSED [ 75%] +core\tests\test_pydantic_validation.py::TestNodeResultValidationErrors::test_noderesult_empty_validation_errors_by_default PASSED [ 75%] +core\tests\test_pydantic_validation.py::TestPydanticValidationIntegration::test_nodespec_serialization_with_output_model PASSED [ 75%] +core\tests\test_pydantic_validation.py::TestJSONSchemaGeneration::test_simple_model_schema_generation PASSED [ 76%] +core\tests\test_pydantic_validation.py::TestJSONSchemaGeneration::test_complex_model_schema_generation PASSED [ 76%] +core\tests\test_pydantic_validation.py::TestJSONSchemaGeneration::test_schema_includes_required_fields PASSED [ 76%] +core\tests\test_pydantic_validation.py::TestJSONSchemaGeneration::test_schema_can_be_used_in_response_format PASSED [ 77%] +core\tests\test_pydantic_validation.py::TestRetryWithFeedback::test_validation_feedback_format PASSED [ 77%] +core\tests\test_pydantic_validation.py::TestRetryWithFeedback::test_feedback_mentions_fix_instruction PASSED [ 78%] +core\tests\test_pydantic_validation.py::TestRetryWithFeedback::test_max_validation_retries_default PASSED [ 78%] +core\tests\test_pydantic_validation.py::TestRetryWithFeedback::test_max_validation_retries_customizable PASSED [ 78%] +core\tests\test_pydantic_validation.py::TestRetryWithFeedback::test_zero_retries_allowed PASSED [ 79%] +core\tests\test_pydantic_validation.py::TestRetryWithFeedback::test_feedback_includes_all_error_types PASSED [ 79%] +core\tests\test_pydantic_validation.py::TestPydanticValidationIntegrationExtended::test_nodespec_with_all_validation_options PASSED [ 79%] +core\tests\test_pydantic_validation.py::TestPydanticValidationIntegrationExtended::test_validator_preserves_model_defaults PASSED [ 80%] +core\tests\test_pydantic_validation.py::TestPydanticValidationIntegrationExtended::test_validation_result_error_property PASSED [ 80%] +core\tests\test_run.py::TestRuntimeMetrics::test_success_rate PASSED [ 80%] +core\tests\test_run.py::TestRuntimeMetrics::test_success_rate_zero_decisions PASSED [ 81%] +core\tests\test_run.py::TestRun::test_duration_ms PASSED [ 81%] +core\tests\test_run.py::TestRun::test_add_decision PASSED [ 82%] +core\tests\test_run.py::TestRun::test_record_outcome PASSED [ 82%] +core\tests\test_run.py::TestRun::test_add_problem PASSED [ 82%] +core\tests\test_run.py::TestRun::test_complete PASSED [ 83%] +core\tests\test_run.py::TestRunSummary::test_from_run_basic PASSED [ 83%] +core\tests\test_run.py::TestRunSummary::test_from_run_with_decisions PASSED [ 83%] +core\tests\test_run.py::TestRunSummary::test_from_run_with_problems PASSED [ 84%] +core\tests\test_runtime.py::TestRuntimeBasics::test_start_and_end_run PASSED [ 84%] +core\tests\test_runtime.py::TestRuntimeBasics::test_end_without_start_is_graceful PASSED [ 84%] +core\tests\test_runtime.py::TestRuntimeBasics::test_run_saved_on_end PASSED [ 85%] +core\tests\test_runtime.py::TestDecisionRecording::test_basic_decision FAILED [ 85%] +core\tests\test_runtime.py::TestDecisionRecording::test_decision_without_run_is_graceful PASSED [ 86%] +core\tests\test_runtime.py::TestDecisionRecording::test_decision_with_node_context FAILED [ 86%] +core\tests\test_runtime.py::TestDecisionRecording::test_decision_type FAILED [ 86%] +core\tests\test_runtime.py::TestOutcomeRecording::test_record_successful_outcome PASSED [ 87%] +core\tests\test_runtime.py::TestOutcomeRecording::test_record_failed_outcome FAILED [ 87%] +core\tests\test_runtime.py::TestOutcomeRecording::test_metrics_updated_on_outcome FAILED [ 87%] +core\tests\test_runtime.py::TestProblemReporting::test_report_problem PASSED [ 88%] +core\tests\test_runtime.py::TestProblemReporting::test_problem_linked_to_decision FAILED [ 88%] +core\tests\test_runtime.py::TestConvenienceMethods::test_quick_decision FAILED [ 89%] +core\tests\test_runtime.py::TestConvenienceMethods::test_decide_and_execute_success PASSED [ 89%] +core\tests\test_runtime.py::TestConvenienceMethods::test_decide_and_execute_failure FAILED [ 89%] +core\tests\test_runtime.py::TestNarrativeGeneration::test_default_narrative_success PASSED [ 90%] +core\tests\test_runtime.py::TestNarrativeGeneration::test_default_narrative_failure FAILED [ 90%] +core\tests\test_testing_framework.py::TestTestCaseSchema::test_create_test PASSED [ 90%] +core\tests\test_testing_framework.py::TestTestCaseSchema::test_approve_test PASSED [ 91%] +core\tests\test_testing_framework.py::TestTestCaseSchema::test_modify_test PASSED [ 91%] +core\tests\test_testing_framework.py::TestTestCaseSchema::test_reject_test PASSED [ 91%] +core\tests\test_testing_framework.py::TestTestCaseSchema::test_record_result PASSED [ 92%] +core\tests\test_testing_framework.py::TestTestResultSchema::test_create_passed_result PASSED [ 92%] +core\tests\test_testing_framework.py::TestTestResultSchema::test_create_failed_result PASSED [ 93%] +core\tests\test_testing_framework.py::TestTestResultSchema::test_summary_dict PASSED [ 93%] +core\tests\test_testing_framework.py::TestTestSuiteResult::test_suite_result_properties PASSED [ 93%] +core\tests\test_testing_framework.py::TestTestSuiteResult::test_get_results_by_category PASSED [ 94%] +core\tests\test_testing_framework.py::TestTestStorage::test_save_and_load_test PASSED [ 94%] +core\tests\test_testing_framework.py::TestTestStorage::test_delete_test PASSED [ 94%] +core\tests\test_testing_framework.py::TestTestStorage::test_get_tests_by_goal PASSED [ 95%] +core\tests\test_testing_framework.py::TestTestStorage::test_get_approved_tests PASSED [ 95%] +core\tests\test_testing_framework.py::TestTestStorage::test_save_and_load_result PASSED [ 95%] +core\tests\test_testing_framework.py::TestTestStorage::test_result_history PASSED [ 96%] +core\tests\test_testing_framework.py::TestTestStorage::test_get_stats PASSED [ 96%] +core\tests\test_testing_framework.py::TestErrorCategorizer::test_categorize_passed PASSED [ 97%] +core\tests\test_testing_framework.py::TestErrorCategorizer::test_categorize_logic_error PASSED [ 97%] +core\tests\test_testing_framework.py::TestErrorCategorizer::test_categorize_implementation_error PASSED [ 97%] +core\tests\test_testing_framework.py::TestErrorCategorizer::test_categorize_edge_case PASSED [ 98%] +core\tests\test_testing_framework.py::TestErrorCategorizer::test_categorize_from_stack_trace PASSED [ 98%] +core\tests\test_testing_framework.py::TestErrorCategorizer::test_get_fix_suggestion PASSED [ 98%] +core\tests\test_testing_framework.py::TestErrorCategorizer::test_get_iteration_guidance PASSED [ 99%] +core\tests\test_testing_framework.py::TestDebugTool::test_analyze_missing_test PASSED [ 99%] +core\tests\test_testing_framework.py::TestDebugTool::test_analyze_with_result PASSED [100%] + +================================== FAILURES =================================== +_______________ TestBuilderQueryBasics.test_get_recent_failures _______________ +core\tests\test_builder.py:120: in test_get_recent_failures + create_failed_run(runtime) +core\tests\test_builder.py:67: in create_failed_run + runtime.end_run(success=False, narrative="Failed to process - no data") +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 264: character maps to +__________________ TestFailureAnalysis.test_analyze_failure ___________________ +core\tests\test_builder.py:137: in test_analyze_failure + run_id = create_failed_run(runtime) + ^^^^^^^^^^^^^^^^^^^^^^^^^^ +core\tests\test_builder.py:67: in create_failed_run + runtime.end_run(success=False, narrative="Failed to process - no data") +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 264: character maps to +__________ TestFailureAnalysis.test_failure_analysis_has_suggestions __________ +core\tests\test_builder.py:162: in test_failure_analysis_has_suggestions + run_id = create_failed_run(runtime) + ^^^^^^^^^^^^^^^^^^^^^^^^^^ +core\tests\test_builder.py:67: in create_failed_run + runtime.end_run(success=False, narrative="Failed to process - no data") +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 264: character maps to +________________ TestPatternAnalysis.test_find_patterns_basic _________________ +core\tests\test_builder.py:192: in test_find_patterns_basic + create_failed_run(runtime, "goal_x") +core\tests\test_builder.py:67: in create_failed_run + runtime.end_run(success=False, narrative="Failed to process - no data") +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 261: character maps to +___________ TestPatternAnalysis.test_find_patterns_common_failures ____________ +core\tests\test_builder.py:207: in test_find_patterns_common_failures + create_failed_run(runtime, "failing_goal") +core\tests\test_builder.py:67: in create_failed_run + runtime.end_run(success=False, narrative="Failed to process - no data") +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 267: character maps to +__________ TestPatternAnalysis.test_find_patterns_problematic_nodes ___________ +core\tests\test_builder.py:222: in test_find_patterns_problematic_nodes + create_failed_run(runtime, "node_test") +core\tests\test_builder.py:67: in create_failed_run + runtime.end_run(success=False, narrative="Failed to process - no data") +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 264: character maps to +____________________ TestPatternAnalysis.test_compare_runs ____________________ +core\tests\test_builder.py:235: in test_compare_runs + run2 = create_failed_run(runtime) + ^^^^^^^^^^^^^^^^^^^^^^^^^^ +core\tests\test_builder.py:67: in create_failed_run + runtime.end_run(success=False, narrative="Failed to process - no data") +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 264: character maps to +____________ TestImprovementSuggestions.test_suggest_improvements _____________ +core\tests\test_builder.py:253: in test_suggest_improvements + create_failed_run(runtime, "improve_goal") +core\tests\test_builder.py:67: in create_failed_run + runtime.end_run(success=False, narrative="Failed to process - no data") +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 267: character maps to +__ TestImprovementSuggestions.test_suggest_improvements_for_low_success_rate __ +core\tests\test_builder.py:268: in test_suggest_improvements_for_low_success_rate + create_failed_run(runtime, "low_success") +core\tests\test_builder.py:67: in create_failed_run + runtime.end_run(success=False, narrative="Failed to process - no data") +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 266: character maps to +___________ TestBuilderWorkflow.test_builder_investigation_workflow ___________ +core\tests\test_builder.py:304: in test_builder_investigation_workflow + create_failed_run(runtime, "customer_goal") +core\tests\test_builder.py:67: in create_failed_run + runtime.end_run(success=False, narrative="Failed to process - no data") +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 268: character maps to +__________________ TestDecisionRecording.test_basic_decision __________________ +core\tests\test_runtime.py:78: in test_basic_decision + runtime.end_run(success=True) +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 337: character maps to +____________ TestDecisionRecording.test_decision_with_node_context ____________ +core\tests\test_runtime.py:111: in test_decision_with_node_context + runtime.end_run(success=True) +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 332: character maps to +__________________ TestDecisionRecording.test_decision_type ___________________ +core\tests\test_runtime.py:132: in test_decision_type + runtime.end_run(success=True) +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 337: character maps to +_______________ TestOutcomeRecording.test_record_failed_outcome _______________ +core\tests\test_runtime.py:191: in test_record_failed_outcome + runtime.end_run(success=False) +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 312: character maps to +____________ TestOutcomeRecording.test_metrics_updated_on_outcome _____________ +core\tests\test_runtime.py:222: in test_metrics_updated_on_outcome + runtime.end_run(success=False) +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 309: character maps to +____________ TestProblemReporting.test_problem_linked_to_decision _____________ +core\tests\test_runtime.py:270: in test_problem_linked_to_decision + runtime.end_run(success=True) +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 347: character maps to +_________________ TestConvenienceMethods.test_quick_decision __________________ +core\tests\test_runtime.py:292: in test_quick_decision + runtime.end_run(success=True) +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 331: character maps to +___________ TestConvenienceMethods.test_decide_and_execute_failure ____________ +core\tests\test_runtime.py:338: in test_decide_and_execute_failure + runtime.end_run(success=False) +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 315: character maps to +___________ TestNarrativeGeneration.test_default_narrative_failure ____________ +core\tests\test_runtime.py:381: in test_default_narrative_failure + runtime.end_run(success=False) +core\framework\runtime\core.py:117: in end_run + self.storage.save_run(self._current_run) +core\framework\storage\backend.py:62: in save_run + f.write(summary.model_dump_json(indent=2)) +C:\Python312\Lib\encodings\cp1252.py:19: in encode + return codecs.charmap_encode(input,self.errors,encoding_table)[0] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E UnicodeEncodeError: 'charmap' codec can't encode character '\u2717' in position 352: character maps to +============================== warnings summary =============================== +core\framework\graph\safe_eval.py:79 + C:\Users\Advika Nagool\Desktop\Hiv\core\framework\graph\safe_eval.py:79: DeprecationWarning: ast.Num is deprecated and will be removed in Python 3.14; use ast.Constant instead + def visit_Num(self, node: ast.Num) -> Any: + +core\framework\graph\safe_eval.py:82 + C:\Users\Advika Nagool\Desktop\Hiv\core\framework\graph\safe_eval.py:82: DeprecationWarning: ast.Str is deprecated and will be removed in Python 3.14; use ast.Constant instead + def visit_Str(self, node: ast.Str) -> Any: + +core\framework\graph\safe_eval.py:85 + C:\Users\Advika Nagool\Desktop\Hiv\core\framework\graph\safe_eval.py:85: DeprecationWarning: ast.NameConstant is deprecated and will be removed in Python 3.14; use ast.Constant instead + def visit_NameConstant(self, node: ast.NameConstant) -> Any: + +-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html +=========================== short test summary info =========================== +FAILED core\tests\test_builder.py::TestBuilderQueryBasics::test_get_recent_failures +FAILED core\tests\test_builder.py::TestFailureAnalysis::test_analyze_failure +FAILED core\tests\test_builder.py::TestFailureAnalysis::test_failure_analysis_has_suggestions +FAILED core\tests\test_builder.py::TestPatternAnalysis::test_find_patterns_basic +FAILED core\tests\test_builder.py::TestPatternAnalysis::test_find_patterns_common_failures +FAILED core\tests\test_builder.py::TestPatternAnalysis::test_find_patterns_problematic_nodes +FAILED core\tests\test_builder.py::TestPatternAnalysis::test_compare_runs - U... +FAILED core\tests\test_builder.py::TestImprovementSuggestions::test_suggest_improvements +FAILED core\tests\test_builder.py::TestImprovementSuggestions::test_suggest_improvements_for_low_success_rate +FAILED core\tests\test_builder.py::TestBuilderWorkflow::test_builder_investigation_workflow +FAILED core\tests\test_runtime.py::TestDecisionRecording::test_basic_decision +FAILED core\tests\test_runtime.py::TestDecisionRecording::test_decision_with_node_context +FAILED core\tests\test_runtime.py::TestDecisionRecording::test_decision_type +FAILED core\tests\test_runtime.py::TestOutcomeRecording::test_record_failed_outcome +FAILED core\tests\test_runtime.py::TestOutcomeRecording::test_metrics_updated_on_outcome +FAILED core\tests\test_runtime.py::TestProblemReporting::test_problem_linked_to_decision +FAILED core\tests\test_runtime.py::TestConvenienceMethods::test_quick_decision +FAILED core\tests\test_runtime.py::TestConvenienceMethods::test_decide_and_execute_failure +FAILED core\tests\test_runtime.py::TestNarrativeGeneration::test_default_narrative_failure +============ 19 failed, 254 passed, 3 warnings in 63.22s (0:01:03) ============