diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4b2411d5..3138b54f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -47,6 +47,52 @@ jobs: fail_ci_if_error: false token: ${{ secrets.CODECOV_TOKEN }} + # Python opt-in integration install + focused tests + test-python-optional-integrations: + name: Python Optional Integrations (${{ matrix.integration }} / py${{ matrix.python-version }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - integration: openai-agents + python-version: '3.9' + extras: ".[dev,openai,openai-agents]" + tests: "tests/test_openai_agents_integration.py" + - integration: openai-agents + python-version: '3.11' + extras: ".[dev,openai,openai-agents]" + tests: "tests/test_openai_agents_integration.py" + - integration: crewai + python-version: '3.11' + extras: ".[dev,crewai,openai]" + tests: "tests/test_crewai_integration.py" + - integration: google-adk + python-version: '3.11' + extras: ".[dev,google-adk]" + tests: "tests/test_google_adk_integration.py" + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + + - name: Install integration dependencies + run: | + python -m pip install --upgrade pip + pip install -e "${{ matrix.extras }}" + + - name: Run focused integration tests + run: | + pytest ${{ matrix.tests }} -v + env: + PYTHONPATH: ${{ github.workspace }} + # TypeScript Core Tests test-typescript-core: name: TypeScript Core Tests diff --git a/README.md b/README.md index 63e9af87..51de5118 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ cascadeflow Logo -# Smart AI model cascading for cost optimization +# Agent Runtime Intelligence Layer [![PyPI version](https://img.shields.io/pypi/v/cascadeflow?color=blue&label=Python)](https://pypi.org/project/cascadeflow/) [![npm version](https://img.shields.io/npm/v/@cascadeflow/core?color=red&label=TypeScript)](https://www.npmjs.com/package/@cascadeflow/core) @@ -17,10 +17,11 @@ [![PyPI Downloads](https://static.pepy.tech/badge/cascadeflow)](https://pepy.tech/project/cascadeflow) [![npm Downloads](https://img.shields.io/npm/dt/@cascadeflow/n8n-nodes-cascadeflow?label=npm%20downloads&color=orange)](https://www.npmjs.com/search?q=%40cascadeflow) [![Tests](https://github.com/lemony-ai/cascadeflow/actions/workflows/test.yml/badge.svg)](https://github.com/lemony-ai/cascadeflow/actions/workflows/test.yml) +[![Docs](https://img.shields.io/badge/docs-cascadeflow.dev-blue)](https://docs.cascadeflow.dev) [![Python Docs](https://img.shields.io/badge/docs-Python-blue)](./docs/) [![TypeScript Docs](https://img.shields.io/badge/docs-TypeScript-red)](./docs/) [![X Follow](https://img.shields.io/twitter/follow/saschabuehrle?style=social)](https://x.com/saschabuehrle) -[![GitHub Stars](https://img.shields.io/github/stars/lemony-ai/cascadeflow?style=social)](https://github.com/lemony-ai/cascadeflow) +[![GitHub Stars](https://img.shields.io/github/stars/lemony-ai/cascadeflow?style=flat&color=yellow&label=Stars)](https://github.com/lemony-ai/cascadeflow/stargazers)
@@ -28,17 +29,15 @@
-**[Python Python](#-python) • [TypeScript TypeScript](#-typescript) • [LangChain LangChain](#-langchain-integration) • [n8n n8n](#-n8n-integration) • [Vercel AI Vercel AI](./packages/integrations/vercel-ai/) • [OpenClaw OpenClaw](https://clawhub.ai/saschabuehrle/cascadeflow) • [📖 Docs](./docs/) • [💡 Examples](#examples)** +**[Python Python](#-python) • [TypeScript TypeScript](#-typescript) • [LangChain LangChain](#-langchain-integration) • [n8n n8n](#-n8n-integration) • [Vercel AI Vercel AI](./packages/integrations/vercel-ai/) • [OpenClaw OpenClaw](https://clawhub.ai/saschabuehrle/cascadeflow) • [Full Docs](https://docs.cascadeflow.dev) • [📖 Docs](./docs/) • [💡 Examples](#examples)** --- -**Stop Bleeding Money on AI Calls. Cut Costs 30-65% in 3 Lines of Code.** +**The in-process intelligence layer for AI agents.** Optimize cost, latency, quality, budget, compliance, and energy — inside the execution loop, not at the HTTP boundary. -40-70% of text prompts and 20-60% of agent calls don't need expensive flagship models. You're overpaying every single day. - -*cascadeflow fixes this with intelligent model cascading, available in Python and TypeScript.* +cascadeflow works where external proxies can't: per-step model decisions based on agent state, per-tool-call budget gating, runtime stop/continue/escalate actions, and business KPI injection during agent loops. Sub-1ms overhead. Works with LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, and Vercel AI SDK. ```python pip install cascadeflow @@ -52,6 +51,17 @@ npm install @cascadeflow/core ## Why cascadeflow? +### Proxy vs In-Process Harness + +| Dimension | External Proxy | cascadeflow Harness | +|---|---|---| +| **Scope** | HTTP request boundary | Inside agent execution loop | +| **Dimensions** | Cost only | Cost + quality + latency + budget + compliance + energy | +| **Latency overhead** | 10-50ms network RTT | <1ms in-process | +| **Business logic** | None | KPI weights and targets | +| **Enforcement** | None (observe only) | stop, deny_tool, switch_model | +| **Auditability** | Request logs | Per-step decision traces | + cascadeflow is an intelligent AI model cascading library that dynamically selects the optimal model for each query or tool call through speculative execution. It's based on the research that 40-70% of queries don't require slow, expensive flagship models, and domain-specific smaller models often outperform large general-purpose models on specialized tasks. For the remaining queries that need advanced reasoning, cascadeflow automatically escalates to flagship models if needed. ### Use Cases @@ -140,6 +150,34 @@ In practice, 60-70% of queries are handled by small, efficient models (8-20x cos --- +## Harness API + +Three tiers of integration — zero-change observability to full policy control: + +**Tier 1: Zero-change observability** +```python +import cascadeflow +cascadeflow.init(mode="observe") +# All OpenAI/Anthropic SDK calls are now tracked. No code changes needed. +``` + +**Tier 2: Scoped runs with budget** +```python +with cascadeflow.run(budget=0.50, max_tool_calls=10) as session: + result = await agent.run("Analyze this dataset") + print(session.summary()) # cost, latency, energy, steps, tool calls + print(session.trace()) # full decision audit trail +``` + +**Tier 3: Decorated agents with policy** +```python +@cascadeflow.agent(budget=0.20, compliance="gdpr", kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1}) +async def my_agent(query: str): + return await llm.complete(query) +``` + +--- + ## Quick Start ### Drop-In Gateway (Existing Apps) @@ -724,6 +762,12 @@ console.log(`Warnings: ${validation.warnings}`); | 📋 **Message & Tool Call Lists** | Full conversation history with tool_calls and tool_call_id preservation across turns | | 🪝 **Hooks & Callbacks** | Telemetry callbacks, cost events, and streaming hooks for observability | | 🏭 **Production Ready** | Streaming, batch processing, tool handling, reasoning model support, caching, error recovery, anomaly detection | +| 💳 **Budget Enforcement** | Per-run and per-user budget caps with automatic stop actions when limits are exceeded | +| 🔒 **Compliance Gating** | GDPR, HIPAA, PCI, and strict model allowlists — block non-compliant models before execution | +| 📊 **KPI-Weighted Routing** | Inject business priorities (quality, cost, latency, energy) as weights into every model decision | +| 🌱 **Energy Tracking** | Deterministic compute-intensity coefficients for carbon-aware AI operations | +| 🔍 **Decision Traces** | Full per-step audit trail: action, reason, model, cost, budget state, enforcement status | +| ⚙️ **Harness Modes** | off / observe / enforce — roll out safely with observe, then switch to enforce when ready | --- @@ -774,7 +818,7 @@ If you use cascadeflow in your research or project, please cite: ```bibtex @software{cascadeflow2025, author = {Lemony Inc., Sascha Buehrle and Contributors}, - title = {cascadeflow: Smart AI model cascading for cost optimization}, + title = {cascadeflow: Agent runtime intelligence layer for AI agent workflows}, year = {2025}, publisher = {GitHub}, url = {https://github.com/lemony-ai/cascadeflow} diff --git a/cascadeflow/__init__.py b/cascadeflow/__init__.py index 1b61a9f3..af4c429a 100644 --- a/cascadeflow/__init__.py +++ b/cascadeflow/__init__.py @@ -1,30 +1,23 @@ """ -cascadeflow - Smart AI model cascading for cost optimization. - -Route queries intelligently across multiple AI models from tiny SLMs -to frontier LLMs based on complexity, domain, and budget. - -Features: -- 🚀 Speculative cascades (2-3x faster) -- 💰 60-95% cost savings -- 🎯 Per-prompt domain detection -- 🎨 2.0x domain boost for specialists -- 🔍 Multi-factor optimization -- 🆓 Free tier (Ollama + Groq) -- ⚡ 3 lines of code - -Example: - >>> from cascadeflow import CascadeAgent, CascadePresets - >>> - >>> # Auto-detect available models - >>> models = CascadePresets.auto_detect_models() - >>> - >>> # Create agent with intelligence layer - >>> agent = CascadeAgent(models, enable_caching=True) - >>> - >>> # Run query (automatically optimized!) - >>> result = await agent.run("Fix this Python bug") - >>> print(f"Used {result.model_used} - Cost: ${result.cost:.6f}") +cascadeflow - Agent runtime intelligence layer. + +In-process harness that optimizes cost, latency, quality, budget, compliance, +and energy across AI agent workflows. Works inside agent execution loops with +full state awareness -- not an external proxy. + +Quick start: + import cascadeflow + cascadeflow.init(mode="observe") + # All OpenAI/Anthropic SDK calls are now tracked and traced. + +Key APIs: + cascadeflow.init(mode) -- activate harness (off | observe | enforce) + cascadeflow.run(budget) -- scoped run with budget/trace + @cascadeflow.agent(budget) -- policy metadata on agent functions + session.summary() -- structured metrics + session.trace() -- full decision audit trail + +Integrations: LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, Vercel AI SDK """ __version__ = "1.0.0" @@ -240,6 +233,10 @@ ) # NEW: Harness API scaffold (V2 core branch) +# NOTE: harness.agent is NOT re-exported here — it would shadow the +# cascadeflow.agent *module* and break dotted-path resolution +# (e.g. patch("cascadeflow.agent.PROVIDER_REGISTRY")). +# Use ``from cascadeflow.harness import agent`` instead. from .harness import ( HarnessConfig, HarnessInitReport, @@ -247,7 +244,6 @@ init, reset, run, - agent as harness_agent, get_harness_config, get_current_run, ) @@ -401,7 +397,6 @@ "init", "reset", "run", - "harness_agent", "get_harness_config", "get_current_run", # ===== PROVIDERS ===== diff --git a/cascadeflow/harness/__init__.py b/cascadeflow/harness/__init__.py index 43a03662..74c07219 100644 --- a/cascadeflow/harness/__init__.py +++ b/cascadeflow/harness/__init__.py @@ -14,11 +14,13 @@ HarnessInitReport, HarnessRunContext, agent, + get_harness_callback_manager, get_current_run, get_harness_config, init, reset, run, + set_harness_callback_manager, ) __all__ = [ @@ -29,6 +31,8 @@ "run", "agent", "get_current_run", + "get_harness_callback_manager", "get_harness_config", + "set_harness_callback_manager", "reset", ] diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py index a71d5f5a..95ff4245 100644 --- a/cascadeflow/harness/api.py +++ b/cascadeflow/harness/api.py @@ -4,8 +4,10 @@ import json import logging import os +import time from contextvars import ContextVar, Token from dataclasses import dataclass, field +from functools import wraps from importlib.util import find_spec from pathlib import Path from typing import Any, Callable, Literal, Optional, TypeVar, cast @@ -39,7 +41,21 @@ class HarnessInitReport: @dataclass class HarnessRunContext: + """Scoped run context for tracking harness metrics across LLM calls. + + Thread safety: the context is stored in a ``ContextVar`` and is safe for + asyncio (each task gets its own copy of the token). However, the context + object itself uses plain attribute mutation (``+=``) for counters. If + multiple OS threads share the *same* ``HarnessRunContext`` instance, + concurrent updates may race. Each ``with run(...)`` scope should be + confined to a single thread or asyncio task. + """ + run_id: str = field(default_factory=lambda: uuid4().hex[:12]) + _started_monotonic: float = field(default_factory=time.monotonic, init=False, repr=False) + started_at_ms: float = field(default_factory=lambda: time.time() * 1000) + ended_at_ms: Optional[float] = None + duration_ms: Optional[float] = None mode: HarnessMode = "off" budget_max: Optional[float] = None tool_calls_max: Optional[int] = None @@ -73,6 +89,9 @@ def __enter__(self) -> HarnessRunContext: return self def __exit__(self, exc_type: Any, exc: Any, tb: Any) -> None: + self.ended_at_ms = time.time() * 1000 + self.duration_ms = max(0.0, (time.monotonic() - self._started_monotonic) * 1000.0) + self._log_summary() if self._token is not None: _current_run.reset(self._token) self._token = None @@ -86,6 +105,44 @@ async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None: def trace(self) -> list[dict[str, Any]]: return list(self._trace) + def summary(self) -> dict[str, Any]: + return { + "run_id": self.run_id, + "mode": self.mode, + "step_count": self.step_count, + "tool_calls": self.tool_calls, + "cost": self.cost, + "savings": self.savings, + "latency_used_ms": self.latency_used_ms, + "energy_used": self.energy_used, + "budget_max": self.budget_max, + "budget_remaining": self.budget_remaining, + "last_action": self.last_action, + "model_used": self.model_used, + "duration_ms": self.duration_ms, + } + + def _log_summary(self) -> None: + if self.mode == "off" or self.step_count <= 0: + return + logger.info( + ( + "harness run summary run_id=%s mode=%s steps=%d tool_calls=%d " + "cost=%.6f latency_ms=%.2f energy=%.4f last_action=%s model=%s " + "budget_remaining=%s" + ), + self.run_id, + self.mode, + self.step_count, + self.tool_calls, + self.cost, + self.latency_used_ms, + self.energy_used, + self.last_action, + self.model_used, + self.budget_remaining, + ) + def record( self, action: str, @@ -95,19 +152,42 @@ def record( applied: Optional[bool] = None, decision_mode: Optional[str] = None, ) -> None: - self.last_action = action - self.model_used = model + safe_action = _sanitize_trace_value(action, max_length=_MAX_ACTION_LEN) + if not safe_action: + logger.warning("record() called with empty action, defaulting to 'allow'") + safe_action = "allow" + safe_reason = _sanitize_trace_value(reason, max_length=_MAX_REASON_LEN) or "unspecified" + safe_model = ( + _sanitize_trace_value(model, max_length=_MAX_MODEL_LEN) if model is not None else None + ) + + self.last_action = safe_action + self.model_used = safe_model entry: dict[str, Any] = { - "action": action, - "reason": reason, - "model": model, + "action": safe_action, + "reason": safe_reason, + "model": safe_model, "run_id": self.run_id, + "mode": self.mode, + "step": self.step_count, + "timestamp_ms": time.time() * 1000, + "tool_calls_total": self.tool_calls, + "cost_total": self.cost, + "latency_used_ms": self.latency_used_ms, + "energy_used": self.energy_used, + "budget_state": { + "max": self.budget_max, + "remaining": self.budget_remaining, + }, } if applied is not None: entry["applied"] = applied if decision_mode is not None: entry["decision_mode"] = decision_mode self._trace.append(entry) + if len(self._trace) > _MAX_TRACE_ENTRIES: + self._trace = self._trace[-_MAX_TRACE_ENTRIES:] + _emit_harness_decision(entry) _harness_config: HarnessConfig = HarnessConfig() @@ -115,6 +195,7 @@ def record( "cascadeflow_harness_run", default=None ) _is_instrumented: bool = False +_harness_callback_manager: Any = None _UNSET = object() @@ -124,6 +205,32 @@ def _validate_mode(mode: str) -> HarnessMode: return cast(HarnessMode, mode) +_VALID_COMPLIANCE_VALUES = {"gdpr", "hipaa", "pci", "strict"} + + +def _validate_harness_params( + *, + budget: Optional[float], + max_tool_calls: Optional[int], + max_latency_ms: Optional[float], + max_energy: Optional[float], + compliance: Optional[str], +) -> None: + """Validate harness parameters, raising ValueError for invalid inputs.""" + if budget is not None and budget < 0: + raise ValueError(f"budget must be non-negative, got {budget}") + if max_tool_calls is not None and max_tool_calls < 0: + raise ValueError(f"max_tool_calls must be non-negative, got {max_tool_calls}") + if max_latency_ms is not None and max_latency_ms < 0: + raise ValueError(f"max_latency_ms must be non-negative, got {max_latency_ms}") + if max_energy is not None and max_energy < 0: + raise ValueError(f"max_energy must be non-negative, got {max_energy}") + if compliance is not None and compliance.strip().lower() not in _VALID_COMPLIANCE_VALUES: + raise ValueError( + f"compliance must be one of {sorted(_VALID_COMPLIANCE_VALUES)}, got {compliance!r}" + ) + + def _detect_sdks() -> dict[str, bool]: return { "openai": find_spec("openai") is not None, @@ -139,6 +246,15 @@ def get_current_run() -> Optional[HarnessRunContext]: return _current_run.get() +def get_harness_callback_manager() -> Any: + return _harness_callback_manager + + +def set_harness_callback_manager(callback_manager: Any) -> None: + global _harness_callback_manager + _harness_callback_manager = callback_manager + + def reset() -> None: """ Reset harness global state and unpatch instrumented clients. @@ -148,15 +264,72 @@ def reset() -> None: global _harness_config global _is_instrumented + global _harness_callback_manager + global _cached_cascade_decision_event - from cascadeflow.harness.instrument import unpatch_openai + from cascadeflow.harness.instrument import unpatch_anthropic, unpatch_openai unpatch_openai() + unpatch_anthropic() _harness_config = HarnessConfig() _is_instrumented = False + _harness_callback_manager = None + _cached_cascade_decision_event = None _current_run.set(None) +_MAX_ACTION_LEN = 64 +_MAX_REASON_LEN = 160 +_MAX_MODEL_LEN = 128 +_MAX_ENV_JSON_LEN = 4096 +_MAX_TRACE_ENTRIES = 1000 + + +def _sanitize_trace_value(value: Any, *, max_length: int) -> Optional[str]: + if value is None: + return None + text = str(value).replace("\n", " ").replace("\r", " ").strip() + text = "".join(c for c in text if c.isprintable()) + if len(text) > max_length: + text = text[: max_length - 3] + "..." + return text or None + + +_cached_cascade_decision_event: Any = None + + +def _emit_harness_decision(entry: dict[str, Any]) -> None: + global _cached_cascade_decision_event + + manager = get_harness_callback_manager() + if manager is None: + return + + trigger = getattr(manager, "trigger", None) + if not callable(trigger): + logger.debug("harness callback manager has no trigger() method") + return + + if _cached_cascade_decision_event is None: + try: + from cascadeflow.telemetry.callbacks import CallbackEvent + + _cached_cascade_decision_event = CallbackEvent.CASCADE_DECISION + except Exception: + logger.debug("telemetry callbacks unavailable for harness decision emit", exc_info=True) + return + + try: + trigger( + _cached_cascade_decision_event, + query="[harness]", + data=dict(entry), + workflow="harness", + ) + except Exception: + logger.debug("failed to emit harness decision callback", exc_info=True) + + def _parse_bool(raw: str) -> bool: normalized = raw.strip().lower() return normalized in {"1", "true", "yes", "on"} @@ -171,6 +344,8 @@ def _parse_int(raw: str) -> int: def _parse_json_dict(raw: str) -> dict[str, float]: + if len(raw) > _MAX_ENV_JSON_LEN: + raise ValueError(f"JSON config exceeds {_MAX_ENV_JSON_LEN} characters for harness env var") value = json.loads(raw) if not isinstance(value, dict): raise ValueError("expected JSON object") @@ -305,9 +480,12 @@ def init( kpi_targets: Optional[dict[str, float]] | object = _UNSET, kpi_weights: Optional[dict[str, float]] | object = _UNSET, compliance: Optional[str] | object = _UNSET, + callback_manager: Any | object = _UNSET, ) -> HarnessInitReport: """ - Initialize global harness settings and instrument detected SDK clients. + Initialize global harness settings. + + This is a scaffold API for V2 work and intentionally performs no request patching yet. """ global _harness_config @@ -338,8 +516,18 @@ def init( resolved_compliance = _resolve_value( "compliance", compliance, env_config, file_config, None, sources ) + if callback_manager is not _UNSET: + set_harness_callback_manager(callback_manager) + sources["callback_manager"] = "code" validated_mode = _validate_mode(str(resolved_mode)) + _validate_harness_params( + budget=cast(Optional[float], resolved_budget), + max_tool_calls=cast(Optional[int], resolved_max_tool_calls), + max_latency_ms=cast(Optional[float], resolved_max_latency_ms), + max_energy=cast(Optional[float], resolved_max_energy), + compliance=cast(Optional[str], resolved_compliance), + ) _harness_config = HarnessConfig( mode=validated_mode, verbose=bool(resolved_verbose), @@ -361,13 +549,29 @@ def init( if patch_openai(): instrumented.append("openai") - elif validated_mode == "off": - from cascadeflow.harness.instrument import is_patched, unpatch_openai + else: + detected_but_not_instrumented.append("openai") + + if validated_mode != "off" and sdk_presence["anthropic"]: + from cascadeflow.harness.instrument import patch_anthropic - if is_patched(): + if patch_anthropic(): + instrumented.append("anthropic") + else: + detected_but_not_instrumented.append("anthropic") + + if validated_mode == "off": + from cascadeflow.harness.instrument import ( + is_anthropic_patched, + is_openai_patched, + unpatch_anthropic, + unpatch_openai, + ) + + if is_openai_patched(): unpatch_openai() - if sdk_presence["anthropic"]: - detected_but_not_instrumented.append("anthropic") + if is_anthropic_patched(): + unpatch_anthropic() if _is_instrumented: logger.debug("harness init called again; instrumentation remains idempotent") @@ -415,6 +619,14 @@ def run( resolved_kpi_weights = kpi_weights if kpi_weights is not None else config.kpi_weights resolved_compliance = compliance if compliance is not None else config.compliance + _validate_harness_params( + budget=resolved_budget, + max_tool_calls=resolved_tool_calls, + max_latency_ms=resolved_latency, + max_energy=resolved_energy, + compliance=resolved_compliance, + ) + return HarnessRunContext( mode=config.mode, budget_max=resolved_budget, @@ -453,18 +665,18 @@ def decorator(func: F) -> F: if inspect.iscoroutinefunction(func): + @wraps(func) async def async_wrapper(*args: Any, **kwargs: Any) -> Any: return await func(*args, **kwargs) async_wrapper.__cascadeflow_agent_policy__ = metadata # type: ignore[attr-defined] - async_wrapper.__name__ = getattr(func, "__name__", "wrapped_agent") return cast(F, async_wrapper) + @wraps(func) def sync_wrapper(*args: Any, **kwargs: Any) -> Any: return func(*args, **kwargs) sync_wrapper.__cascadeflow_agent_policy__ = metadata # type: ignore[attr-defined] - sync_wrapper.__name__ = getattr(func, "__name__", "wrapped_agent") return cast(F, sync_wrapper) return decorator diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py index c2fbd7ab..4b08b9f6 100644 --- a/cascadeflow/harness/instrument.py +++ b/cascadeflow/harness/instrument.py @@ -1,11 +1,10 @@ -"""OpenAI Python client auto-instrumentation for cascadeflow harness. +"""Python SDK auto-instrumentation for cascadeflow harness. -Patches ``openai.resources.chat.completions.Completions.create`` (sync) and -``AsyncCompletions.create`` (async) to intercept LLM calls for observe/enforce -modes. +Patches OpenAI and Anthropic SDK request methods to intercept LLM calls for +observe/enforce modes. -This module is called internally by ``cascadeflow.harness.init()``. Users -should not call ``patch_openai`` / ``unpatch_openai`` directly. +This module is called internally by ``cascadeflow.harness.init()``. Users +should not call patch/unpatch helpers directly. Implementation notes: - Patching is class-level (all current and future client instances). @@ -51,6 +50,9 @@ _openai_patched: bool = False _original_sync_create: Any = None _original_async_create: Any = None +_anthropic_patched: bool = False +_original_anthropic_sync_create: Any = None +_original_anthropic_async_create: Any = None _MODEL_TOTAL_COSTS: dict[str, float] = { name: _model_total_price_shared(name) for name in _PRICING_MODELS @@ -140,7 +142,7 @@ def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> return _estimate_energy_shared(model, prompt_tokens, completion_tokens) -def _count_tool_calls_in_response(response: Any) -> int: +def _count_tool_calls_in_openai_response(response: Any) -> int: """Count tool calls in a non-streaming ChatCompletion response.""" choices = getattr(response, "choices", None) if not choices: @@ -154,7 +156,7 @@ def _count_tool_calls_in_response(response: Any) -> int: return len(tool_calls) -def _extract_usage(response: Any) -> tuple[int, int]: +def _extract_openai_usage(response: Any) -> tuple[int, int]: """Extract (prompt_tokens, completion_tokens) from a response.""" usage = getattr(response, "usage", None) if usage is None: @@ -165,6 +167,29 @@ def _extract_usage(response: Any) -> tuple[int, int]: ) +def _extract_anthropic_usage(response: Any) -> tuple[int, int]: + """Extract (input_tokens, output_tokens) from an Anthropic response.""" + usage = getattr(response, "usage", None) + if usage is None: + return 0, 0 + return ( + getattr(usage, "input_tokens", 0) or 0, + getattr(usage, "output_tokens", 0) or 0, + ) + + +def _count_tool_calls_in_anthropic_response(response: Any) -> int: + """Count Anthropic ``tool_use`` blocks in a non-streaming response.""" + content = getattr(response, "content", None) + if not content: + return 0 + count = 0 + for block in content: + if getattr(block, "type", None) == "tool_use": + count += 1 + return count + + def _model_total_cost(model: str) -> float: return _MODEL_TOTAL_COSTS.get(model, _model_total_price_shared(model)) @@ -596,6 +621,9 @@ def __next__(self) -> Any: except StopIteration: self._finalize() raise + except Exception: + self._finalize() + raise def __enter__(self) -> _InstrumentedStream: if hasattr(self._stream, "__enter__"): @@ -625,6 +653,9 @@ async def __anext__(self) -> Any: except StopAsyncIteration: self._finalize() raise + except Exception: + self._finalize() + raise async def __aenter__(self) -> _InstrumentedAsyncStream: if hasattr(self._stream, "__aenter__"): @@ -638,6 +669,174 @@ async def __aexit__(self, *args: Any) -> bool: return False +class _InstrumentedAnthropicStreamBase: + """Shared stream-wrapper logic for sync and async Anthropic streams.""" + + __slots__ = ( + "_stream", + "_ctx", + "_model", + "_start_time", + "_pre_action", + "_pre_reason", + "_pre_model", + "_pre_applied", + "_decision_mode", + "_input_tokens", + "_output_tokens", + "_tool_call_count", + "_finalized", + ) + + def __init__( + self, + stream: Any, + ctx: Any, + model: str, + start_time: float, + pre_action: str = "allow", + pre_reason: str = "observe", + pre_model: str | None = None, + pre_applied: bool = True, + decision_mode: str = "observe", + ) -> None: + self._stream = stream + self._ctx = ctx + self._model = model + self._start_time = start_time + self._pre_action = pre_action + self._pre_reason = pre_reason + self._pre_model = pre_model or model + self._pre_applied = pre_applied + self._decision_mode = decision_mode + self._input_tokens: int = 0 + self._output_tokens: int = 0 + self._tool_call_count: int = 0 + self._finalized: bool = False + + def close(self) -> None: + self._finalize() + if hasattr(self._stream, "close"): + self._stream.close() + + def _inspect_event(self, event: Any) -> None: + event_type = getattr(event, "type", None) + + if event_type == "message_start": + message = getattr(event, "message", None) + usage = getattr(message, "usage", None) + if usage is not None: + input_tokens = getattr(usage, "input_tokens", None) + output_tokens = getattr(usage, "output_tokens", None) + if isinstance(input_tokens, (int, float)): + self._input_tokens = int(input_tokens) if input_tokens > 0 else 0 + if isinstance(output_tokens, (int, float)): + self._output_tokens = int(output_tokens) if output_tokens > 0 else 0 + return + + usage = getattr(event, "usage", None) + if usage is not None: + input_tokens = getattr(usage, "input_tokens", None) + output_tokens = getattr(usage, "output_tokens", None) + if isinstance(input_tokens, (int, float)) and input_tokens > 0: + self._input_tokens = int(input_tokens) + if isinstance(output_tokens, (int, float)): + self._output_tokens = int(output_tokens) if output_tokens > 0 else 0 + + if event_type == "content_block_start": + content_block = getattr(event, "content_block", None) + block_type = getattr(content_block, "type", None) + if block_type in {"tool_use", "server_tool_use"}: + self._tool_call_count += 1 + + def _finalize(self) -> None: + if self._finalized: + return + self._finalized = True + + if self._ctx is None: + return + + elapsed_ms = (time.monotonic() - self._start_time) * 1000 + _update_context( + self._ctx, + self._model, + self._input_tokens, + self._output_tokens, + self._tool_call_count, + elapsed_ms, + action=self._pre_action, + action_reason=self._pre_reason, + action_model=self._pre_model, + applied=self._pre_applied, + decision_mode=self._decision_mode, + ) + + +class _InstrumentedAnthropicStream(_InstrumentedAnthropicStreamBase): + """Wraps an Anthropic sync stream and tracks usage at stream end.""" + + __slots__ = () + + def __iter__(self) -> _InstrumentedAnthropicStream: + return self + + def __next__(self) -> Any: + try: + event = next(self._stream) + self._inspect_event(event) + return event + except StopIteration: + self._finalize() + raise + except Exception: + self._finalize() + raise + + def __enter__(self) -> _InstrumentedAnthropicStream: + if hasattr(self._stream, "__enter__"): + self._stream.__enter__() + return self + + def __exit__(self, *args: Any) -> bool: + self._finalize() + if hasattr(self._stream, "__exit__"): + return self._stream.__exit__(*args) # type: ignore[no-any-return] + return False + + +class _InstrumentedAnthropicAsyncStream(_InstrumentedAnthropicStreamBase): + """Wraps an Anthropic async stream and tracks usage at stream end.""" + + __slots__ = () + + def __aiter__(self) -> _InstrumentedAnthropicAsyncStream: + return self + + async def __anext__(self) -> Any: + try: + event = await self._stream.__anext__() + self._inspect_event(event) + return event + except StopAsyncIteration: + self._finalize() + raise + except Exception: + self._finalize() + raise + + async def __aenter__(self) -> _InstrumentedAnthropicAsyncStream: + if hasattr(self._stream, "__aenter__"): + await self._stream.__aenter__() + return self + + async def __aexit__(self, *args: Any) -> bool: + self._finalize() + if hasattr(self._stream, "__aexit__"): + return await self._stream.__aexit__(*args) # type: ignore[no-any-return] + return False + + # --------------------------------------------------------------------------- # Wrapper factories # --------------------------------------------------------------------------- @@ -713,8 +912,8 @@ def _finalize_interception( if (not state.is_stream) and ctx: elapsed_ms = (time.monotonic() - state.start_time) * 1000 - prompt_tokens, completion_tokens = _extract_usage(response) - tool_call_count = _count_tool_calls_in_response(response) + prompt_tokens, completion_tokens = _extract_openai_usage(response) + tool_call_count = _count_tool_calls_in_openai_response(response) _update_context( ctx, state.model, @@ -810,6 +1009,158 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: return wrapper +def _make_patched_anthropic_create(original_fn: Any) -> Any: + """Create a patched version of ``anthropic.Messages.create``.""" + + @functools.wraps(original_fn) + def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: + from cascadeflow.harness.api import get_current_run, get_harness_config + + config = get_harness_config() + ctx = get_current_run() + mode = ctx.mode if ctx else config.mode + + if mode == "off": + return original_fn(self, *args, **kwargs) + + model: str = kwargs.get("model", "unknown") + pre_action = "allow" + pre_reason = mode + pre_model = model + pre_applied = True + + if ctx: + kwargs, model, pre_action, pre_reason, pre_model, pre_applied = ( + _resolve_pre_call_decision( + ctx, + mode, + model, + kwargs, + ) + ) + + is_stream = bool(kwargs.get("stream", False)) + start_time = time.monotonic() + response = original_fn(self, *args, **kwargs) + + if not ctx: + logger.debug( + "harness %s (anthropic): model=%s (no active run scope, metrics not tracked)", + mode, + model, + ) + return response + + if is_stream: + return _InstrumentedAnthropicStream( + response, + ctx, + model, + start_time, + pre_action, + pre_reason, + pre_model, + pre_applied, + mode, + ) + + elapsed_ms = (time.monotonic() - start_time) * 1000 + input_tokens, output_tokens = _extract_anthropic_usage(response) + tool_call_count = _count_tool_calls_in_anthropic_response(response) + _update_context( + ctx, + model, + input_tokens, + output_tokens, + tool_call_count, + elapsed_ms, + action=pre_action, + action_reason=pre_reason, + action_model=pre_model, + applied=pre_applied, + decision_mode=mode, + ) + return response + + return wrapper + + +def _make_patched_anthropic_async_create(original_fn: Any) -> Any: + """Create a patched version of ``anthropic.AsyncMessages.create``.""" + + @functools.wraps(original_fn) + async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: + from cascadeflow.harness.api import get_current_run, get_harness_config + + config = get_harness_config() + ctx = get_current_run() + mode = ctx.mode if ctx else config.mode + + if mode == "off": + return await original_fn(self, *args, **kwargs) + + model: str = kwargs.get("model", "unknown") + pre_action = "allow" + pre_reason = mode + pre_model = model + pre_applied = True + + if ctx: + kwargs, model, pre_action, pre_reason, pre_model, pre_applied = ( + _resolve_pre_call_decision( + ctx, + mode, + model, + kwargs, + ) + ) + + is_stream = bool(kwargs.get("stream", False)) + start_time = time.monotonic() + response = await original_fn(self, *args, **kwargs) + + if not ctx: + logger.debug( + "harness %s async (anthropic): model=%s (no active run scope, metrics not tracked)", + mode, + model, + ) + return response + + if is_stream: + return _InstrumentedAnthropicAsyncStream( + response, + ctx, + model, + start_time, + pre_action, + pre_reason, + pre_model, + pre_applied, + mode, + ) + + elapsed_ms = (time.monotonic() - start_time) * 1000 + input_tokens, output_tokens = _extract_anthropic_usage(response) + tool_call_count = _count_tool_calls_in_anthropic_response(response) + _update_context( + ctx, + model, + input_tokens, + output_tokens, + tool_call_count, + elapsed_ms, + action=pre_action, + action_reason=pre_reason, + action_model=pre_model, + applied=pre_applied, + decision_mode=mode, + ) + return response + + return wrapper + + # --------------------------------------------------------------------------- # Public API (called by cascadeflow.harness.api) # --------------------------------------------------------------------------- @@ -846,6 +1197,37 @@ def patch_openai() -> bool: return True +def patch_anthropic() -> bool: + """Patch the Anthropic Python client for harness instrumentation. + + Returns ``True`` if patching succeeded, ``False`` if anthropic is not + installed. Idempotent: safe to call multiple times. + """ + global _anthropic_patched, _original_anthropic_sync_create, _original_anthropic_async_create + + if _anthropic_patched: + logger.debug("anthropic already patched, skipping") + return True + + try: + from anthropic.resources.messages import AsyncMessages, Messages + except ImportError: + logger.debug("anthropic package not available, skipping instrumentation") + return False + + _original_anthropic_sync_create = Messages.create + _original_anthropic_async_create = AsyncMessages.create + + Messages.create = _make_patched_anthropic_create(_original_anthropic_sync_create) # type: ignore[assignment] + AsyncMessages.create = _make_patched_anthropic_async_create( # type: ignore[assignment] + _original_anthropic_async_create, + ) + + _anthropic_patched = True + logger.info("anthropic client instrumented (sync + async)") + return True + + def unpatch_openai() -> None: """Restore original OpenAI client methods. @@ -873,6 +1255,43 @@ def unpatch_openai() -> None: logger.info("openai client unpatched") -def is_patched() -> bool: +def unpatch_anthropic() -> None: + """Restore original Anthropic client methods. + + Safe to call even if not patched. Used by ``reset()`` and tests. + """ + global _anthropic_patched, _original_anthropic_sync_create, _original_anthropic_async_create + + if not _anthropic_patched: + return + + try: + from anthropic.resources.messages import AsyncMessages, Messages + except ImportError: + _anthropic_patched = False + return + + if _original_anthropic_sync_create is not None: + Messages.create = _original_anthropic_sync_create # type: ignore[assignment] + if _original_anthropic_async_create is not None: + AsyncMessages.create = _original_anthropic_async_create # type: ignore[assignment] + + _original_anthropic_sync_create = None + _original_anthropic_async_create = None + _anthropic_patched = False + logger.info("anthropic client unpatched") + + +def is_openai_patched() -> bool: """Return whether the OpenAI client is currently patched.""" return _openai_patched + + +def is_anthropic_patched() -> bool: + """Return whether the Anthropic client is currently patched.""" + return _anthropic_patched + + +def is_patched() -> bool: + """Return whether any supported Python SDK is currently patched.""" + return _openai_patched or _anthropic_patched diff --git a/cascadeflow/harness/pricing.py b/cascadeflow/harness/pricing.py index bd86323e..81a1de06 100644 --- a/cascadeflow/harness/pricing.py +++ b/cascadeflow/harness/pricing.py @@ -1,11 +1,17 @@ """Shared harness pricing and energy profiles. This module centralizes model-cost and energy-estimation defaults used by -harness integrations (OpenAI auto-instrumentation, OpenAI Agents SDK, CrewAI). +harness integrations (OpenAI auto-instrumentation, OpenAI Agents SDK, CrewAI, +Google ADK). + +A future pricing registry will consolidate with ``cascadeflow.pricing`` +and LiteLLM live data. Until then this module is the canonical source +for harness-level cost/energy estimation. """ from __future__ import annotations +import re as _re from typing import Final # USD per 1M tokens (input, output). @@ -21,15 +27,22 @@ "o1": (15.00, 60.00), "o1-mini": (3.00, 12.00), "o3-mini": (1.10, 4.40), - # Anthropic aliases used by CrewAI model names. + # Anthropic "claude-sonnet-4": (3.00, 15.00), "claude-haiku-3.5": (1.00, 5.00), "claude-opus-4.5": (5.00, 25.00), + # Google Gemini + "gemini-2.5-flash": (0.15, 0.60), + "gemini-2.5-pro": (1.25, 10.00), + "gemini-2.0-flash": (0.10, 0.40), + "gemini-1.5-flash": (0.075, 0.30), + "gemini-1.5-pro": (1.25, 5.00), } DEFAULT_PRICING_USD_PER_M: Final[tuple[float, float]] = (2.50, 10.00) # Deterministic proxy coefficients for energy tracking. ENERGY_COEFFICIENTS: Final[dict[str, float]] = { + # OpenAI "gpt-4o": 1.0, "gpt-4o-mini": 0.3, "gpt-5": 1.2, @@ -40,6 +53,16 @@ "o1": 2.0, "o1-mini": 0.8, "o3-mini": 0.5, + # Anthropic + "claude-sonnet-4": 1.0, + "claude-haiku-3.5": 0.3, + "claude-opus-4.5": 1.8, + # Google Gemini + "gemini-2.5-flash": 0.3, + "gemini-2.5-pro": 1.2, + "gemini-2.0-flash": 0.25, + "gemini-1.5-flash": 0.2, + "gemini-1.5-pro": 1.0, } DEFAULT_ENERGY_COEFFICIENT: Final[float] = 1.0 ENERGY_OUTPUT_WEIGHT: Final[float] = 1.5 @@ -60,19 +83,85 @@ ) +# --------------------------------------------------------------------------- +# Fuzzy model-name resolution +# --------------------------------------------------------------------------- + +# Pre-compiled pattern for stripping version/preview/date suffixes. +# Matches: -preview, -preview-05-20, -20250120, -latest, -exp-0827, etc. +_VERSION_SUFFIX_RE = _re.compile( + r"(-preview(?:-\d{2,4}-\d{2})?|-\d{8,}|-latest|-exp(?:-\d+)?|-it)$" +) + +# Cache for resolved model → pricing key lookups. +_pricing_key_cache: dict[str, str | None] = {} + + +def _resolve_pricing_key(model: str) -> str | None: + """Resolve a model name to a known pricing table key. + + Tries exact match first, then strips version/preview/date suffixes, + then tries longest-prefix match against known model names. + Returns ``None`` when no match is found (caller should use defaults). + """ + if model in _pricing_key_cache: + return _pricing_key_cache[model] + + # Exact match + if model in PRICING_USD_PER_M: + _pricing_key_cache[model] = model + return model + + # Strip version suffixes and retry + stripped = _VERSION_SUFFIX_RE.sub("", model) + if stripped != model and stripped in PRICING_USD_PER_M: + _pricing_key_cache[model] = stripped + return stripped + + # Longest-prefix match (e.g. "gemini-2.5-flash-8b" → "gemini-2.5-flash") + best: str | None = None + best_len = 0 + for known in PRICING_USD_PER_M: + if model.startswith(known) and len(known) > best_len: + best = known + best_len = len(known) + if best is not None: + _pricing_key_cache[model] = best + return best + + _pricing_key_cache[model] = None + return None + + +# --------------------------------------------------------------------------- +# Public estimation helpers +# --------------------------------------------------------------------------- + + def estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float: """Estimate USD cost from token usage.""" - in_price, out_price = PRICING_USD_PER_M.get(model, DEFAULT_PRICING_USD_PER_M) + key = _resolve_pricing_key(model) + in_price, out_price = ( + PRICING_USD_PER_M.get(key, DEFAULT_PRICING_USD_PER_M) if key else DEFAULT_PRICING_USD_PER_M + ) return (input_tokens / 1_000_000.0) * in_price + (output_tokens / 1_000_000.0) * out_price def estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float: """Estimate deterministic proxy energy units.""" - coefficient = ENERGY_COEFFICIENTS.get(model, DEFAULT_ENERGY_COEFFICIENT) - return coefficient * (input_tokens + (output_tokens * ENERGY_OUTPUT_WEIGHT)) + key = _resolve_pricing_key(model) + coeff = ( + ENERGY_COEFFICIENTS.get(key, DEFAULT_ENERGY_COEFFICIENT) + if key + else DEFAULT_ENERGY_COEFFICIENT + ) + return coeff * (input_tokens + (output_tokens * ENERGY_OUTPUT_WEIGHT)) def model_total_price(model: str) -> float: """Return total (input + output) price per 1M tokens.""" - in_price, out_price = PRICING_USD_PER_M.get(model, DEFAULT_PRICING_USD_PER_M) + key = _resolve_pricing_key(model) + in_price, out_price = ( + PRICING_USD_PER_M.get(key, DEFAULT_PRICING_USD_PER_M) if key else DEFAULT_PRICING_USD_PER_M + ) return in_price + out_price diff --git a/cascadeflow/integrations/__init__.py b/cascadeflow/integrations/__init__.py index 33552773..61c3ebbd 100644 --- a/cascadeflow/integrations/__init__.py +++ b/cascadeflow/integrations/__init__.py @@ -185,6 +185,28 @@ crewai_is_enabled = None crewai_get_config = None +# Try to import Google ADK integration +try: + from .google_adk import ( + GOOGLE_ADK_AVAILABLE, + GoogleADKHarnessConfig, + CascadeFlowADKPlugin, + enable as google_adk_enable, + disable as google_adk_disable, + is_available as google_adk_is_available, + is_enabled as google_adk_is_enabled, + get_config as google_adk_get_config, + ) +except ImportError: + GOOGLE_ADK_AVAILABLE = False + GoogleADKHarnessConfig = None + CascadeFlowADKPlugin = None + google_adk_enable = None + google_adk_disable = None + google_adk_is_available = None + google_adk_is_enabled = None + google_adk_get_config = None + __all__ = [] if LITELLM_AVAILABLE: @@ -285,6 +307,20 @@ ] ) +if GOOGLE_ADK_AVAILABLE: + __all__.extend( + [ + "GOOGLE_ADK_AVAILABLE", + "GoogleADKHarnessConfig", + "CascadeFlowADKPlugin", + "google_adk_enable", + "google_adk_disable", + "google_adk_is_available", + "google_adk_is_enabled", + "google_adk_get_config", + ] + ) + # Integration capabilities INTEGRATION_CAPABILITIES = { "litellm": LITELLM_AVAILABLE, @@ -294,6 +330,7 @@ "openclaw": OPENCLAW_AVAILABLE, "paygentic": PAYGENTIC_AVAILABLE, "crewai": CREWAI_AVAILABLE, + "google_adk": GOOGLE_ADK_AVAILABLE, } @@ -319,4 +356,5 @@ def get_integration_info(): "openclaw_available": OPENCLAW_AVAILABLE, "paygentic_available": PAYGENTIC_AVAILABLE, "crewai_available": CREWAI_AVAILABLE, + "google_adk_available": GOOGLE_ADK_AVAILABLE, } diff --git a/cascadeflow/integrations/google_adk.py b/cascadeflow/integrations/google_adk.py new file mode 100644 index 00000000..325d21b2 --- /dev/null +++ b/cascadeflow/integrations/google_adk.py @@ -0,0 +1,486 @@ +"""Google ADK (Agent Development Kit) harness integration for cascadeflow. + +Uses ADK's ``BasePlugin`` system to intercept all LLM calls across all agents +in a Runner, feeding metrics into ``cascadeflow.harness`` run contexts. + +This module is optional — ``pip install cascadeflow[google-adk]`` pulls in the +google-adk dependency. When google-adk is not installed the public helpers +return gracefully and ``GOOGLE_ADK_AVAILABLE`` is ``False``. + +Integration surface: + - ``enable()``: create and return a plugin instance + - ``disable()``: deactivate the plugin and clean up + - ``CascadeFlowADKPlugin``: BasePlugin subclass for Runner(plugins=[...]) + +Unlike CrewAI (global hooks), ADK plugins are registered per-Runner. +``enable()`` returns the plugin instance; the user passes it to +``Runner(plugins=[plugin])``. + +Design note — no tool gating: + ADK's ``tools_dict`` is part of agent definition, not per-call. + Budget gate via ``before_model_callback`` provides sufficient cost control. +""" + +from __future__ import annotations + +import logging +import time +from dataclasses import dataclass +from importlib.util import find_spec +from typing import Any, Optional + +from cascadeflow.harness.api import get_current_run +from cascadeflow.harness.pricing import estimate_cost, estimate_energy + +logger = logging.getLogger("cascadeflow.integrations.google_adk") + +GOOGLE_ADK_AVAILABLE = find_spec("google.adk") is not None + +# Resolve the base class: use ADK's BasePlugin when available, else object. +_ADKBasePlugin: type +if GOOGLE_ADK_AVAILABLE: + try: + from google.adk.plugins import BasePlugin as _ADKBasePlugin # type: ignore[assignment] + except ImportError: + _ADKBasePlugin = object # type: ignore[assignment,misc] + GOOGLE_ADK_AVAILABLE = False +else: + _ADKBasePlugin = object # type: ignore[assignment,misc] + + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + + +@dataclass +class GoogleADKHarnessConfig: + """Runtime configuration for the Google ADK harness integration. + + fail_open: + If ``True`` (default), errors inside callbacks never break ADK + execution — they are logged and swallowed. + enable_budget_gate: + If ``True`` (default), ``before_model_callback`` blocks calls when + the harness run budget is exhausted (enforce mode only). + """ + + fail_open: bool = True + enable_budget_gate: bool = True + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _normalize_model_name(model: str) -> str: + """Strip LiteLlm-style provider prefix (``openai/gpt-4o`` → ``gpt-4o``). + + Also handles ``models/gemini-2.5-flash`` → ``gemini-2.5-flash``. + """ + if "/" in model: + return model.rsplit("/", 1)[-1] + return model + + +def _count_function_calls(content: Any) -> int: + """Count ``function_call`` parts in an ADK LlmResponse content.""" + if content is None: + return 0 + parts = getattr(content, "parts", None) + if not parts: + return 0 + count = 0 + for part in parts: + if getattr(part, "function_call", None) is not None: + count += 1 + return count + + +# --------------------------------------------------------------------------- +# Plugin +# --------------------------------------------------------------------------- + + +class CascadeFlowADKPlugin(_ADKBasePlugin): # type: ignore[misc] + """Google ADK BasePlugin with cascadeflow harness awareness. + + Intercepts every LLM call across all agents in a Runner to provide: + - Budget enforcement (enforce mode: short-circuits with error response) + - Cost, latency, and energy tracking + - Tool call counting + - Full trace recording into HarnessRunContext + """ + + def __init__(self, config: Optional[GoogleADKHarnessConfig] = None) -> None: + # google-adk BasePlugin requires a stable plugin name. + try: + super().__init__(name="cascadeflow_harness") + except TypeError: + # Fallback for local test environments where BasePlugin is ``object``. + super().__init__() + self.name = "cascadeflow_harness" + self._config = config or GoogleADKHarnessConfig() + self._active = True + self._call_seq: int = 0 + # Track call metadata between before/after callbacks. + # Keyed by id(callback_context) to guarantee uniqueness even when + # two concurrent calls share (invocation_id, agent_name). + self._call_start_times: dict[int, float] = {} + self._call_models: dict[int, str] = {} + # Fallback mapping for runtimes that provide distinct callback_context + # objects between before/after callbacks. + self._call_fallback_keys: dict[tuple[str, str], list[int]] = {} + + @staticmethod + def _callback_key(callback_context: Any) -> int: + """Return a unique key for a callback_context object. + + Uses ``id()`` which is guaranteed unique for the lifetime of the + object — ADK keeps the same CallbackContext alive across the + before/after/error callback sequence for a single LLM call. + """ + return id(callback_context) + + @staticmethod + def _fallback_key(callback_context: Any) -> tuple[str, str]: + """Return a stable fallback key for correlation across callbacks.""" + invocation_id = str(getattr(callback_context, "invocation_id", "") or "") + agent_name = str(getattr(callback_context, "agent_name", "") or "") + return (invocation_id, agent_name) + + def _track_call_key(self, callback_context: Any, key: int) -> None: + """Register key in fallback queue for cross-object callback matching.""" + fallback_key = self._fallback_key(callback_context) + if not fallback_key[0] and not fallback_key[1]: + return + self._call_fallback_keys.setdefault(fallback_key, []).append(key) + + def _resolve_call_key(self, callback_context: Any) -> int | None: + """Resolve stored key for callback context across runtime variants.""" + key = self._callback_key(callback_context) + if key in self._call_models or key in self._call_start_times: + return key + + fallback_key = self._fallback_key(callback_context) + keys = self._call_fallback_keys.get(fallback_key) + if not keys: + return None + + resolved = keys.pop(0) + if not keys: + self._call_fallback_keys.pop(fallback_key, None) + return resolved + + async def before_model_callback( + self, + callback_context: Any, + llm_request: Any, + ) -> Any: + """Budget gate and timing setup. + + Returns ``None`` to proceed normally, or an ``LlmResponse`` with + an error to short-circuit the call when budget is exhausted. + """ + if not self._active: + return None + + try: + ctx = get_current_run() + if ctx is None: + return None + if ctx.mode == "off": + return None + + # Extract model name from request + model_raw = getattr(llm_request, "model", None) or "unknown" + model = _normalize_model_name(str(model_raw)) + + key = self._callback_key(callback_context) + + # Budget gate in enforce mode + if ( + self._config.enable_budget_gate + and ctx.mode == "enforce" + and ctx.budget_max is not None + and ctx.cost >= ctx.budget_max + ): + logger.warning( + "google-adk: blocking LLM call — budget exhausted " + "(spent $%.4f of $%.4f max)", + ctx.cost, + ctx.budget_max, + ) + ctx.record(action="stop", reason="budget_exhausted", model=model) + return self._make_budget_error_response(ctx) + + # Record start time and model for after_model_callback + self._call_start_times[key] = time.monotonic() + self._call_models[key] = model + self._track_call_key(callback_context, key) + + return None + except Exception: + if self._config.fail_open: + logger.debug("google-adk before_model_callback error (fail_open)", exc_info=True) + return None + raise + + async def after_model_callback( + self, + callback_context: Any, + llm_response: Any, + ) -> Any: + """Extract tokens, count tool calls, estimate cost/energy, update run context.""" + if not self._active: + return None + + try: + ctx = get_current_run() + if ctx is None: + return None + if ctx.mode == "off": + return None + + key = self._resolve_call_key(callback_context) + + # Recover model name stored during before_model_callback + model = self._call_models.pop(key, "unknown") if key is not None else "unknown" + + # Extract token counts from usage_metadata + input_tokens, output_tokens = self._extract_tokens(llm_response) + + # Count function_call parts in response content + content = getattr(llm_response, "content", None) + tool_calls = _count_function_calls(content) + + # Cost and energy estimation + cost = estimate_cost(model, input_tokens, output_tokens) + energy = estimate_energy(model, input_tokens, output_tokens) + + # Latency + start_time = self._call_start_times.pop(key, None) if key is not None else None + elapsed_ms = (time.monotonic() - start_time) * 1000 if start_time else 0.0 + + # Update run context + ctx.cost += cost + ctx.step_count += 1 + ctx.latency_used_ms += elapsed_ms + ctx.energy_used += energy + ctx.tool_calls += tool_calls + + if ctx.budget_max is not None: + ctx.budget_remaining = ctx.budget_max - ctx.cost + + ctx.model_used = model + ctx.record(action="allow", reason=ctx.mode, model=model) + + logger.debug( + "google-adk: tracked call model=%s cost=$%.6f latency=%.0fms tools=%d", + model, + cost, + elapsed_ms, + tool_calls, + ) + + return None + except Exception: + if self._config.fail_open: + logger.debug("google-adk after_model_callback error (fail_open)", exc_info=True) + return None + raise + + async def on_model_error_callback( + self, + callback_context: Any, + llm_request: Any = None, + error: Exception | None = None, + ) -> Any: + """Record error in trace and clean up timing state.""" + if not self._active: + return None + + try: + # Backward-compatible calling form used in existing tests: + # on_model_error_callback(callback_context, error) + if error is None and isinstance(llm_request, Exception): + error = llm_request + + key = self._resolve_call_key(callback_context) + model = self._call_models.pop(key, "unknown") if key is not None else "unknown" + if key is not None: + self._call_start_times.pop(key, None) + + ctx = get_current_run() + if ctx is not None and error is not None: + error_type = type(error).__name__ + ctx.record( + action="error", + reason=f"model_error:{error_type}", + model=model, + ) + + return None + except Exception: + if self._config.fail_open: + logger.debug("google-adk on_model_error_callback error (fail_open)", exc_info=True) + return None + raise + + def deactivate(self) -> None: + """Make all callbacks no-ops without unregistering from Runner.""" + self._active = False + self._call_seq = 0 + self._call_start_times.clear() + self._call_models.clear() + self._call_fallback_keys.clear() + + @staticmethod + def _extract_tokens(llm_response: Any) -> tuple[int, int]: + """Extract input/output token counts from an ADK LlmResponse. + + ADK responses carry ``usage_metadata`` with ``prompt_token_count`` + and ``candidates_token_count``. Falls back to estimating from + content text (4 chars ≈ 1 token). + """ + usage = getattr(llm_response, "usage_metadata", None) + if usage is not None: + input_tokens = getattr(usage, "prompt_token_count", 0) or 0 + output_tokens = getattr(usage, "candidates_token_count", 0) or 0 + if input_tokens > 0 or output_tokens > 0: + return int(input_tokens), int(output_tokens) + + # Fallback: estimate from content text + content = getattr(llm_response, "content", None) + if content is not None: + parts = getattr(content, "parts", None) + if parts: + text_chars = sum(len(getattr(p, "text", "") or "") for p in parts) + return 0, max(text_chars // 4, 1) + + return 0, 0 + + @staticmethod + def _make_budget_error_response(ctx: Any) -> Any: + """Build an LlmResponse that short-circuits the LLM call. + + When ADK is available we return a real ``LlmResponse``. When not + (shouldn't happen in practice), we return a sentinel dict. + + The user-facing message is intentionally generic to avoid leaking + internal spend/limit numbers. Exact figures are logged separately. + """ + # Generic message safe for end-user exposure. + msg = "cascadeflow harness budget exceeded" + # Detailed figures for operators only. + logger.warning( + "google-adk: budget exceeded — spent $%.4f of $%.4f max", + ctx.cost, + ctx.budget_max, + ) + if GOOGLE_ADK_AVAILABLE: + try: + from google.adk.models import LlmResponse # type: ignore[import-untyped] + from google.genai.types import Content, Part # type: ignore[import-untyped] + + return LlmResponse( + content=Content(parts=[Part(text=msg)]), + error_code="BUDGET_EXCEEDED", + error_message=msg, + ) + except ImportError: + pass + + return {"error_code": "BUDGET_EXCEEDED", "error_message": msg} + + +# --------------------------------------------------------------------------- +# Module-level state +# --------------------------------------------------------------------------- + +_config: GoogleADKHarnessConfig = GoogleADKHarnessConfig() +_plugin_instance: Optional[CascadeFlowADKPlugin] = None +_enabled: bool = False + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def is_available() -> bool: + """Return whether the google-adk package is installed.""" + return GOOGLE_ADK_AVAILABLE + + +def is_enabled() -> bool: + """Return whether a plugin instance has been created via ``enable()``.""" + return _enabled + + +def get_config() -> GoogleADKHarnessConfig: + """Return a copy of the current configuration.""" + return GoogleADKHarnessConfig( + fail_open=_config.fail_open, + enable_budget_gate=_config.enable_budget_gate, + ) + + +def enable( + config: Optional[GoogleADKHarnessConfig] = None, +) -> CascadeFlowADKPlugin: + """Create a cascadeflow-instrumented ADK plugin instance. + + Unlike CrewAI (global hooks), ADK plugins are per-Runner. Pass the + returned plugin to ``Runner(plugins=[plugin])``. + + Idempotent: returns the same instance on repeated calls unless + ``disable()`` was called in between. + + Args: + config: Optional configuration overrides. + + Returns: + ``CascadeFlowADKPlugin`` instance ready for ``Runner(plugins=[...])``. + """ + global _config, _plugin_instance, _enabled + + if _enabled and _plugin_instance is not None: + logger.debug("google-adk plugin already enabled; returning existing instance") + return _plugin_instance + + if config is not None: + _config = config + + _plugin_instance = CascadeFlowADKPlugin(config=_config) + _enabled = True + logger.info("google-adk harness plugin created") + return _plugin_instance + + +def disable() -> None: + """Deactivate the plugin and clear module state. + + Safe to call even if not enabled. + """ + global _plugin_instance, _enabled + + if _plugin_instance is not None: + _plugin_instance.deactivate() + + _plugin_instance = None + _enabled = False + logger.info("google-adk harness plugin disabled") + + +__all__ = [ + "GOOGLE_ADK_AVAILABLE", + "GoogleADKHarnessConfig", + "CascadeFlowADKPlugin", + "enable", + "disable", + "is_available", + "is_enabled", + "get_config", +] diff --git a/cascadeflow/integrations/langchain/__init__.py b/cascadeflow/integrations/langchain/__init__.py index 45c6ea2f..7b3f9551 100644 --- a/cascadeflow/integrations/langchain/__init__.py +++ b/cascadeflow/integrations/langchain/__init__.py @@ -54,6 +54,14 @@ CascadeFlowCallbackHandler, get_cascade_callback, ) +from .harness_callback import ( + HarnessAwareCascadeFlowCallbackHandler, + get_harness_callback, +) +from .harness_state import ( + apply_langgraph_state, + extract_langgraph_state, +) __all__ = [ # Main classes @@ -93,4 +101,8 @@ # LangChain callback handlers "CascadeFlowCallbackHandler", "get_cascade_callback", + "HarnessAwareCascadeFlowCallbackHandler", + "get_harness_callback", + "extract_langgraph_state", + "apply_langgraph_state", ] diff --git a/cascadeflow/integrations/langchain/harness_callback.py b/cascadeflow/integrations/langchain/harness_callback.py new file mode 100644 index 00000000..01f08d8c --- /dev/null +++ b/cascadeflow/integrations/langchain/harness_callback.py @@ -0,0 +1,248 @@ +"""Harness-aware callbacks for LangChain/LangGraph integration. + +Enforce-mode limitations (LangChain callback architecture): + - ``stop`` (budget/latency/energy exceeded): fully enforced — raises + BudgetExceededError or HarnessStopError from ``on_llm_start``. + - ``deny_tool`` (tool-call cap): fully enforced at the tool level via + ``on_tool_start`` — raises HarnessStopError before tool execution. + - ``switch_model``: **observe-only** — LangChain dispatches the LLM call + before ``on_llm_start`` returns, so the callback cannot redirect to a + different model. The decision is recorded with ``applied=False``. + - ``deny_tool`` at LLM level (pre-call decision): **observe-only** — the + callback cannot strip tools from an already-dispatched LLM request. + The decision is recorded with ``applied=False``. +""" + +from __future__ import annotations + +import logging +import time +from contextlib import contextmanager +from typing import Any, Optional + +from cascadeflow.harness import get_current_run +from cascadeflow.harness.pricing import estimate_cost, estimate_energy +from cascadeflow.schema.exceptions import HarnessStopError + +from .harness_state import apply_langgraph_state, extract_langgraph_state +from .langchain_callbacks import CascadeFlowCallbackHandler +from .utils import extract_token_usage + +logger = logging.getLogger("cascadeflow.harness.langchain") + + +class HarnessAwareCascadeFlowCallbackHandler(CascadeFlowCallbackHandler): + """LangChain callback that bridges native lifecycle events into HarnessRunContext. + + See module docstring for enforce-mode limitations on ``switch_model`` + and LLM-level ``deny_tool``. + """ + + def __init__(self, *, fail_open: bool = True): + super().__init__() + self.fail_open = fail_open + self._llm_started_at: Optional[float] = None + self._pre_action: str = "allow" + self._pre_reason: str = "allow" + self._pre_model: Optional[str] = None + self._pre_recorded: bool = False + + def _handle_harness_error(self, error: Exception) -> None: + if self.fail_open: + logger.exception("langchain harness callback failed (fail-open)", exc_info=error) + return + raise error + + def _sync_state(self, payload: dict[str, Any]) -> None: + run_ctx = get_current_run() + if run_ctx is None: + return + state = extract_langgraph_state(payload) + if state: + apply_langgraph_state(run_ctx, state) + + def on_llm_start(self, serialized: dict[str, Any], prompts: list[str], **kwargs: Any) -> None: + super().on_llm_start(serialized=serialized, prompts=prompts, **kwargs) + self._llm_started_at = time.monotonic() + self._pre_action = "allow" + self._pre_reason = "allow" + self._pre_model = self.current_model + self._pre_recorded = False + + try: + self._sync_state(kwargs) + + run_ctx = get_current_run() + if run_ctx is None: + return + + model_name = self.current_model or "unknown" + invocation_params = kwargs.get("invocation_params") + has_tools = False + if isinstance(invocation_params, dict): + has_tools = bool(invocation_params.get("tools")) + if not has_tools: + has_tools = bool(kwargs.get("tools")) + + from cascadeflow.harness.instrument import ( + _evaluate_pre_call_decision, + _raise_stop_error, + ) # noqa: I001 + + decision = _evaluate_pre_call_decision(run_ctx, model_name, has_tools=has_tools) + self._pre_action = decision.action + self._pre_reason = decision.reason + self._pre_model = decision.target_model + + if run_ctx.mode == "observe": + if decision.action != "allow": + run_ctx.record( + action=decision.action, + reason=decision.reason, + model=decision.target_model, + applied=False, + decision_mode="observe", + ) + self._pre_recorded = True + return + + if run_ctx.mode != "enforce": + return + + if decision.action == "stop": + run_ctx.record( + action="stop", + reason=decision.reason, + model=model_name, + applied=True, + decision_mode="enforce", + ) + self._pre_recorded = True + _raise_stop_error(run_ctx, decision.reason) + + if decision.action == "switch_model": + run_ctx.record( + action="switch_model", + reason=decision.reason, + model=decision.target_model, + applied=False, + decision_mode="enforce", + ) + self._pre_recorded = True + + if decision.action == "deny_tool" and has_tools: + run_ctx.record( + action="deny_tool", + reason=decision.reason, + model=model_name, + applied=False, + decision_mode="enforce", + ) + self._pre_recorded = True + + except Exception as exc: + self._handle_harness_error(exc) + + def on_llm_end(self, response: Any, **kwargs: Any) -> None: + super().on_llm_end(response=response, **kwargs) + + try: + self._sync_state(kwargs) + run_ctx = get_current_run() + if run_ctx is None: + return + + model_name = self.current_model + if not model_name and getattr(response, "llm_output", None): + model_name = response.llm_output.get("model_name") + model_name = model_name or "unknown" + + token_usage = extract_token_usage(response) + prompt_tokens = int(token_usage["input"]) + completion_tokens = int(token_usage["output"]) + elapsed_ms = 0.0 + if self._llm_started_at is not None: + elapsed_ms = (time.monotonic() - self._llm_started_at) * 1000.0 + + run_ctx.step_count += 1 + run_ctx.cost += estimate_cost(model_name, prompt_tokens, completion_tokens) + run_ctx.energy_used += estimate_energy(model_name, prompt_tokens, completion_tokens) + run_ctx.latency_used_ms += elapsed_ms + + if run_ctx.budget_max is not None: + run_ctx.budget_remaining = run_ctx.budget_max - run_ctx.cost + + if self._pre_action == "allow": + run_ctx.record( + action="allow", + reason="langchain_step", + model=model_name, + applied=True, + decision_mode=run_ctx.mode, + ) + elif not self._pre_recorded: + run_ctx.record( + action=self._pre_action, + reason=self._pre_reason, + model=self._pre_model or model_name, + applied=False, + decision_mode=run_ctx.mode, + ) + + except Exception as exc: + self._handle_harness_error(exc) + finally: + self._llm_started_at = None + self._pre_action = "allow" + self._pre_reason = "allow" + self._pre_model = None + self._pre_recorded = False + + def on_tool_start(self, serialized: dict[str, Any], input_str: str, **kwargs: Any) -> Any: + try: + self._sync_state(kwargs) + run_ctx = get_current_run() + if run_ctx is None: + return None + if run_ctx.tool_calls_max is None: + return None + + if run_ctx.tool_calls >= run_ctx.tool_calls_max: + if run_ctx.mode == "observe": + run_ctx.record( + action="deny_tool", + reason="max_tool_calls_reached", + model=self.current_model, + applied=False, + decision_mode="observe", + ) + return None + if run_ctx.mode == "enforce": + run_ctx.record( + action="deny_tool", + reason="max_tool_calls_reached", + model=self.current_model, + applied=True, + decision_mode="enforce", + ) + raise HarnessStopError( + "cascadeflow harness deny_tool: max tool calls reached", + reason="max_tool_calls_reached", + ) + + # Track executed tools (not predicted tool calls in LLM output). + run_ctx.tool_calls += 1 + return None + except Exception as exc: + self._handle_harness_error(exc) + return None + + +@contextmanager +def get_harness_callback(*, fail_open: bool = True): + """Context manager that yields a harness-aware LangChain callback handler.""" + callback = HarnessAwareCascadeFlowCallbackHandler(fail_open=fail_open) + yield callback + + +__all__ = ["HarnessAwareCascadeFlowCallbackHandler", "get_harness_callback"] diff --git a/cascadeflow/integrations/langchain/harness_state.py b/cascadeflow/integrations/langchain/harness_state.py new file mode 100644 index 00000000..b4b40da5 --- /dev/null +++ b/cascadeflow/integrations/langchain/harness_state.py @@ -0,0 +1,124 @@ +"""LangGraph/LangChain state extraction helpers for harness integration.""" + +from __future__ import annotations + +from typing import Any, Mapping, Optional + + +def _as_int(value: Any) -> Optional[int]: + try: + if value is None: + return None + return int(value) + except (TypeError, ValueError): + return None + + +def _as_float(value: Any) -> Optional[float]: + try: + if value is None: + return None + return float(value) + except (TypeError, ValueError): + return None + + +def _extract_candidate_state(source: Any) -> Optional[Mapping[str, Any]]: + """Extract a named state container from a mapping. + + Only returns state from explicitly named keys (langgraph_state, graph_state, + state). Returns None when no named key matches — avoids treating arbitrary + kwargs as harness state. + """ + if not isinstance(source, Mapping): + return None + + for key in ("langgraph_state", "graph_state", "state"): + candidate = source.get(key) + if isinstance(candidate, Mapping): + return candidate + + return None + + +def extract_langgraph_state(payload: Any) -> dict[str, Any]: + """Extract normalized harness-relevant fields from LangGraph-style state payloads.""" + + candidates: list[Mapping[str, Any]] = [] + root = _extract_candidate_state(payload) + if root is not None: + candidates.append(root) + + if isinstance(payload, Mapping): + metadata = payload.get("metadata") + if isinstance(metadata, Mapping): + state_from_metadata = _extract_candidate_state(metadata) + if state_from_metadata is not None: + candidates.append(state_from_metadata) + + configurable = payload.get("configurable") + if isinstance(configurable, Mapping): + state_from_configurable = _extract_candidate_state(configurable) + if state_from_configurable is not None: + candidates.append(state_from_configurable) + + merged: dict[str, Any] = {} + for source in candidates: + if "agent_id" in source and isinstance(source.get("agent_id"), str): + merged["agent_id"] = source["agent_id"] + if "model" in source and isinstance(source.get("model"), str): + merged["model_used"] = source["model"] + if "model_used" in source and isinstance(source.get("model_used"), str): + merged["model_used"] = source["model_used"] + + step_count = _as_int(source.get("step_count", source.get("step"))) + if step_count is not None: + merged["step_count"] = step_count + + tool_calls = _as_int(source.get("tool_calls")) + if tool_calls is not None: + merged["tool_calls"] = tool_calls + + budget_remaining = _as_float(source.get("budget_remaining")) + if budget_remaining is not None: + merged["budget_remaining"] = budget_remaining + + latency_used_ms = _as_float(source.get("latency_used_ms", source.get("latency_ms"))) + if latency_used_ms is not None: + merged["latency_used_ms"] = latency_used_ms + + energy_used = _as_float(source.get("energy_used", source.get("energy"))) + if energy_used is not None: + merged["energy_used"] = energy_used + + return merged + + +def apply_langgraph_state(run_ctx: Any, state: Mapping[str, Any]) -> None: + """Apply extracted state fields onto an active HarnessRunContext.""" + if run_ctx is None or not isinstance(state, Mapping): + return + + step_count = _as_int(state.get("step_count")) + if step_count is not None and step_count > getattr(run_ctx, "step_count", 0): + run_ctx.step_count = step_count + + tool_calls = _as_int(state.get("tool_calls")) + if tool_calls is not None and tool_calls > getattr(run_ctx, "tool_calls", 0): + run_ctx.tool_calls = tool_calls + + latency_used_ms = _as_float(state.get("latency_used_ms")) + if latency_used_ms is not None and latency_used_ms > getattr(run_ctx, "latency_used_ms", 0.0): + run_ctx.latency_used_ms = latency_used_ms + + energy_used = _as_float(state.get("energy_used")) + if energy_used is not None and energy_used > getattr(run_ctx, "energy_used", 0.0): + run_ctx.energy_used = energy_used + + budget_remaining = _as_float(state.get("budget_remaining")) + if budget_remaining is not None: + run_ctx.budget_remaining = budget_remaining + + model_used = state.get("model_used") + if isinstance(model_used, str) and model_used: + run_ctx.model_used = model_used diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py new file mode 100644 index 00000000..9ba062e5 --- /dev/null +++ b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py @@ -0,0 +1,213 @@ +"""Tests for harness-aware LangChain callback integration.""" + +from __future__ import annotations + +import pytest +from langchain_core.messages import AIMessage +from langchain_core.outputs import ChatGeneration, LLMResult + +from cascadeflow.harness import init, reset, run +from cascadeflow.integrations.langchain.harness_callback import ( + HarnessAwareCascadeFlowCallbackHandler, +) +from cascadeflow.integrations.langchain.harness_state import ( + apply_langgraph_state, + extract_langgraph_state, +) +from cascadeflow.integrations.langchain.utils import extract_tool_calls +from cascadeflow.schema.exceptions import BudgetExceededError, HarnessStopError + + +@pytest.fixture(autouse=True) +def _reset_harness_state() -> None: + reset() + + +def _llm_result(model_name: str, prompt_tokens: int, completion_tokens: int) -> LLMResult: + generation = ChatGeneration(message=AIMessage(content="ok"), generation_info={}) + return LLMResult( + generations=[[generation]], + llm_output={ + "model_name": model_name, + "token_usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + }, + }, + ) + + +def test_harness_callback_updates_active_run_metrics() -> None: + init(mode="observe", budget=1.0) + handler = HarnessAwareCascadeFlowCallbackHandler() + + with run(budget=1.0) as ctx: + handler.on_llm_start( + serialized={}, + prompts=["hello"], + invocation_params={"model": "gpt-4o-mini"}, + ) + handler.on_llm_end(_llm_result("gpt-4o-mini", 120, 80)) + + assert ctx.step_count == 1 + assert ctx.cost > 0 + assert ctx.energy_used > 0 + assert ctx.budget_remaining is not None + assert ctx.budget_remaining < 1.0 + assert ctx.last_action == "allow" + assert ctx.model_used == "gpt-4o-mini" + + +def test_harness_callback_enforce_raises_when_budget_exhausted() -> None: + init(mode="enforce", budget=0.1) + handler = HarnessAwareCascadeFlowCallbackHandler(fail_open=False) + + with run(budget=0.1) as ctx: + ctx.cost = 0.1 + ctx.budget_remaining = 0.0 + + with pytest.raises(BudgetExceededError): + handler.on_llm_start( + serialized={}, + prompts=["hello"], + invocation_params={"model": "gpt-4o-mini"}, + ) + + trace = ctx.trace() + assert trace + assert trace[-1]["action"] == "stop" + assert trace[-1]["reason"] == "budget_exceeded" + assert trace[-1]["applied"] is True + + +def test_harness_callback_observe_records_non_applied_decisions() -> None: + init(mode="observe", budget=1.0) + handler = HarnessAwareCascadeFlowCallbackHandler() + + with run(budget=1.0) as ctx: + ctx.cost = 0.9 + ctx.budget_remaining = 0.1 + + handler.on_llm_start( + serialized={}, + prompts=["hello"], + invocation_params={"model": "gpt-4o", "tools": [{"name": "lookup"}]}, + ) + + trace = ctx.trace() + assert trace + assert trace[-1]["action"] in {"switch_model", "deny_tool"} + assert trace[-1]["applied"] is False + assert trace[-1]["decision_mode"] == "observe" + + +def test_harness_callback_enforce_denies_tool_when_limit_reached() -> None: + init(mode="enforce", max_tool_calls=0, budget=1.0) + handler = HarnessAwareCascadeFlowCallbackHandler(fail_open=False) + + with run(max_tool_calls=0, budget=1.0) as ctx: + with pytest.raises(HarnessStopError, match="max tool calls"): + handler.on_tool_start(serialized={"name": "search"}, input_str="query") + + trace = ctx.trace() + assert trace + assert trace[-1]["action"] == "deny_tool" + assert trace[-1]["applied"] is True + assert trace[-1]["decision_mode"] == "enforce" + + +def test_on_llm_end_no_run_context_is_safe() -> None: + handler = HarnessAwareCascadeFlowCallbackHandler() + handler.on_llm_start( + serialized={}, + prompts=["hello"], + invocation_params={"model": "gpt-4o-mini"}, + ) + handler.on_llm_end(_llm_result("gpt-4o-mini", 10, 5)) + + +def test_on_tool_start_no_run_context_is_safe() -> None: + handler = HarnessAwareCascadeFlowCallbackHandler() + handler.on_tool_start(serialized={"name": "search"}, input_str="query") + + +def test_extract_state_ignores_plain_kwargs() -> None: + """Kwargs without a named state key should not leak into state.""" + state = extract_langgraph_state({"model": "gpt-4o", "invocation_params": {"tools": []}}) + assert state == {} + + +def test_tool_deny_uses_run_ctx_tool_calls() -> None: + """Tool gating should use run_ctx.tool_calls, not a local counter.""" + init(mode="enforce", max_tool_calls=2, budget=1.0) + handler = HarnessAwareCascadeFlowCallbackHandler(fail_open=False) + + with run(max_tool_calls=2, budget=1.0) as ctx: + # Simulate tool calls already counted by on_llm_end or other integrations + ctx.tool_calls = 2 + + with pytest.raises(HarnessStopError, match="max tool calls"): + handler.on_tool_start(serialized={"name": "search"}, input_str="query") + + +def test_tool_start_counts_executions_and_blocks_after_limit() -> None: + init(mode="enforce", max_tool_calls=1, budget=1.0) + handler = HarnessAwareCascadeFlowCallbackHandler(fail_open=False) + + with run(max_tool_calls=1, budget=1.0) as ctx: + assert ctx.tool_calls == 0 + assert handler.on_tool_start(serialized={"name": "search"}, input_str="first") is None + assert ctx.tool_calls == 1 + + with pytest.raises(HarnessStopError, match="max tool calls"): + handler.on_tool_start(serialized={"name": "search"}, input_str="second") + + assert ctx.tool_calls == 1 + trace = ctx.trace() + assert trace[-1]["action"] == "deny_tool" + assert trace[-1]["applied"] is True + + +def test_extract_tool_calls_supports_llm_result_nested_generations() -> None: + generation = ChatGeneration( + message=AIMessage( + content="", tool_calls=[{"name": "search", "args": {"q": "x"}, "id": "t1"}] + ), + generation_info={}, + ) + llm_result = LLMResult(generations=[[generation]], llm_output={"model_name": "gpt-4o-mini"}) + tool_calls = extract_tool_calls(llm_result) + assert len(tool_calls) == 1 + assert tool_calls[0]["name"] == "search" + + +def test_extract_and_apply_langgraph_state() -> None: + state = extract_langgraph_state( + { + "metadata": { + "langgraph_state": { + "step": 4, + "tool_calls": 3, + "budget_remaining": 0.42, + "latency_ms": 130.0, + "energy": 77.0, + "model": "gpt-4o-mini", + } + } + } + ) + + assert state["step_count"] == 4 + assert state["tool_calls"] == 3 + assert state["model_used"] == "gpt-4o-mini" + + init(mode="observe", budget=1.0) + with run(budget=1.0) as ctx: + apply_langgraph_state(ctx, state) + assert ctx.step_count == 4 + assert ctx.tool_calls == 3 + assert ctx.budget_remaining == pytest.approx(0.42) + assert ctx.latency_used_ms == pytest.approx(130.0) + assert ctx.energy_used == pytest.approx(77.0) + assert ctx.model_used == "gpt-4o-mini" diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py b/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py index fdbcff1d..0f051519 100644 --- a/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py +++ b/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py @@ -4,7 +4,11 @@ from langchain_core.messages import AIMessage, BaseMessage, HumanMessage from langchain_core.outputs import ChatGeneration, ChatResult +from cascadeflow.harness import init, reset, run from cascadeflow.integrations.langchain import CascadeFlow +from cascadeflow.integrations.langchain.harness_callback import ( + HarnessAwareCascadeFlowCallbackHandler, +) class MockSequenceChatModel(BaseChatModel): @@ -116,3 +120,38 @@ def test_domain_policy_direct_to_verifier_skips_drafter() -> None: assert drafter.calls == 0 assert verifier.calls == 1 assert result.llm_output["cascade"]["routing_reason"] == "domain_policy_direct" + + +def test_wrapper_only_auto_adds_harness_callback_inside_active_run_scope() -> None: + reset() + init(mode="observe") + drafter = MockSequenceChatModel("draft") + verifier = MockSequenceChatModel("verify") + cascade = CascadeFlow(drafter=drafter, verifier=verifier, enable_pre_router=False) + + outside_callbacks = cascade._resolve_callbacks([]) + assert not any( + isinstance(cb, HarnessAwareCascadeFlowCallbackHandler) for cb in outside_callbacks + ) + + with run(): + inside_callbacks = cascade._resolve_callbacks([]) + assert any( + isinstance(cb, HarnessAwareCascadeFlowCallbackHandler) for cb in inside_callbacks + ) + + +def test_wrapper_does_not_duplicate_harness_callback() -> None: + reset() + init(mode="observe") + drafter = MockSequenceChatModel("draft") + verifier = MockSequenceChatModel("verify") + cascade = CascadeFlow(drafter=drafter, verifier=verifier, enable_pre_router=False) + existing = HarnessAwareCascadeFlowCallbackHandler() + + with run(): + callbacks = cascade._resolve_callbacks([existing]) + assert ( + len([cb for cb in callbacks if isinstance(cb, HarnessAwareCascadeFlowCallbackHandler)]) + == 1 + ) diff --git a/cascadeflow/integrations/langchain/utils.py b/cascadeflow/integrations/langchain/utils.py index fe47a353..04f3e4a5 100644 --- a/cascadeflow/integrations/langchain/utils.py +++ b/cascadeflow/integrations/langchain/utils.py @@ -195,6 +195,10 @@ def extract_tool_calls(response: Any) -> list[dict[str, Any]]: msg = None if hasattr(response, "generations") and response.generations: generation = response.generations[0] + # LLMResult.generations is often list[list[Generation]], while ChatResult + # uses list[Generation]. Support both shapes. + if isinstance(generation, list) and generation: + generation = generation[0] msg = getattr(generation, "message", None) else: msg = getattr(response, "message", None) or response diff --git a/cascadeflow/integrations/langchain/wrapper.py b/cascadeflow/integrations/langchain/wrapper.py index ed6d554b..f108d60f 100644 --- a/cascadeflow/integrations/langchain/wrapper.py +++ b/cascadeflow/integrations/langchain/wrapper.py @@ -169,6 +169,35 @@ def _split_runnable_config( model_kwargs[key] = value return model_kwargs, config + def _resolve_callbacks(self, raw_callbacks: Any) -> list[Any]: + if raw_callbacks is None: + callbacks: list[Any] = [] + elif isinstance(raw_callbacks, list): + callbacks = list(raw_callbacks) + elif isinstance(raw_callbacks, tuple): + callbacks = list(raw_callbacks) + else: + callbacks = [raw_callbacks] + + try: + from cascadeflow.harness import get_current_run, get_harness_config + + harness_config = get_harness_config() + run_ctx = get_current_run() + if harness_config.mode == "off" or run_ctx is None or run_ctx.mode == "off": + return callbacks + + from .harness_callback import HarnessAwareCascadeFlowCallbackHandler + + if any(isinstance(cb, HarnessAwareCascadeFlowCallbackHandler) for cb in callbacks): + return callbacks + + callbacks.append(HarnessAwareCascadeFlowCallbackHandler()) + return callbacks + except Exception: + # Preserve existing behavior for users who do not enable harness flows. + return callbacks + def _generate( self, messages: list[BaseMessage], @@ -202,7 +231,7 @@ def _generate( merged_kwargs["stop"] = stop # Extract callbacks before filtering (need to pass them explicitly to nested models) - callbacks = merged_kwargs.get("callbacks", []) + callbacks = self._resolve_callbacks(merged_kwargs.get("callbacks", [])) existing_tags = merged_kwargs.get("tags", []) or [] base_tags = existing_tags + ["cascadeflow"] if existing_tags else ["cascadeflow"] @@ -599,7 +628,7 @@ async def _agenerate( merged_kwargs["stop"] = stop # Extract callbacks before filtering (need to pass them explicitly to nested models) - callbacks = merged_kwargs.get("callbacks", []) + callbacks = self._resolve_callbacks(merged_kwargs.get("callbacks", [])) existing_tags = merged_kwargs.get("tags", []) or [] base_tags = existing_tags + ["cascadeflow"] if existing_tags else ["cascadeflow"] @@ -1001,7 +1030,7 @@ def _stream( stream_kwargs, base_config = self._split_runnable_config(merged_kwargs) base_tags = (base_config.get("tags") or []) + ["cascadeflow"] existing_metadata = base_config.get("metadata", {}) or {} - callbacks = base_config.get("callbacks", []) + callbacks = self._resolve_callbacks(base_config.get("callbacks", [])) resolved_domain = self._resolve_domain(messages, existing_metadata) effective_quality_threshold = self._effective_quality_threshold(resolved_domain) force_verifier_for_domain = self._domain_forces_verifier(resolved_domain) @@ -1324,7 +1353,7 @@ async def _astream( stream_kwargs, base_config = self._split_runnable_config(merged_kwargs) base_tags = (base_config.get("tags") or []) + ["cascadeflow"] existing_metadata = base_config.get("metadata", {}) or {} - callbacks = base_config.get("callbacks", []) + callbacks = self._resolve_callbacks(base_config.get("callbacks", [])) safe_kwargs = { k: v for k, v in stream_kwargs.items() diff --git a/docs-site/api-reference/python/agent-decorator.mdx b/docs-site/api-reference/python/agent-decorator.mdx new file mode 100644 index 00000000..912a03fd --- /dev/null +++ b/docs-site/api-reference/python/agent-decorator.mdx @@ -0,0 +1,79 @@ +--- +title: "@cascadeflow.agent()" +description: Decorate agent functions with policy metadata including budget, compliance, and KPI weights. +--- + +# @cascadeflow.agent() + +Annotate agent functions with policy metadata. The decorator attaches budget, compliance, and KPI configuration to the function for the harness to use at runtime. + +## Signature + +```python +def agent( + budget: Optional[float] = None, + compliance: Optional[str] = None, + kpi_weights: Optional[dict[str, float]] = None, + kpi_targets: Optional[dict[str, float]] = None, + max_tool_calls: Optional[int] = None, +) +``` + +## Parameters + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `budget` | `float \| None` | `None` | Max USD for this agent | +| `compliance` | `str \| None` | `None` | Compliance mode | +| `kpi_weights` | `dict \| None` | `None` | KPI dimension weights | +| `kpi_targets` | `dict \| None` | `None` | KPI dimension targets | +| `max_tool_calls` | `int \| None` | `None` | Max tool/function calls | + +## Usage + +### Basic + +```python +@cascadeflow.agent(budget=0.20) +async def my_agent(query: str): + return await llm.complete(query) +``` + +### With compliance + +```python +@cascadeflow.agent(budget=0.50, compliance="gdpr") +async def eu_agent(query: str): + return await llm.complete(query) +``` + +### With KPI weights + +```python +@cascadeflow.agent( + budget=1.00, + kpi_weights={"quality": 0.8, "cost": 0.2}, + kpi_targets={"quality": 0.9}, +) +async def premium_agent(query: str): + return await llm.complete(query) +``` + +### Multiple agents with different policies + +```python +@cascadeflow.agent(budget=0.10, kpi_weights={"cost": 0.9, "quality": 0.1}) +async def triage_agent(query: str): + return await llm.complete(query) + +@cascadeflow.agent(budget=2.00, kpi_weights={"quality": 0.9, "cost": 0.1}) +async def analysis_agent(query: str): + return await llm.complete(query) +``` + +## Notes + +- The decorator does not wrap or modify the function's execution. It attaches metadata that the harness reads at runtime. +- Works with both sync and async functions. +- Requires `init()` to have been called for the metadata to take effect. +- Can be combined with `run()` — the run's constraints are checked in addition to the decorator's. diff --git a/docs-site/api-reference/python/harness-config.mdx b/docs-site/api-reference/python/harness-config.mdx new file mode 100644 index 00000000..42ae7a6d --- /dev/null +++ b/docs-site/api-reference/python/harness-config.mdx @@ -0,0 +1,73 @@ +--- +title: HarnessConfig +description: Full configuration dataclass for the cascadeflow harness with all fields, types, and defaults. +--- + +# HarnessConfig + +Configuration dataclass for the cascadeflow harness. Pass to `cascadeflow.init(config=...)` for full control. + +## Definition + +```python +from dataclasses import dataclass +from typing import Optional + +@dataclass +class HarnessConfig: + mode: HarnessMode = "off" + verbose: bool = False + budget: Optional[float] = None + max_tool_calls: Optional[int] = None + max_latency_ms: Optional[float] = None + max_energy: Optional[float] = None + kpi_targets: Optional[dict[str, float]] = None + kpi_weights: Optional[dict[str, float]] = None + compliance: Optional[str] = None +``` + +## Fields + +| Field | Type | Default | Description | +|---|---|---|---| +| `mode` | `"off" \| "observe" \| "enforce"` | `"off"` | Harness mode | +| `verbose` | `bool` | `False` | Print decisions to stderr | +| `budget` | `float \| None` | `None` | Max USD for the run (None = unlimited) | +| `max_tool_calls` | `int \| None` | `None` | Max tool/function calls (None = unlimited) | +| `max_latency_ms` | `float \| None` | `None` | Max wall-clock ms per call (None = unlimited) | +| `max_energy` | `float \| None` | `None` | Max energy units (None = unlimited) | +| `kpi_targets` | `dict \| None` | `None` | Target values per KPI dimension | +| `kpi_weights` | `dict \| None` | `None` | Relative weights per KPI dimension | +| `compliance` | `str \| None` | `None` | Compliance mode: `"gdpr"`, `"hipaa"`, `"pci"`, `"strict"` | + +## HarnessMode + +```python +HarnessMode = Literal["off", "observe", "enforce"] +``` + +## Usage + +```python +from cascadeflow import HarnessConfig +import cascadeflow + +config = HarnessConfig( + mode="enforce", + budget=1.00, + max_tool_calls=20, + max_energy=200.0, + compliance="gdpr", + kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1}, + kpi_targets={"quality": 0.85}, + verbose=True, +) + +cascadeflow.init(config=config) +``` + +## Import + +```python +from cascadeflow import HarnessConfig +``` diff --git a/docs-site/api-reference/python/init.mdx b/docs-site/api-reference/python/init.mdx new file mode 100644 index 00000000..b07a0e00 --- /dev/null +++ b/docs-site/api-reference/python/init.mdx @@ -0,0 +1,68 @@ +--- +title: cascadeflow.init() +description: Activate the cascadeflow harness globally with a mode and optional configuration. +--- + +# cascadeflow.init() + +Activate the harness globally. All subsequent LLM calls (OpenAI, Anthropic) are automatically tracked. + +## Signature + +```python +def init( + mode: HarnessMode = "off", + *, + config: Optional[HarnessConfig] = None, + verbose: bool = False, +) -> HarnessInitReport +``` + +## Parameters + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `mode` | `"off" \| "observe" \| "enforce"` | `"off"` | Harness mode | +| `config` | `HarnessConfig \| None` | `None` | Full configuration (overrides mode) | +| `verbose` | `bool` | `False` | Print decisions to stderr | + +## Returns + +`HarnessInitReport` — confirmation of harness activation with mode and configuration summary. + +## Usage + +### Minimal + +```python +import cascadeflow +cascadeflow.init(mode="observe") +``` + +### With config + +```python +from cascadeflow import HarnessConfig + +config = HarnessConfig( + mode="enforce", + budget=1.00, + compliance="gdpr", + verbose=True, +) +cascadeflow.init(config=config) +``` + +### Environment-driven + +```python +import os +cascadeflow.init(mode=os.getenv("CASCADEFLOW_MODE", "observe")) +``` + +## Notes + +- Call `init()` once at application startup, before any LLM calls +- Calling `init()` again replaces the previous configuration +- Use `cascadeflow.reset()` to deactivate the harness +- `init(mode="off")` is equivalent to not calling `init()` at all diff --git a/docs-site/api-reference/python/run-context.mdx b/docs-site/api-reference/python/run-context.mdx new file mode 100644 index 00000000..be9377a4 --- /dev/null +++ b/docs-site/api-reference/python/run-context.mdx @@ -0,0 +1,76 @@ +--- +title: HarnessRunContext +description: Run context object yielded by cascadeflow.run() with summary(), trace(), and budget tracking methods. +--- + +# HarnessRunContext + +The context object yielded by `cascadeflow.run()`. Provides access to run metrics, decision traces, and budget state. + +## Methods + +### summary() + +Returns aggregate metrics for the run. + +```python +summary = session.summary() +``` + +Returns a dict with: + +| Key | Type | Description | +|---|---|---| +| `cost_total` | `float` | Cumulative cost in USD | +| `steps` | `int` | Number of LLM calls | +| `tool_calls` | `int` | Number of tool/function calls | +| `latency_total_ms` | `float` | Total wall-clock latency in ms | +| `energy_used` | `float` | Total energy units consumed | +| `budget_remaining` | `float \| None` | USD remaining (None if no budget set) | + +### trace() + +Returns the list of decision records for the run. + +```python +records = session.trace() +``` + +Each record is a dict with: + +| Key | Type | Description | +|---|---|---| +| `action` | `str` | `"allow"`, `"switch_model"`, `"deny_tool"`, or `"stop"` | +| `reason` | `str` | Human-readable explanation | +| `model` | `str` | Model name | +| `step` | `int` | Step number (1-indexed) | +| `cost_total` | `float` | Cumulative cost at this step | +| `budget_state` | `str` | `"ok"`, `"warning"`, or `"exceeded"` | +| `applied` | `bool` | Whether the action was enforced | + +## Usage + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run(budget=0.50) as session: + result = await agent.run("Analyze this dataset") + + # Aggregate metrics + summary = session.summary() + print(f"Cost: ${summary['cost_total']:.4f}") + print(f"Steps: {summary['steps']}") + print(f"Budget remaining: ${summary['budget_remaining']:.4f}") + + # Decision trace + for record in session.trace(): + print(f"Step {record['step']}: {record['action']} — {record['reason']}") +``` + +## Import + +```python +from cascadeflow import HarnessRunContext +``` diff --git a/docs-site/api-reference/python/run.mdx b/docs-site/api-reference/python/run.mdx new file mode 100644 index 00000000..72202a74 --- /dev/null +++ b/docs-site/api-reference/python/run.mdx @@ -0,0 +1,83 @@ +--- +title: cascadeflow.run() +description: Create a scoped run context with budget caps, tool call limits, and metrics tracking. +--- + +# cascadeflow.run() + +Create a scoped run context manager that tracks metrics and optionally enforces constraints for a block of agent execution. + +## Signature + +```python +def run( + budget: Optional[float] = None, + max_tool_calls: Optional[int] = None, + max_latency_ms: Optional[float] = None, + max_energy: Optional[float] = None, + compliance: Optional[str] = None, + kpi_weights: Optional[dict[str, float]] = None, + kpi_targets: Optional[dict[str, float]] = None, +) -> ContextManager[HarnessRunContext] +``` + +## Parameters + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `budget` | `float \| None` | `None` | Max USD for this run | +| `max_tool_calls` | `int \| None` | `None` | Max tool/function calls | +| `max_latency_ms` | `float \| None` | `None` | Max wall-clock ms per call | +| `max_energy` | `float \| None` | `None` | Max energy units | +| `compliance` | `str \| None` | `None` | `"gdpr"`, `"hipaa"`, `"pci"`, or `"strict"` | +| `kpi_weights` | `dict \| None` | `None` | KPI dimension weights | +| `kpi_targets` | `dict \| None` | `None` | KPI dimension targets | + +## Returns + +Context manager yielding `HarnessRunContext`. See [HarnessRunContext](/api-reference/python/run-context). + +## Usage + +### Basic budget + +```python +with cascadeflow.run(budget=0.50) as session: + result = await agent.run("Analyze this data") + print(session.summary()) +``` + +### Full configuration + +```python +with cascadeflow.run( + budget=1.00, + max_tool_calls=10, + max_energy=100.0, + compliance="gdpr", + kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1}, + kpi_targets={"quality": 0.9}, +) as session: + result = await agent.run("Process EU customer data") + print(session.summary()) + for record in session.trace(): + print(f"Step {record['step']}: {record['action']}") +``` + +### Nested runs + +Runs can be nested. Inner runs inherit the parent's remaining budget: + +```python +with cascadeflow.run(budget=1.00) as outer: + with cascadeflow.run(budget=0.30) as inner: + await agent.run("Sub-task") + # outer.summary() includes inner costs +``` + +## Notes + +- `run()` requires `init()` to have been called first +- Parameters override the global config for the duration of the block +- Use `session.summary()` for aggregate metrics +- Use `session.trace()` for per-step decision records diff --git a/docs-site/api-reference/typescript/core.mdx b/docs-site/api-reference/typescript/core.mdx new file mode 100644 index 00000000..ae8f8311 --- /dev/null +++ b/docs-site/api-reference/typescript/core.mdx @@ -0,0 +1,77 @@ +--- +title: "@cascadeflow/core" +description: TypeScript core package with CascadeAgent for model routing, cost tracking, and quality validation. +--- + +# @cascadeflow/core + +The core TypeScript package for cascadeflow. Provides `CascadeAgent` for speculative model cascading with quality validation. + +## Install + +```bash +npm install @cascadeflow/core +``` + +## CascadeAgent + +```typescript +import { CascadeAgent, ModelConfig } from '@cascadeflow/core'; + +const agent = new CascadeAgent({ + models: [ + { name: 'gpt-4o-mini', provider: 'openai', cost: 0.000375 }, + { name: 'gpt-4o', provider: 'openai', cost: 0.00625 }, + ], +}); + +const result = await agent.run('What is TypeScript?'); +console.log(`Model: ${result.modelUsed}`); +console.log(`Cost: $${result.totalCost}`); +console.log(`Saved: ${result.savingsPercentage}%`); +``` + +## ModelConfig + +```typescript +interface ModelConfig { + name: string; // Model name (e.g. 'gpt-4o-mini') + provider: string; // Provider name (e.g. 'openai') + cost: number; // Cost per token (approximate) +} +``` + +## CascadeAgentOptions + +```typescript +interface CascadeAgentOptions { + models: ModelConfig[]; + quality?: { + threshold?: number; // Confidence threshold (0-1) + requireMinimumTokens?: number; // Min response length + useSemanticValidation?: boolean; // Enable ML validation + semanticThreshold?: number; // Semantic similarity threshold + }; +} +``` + +## Result + +```typescript +interface CascadeResult { + content: string; + modelUsed: string; + totalCost: number; + savingsPercentage: number; + cascadeDecision: string; +} +``` + +## Features + +- Speculative execution with quality validation +- Multi-provider support (OpenAI, Anthropic, Groq, Ollama, vLLM) +- Streaming responses +- Tool calling and structured output +- Cost tracking and analytics +- Works in Node.js, Browser, and Edge Functions diff --git a/docs-site/api-reference/typescript/langchain.mdx b/docs-site/api-reference/typescript/langchain.mdx new file mode 100644 index 00000000..9a9e3050 --- /dev/null +++ b/docs-site/api-reference/typescript/langchain.mdx @@ -0,0 +1,77 @@ +--- +title: "@cascadeflow/langchain" +description: TypeScript LangChain integration with withCascade() for drop-in cascade routing and model discovery helpers. +--- + +# @cascadeflow/langchain + +LangChain integration for TypeScript. Provides `withCascade()` for drop-in cascade routing with any LangChain chat model. + +## Install + +```bash +npm install @cascadeflow/langchain @langchain/core @langchain/openai +``` + +## withCascade + +Creates a cascade-enabled chat model from a drafter and verifier. + +```typescript +import { ChatOpenAI } from '@langchain/openai'; +import { ChatAnthropic } from '@langchain/anthropic'; +import { withCascade } from '@cascadeflow/langchain'; + +const cascade = withCascade({ + drafter: new ChatOpenAI({ model: 'gpt-4o-mini' }), + verifier: new ChatAnthropic({ model: 'claude-sonnet-4' }), + qualityThreshold: 0.8, +}); + +// Use like any LangChain chat model +const result = await cascade.invoke('Explain quantum computing'); + +// With LCEL chains +const chain = prompt.pipe(cascade).pipe(new StringOutputParser()); +``` + +## Options + +```typescript +interface CascadeOptions { + drafter: BaseChatModel; // Cheap, fast model + verifier: BaseChatModel; // Powerful fallback model + qualityThreshold?: number; // 0-1, default 0.4 +} +``` + +## Model Discovery + +```typescript +import { + discoverCascadePairs, + findBestCascadePair, + analyzeModel, + validateCascadePair, +} from '@cascadeflow/langchain'; + +const models = [ + new ChatOpenAI({ model: 'gpt-4o-mini' }), + new ChatOpenAI({ model: 'gpt-4o' }), + new ChatAnthropic({ model: 'claude-sonnet-4' }), +]; + +const best = findBestCascadePair(models); +const cascade = withCascade({ + drafter: best.drafter, + verifier: best.verifier, +}); +``` + +## Features + +- Full LCEL support (pipes, sequences, batch) +- Streaming with pre-routing +- Tool calling and structured output +- LangSmith cost tracking metadata +- Model discovery and pair validation diff --git a/docs-site/api-reference/typescript/vercel-ai.mdx b/docs-site/api-reference/typescript/vercel-ai.mdx new file mode 100644 index 00000000..ae9af949 --- /dev/null +++ b/docs-site/api-reference/typescript/vercel-ai.mdx @@ -0,0 +1,63 @@ +--- +title: "@cascadeflow/vercel-ai" +description: Vercel AI SDK middleware integration for cascade routing with streaming, multi-turn chat, and tool execution. +--- + +# @cascadeflow/vercel-ai + +Middleware integration for the Vercel AI SDK. Adds cascade routing to AI SDK applications with streaming support. + +## Install + +```bash +npm install @cascadeflow/vercel-ai +``` + +## createChatHandler + +Creates a request handler for AI SDK chat endpoints. + +```typescript +import { createChatHandler } from '@cascadeflow/vercel-ai'; +import { CascadeAgent } from '@cascadeflow/core'; + +const agent = new CascadeAgent({ + models: [ + { name: 'gpt-4o-mini', provider: 'openai', cost: 0.000375 }, + { name: 'gpt-4o', provider: 'openai', cost: 0.00625 }, + ], +}); + +const handler = createChatHandler(agent, { + protocol: 'data', + tools, + toolHandlers, + maxSteps: 5, +}); +``` + +## Options + +```typescript +interface ChatHandlerOptions { + protocol: 'data' | 'ui'; // AI SDK stream protocol + tools?: ToolDefinition[]; // Tool definitions + toolHandlers?: Record; // Server-side tool execution + toolExecutor?: Function; // Universal tool executor + maxSteps?: number; // Multi-step tool loop limit + forceDirect?: boolean; // Skip cascade, use verifier + allowOverrides?: string[]; // Request-level override keys + overrideSecret?: string; // Shared secret for overrides +} +``` + +## Features + +- AI SDK v4 `data` stream and v5/v6 UI streams +- `useChat` multi-turn support +- `parts` message format (AI SDK v6) +- Tool call streaming visibility +- Server-side tool execution loops +- Multi-step controls +- Cascade decision stream parts +- Request-level overrides with allowlist diff --git a/docs-site/changelog.mdx b/docs-site/changelog.mdx new file mode 100644 index 00000000..2cda1c2f --- /dev/null +++ b/docs-site/changelog.mdx @@ -0,0 +1,28 @@ +--- +title: Changelog +description: Release history and changelog for cascadeflow. +--- + +# Changelog + +For the full release history, see [GitHub Releases](https://github.com/lemony-ai/cascadeflow/releases). + +## Recent Highlights + +- **v1.0.0** — Agent runtime intelligence layer with harness API, 6 framework integrations, compliance gating, KPI-weighted routing, energy tracking, decision traces +- Agent loops and multi-agent orchestration +- Tool execution engine with parallel execution and risk gating +- Hooks and callbacks for telemetry and observability +- Vercel AI SDK integration (17+ additional providers) +- OpenClaw provider for custom deployments +- Gateway server (drop-in OpenAI/Anthropic-compatible endpoint) +- User tier management with per-user budgets +- Semantic quality validators via FastEmbed +- Domain-aware cascading with 16 domain classifications +- Benchmark reports (MMLU, GSM8K, MT-Bench, HumanEval, TruthfulQA) + +## Links + +- [GitHub Releases](https://github.com/lemony-ai/cascadeflow/releases) +- [PyPI](https://pypi.org/project/cascadeflow/) +- [npm](https://www.npmjs.com/package/@cascadeflow/core) diff --git a/docs-site/contributing.mdx b/docs-site/contributing.mdx new file mode 100644 index 00000000..ff45625e --- /dev/null +++ b/docs-site/contributing.mdx @@ -0,0 +1,96 @@ +--- +title: Contributing +description: How to contribute to cascadeflow — development setup, code style, testing, and pull request process. +--- + +# Contributing + +We welcome contributions to cascadeflow. This guide covers development setup for both Python and TypeScript. + +## Monorepo Structure + +``` +cascadeflow/ + cascadeflow/ # Python package + packages/ + core/ # TypeScript core + langchain-cascadeflow/ # LangChain TypeScript + integrations/ + vercel-ai/ # Vercel AI SDK + n8n/ # n8n community nodes + tests/ # Python tests + examples/ # Python examples + docs/ # Documentation + docs-site/ # Mintlify docs site +``` + +## Python Development + +### Setup + +```bash +git clone https://github.com/lemony-ai/cascadeflow.git +cd cascadeflow +python -m venv .venv +source .venv/bin/activate +pip install -e ".[dev]" +pre-commit install +``` + +### Code Style + +- **Formatter**: Black (line length 100) +- **Linter**: Ruff +- **Type checker**: mypy +- **Import sorting**: isort + +```bash +black cascadeflow/ tests/ +ruff check cascadeflow/ tests/ +mypy cascadeflow/ +``` + +### Testing + +```bash +pytest tests/ -x -q # Run all tests +pytest tests/ -m "not integration" # Skip integration tests +pytest tests/ --cov=cascadeflow # With coverage +``` + +## TypeScript Development + +### Setup + +```bash +cd packages/core +pnpm install +pnpm build +pnpm test +``` + +### Code Style + +- **Linter**: ESLint +- **Language**: TypeScript (strict mode) +- **Indentation**: 2 spaces + +## Making Changes + +1. Create a branch from `main` +2. Make changes with clear, descriptive commits +3. Follow commit conventions: `feat:`, `fix:`, `docs:`, `test:`, `refactor:`, `chore:` +4. Add tests for new functionality +5. Ensure all tests pass + +## Pull Requests + +- All PRs require review approval +- Linear history enforced (no merge commits) +- CI must pass before merge + +## Links + +- [GitHub Issues](https://github.com/lemony-ai/cascadeflow/issues) — Bug reports and feature requests +- [GitHub Discussions](https://github.com/lemony-ai/cascadeflow/discussions) — Questions and community +- [Email](mailto:hello@lemony.ai) — Direct support diff --git a/docs-site/docs.json b/docs-site/docs.json new file mode 100644 index 00000000..1e441f37 --- /dev/null +++ b/docs-site/docs.json @@ -0,0 +1,130 @@ +{ + "$schema": "https://mintlify.com/docs.json", + "theme": "palm", + "name": "cascadeflow", + "colors": { + "primary": "#0E7490", + "light": "#22D3EE", + "dark": "#0E7490" + }, + "logo": { + "light": "/logo/cascadeflow-light.svg", + "dark": "/logo/cascadeflow-dark.svg" + }, + "favicon": "/favicon.svg", + "tabs": [ + { "id": "get-started", "name": "Get Started" }, + { "id": "harness", "name": "Harness" }, + { "id": "integrations", "name": "Integrations" }, + { "id": "api-reference", "name": "API Reference" }, + { "id": "examples", "name": "Examples" } + ], + "navigation": { + "get-started": [ + { + "group": "Get Started", + "pages": [ + "get-started/introduction", + "get-started/quickstart", + "get-started/installation", + "get-started/how-it-works" + ] + }, + { + "group": "Resources", + "pages": [ + "changelog", + "contributing" + ] + } + ], + "harness": [ + { + "group": "Harness", + "pages": [ + "harness/overview", + "harness/modes", + "harness/budget-enforcement", + "harness/compliance", + "harness/kpi-optimization", + "harness/energy-tracking", + "harness/decision-trace", + "harness/actions" + ] + } + ], + "integrations": [ + { + "group": "Integrations", + "pages": [ + "integrations/overview", + "integrations/langchain", + "integrations/openai-agents", + "integrations/crewai", + "integrations/google-adk", + "integrations/n8n", + "integrations/vercel-ai" + ] + } + ], + "api-reference": [ + { + "group": "Python", + "pages": [ + "api-reference/python/init", + "api-reference/python/run", + "api-reference/python/agent-decorator", + "api-reference/python/harness-config", + "api-reference/python/run-context" + ] + }, + { + "group": "TypeScript", + "pages": [ + "api-reference/typescript/core", + "api-reference/typescript/vercel-ai", + "api-reference/typescript/langchain" + ] + } + ], + "examples": [ + { + "group": "Examples", + "pages": [ + "examples/basic-usage", + "examples/budget-enforcement", + "examples/compliance-gating", + "examples/kpi-weighted-routing", + "examples/multi-agent", + "examples/enterprise-patterns" + ] + } + ] + }, + "topbarLinks": [ + { + "name": "GitHub", + "url": "https://github.com/lemony-ai/cascadeflow" + } + ], + "topbarCtaButton": { + "name": "Get Started", + "url": "/get-started/quickstart" + }, + "footerSocials": { + "github": "https://github.com/lemony-ai/cascadeflow", + "x": "https://x.com/saschabuehrle" + }, + "anchors": [ + { + "name": "GitHub", + "icon": "github", + "url": "https://github.com/lemony-ai/cascadeflow" + }, + { + "name": "PyPI", + "icon": "python", + "url": "https://pypi.org/project/cascadeflow/" + } + ] +} diff --git a/docs-site/examples/basic-usage.mdx b/docs-site/examples/basic-usage.mdx new file mode 100644 index 00000000..9cf838d0 --- /dev/null +++ b/docs-site/examples/basic-usage.mdx @@ -0,0 +1,81 @@ +--- +title: Basic Usage +description: Simple cascade setup with OpenAI models showing speculative execution, cost tracking, and savings calculation. +--- + +# Basic Usage + +A minimal example showing cascadeflow's speculative cascade with two OpenAI models. + +## Setup + +```bash +pip install "cascadeflow[openai]" +export OPENAI_API_KEY="sk-..." +``` + +## Code + +```python +import asyncio +from cascadeflow import CascadeAgent, ModelConfig + +agent = CascadeAgent(models=[ + ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375), + ModelConfig(name="gpt-4o", provider="openai", cost=0.00625), +]) + +queries = [ + "What's the capital of France?", # Simple — draft model handles + "Explain quantum computing", # Medium — may escalate + "Write a Python function to sort a list", # Code — domain routing +] + +async def main(): + total_cost = 0 + baseline_cost = 0 + + for query in queries: + result = await agent.run(query) + total_cost += result.total_cost + baseline_cost += result.total_cost if result.model_used == "gpt-4o" else result.total_cost * (0.00625 / 0.000375) + + print(f"Query: {query[:40]}...") + print(f" Model: {result.model_used}") + print(f" Cost: ${result.total_cost:.6f}") + print() + + savings = (1 - total_cost / baseline_cost) * 100 if baseline_cost > 0 else 0 + print(f"Total cost: ${total_cost:.6f}") + print(f"Savings: {savings:.0f}%") + +asyncio.run(main()) +``` + +## How It Works + +1. `gpt-4o-mini` (draft model) handles the query first +2. Quality validation checks the response +3. If quality passes, the draft response is returned (60-70% of queries) +4. If quality fails, `gpt-4o` (verifier model) handles the query +5. Cost tracking reports per-query and aggregate metrics + +## TypeScript + +```typescript +import { CascadeAgent } from '@cascadeflow/core'; + +const agent = new CascadeAgent({ + models: [ + { name: 'gpt-4o-mini', provider: 'openai', cost: 0.000375 }, + { name: 'gpt-4o', provider: 'openai', cost: 0.00625 }, + ], +}); + +const result = await agent.run('What is TypeScript?'); +console.log(`Model: ${result.modelUsed}, Cost: $${result.totalCost}`); +``` + +## Source + +[examples/basic_usage.py](https://github.com/lemony-ai/cascadeflow/blob/main/examples/basic_usage.py) diff --git a/docs-site/examples/budget-enforcement.mdx b/docs-site/examples/budget-enforcement.mdx new file mode 100644 index 00000000..dab52ed9 --- /dev/null +++ b/docs-site/examples/budget-enforcement.mdx @@ -0,0 +1,84 @@ +--- +title: Budget Enforcement +description: Per-run and per-user budget caps with enforcement callbacks, cost tracking, and automatic stop actions. +--- + +# Budget Enforcement + +Enforce spending limits on agent runs with automatic stop actions when budget is exceeded. + +## Basic Budget Cap + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run(budget=0.50) as session: + result = await agent.run("Research and summarize this topic") + + summary = session.summary() + print(f"Cost: ${summary['cost_total']:.4f}") + print(f"Budget remaining: ${summary['budget_remaining']:.4f}") + print(f"Steps completed: {summary['steps']}") +``` + +## Budget with Tool Call Limit + +```python +with cascadeflow.run(budget=1.00, max_tool_calls=5) as session: + result = await agent.run("Search and analyze this dataset") + # Stops when either budget or tool call limit is hit +``` + +## Per-Agent Budgets + +```python +@cascadeflow.agent(budget=0.10) +async def triage_agent(query: str): + """Cheap triage — $0.10 max.""" + return await llm.complete(query) + +@cascadeflow.agent(budget=2.00) +async def research_agent(query: str): + """Deep research — $2.00 max.""" + return await llm.complete(query) +``` + +## Cost Tracking (Legacy API) + +For pre-harness budget enforcement using the telemetry API: + +```python +from cascadeflow.telemetry import BudgetConfig, CostTracker, strict_budget_enforcement + +tracker = CostTracker( + budget_config=BudgetConfig( + daily_limit=10.0, + per_query_limit=0.50, + alert_threshold=0.8, + ), + enforcement_callback=strict_budget_enforcement, +) + +# Track costs manually +tracker.track(model="gpt-4o", cost=0.003) +print(f"Daily spend: ${tracker.daily_spend:.4f}") +``` + +## Decision Trace + +```python +with cascadeflow.run(budget=0.50) as session: + result = await agent.run("Multi-step analysis") + + for record in session.trace(): + if record['action'] == 'stop': + print(f"Stopped at step {record['step']}: {record['reason']}") + else: + print(f"Step {record['step']}: {record['action']} (${record['cost_total']:.4f})") +``` + +## Source + +[examples/enforcement/basic_enforcement.py](https://github.com/lemony-ai/cascadeflow/blob/main/examples/enforcement/basic_enforcement.py) diff --git a/docs-site/examples/compliance-gating.mdx b/docs-site/examples/compliance-gating.mdx new file mode 100644 index 00000000..19f9fbd3 --- /dev/null +++ b/docs-site/examples/compliance-gating.mdx @@ -0,0 +1,89 @@ +--- +title: Compliance Gating +description: GDPR, HIPAA, PCI, and strict model allowlists with enforcement examples for regulated agent workflows. +--- + +# Compliance Gating + +Restrict which models can be used based on compliance requirements. + +## GDPR Compliance + +Only allow models approved for EU data processing: + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run(compliance="gdpr") as session: + # Only gpt-4o, gpt-4o-mini, gpt-3.5-turbo are allowed + result = await agent.run("Process this EU customer feedback") + + for record in session.trace(): + if record['action'] == 'switch_model': + print(f"Model switched: {record['reason']}") +``` + +## HIPAA Compliance + +For healthcare data — stricter allowlist: + +```python +with cascadeflow.run(compliance="hipaa") as session: + # Only gpt-4o, gpt-4o-mini are allowed + result = await agent.run("Summarize this patient record") +``` + +## PCI Compliance + +For payment card data: + +```python +with cascadeflow.run(compliance="pci") as session: + # Only gpt-4o-mini, gpt-3.5-turbo are allowed + result = await agent.run("Analyze this transaction") +``` + +## Strict Mode + +Maximum restriction — single model only: + +```python +with cascadeflow.run(compliance="strict") as session: + # Only gpt-4o is allowed + result = await agent.run("Classify this sensitive document") +``` + +## Compliance Allowlists + +| Mode | Allowed Models | +|---|---| +| `gdpr` | gpt-4o, gpt-4o-mini, gpt-3.5-turbo | +| `hipaa` | gpt-4o, gpt-4o-mini | +| `pci` | gpt-4o-mini, gpt-3.5-turbo | +| `strict` | gpt-4o | + +## Combining with Budget + +```python +@cascadeflow.agent(budget=1.00, compliance="gdpr") +async def eu_data_agent(query: str): + """Process EU data within budget using only GDPR-approved models.""" + return await llm.complete(query) +``` + +## Observe Mode for Audit + +Use `observe` mode to audit which models would be blocked without affecting production: + +```python +cascadeflow.init(mode="observe") + +with cascadeflow.run(compliance="hipaa") as session: + result = await agent.run("Process health data") + + # Check which calls would have been blocked + violations = [r for r in session.trace() if r['action'] == 'switch_model'] + print(f"Compliance violations detected: {len(violations)}") +``` diff --git a/docs-site/examples/enterprise-patterns.mdx b/docs-site/examples/enterprise-patterns.mdx new file mode 100644 index 00000000..5949972c --- /dev/null +++ b/docs-site/examples/enterprise-patterns.mdx @@ -0,0 +1,127 @@ +--- +title: Enterprise Patterns +description: Production-ready patterns including retry logic, rate limiting, budget management, circuit breakers, caching, and health monitoring. +--- + +# Enterprise Patterns + +Production patterns for deploying cascadeflow at scale. + +## Retry with Exponential Backoff + +```python +import asyncio +from cascadeflow import CascadeAgent + +async def execute_with_retry(agent, query, max_retries=3, base_delay=1.0): + for attempt in range(max_retries): + try: + return await agent.run(query) + except Exception as e: + if attempt == max_retries - 1: + raise + delay = base_delay * (2 ** attempt) + await asyncio.sleep(delay) +``` + +## Rate Limiting + +```python +import time +from collections import deque + +class RateLimiter: + def __init__(self, max_requests: int, window_seconds: float): + self.max_requests = max_requests + self.window = window_seconds + self.requests = deque() + + async def acquire(self): + now = time.monotonic() + while self.requests and self.requests[0] < now - self.window: + self.requests.popleft() + if len(self.requests) >= self.max_requests: + wait = self.requests[0] + self.window - now + await asyncio.sleep(wait) + self.requests.append(time.monotonic()) +``` + +## Budget Management + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +# Per-user daily budget +async def handle_user_request(user_id: str, query: str): + user_budget = get_user_remaining_budget(user_id) + + with cascadeflow.run(budget=min(user_budget, 0.50)) as session: + result = await agent.run(query) + + spent = session.summary()['cost_total'] + update_user_budget(user_id, spent) + return result +``` + +## Circuit Breaker + +```python +from cascadeflow import CircuitBreaker, CircuitBreakerConfig + +config = CircuitBreakerConfig( + failure_threshold=5, + recovery_timeout=30.0, + half_open_max_calls=2, +) + +breaker = CircuitBreaker(config=config) + +async def safe_call(agent, query): + if not breaker.allow_request(): + return fallback_response(query) + try: + result = await agent.run(query) + breaker.record_success() + return result + except Exception as e: + breaker.record_failure() + raise +``` + +## Response Caching + +```python +from cascadeflow import ResponseCache + +cache = ResponseCache(max_size=1000, ttl_seconds=300) + +async def cached_run(agent, query): + cached = cache.get(query) + if cached: + return cached + result = await agent.run(query) + cache.set(query, result) + return result +``` + +## Health Monitoring + +```python +with cascadeflow.run(budget=10.00) as session: + for query in production_queries: + result = await agent.run(query) + + summary = session.summary() + + # Alert on anomalies + if summary['cost_total'] > 8.0: + alert("Budget 80% consumed") + if summary['steps'] > 100: + alert("High step count") +``` + +## Source + +[examples/production_patterns.py](https://github.com/lemony-ai/cascadeflow/blob/main/examples/production_patterns.py) diff --git a/docs-site/examples/kpi-weighted-routing.mdx b/docs-site/examples/kpi-weighted-routing.mdx new file mode 100644 index 00000000..5bab7689 --- /dev/null +++ b/docs-site/examples/kpi-weighted-routing.mdx @@ -0,0 +1,95 @@ +--- +title: KPI-Weighted Routing +description: Configure quality, cost, latency, and energy weights to encode business priorities into model routing decisions. +--- + +# KPI-Weighted Routing + +Inject business priorities into every model decision using KPI weights. + +## Quality-First (Premium Workload) + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run( + budget=2.00, + kpi_weights={"quality": 0.8, "cost": 0.1, "latency": 0.1}, + kpi_targets={"quality": 0.9} +) as session: + # Routes to highest-quality models within budget + result = await agent.run("Draft a legal contract clause") + print(session.summary()) +``` + +## Cost-First (High-Volume Batch) + +```python +with cascadeflow.run( + budget=5.00, + kpi_weights={"cost": 0.7, "quality": 0.2, "latency": 0.1} +) as session: + # Routes to cheapest models that meet quality floor + for query in batch_queries: + result = await agent.run(query) + print(f"Total cost: ${session.summary()['cost_total']:.4f}") +``` + +## Latency-First (Real-Time) + +```python +with cascadeflow.run( + kpi_weights={"latency": 0.7, "quality": 0.2, "cost": 0.1}, + max_latency_ms=2000.0 +) as session: + # Routes to fastest models, hard cap at 2 seconds + result = await agent.run("Quick classification task") +``` + +## Energy-Aware (Carbon-Conscious) + +```python +with cascadeflow.run( + kpi_weights={"quality": 0.4, "energy": 0.3, "cost": 0.3}, + max_energy=100.0 +) as session: + # Balances quality with energy efficiency + result = await agent.run("Summarize this report") + print(f"Energy used: {session.summary()['energy_used']:.1f} units") +``` + +## Per-Agent Profiles + +```python +@cascadeflow.agent( + budget=0.10, + kpi_weights={"cost": 0.9, "quality": 0.1} +) +async def triage_agent(query: str): + """Quick classification — prioritize cost.""" + return await llm.complete(query) + +@cascadeflow.agent( + budget=2.00, + kpi_weights={"quality": 0.9, "cost": 0.1}, + kpi_targets={"quality": 0.95} +) +async def analysis_agent(query: str): + """Deep analysis — prioritize quality.""" + return await llm.complete(query) +``` + +## Quality Priors + +The harness uses built-in quality priors for scoring: + +| Model | Quality Prior | Latency Prior | +|---|---|---| +| o1 | 0.95 | 0.40 | +| gpt-4o | 0.90 | 0.72 | +| gpt-4-turbo | 0.88 | 0.66 | +| gpt-5-mini | 0.86 | 0.84 | +| gpt-4o-mini | 0.75 | 0.93 | +| gpt-3.5-turbo | 0.65 | 1.00 | diff --git a/docs-site/examples/multi-agent.mdx b/docs-site/examples/multi-agent.mdx new file mode 100644 index 00000000..06b9598b --- /dev/null +++ b/docs-site/examples/multi-agent.mdx @@ -0,0 +1,103 @@ +--- +title: Multi-Agent Orchestration +description: Multi-turn tool execution with agent-as-a-tool delegation and budget tracking across agent boundaries. +--- + +# Multi-Agent Orchestration + +cascadeflow supports multi-agent patterns with tool execution, delegation, and budget tracking across agent boundaries. + +## Tool Execution Loop + +```python +import asyncio +from cascadeflow import CascadeAgent, ModelConfig +from cascadeflow.tools import ToolConfig, ToolExecutor + +# Define tools +tools = [ + ToolConfig( + name="calculator", + description="Evaluate a math expression", + parameters={"expression": {"type": "string"}}, + handler=lambda expression: str(eval(expression)), + ), + ToolConfig( + name="search", + description="Search the web", + parameters={"query": {"type": "string"}}, + handler=lambda query: f"Results for: {query}", + ), +] + +agent = CascadeAgent(models=[ + ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375), + ModelConfig(name="gpt-4o", provider="openai", cost=0.00625), +]) + +executor = ToolExecutor(tools=tools) + +async def main(): + result = await agent.run( + "Calculate 15% of 250 and search for tax rates", + tools=tools, + tool_executor=executor, + max_steps=5, + ) + print(result.content) + +asyncio.run(main()) +``` + +## With Harness Budget Tracking + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run(budget=1.00, max_tool_calls=10) as session: + result = await agent.run( + "Research this topic using multiple tools", + tools=tools, + tool_executor=executor, + max_steps=10, + ) + + summary = session.summary() + print(f"Cost: ${summary['cost_total']:.4f}") + print(f"Tool calls: {summary['tool_calls']}") + print(f"Steps: {summary['steps']}") +``` + +## Agent-as-a-Tool Delegation + +```python +# Define a researcher agent as a tool +researcher = CascadeAgent(models=[ + ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375), + ModelConfig(name="gpt-4o", provider="openai", cost=0.00625), +]) + +async def research_handler(query: str) -> str: + result = await researcher.run(query) + return result.content + +# Main agent can delegate to researcher +tools = [ + ToolConfig( + name="research", + description="Delegate research to a specialist agent", + parameters={"query": {"type": "string"}}, + handler=research_handler, + ), +] + +# Budget tracks across both agents +with cascadeflow.run(budget=2.00) as session: + result = await main_agent.run("Analyze and research this topic", tools=tools) +``` + +## Source + +[examples/agentic_multi_agent.py](https://github.com/lemony-ai/cascadeflow/blob/main/examples/agentic_multi_agent.py) diff --git a/docs-site/favicon.svg b/docs-site/favicon.svg new file mode 100644 index 00000000..496df9f5 --- /dev/null +++ b/docs-site/favicon.svg @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/docs-site/get-started/how-it-works.mdx b/docs-site/get-started/how-it-works.mdx new file mode 100644 index 00000000..721feef6 --- /dev/null +++ b/docs-site/get-started/how-it-works.mdx @@ -0,0 +1,112 @@ +--- +title: How It Works +description: Architecture of cascadeflow's two engines — Cascade for speculative model routing and Harness for agent runtime intelligence. +--- + +# How It Works + +cascadeflow ships two complementary engines that can be used independently or together. + +## Cascade Engine + +The Cascade Engine optimizes model selection through **speculative execution with quality validation**: + +1. **Speculatively executes** small, fast models first — optimistic execution ($0.15-0.30/1M tokens) +2. **Validates quality** of responses using configurable thresholds (completeness, confidence, correctness) +3. **Dynamically escalates** to larger models only when quality validation fails ($1.25-3.00/1M tokens) +4. **Learns patterns** to optimize future cascading decisions and domain-specific routing + +In practice, 60-70% of queries are handled by small, efficient models without escalation. + +**Result:** 40-85% cost reduction, 2-10x faster responses, zero quality loss. + +``` +Query → Domain Detection → Try Draft Model → Quality Check + │ + Pass ───┘─── Fail + │ │ + Return Escalate to + Result Verifier Model +``` + +## Harness Engine + +The Harness Engine provides **agent runtime intelligence** — budget enforcement, compliance gating, KPI-weighted routing, energy tracking, and decision traces. + +Unlike the Cascade Engine which routes between models, the Harness Engine wraps existing agent execution and makes decisions at every step: + +``` +Agent Step → Harness Decision → allow / switch_model / deny_tool / stop + │ + ├── Check budget remaining + ├── Check compliance allowlist + ├── Score KPI dimensions + ├── Check tool call cap + ├── Check latency cap + └── Check energy cap +``` + +### Decision Flow + +For each LLM call or tool execution inside an agent loop, the harness: + +1. **Records** the model, step number, and cumulative metrics +2. **Evaluates** all configured constraints (budget, compliance, tool calls, latency, energy) +3. **Scores** the call against KPI weights if configured +4. **Decides** an action: `allow`, `switch_model`, `deny_tool`, or `stop` +5. **Enforces** the action if in `enforce` mode (logs only in `observe` mode) +6. **Appends** a trace record for auditability + +### HarnessConfig + +All harness behavior is configured through a single dataclass: + +```python +HarnessConfig( + mode="enforce", # off | observe | enforce + budget=0.50, # Max USD for the run + max_tool_calls=10, # Max tool/function calls + max_latency_ms=5000.0, # Max wall-clock ms per call + max_energy=100.0, # Max energy units + compliance="gdpr", # gdpr | hipaa | pci | strict + kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1}, + kpi_targets={"quality": 0.9}, +) +``` + +## Combined Usage + +When both engines are active, the Cascade Engine handles model selection while the Harness Engine enforces constraints: + +```python +import cascadeflow +from cascadeflow import CascadeAgent, ModelConfig + +# Harness: enforce budget and compliance +cascadeflow.init(mode="enforce") + +# Cascade: speculative model routing +agent = CascadeAgent(models=[ + ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375), + ModelConfig(name="gpt-4o", provider="openai", cost=0.00625), +]) + +with cascadeflow.run(budget=1.00) as session: + result = await agent.run("Analyze this contract for GDPR compliance") + print(session.summary()) +``` + +## Provider Abstraction + +cascadeflow supports 17+ providers through a unified interface: + +| Provider | Type | Package | +|---|---|---| +| OpenAI | API | `cascadeflow[openai]` | +| Anthropic | API | `cascadeflow[anthropic]` | +| Groq | API | `cascadeflow[groq]` | +| Together | API | `cascadeflow[together]` | +| Hugging Face | API | `cascadeflow[huggingface]` | +| Ollama | Local | Built-in (HTTP) | +| vLLM | Local | `cascadeflow[vllm]` | +| Vercel AI SDK | TypeScript | `@cascadeflow/vercel-ai` | diff --git a/docs-site/get-started/installation.mdx b/docs-site/get-started/installation.mdx new file mode 100644 index 00000000..ff6b8583 --- /dev/null +++ b/docs-site/get-started/installation.mdx @@ -0,0 +1,101 @@ +--- +title: Installation +description: Install cascadeflow with pip extras for Python or npm packages for TypeScript, including provider-specific setup. +--- + +# Installation + +## Python + +### Minimal install + +```bash +pip install cascadeflow +``` + +Core dependencies: `pydantic>=2.0.0`, `httpx>=0.25.0`, `tiktoken>=0.5.0`, `rich>=13.0.0`. + +### With providers + +```bash +pip install "cascadeflow[providers]" # OpenAI + Anthropic + Groq +``` + +Individual providers: + +```bash +pip install "cascadeflow[openai]" # OpenAI +pip install "cascadeflow[anthropic]" # Anthropic +pip install "cascadeflow[groq]" # Groq +pip install "cascadeflow[huggingface]" # Hugging Face +pip install "cascadeflow[together]" # Together AI +``` + +### With framework integrations + +```bash +pip install "cascadeflow[langchain]" # LangChain/LangGraph +pip install "cascadeflow[openai-agents]" # OpenAI Agents SDK +pip install "cascadeflow[crewai]" # CrewAI (Python 3.10+) +pip install "cascadeflow[google-adk]" # Google ADK (Python 3.10+) +``` + +### Local inference + +```bash +pip install "cascadeflow[vllm]" # vLLM (Python 3.10-3.13) +``` + +Ollama does not need a Python package — cascadeflow communicates with Ollama via HTTP at `localhost:11434`. Install Ollama separately from [ollama.ai](https://ollama.ai). + +### Everything + +```bash +pip install "cascadeflow[all]" # All providers + semantic routing +``` + +### Development + +```bash +git clone https://github.com/lemony-ai/cascadeflow.git +cd cascadeflow +pip install -e ".[dev]" +``` + +## TypeScript + +### Core + +```bash +npm install @cascadeflow/core +``` + +### Framework packages + +```bash +npm install @cascadeflow/langchain # LangChain integration +npm install @cascadeflow/vercel-ai # Vercel AI SDK middleware +npm install @cascadeflow/n8n-nodes-cascadeflow # n8n community node +``` + +## Provider Setup + +Set API keys as environment variables: + +```bash +export OPENAI_API_KEY="sk-..." +export ANTHROPIC_API_KEY="sk-ant-..." +export GROQ_API_KEY="gsk_..." +``` + +cascadeflow auto-detects available providers based on which API keys are set. + +## Verify Installation + +```bash +python -c "import cascadeflow; print(cascadeflow.__version__)" +``` + +```bash +python -c "from cascadeflow import init, run, HarnessConfig, HarnessRunContext; print('OK')" +``` diff --git a/docs-site/get-started/introduction.mdx b/docs-site/get-started/introduction.mdx new file mode 100644 index 00000000..39c2f74c --- /dev/null +++ b/docs-site/get-started/introduction.mdx @@ -0,0 +1,62 @@ +--- +title: Introduction +description: What cascadeflow is, how it differs from external proxies, and when to use it for agent runtime intelligence. +--- + +# Introduction + +cascadeflow is an in-process intelligence layer that sits inside AI agent execution loops. Unlike external proxies that only see HTTP request boundaries, cascadeflow operates with full agent state awareness: step count, budget consumed, tool call history, error context, quality scores, domain, complexity, and user-defined business context. + +## What makes cascadeflow different + +**1. Inside-the-loop control.** Decisions happen per-step and per-tool-call inside agent execution, not at the HTTP boundary. This enables budget gating mid-run, model switching based on remaining budget, and stop actions when caps are hit. + +**2. Multi-dimensional optimization.** Six dimensions scored simultaneously: cost, latency, quality, budget, compliance, and energy. Not just cost routing. + +**3. Business logic injection.** KPI weights and targets let teams encode business priorities (e.g. 60% quality, 30% cost, 10% latency) into every model decision. + +**4. Actionable decisions.** Four actions: `allow`, `switch_model`, `deny_tool`, `stop`. The harness does not just observe — it controls execution flow. + +**5. Full transparency.** Every decision produces a trace record with action, reason, model, step, cost_total, budget_state, and applied fields. Audit-ready. + +**6. Measurable value.** Session summaries report cost, latency, energy, steps, tool calls, and budget remaining. Before/after comparison is built in. + +**7. Cross-framework policy layer.** Unified KPI semantics across LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, and Vercel AI SDK. + +**8. Latency advantage.** In-process instrumentation adds less than 1ms overhead per call. External proxies add 10-50ms of network round-trip latency per LLM call. + +## Proxy vs In-Process Harness + +| Dimension | External Proxy | cascadeflow Harness | +|---|---|---| +| **Scope** | HTTP request boundary | Inside agent execution loop | +| **Dimensions** | Cost only | Cost + quality + latency + budget + compliance + energy | +| **Latency overhead** | 10-50ms network RTT | <1ms in-process | +| **Business logic** | None | KPI weights and targets | +| **Enforcement** | None (observe only) | stop, deny_tool, switch_model | +| **Auditability** | Request logs | Per-step decision traces | + +## When to use cascadeflow + +- You run AI agents (LangChain, LangGraph, CrewAI, OpenAI Agents SDK, Google ADK, or custom) +- You want to reduce LLM costs without changing agent code +- You need budget enforcement across multi-step agent runs +- You need to inject business KPIs (quality, cost, latency, energy) into agent decisions +- You need compliance-aware model gating (GDPR, HIPAA, PCI, strict) +- You want full trace recording for auditability and tuning + +## When NOT to use cascadeflow + +- Single one-off LLM calls (overhead not justified) +- You only use one model and don't want routing +- You need a hosted proxy service (cascadeflow is a library, not a SaaS) + +## Two Engines + +cascadeflow ships two complementary engines: + +**Cascade Engine** — Speculative execution with quality validation. Tries cheap models first, validates quality, escalates only when needed. Achieves 40-85% cost savings on typical workloads. + +**Harness Engine** — Agent runtime intelligence. Budget enforcement, compliance gating, KPI-weighted routing, energy tracking, and decision traces. Works inside agent loops with full state awareness. + +Both engines can be used independently or together. diff --git a/docs-site/get-started/quickstart.mdx b/docs-site/get-started/quickstart.mdx new file mode 100644 index 00000000..64189077 --- /dev/null +++ b/docs-site/get-started/quickstart.mdx @@ -0,0 +1,118 @@ +--- +title: Quickstart +description: Get cascadeflow running in 3 minutes with zero code changes using the harness API. +--- + +# Quickstart + +Three tiers of integration — pick the one that matches your needs. + +## Install + + + +```bash pip +pip install "cascadeflow[openai]" +``` + +```bash With integrations +pip install "cascadeflow[langchain]" # LangChain/LangGraph +pip install "cascadeflow[openai-agents]" # OpenAI Agents SDK +pip install "cascadeflow[crewai]" # CrewAI +pip install "cascadeflow[google-adk]" # Google ADK +``` + +```bash npm +npm install @cascadeflow/core +``` + + + +## Tier 1: Zero-Change Observability + +Add two lines. All OpenAI and Anthropic SDK calls are automatically tracked. + +```python +import cascadeflow + +cascadeflow.init(mode="observe") + +# Your existing code — no changes needed +import openai +client = openai.OpenAI() +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "What is cascadeflow?"}] +) +# cascadeflow is now tracking cost, latency, energy, and model usage. +``` + +## Tier 2: Scoped Runs with Budget + +Wrap agent execution in a `run()` context manager for budget tracking and enforcement. + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run(budget=0.50, max_tool_calls=10) as session: + # Your agent code here + result = await agent.run("Analyze this dataset and create a report") + + # After execution, inspect metrics + summary = session.summary() + print(f"Cost: ${summary['cost_total']:.4f}") + print(f"Steps: {summary['steps']}") + print(f"Tool calls: {summary['tool_calls']}") + print(f"Budget remaining: ${summary['budget_remaining']:.4f}") + + # Full decision audit trail + for decision in session.trace(): + print(f" Step {decision['step']}: {decision['action']} — {decision['reason']}") +``` + +## Tier 3: Decorated Agents with Policy + +Annotate agent functions with budget, compliance, and KPI metadata. + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +@cascadeflow.agent( + budget=0.20, + compliance="gdpr", + kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1} +) +async def research_agent(query: str): + return await llm.complete(query) +``` + +## Harness Modes + +| Mode | Tracking | Enforcement | Use Case | +|---|---|---|---| +| `off` | No | No | Disabled | +| `observe` | Yes | No | Safe production rollout, metrics collection | +| `enforce` | Yes | Yes | Budget caps, compliance gating, stop actions | + +Start with `observe` in production. Switch to `enforce` once you've validated the metrics. + +## Next Steps + + + + All pip extras, npm packages, and provider setup. + + + Architecture of the Cascade and Harness engines. + + + Per-run and per-user budget caps. + + + LangChain, OpenAI Agents, CrewAI, Google ADK, n8n, Vercel AI. + + diff --git a/docs-site/harness/actions.mdx b/docs-site/harness/actions.mdx new file mode 100644 index 00000000..a904eed8 --- /dev/null +++ b/docs-site/harness/actions.mdx @@ -0,0 +1,99 @@ +--- +title: Decision Actions +description: Four harness actions — allow, switch_model, deny_tool, and stop — and when each is triggered. +--- + +# Decision Actions + +The harness makes one of four decisions at every step. Actions are computed in both `observe` and `enforce` modes, but only applied in `enforce` mode. + +## Actions + +### `allow` + +Proceed normally. No constraints are violated. + +``` +Step 1: allow — budget ok, model compliant +``` + +This is the most common action. It means all hard caps (budget, tool calls, latency, energy) are within limits and compliance is satisfied. + +### `switch_model` + +Route to a different model. Triggered when: +- The current model is not in the compliance allowlist +- KPI scoring indicates a better model choice +- Budget pressure suggests a cheaper alternative + +``` +Step 3: switch_model — compliance violation, switching to gpt-4o-mini (gdpr allowlist) +``` + +In `enforce` mode, the harness substitutes the model. In `observe` mode, the original model is used and the trace records what would have happened. + +### `deny_tool` + +Block a tool/function call. Triggered when `max_tool_calls` is reached. + +``` +Step 5: deny_tool — tool call cap reached (10/10) +``` + +In `enforce` mode, the tool call is blocked. The agent receives a signal that the tool was denied. + +### `stop` + +Halt agent execution. Triggered when: +- Budget is exceeded +- Latency cap is exceeded +- Energy cap is exceeded + +``` +Step 7: stop — budget exceeded ($0.52 > $0.50 cap) +``` + +In `enforce` mode, the agent loop is stopped. In `observe` mode, execution continues and the trace records the violation. + +## Decision Priority + +When multiple constraints are violated simultaneously, the harness applies this priority: + +1. **Compliance** — check first (switch_model or stop) +2. **Budget** — check second (stop) +3. **Tool calls** — check third (deny_tool) +4. **Latency** — check fourth (stop) +5. **Energy** — check fifth (stop) +6. **KPI scoring** — soft optimization (switch_model or allow) + +## Hard vs Soft Controls + +**Hard controls** trigger `stop` or `deny_tool` when limits are exceeded: +- `budget` — max USD +- `max_tool_calls` — max tool/function calls +- `max_latency_ms` — max wall-clock ms per call +- `max_energy` — max energy units +- `compliance` — model allowlist + +**Soft controls** influence model selection through KPI weights but never block execution: +- `kpi_weights` — relative importance of quality, cost, latency, energy +- `kpi_targets` — target values for KPI dimensions + +## Example: Combined Constraints + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run( + budget=1.00, + max_tool_calls=5, + compliance="gdpr", + kpi_weights={"quality": 0.6, "cost": 0.4} +) as session: + result = await agent.run("Process EU customer data") + + for record in session.trace(): + print(f"Step {record['step']}: {record['action']} — {record['reason']}") +``` diff --git a/docs-site/harness/budget-enforcement.mdx b/docs-site/harness/budget-enforcement.mdx new file mode 100644 index 00000000..079752ac --- /dev/null +++ b/docs-site/harness/budget-enforcement.mdx @@ -0,0 +1,83 @@ +--- +title: Budget Enforcement +description: Configure budget enforcement with per-run caps and automatic stop actions when budget is exceeded. +--- + +# Budget Enforcement + +The harness tracks cumulative cost across all LLM calls in a run and enforces budget caps in `enforce` mode. + +## Per-Run Budget + +Set a budget cap on a scoped run: + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run(budget=0.50) as session: + # Agent executes multiple LLM calls + result = await agent.run("Research and summarize this topic") + + summary = session.summary() + print(f"Total cost: ${summary['cost_total']:.4f}") + print(f"Budget remaining: ${summary['budget_remaining']:.4f}") +``` + +When cumulative cost exceeds the budget: +- In `observe` mode: the trace records `action: "stop"` with `applied: false` +- In `enforce` mode: the harness stops execution with `action: "stop"` and `applied: true` + +## Per-Agent Budget + +Attach budget metadata to agent functions: + +```python +@cascadeflow.agent(budget=0.20) +async def cheap_agent(query: str): + return await llm.complete(query) + +@cascadeflow.agent(budget=2.00) +async def premium_agent(query: str): + return await llm.complete(query) +``` + +## Budget Pressure Routing + +When budget is partially consumed, the harness can route to cheaper models. This happens automatically when KPI weights include a cost dimension: + +```python +cascadeflow.init(mode="enforce") + +with cascadeflow.run( + budget=1.00, + kpi_weights={"quality": 0.5, "cost": 0.5} +) as session: + # Early calls may use gpt-4o (high quality) + # As budget pressure increases, routing shifts toward gpt-4o-mini (lower cost) + for query in queries: + result = await agent.run(query) +``` + +## Cost Calculation + +Cost is estimated from the built-in pricing table: + +``` +cost = (input_tokens / 1_000_000) * input_price + (output_tokens / 1_000_000) * output_price +``` + +The pricing table covers 18 models across OpenAI, Anthropic, and Google. Unknown models are resolved via fuzzy matching. + +## Combining with Tool Call Caps + +Budget and tool call caps work together: + +```python +with cascadeflow.run(budget=0.50, max_tool_calls=10) as session: + # Stops when either limit is hit + result = await agent.run("Analyze this data") +``` + +The harness checks all constraints at every step. The first constraint that is violated triggers the corresponding action (`stop` for budget, `deny_tool` for tool calls). diff --git a/docs-site/harness/compliance.mdx b/docs-site/harness/compliance.mdx new file mode 100644 index 00000000..febb0de5 --- /dev/null +++ b/docs-site/harness/compliance.mdx @@ -0,0 +1,66 @@ +--- +title: Compliance Gating +description: GDPR, HIPAA, PCI, and strict model allowlists for compliance-aware model gating in agent workflows. +--- + +# Compliance Gating + +The harness enforces model allowlists based on compliance requirements. When a compliance mode is set, only models in the corresponding allowlist are permitted. + +## Compliance Modes + +| Mode | Allowed Models | Use Case | +|---|---|---| +| `gdpr` | gpt-4o, gpt-4o-mini, gpt-3.5-turbo | EU data protection | +| `hipaa` | gpt-4o, gpt-4o-mini | Healthcare data | +| `pci` | gpt-4o-mini, gpt-3.5-turbo | Payment card data | +| `strict` | gpt-4o | Maximum restriction | + +## Usage + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +# GDPR compliance — only gpt-4o, gpt-4o-mini, gpt-3.5-turbo allowed +with cascadeflow.run(compliance="gdpr") as session: + result = await agent.run("Process this EU customer data") +``` + +Or as agent metadata: + +```python +@cascadeflow.agent(compliance="hipaa") +async def medical_agent(query: str): + return await llm.complete(query) +``` + +## Enforcement Behavior + +When a model outside the allowlist is requested: + +- In `observe` mode: the trace records `action: "switch_model"` with the suggested compliant alternative, but execution continues with the original model +- In `enforce` mode: the harness blocks the non-compliant model and either switches to a compliant alternative or stops execution + +## Combining with Budget + +Compliance and budget constraints are independent. Both are checked at every step: + +```python +with cascadeflow.run(budget=0.50, compliance="gdpr") as session: + # Must stay within budget AND use only GDPR-approved models + result = await agent.run("Analyze EU customer feedback") +``` + +## Custom Allowlists + +The built-in allowlists cover common regulations. For custom requirements, set compliance at the integration level or use the `HarnessConfig` directly: + +```python +config = HarnessConfig( + mode="enforce", + compliance="strict", # Only gpt-4o +) +cascadeflow.init(config=config) +``` diff --git a/docs-site/harness/decision-trace.mdx b/docs-site/harness/decision-trace.mdx new file mode 100644 index 00000000..2b1b14a6 --- /dev/null +++ b/docs-site/harness/decision-trace.mdx @@ -0,0 +1,102 @@ +--- +title: Decision Traces +description: Per-step audit trail of every harness decision — action, reason, model, cost, budget state, and enforcement status. +--- + +# Decision Traces + +Every harness decision produces a trace record. Traces provide a full audit trail for debugging, compliance reporting, and performance tuning. + +## Trace Format + +Each trace record contains: + +| Field | Type | Description | +|---|---|---| +| `action` | string | `"allow"`, `"switch_model"`, `"deny_tool"`, or `"stop"` | +| `reason` | string | Human-readable explanation of the decision | +| `model` | string | Model name used for the call | +| `step` | int | Step number in the run (1-indexed) | +| `cost_total` | float | Cumulative cost in USD at this step | +| `budget_state` | string | `"ok"`, `"warning"`, or `"exceeded"` | +| `applied` | bool | `true` if the action was enforced, `false` in observe mode | + +## Accessing Traces + +```python +import cascadeflow + +cascadeflow.init(mode="observe") + +with cascadeflow.run(budget=0.50) as session: + result = await agent.run("Research this topic") + + # Full decision trace + for record in session.trace(): + print(f"Step {record['step']}: {record['action']} — {record['reason']}") + print(f" Model: {record['model']}, Cost: ${record['cost_total']:.4f}") + print(f" Budget: {record['budget_state']}, Applied: {record['applied']}") +``` + +Example output: + +``` +Step 1: allow — budget ok, model compliant + Model: gpt-4o-mini, Cost: $0.0003 + Budget: ok, Applied: false +Step 2: allow — budget ok, model compliant + Model: gpt-4o-mini, Cost: $0.0007 + Budget: ok, Applied: false +Step 3: switch_model — budget pressure, routing to cheaper model + Model: gpt-4o, Cost: $0.0032 + Budget: warning, Applied: false +``` + +## Observe vs Enforce + +In `observe` mode, traces record what the harness *would* do: +- `applied` is always `false` +- Agent execution continues regardless of the action + +In `enforce` mode, traces record what the harness *did*: +- `applied` is `true` when the action was enforced +- `stop` actions halt execution +- `deny_tool` actions block tool calls + +## Privacy + +Decision traces do not contain prompt content, response content, or user data. They only contain: +- Model names and step numbers +- Cost and budget metrics +- Action decisions and reasons + +This makes traces safe for logging, external storage, and compliance reporting without data classification concerns. + +## Callbacks + +Register callbacks to receive trace records in real time: + +```python +from cascadeflow import get_harness_callback_manager, set_harness_callback_manager + +cb_manager = get_harness_callback_manager() + +# Traces are emitted through the callback system +# Use framework-specific integrations for structured access +``` + +## Session Summary + +In addition to per-step traces, `session.summary()` provides aggregate metrics: + +```python +summary = session.summary() +# { +# "cost_total": 0.0032, +# "steps": 3, +# "tool_calls": 1, +# "latency_total_ms": 1250.0, +# "energy_used": 45.2, +# "budget_remaining": 0.4968, +# } +``` diff --git a/docs-site/harness/energy-tracking.mdx b/docs-site/harness/energy-tracking.mdx new file mode 100644 index 00000000..a3d292ee --- /dev/null +++ b/docs-site/harness/energy-tracking.mdx @@ -0,0 +1,99 @@ +--- +title: Energy Tracking +description: Deterministic compute-intensity coefficients for carbon-aware AI operations, with energy caps and per-model coefficients. +--- + +# Energy Tracking + +The harness tracks energy consumption using deterministic compute-intensity coefficients. This provides a proxy for carbon impact without requiring real-time power measurement. + +## Energy Formula + +``` +energy_units = coefficient * (input_tokens + output_tokens * 1.5) +``` + +Output tokens are weighted 1.5x because generation is more compute-intensive than prompt processing. + +## Energy Coefficients + +| Model | Coefficient | Relative Cost | +|---|---|---| +| gpt-3.5-turbo | 0.20 | Lowest | +| gemini-1.5-flash | 0.20 | Lowest | +| gemini-2.0-flash | 0.25 | Very low | +| claude-haiku-3.5 | 0.30 | Low | +| gemini-2.5-flash | 0.30 | Low | +| gpt-4o-mini | 0.30 | Low | +| gpt-5-mini | 0.35 | Low | +| o3-mini | 0.50 | Medium | +| o1-mini | 0.80 | Medium-high | +| gpt-4o | 1.00 | Baseline | +| claude-sonnet-4 | 1.00 | Baseline | +| gemini-1.5-pro | 1.00 | Baseline | +| gpt-5 | 1.20 | High | +| gemini-2.5-pro | 1.20 | High | +| gpt-4-turbo | 1.50 | High | +| gpt-4 | 1.50 | High | +| claude-opus-4.5 | 1.80 | Very high | +| o1 | 2.00 | Highest | + +## Energy Caps + +Set a maximum energy budget for a run: + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run(max_energy=100.0) as session: + result = await agent.run("Process this large dataset") + + summary = session.summary() + print(f"Energy used: {summary['energy_used']:.1f} units") +``` + +When energy exceeds the cap: +- In `observe` mode: logged but not enforced +- In `enforce` mode: execution stops with `action: "stop"` + +## Energy-Aware KPI Weights + +Include energy in KPI weights for carbon-aware routing: + +```python +with cascadeflow.run( + kpi_weights={"quality": 0.4, "cost": 0.3, "energy": 0.3} +) as session: + # Routes toward lower-energy models when quality allows + result = await agent.run("Summarize this article") +``` + +## Pricing Table + +Full pricing for all 18 supported models (USD per 1M tokens): + +| Model | Input | Output | +|---|---|---| +| **OpenAI** | | | +| gpt-4o | $2.50 | $10.00 | +| gpt-4o-mini | $0.15 | $0.60 | +| gpt-5 | $1.25 | $10.00 | +| gpt-5-mini | $0.20 | $0.80 | +| gpt-4-turbo | $10.00 | $30.00 | +| gpt-4 | $30.00 | $60.00 | +| gpt-3.5-turbo | $0.50 | $1.50 | +| o1 | $15.00 | $60.00 | +| o1-mini | $3.00 | $12.00 | +| o3-mini | $1.10 | $4.40 | +| **Anthropic** | | | +| claude-sonnet-4 | $3.00 | $15.00 | +| claude-haiku-3.5 | $1.00 | $5.00 | +| claude-opus-4.5 | $5.00 | $25.00 | +| **Google** | | | +| gemini-2.5-flash | $0.15 | $0.60 | +| gemini-2.5-pro | $1.25 | $10.00 | +| gemini-2.0-flash | $0.10 | $0.40 | +| gemini-1.5-flash | $0.075 | $0.30 | +| gemini-1.5-pro | $1.25 | $5.00 | diff --git a/docs-site/harness/kpi-optimization.mdx b/docs-site/harness/kpi-optimization.mdx new file mode 100644 index 00000000..e07e1023 --- /dev/null +++ b/docs-site/harness/kpi-optimization.mdx @@ -0,0 +1,103 @@ +--- +title: KPI-Weighted Routing +description: Inject business priorities as quality, cost, latency, and energy weights into every model routing decision. +--- + +# KPI-Weighted Routing + +The harness scores each model decision against configurable KPI weights. This lets teams encode business priorities into agent behavior without changing agent code. + +## KPI Dimensions + +| Dimension | Score Source | Range | What it means | +|---|---|---|---| +| `quality` | Model quality priors | 0.0-1.0 | Higher = better output quality | +| `cost` | Inverse of model cost | 0.0-1.0 | Higher = cheaper model | +| `latency` | Model latency priors | 0.0-1.0 | Higher = faster response | +| `energy` | Inverse of energy coefficient | 0.0-1.0 | Higher = lower compute intensity | + +## Configuration + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run( + kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1}, + kpi_targets={"quality": 0.9} +) as session: + result = await agent.run("Analyze this legal document") +``` + +### Weights + +Weights are relative — they don't need to sum to 1.0 (they are normalized internally). They control the relative importance of each dimension in the composite score. + +```python +# Quality-first (premium workload) +kpi_weights = {"quality": 0.8, "cost": 0.1, "latency": 0.1} + +# Cost-first (high-volume batch) +kpi_weights = {"quality": 0.2, "cost": 0.7, "latency": 0.1} + +# Balanced +kpi_weights = {"quality": 0.4, "cost": 0.3, "latency": 0.2, "energy": 0.1} +``` + +### Targets + +Targets set minimum acceptable values. If a model's score for a dimension falls below the target, it is penalized in the composite score. + +```python +kpi_targets = { + "quality": 0.9, # Require high quality + "latency": 0.7, # Require reasonable speed +} +``` + +## Scoring Formula + +The composite score for a model is: + +``` +score = quality_prior * w_quality + cost_utility * w_cost + latency_prior * w_latency + energy_utility * w_energy +``` + +Where `w_*` are the normalized weights and utility values are computed from model priors. + +## Quality Priors + +Built-in quality priors for common models (OpenAI): + +| Model | Quality | Latency | +|---|---|---| +| o1 | 0.95 | 0.40 | +| gpt-4o | 0.90 | 0.72 | +| gpt-4-turbo | 0.88 | 0.66 | +| gpt-4 | 0.87 | 0.52 | +| gpt-5-mini | 0.86 | 0.84 | +| o1-mini | 0.82 | 0.60 | +| o3-mini | 0.80 | 0.78 | +| gpt-4o-mini | 0.75 | 0.93 | +| gpt-3.5-turbo | 0.65 | 1.00 | + +## Per-Agent KPI Weights + +Different agents can have different priorities: + +```python +@cascadeflow.agent( + budget=0.50, + kpi_weights={"quality": 0.8, "cost": 0.2} +) +async def quality_agent(query: str): + return await llm.complete(query) + +@cascadeflow.agent( + budget=0.10, + kpi_weights={"cost": 0.8, "quality": 0.2} +) +async def budget_agent(query: str): + return await llm.complete(query) +``` diff --git a/docs-site/harness/modes.mdx b/docs-site/harness/modes.mdx new file mode 100644 index 00000000..46a86840 --- /dev/null +++ b/docs-site/harness/modes.mdx @@ -0,0 +1,78 @@ +--- +title: Harness Modes +description: Three harness modes — off, observe, and enforce — with rollout guidance for production deployments. +--- + +# Harness Modes + +cascadeflow operates in one of three modes, set at initialization. + +## Modes + +### `off` + +No tracking, no enforcement. The harness is completely disabled. This is the default. + +```python +cascadeflow.init(mode="off") +``` + +### `observe` + +Track all metrics and decisions, but never block execution. Every LLM call and tool execution is recorded with full decision traces. Actions are computed but not enforced — `applied` is always `false` in trace records. + +```python +cascadeflow.init(mode="observe") +``` + +Use `observe` for: +- Initial production rollout to validate metrics before enforcing +- Shadow-mode testing to understand what the harness would do +- Cost and usage analytics without affecting agent behavior + +### `enforce` + +Track all metrics and enforce constraints. When a hard cap is hit (budget, tool calls, latency, energy) or a compliance violation is detected, the harness takes action: `stop`, `deny_tool`, or `switch_model`. + +```python +cascadeflow.init(mode="enforce") +``` + +Use `enforce` when: +- You have validated metrics in `observe` mode +- You need hard budget caps to prevent runaway costs +- Compliance requirements mandate model gating + +## Rollout Guidance + +Recommended rollout sequence for production: + +1. **Deploy with `observe`** — No risk to agent behavior. Collect metrics, review decision traces, validate that the harness sees what you expect. + +2. **Review traces** — Check that compliance allowlists, budget calculations, and KPI scoring match your expectations. + +3. **Switch to `enforce`** — Once validated, change the mode. The harness will now enforce constraints. + +4. **Monitor** — Use `session.summary()` and `session.trace()` to monitor enforcement in production. + +```python +import os + +# Environment-driven mode selection +mode = os.getenv("CASCADEFLOW_MODE", "observe") +cascadeflow.init(mode=mode) +``` + +## Mode Behavior Matrix + +| Behavior | `off` | `observe` | `enforce` | +|---|---|---|---| +| Cost tracking | No | Yes | Yes | +| Latency tracking | No | Yes | Yes | +| Energy tracking | No | Yes | Yes | +| Decision traces | No | Yes | Yes | +| Budget enforcement | No | No | Yes | +| Tool call gating | No | No | Yes | +| Compliance gating | No | No | Yes | +| `session.summary()` | Empty | Full metrics | Full metrics | +| `session.trace()` | Empty | Decisions (applied=false) | Decisions (applied=true) | diff --git a/docs-site/harness/overview.mdx b/docs-site/harness/overview.mdx new file mode 100644 index 00000000..8486c8c4 --- /dev/null +++ b/docs-site/harness/overview.mdx @@ -0,0 +1,80 @@ +--- +title: Harness Overview +description: Overview of the cascadeflow harness — six optimization dimensions, HarnessConfig surface, and high-level decision flow. +--- + +# Harness Overview + +The cascadeflow harness is an in-process intelligence layer that wraps AI agent execution. It tracks, scores, and optionally enforces constraints across six dimensions for every LLM call and tool execution inside agent loops. + +## Six Dimensions + +| Dimension | What it measures | Hard cap | Soft scoring | +|---|---|---|---| +| **Cost** | Estimated USD from the pricing table | `budget` | `kpi_weights.cost` | +| **Latency** | Wall-clock milliseconds per LLM call | `max_latency_ms` | `kpi_weights.latency` | +| **Quality** | Model quality priors (0-1 score) | -- | `kpi_weights.quality` | +| **Tool calls** | Count of tool/function calls | `max_tool_calls` | -- | +| **Energy** | Compute-intensity coefficient | `max_energy` | `kpi_weights.energy` | +| **Compliance** | Model allowlist per regulation | `compliance` | -- | + +## HarnessConfig + +All harness behavior is configured through a single dataclass: + +```python +from cascadeflow import HarnessConfig + +config = HarnessConfig( + mode="enforce", # "off" | "observe" | "enforce" + verbose=False, # Print decisions to stderr + budget=0.50, # Max USD for the run (None = unlimited) + max_tool_calls=10, # Max tool/function calls (None = unlimited) + max_latency_ms=5000.0, # Max wall-clock ms per call (None = unlimited) + max_energy=100.0, # Max energy units (None = unlimited) + kpi_targets={"quality": 0.9}, # Target values for KPI dimensions + kpi_weights={ # Relative importance of each dimension + "quality": 0.6, + "cost": 0.3, + "latency": 0.1, + }, + compliance="gdpr", # "gdpr" | "hipaa" | "pci" | "strict" | None +) +``` + +## Activation + +```python +import cascadeflow + +# Global activation +cascadeflow.init(mode="observe") + +# Scoped run with overrides +with cascadeflow.run(budget=0.50, max_tool_calls=10) as session: + # agent code + pass + +# Decorated agent function +@cascadeflow.agent(budget=0.20, compliance="gdpr") +async def my_agent(query: str): + pass +``` + +## Decision Flow + +For each LLM call or tool execution: + +1. **Record** model, step number, cumulative cost, latency, energy +2. **Check compliance** — is the model in the allowlist for the configured regulation? +3. **Check hard caps** — budget, tool calls, latency, energy +4. **Score KPI dimensions** — quality, cost, latency, energy weighted by `kpi_weights` +5. **Decide action** — `allow`, `switch_model`, `deny_tool`, or `stop` +6. **Enforce or log** — enforce in `enforce` mode, log only in `observe` mode +7. **Append trace** — full decision record for auditability + +## Supported Models + +The harness includes a built-in pricing table for 18 models across OpenAI, Anthropic, and Google. Unknown models are resolved via fuzzy matching (e.g. `gpt-5-mini` matches even before official pricing is announced). + +See [Energy Tracking](/harness/energy-tracking) for the full pricing and energy coefficients table. diff --git a/docs-site/index.mdx b/docs-site/index.mdx new file mode 100644 index 00000000..2e99a0a2 --- /dev/null +++ b/docs-site/index.mdx @@ -0,0 +1,91 @@ +--- +title: cascadeflow +description: Agent runtime intelligence layer — optimize cost, latency, quality, budget, compliance, and energy across AI agent workflows. +--- + +# cascadeflow + +The in-process intelligence layer for AI agents. Optimize cost, latency, quality, budget, compliance, and energy — inside the execution loop, not at the HTTP boundary. + + + + Get running in 3 minutes with zero code changes. + + + Two engines: Cascade for model routing, Harness for agent intelligence. + + + Budget enforcement, compliance gating, KPI-weighted routing, energy tracking. + + + LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, Vercel AI SDK. + + + +## Install + + + +```bash pip +pip install cascadeflow +``` + +```bash npm +npm install @cascadeflow/core +``` + + + +## Quick Start + + + +```python Observe (zero-change) +import cascadeflow +cascadeflow.init(mode="observe") +# All OpenAI/Anthropic SDK calls are now tracked. +``` + +```python Scoped Run +import cascadeflow +cascadeflow.init(mode="enforce") + +with cascadeflow.run(budget=0.50) as session: + result = await agent.run("Analyze this dataset") + print(session.summary()) +``` + +```python Decorated Agent +import cascadeflow +cascadeflow.init(mode="enforce") + +@cascadeflow.agent(budget=0.20, compliance="gdpr") +async def my_agent(query: str): + return await llm.complete(query) +``` + + + +## Supported Frameworks + +| Framework | Python | TypeScript | Integration Type | +|---|---|---|---| +| LangChain / LangGraph | `cascadeflow[langchain]` | `@cascadeflow/langchain` | Callback handler | +| OpenAI Agents SDK | `cascadeflow[openai-agents]` | -- | ModelProvider | +| CrewAI | `cascadeflow[crewai]` | -- | llm_hooks | +| Google ADK | `cascadeflow[google-adk]` | -- | BasePlugin | +| n8n | -- | `@cascadeflow/n8n-nodes-cascadeflow` | Community node | +| Vercel AI SDK | -- | `@cascadeflow/vercel-ai` | Middleware | + +## Six Dimensions + +cascadeflow optimizes across six dimensions simultaneously: + +| Dimension | What it controls | Example | +|---|---|---| +| **Cost** | USD per LLM call from pricing table | Budget cap of $0.50 per run | +| **Latency** | Wall-clock milliseconds per call | Max 2000ms per call | +| **Quality** | Model quality priors for routing | 60% weight on quality KPI | +| **Budget** | Cumulative spend tracking and caps | Per-user daily limits | +| **Compliance** | Model allowlists per regulation | GDPR: only gpt-4o, gpt-4o-mini | +| **Energy** | Compute-intensity coefficients | Carbon-aware model selection | diff --git a/docs-site/integrations/crewai.mdx b/docs-site/integrations/crewai.mdx new file mode 100644 index 00000000..1fae1fde --- /dev/null +++ b/docs-site/integrations/crewai.mdx @@ -0,0 +1,78 @@ +--- +title: CrewAI +description: Hook-based harness integration for CrewAI with budget gating, metrics tracking, and decision traces across crew steps. +--- + +# CrewAI Integration + +cascadeflow integrates with CrewAI through the native `llm_hooks` system. Call `enable()` to register global hooks that track all crew steps, enforce budget caps, and record decision traces. + +## Install + +```bash +pip install "cascadeflow[crewai]" +``` + +## Quick Start + +```python +from crewai import Agent, Crew, Process, Task +import cascadeflow +from cascadeflow.integrations.crewai import CrewAIHarnessConfig, enable + +cascadeflow.init(mode="observe") + +# Enable harness hooks +config = CrewAIHarnessConfig( + fail_open=True, + budget_gate=True, +) +enable(config=config) + +# Define agents and tasks as usual +researcher = Agent( + role="Researcher", + goal="Find relevant information", + llm="gpt-4o-mini", +) + +task = Task( + description="Research the topic of AI agent frameworks", + agent=researcher, +) + +crew = Crew( + agents=[researcher], + tasks=[task], + process=Process.sequential, +) + +# Run with budget tracking +with cascadeflow.run(budget=1.00) as session: + result = crew.kickoff() + print(session.summary()) + for record in session.trace(): + print(f"Step {record['step']}: {record['action']} — {record['reason']}") +``` + +## Configuration + +```python +config = CrewAIHarnessConfig( + fail_open=True, # Continue on harness errors + budget_gate=True, # Enforce budget caps +) +``` + +## Features + +- Tracks all crew steps automatically via `llm_hooks` +- Budget gating stops crew execution when budget is exceeded +- Full decision trace across all agents in the crew +- Fail-open mode for production safety +- No changes to existing CrewAI agent or task definitions + +## Limitations + +- Tool-level gating is not currently applied (CrewAI hooks operate at the LLM call level) +- Model switching depends on CrewAI's model configuration diff --git a/docs-site/integrations/google-adk.mdx b/docs-site/integrations/google-adk.mdx new file mode 100644 index 00000000..8b6f3403 --- /dev/null +++ b/docs-site/integrations/google-adk.mdx @@ -0,0 +1,91 @@ +--- +title: Google ADK +description: Plugin-based harness integration for Google Agent Development Kit with budget enforcement and metrics tracking. +--- + +# Google ADK Integration + +cascadeflow integrates with Google's Agent Development Kit (ADK) through the `BasePlugin` system. Call `enable()` to get a plugin that plugs into `Runner(plugins=[...])`. + +## Install + +```bash +pip install "cascadeflow[google-adk]" +``` + +Requires Python 3.10+. + +## Quick Start + +```python +import asyncio +from google.adk.agents import Agent +from google.adk.runners import Runner +from google.adk.sessions import InMemorySessionService +from google.genai.types import Content, Part + +import cascadeflow +from cascadeflow.integrations.google_adk import GoogleADKHarnessConfig, enable + +cascadeflow.init(mode="observe") + +# Enable harness plugin +config = GoogleADKHarnessConfig( + fail_open=True, + enable_budget_gate=True, +) +plugin = enable(config=config) + +# Create ADK agent +agent = Agent( + name="research_agent", + model="gemini-2.5-flash", + instruction="You are a helpful research assistant.", +) + +# Run with plugin +session_service = InMemorySessionService() +runner = Runner(agent=agent, plugins=[plugin]) + +async def main(): + with cascadeflow.run(budget=0.50) as session: + user_content = Content(parts=[Part(text="Explain cascadeflow")]) + async for event in runner.run_async( + session_id="test", + user_id="user-1", + new_message=user_content, + ): + pass # Process streaming events + + print(session.summary()) + +asyncio.run(main()) +``` + +## Configuration + +```python +config = GoogleADKHarnessConfig( + fail_open=True, # Continue on harness errors + enable_budget_gate=True, # Enforce budget caps +) +``` + +## Supported Gemini Models + +| Model | Input $/1M | Output $/1M | Energy Coeff | +|---|---|---|---| +| gemini-2.5-flash | $0.15 | $0.60 | 0.30 | +| gemini-2.5-pro | $1.25 | $10.00 | 1.20 | +| gemini-2.0-flash | $0.10 | $0.40 | 0.25 | +| gemini-1.5-flash | $0.075 | $0.30 | 0.20 | +| gemini-1.5-pro | $1.25 | $5.00 | 1.00 | + +## Budget Enforcement + +When budget is exceeded in `enforce` mode, the plugin returns an `LlmResponse` with `error_code="BUDGET_EXCEEDED"`. The ADK runner handles this as a graceful stop. + +## Limitations + +- Tool gating is not applied (intentional design choice — ADK manages tool execution internally) +- Model switching depends on ADK's model configuration diff --git a/docs-site/integrations/langchain.mdx b/docs-site/integrations/langchain.mdx new file mode 100644 index 00000000..2f29062f --- /dev/null +++ b/docs-site/integrations/langchain.mdx @@ -0,0 +1,106 @@ +--- +title: LangChain +description: Harness-aware callback handler for LangChain and LangGraph with budget tracking, cost analytics, and decision traces. +--- + +# LangChain Integration + +cascadeflow integrates with LangChain through a callback handler that wraps any `BaseChatModel`. Works with LCEL chains, streaming, tool calling, structured output, and LangGraph agents. + +## Install + + + +```bash Python +pip install "cascadeflow[langchain]" +``` + +```bash TypeScript +npm install @cascadeflow/langchain @langchain/core @langchain/openai +``` + + + +## Quick Start + + + +```python Python — Harness callback +import cascadeflow +from cascadeflow.integrations.langchain import get_harness_callback +from langchain_openai import ChatOpenAI + +cascadeflow.init(mode="observe") + +model = ChatOpenAI(model="gpt-4o") +cb = get_harness_callback() + +with cascadeflow.run(budget=0.50) as session: + result = await model.ainvoke("Explain quantum computing", config={"callbacks": [cb]}) + print(session.summary()) +``` + +```python Python — Cascade routing +from langchain_openai import ChatOpenAI +from langchain_anthropic import ChatAnthropic +from cascadeflow.integrations.langchain import CascadeFlow + +cascade = CascadeFlow( + drafter=ChatOpenAI(model="gpt-4o-mini"), + verifier=ChatAnthropic(model="claude-sonnet-4"), + quality_threshold=0.8, +) + +result = await cascade.ainvoke("Explain quantum computing") +``` + +```typescript TypeScript — Drop-in cascade +import { ChatOpenAI } from '@langchain/openai'; +import { ChatAnthropic } from '@langchain/anthropic'; +import { withCascade } from '@cascadeflow/langchain'; + +const cascade = withCascade({ + drafter: new ChatOpenAI({ model: 'gpt-4o-mini' }), + verifier: new ChatAnthropic({ model: 'claude-sonnet-4' }), + qualityThreshold: 0.8, +}); + +const result = await cascade.invoke('Explain quantum computing'); +``` + + + +## Features + +- Full LCEL support (pipes, sequences, batch) +- Streaming with pre-routing +- Tool calling and structured output +- LangSmith cost tracking metadata +- Cost tracking callbacks +- Domain policies with `cascadeflow_domain` metadata + +## Cost Tracking Callback + +```python +from cascadeflow.integrations.langchain.langchain_callbacks import get_cascade_callback + +with get_cascade_callback() as cb: + response = await cascade.ainvoke("What is Python?") + print(f"Total cost: ${cb.total_cost:.6f}") + print(f"Drafter cost: ${cb.drafter_cost:.6f}") + print(f"Verifier cost: ${cb.verifier_cost:.6f}") +``` + +## LangSmith Integration + +When LangSmith tracing is enabled, cascadeflow adds metadata to runs: +- `cascade_decision`: whether the drafter was accepted +- `modelUsed`: which model produced the final response +- `drafterQuality`: quality score from validation +- `savingsPercentage`: cost savings achieved + +```bash +export LANGSMITH_API_KEY="..." +export LANGSMITH_PROJECT="my-project" +export LANGSMITH_TRACING=true +``` diff --git a/docs-site/integrations/n8n.mdx b/docs-site/integrations/n8n.mdx new file mode 100644 index 00000000..efb89f51 --- /dev/null +++ b/docs-site/integrations/n8n.mdx @@ -0,0 +1,70 @@ +--- +title: n8n +description: cascadeflow community nodes for n8n with cascade model routing, tool gating, and harness modes for no-code AI workflows. +--- + +# n8n Integration + +cascadeflow provides two community nodes for n8n workflows: a Model sub-node for drop-in cascade routing and an Agent node for standalone multi-step reasoning. + +## Install + +In n8n: +1. Go to **Settings** > **Community Nodes** +2. Search for: `@cascadeflow/n8n-nodes-cascadeflow` +3. Click **Install** + +Or via npm: +```bash +npm install @cascadeflow/n8n-nodes-cascadeflow +``` + +## Two Nodes + +| Node | Type | Use Case | +|---|---|---| +| **CascadeFlow (Model)** | Language Model sub-node | Drop-in for any Chain/LLM node | +| **CascadeFlow Agent** | Standalone agent | Tool calling, memory, multi-step reasoning | + +## CascadeFlow (Model) + +Drop-in replacement for any AI Chat Model in n8n chains: + +1. Add two **AI Chat Model** nodes (cheap drafter + powerful verifier) +2. Add **CascadeFlow (Model)** and connect both models +3. Connect to a **Basic LLM Chain** or **Chain** node +4. Check the **Logs tab** to see cascade decisions + +**Features:** +- Quality threshold (default: 0.4) +- 16 supported domains (Code, Math, Data, Legal, Medical, Financial, etc.) +- Complexity thresholds for automatic routing + +## CascadeFlow Agent + +Standalone agent with tool calling and multi-step reasoning: + +1. Add a **Chat Trigger** node +2. Add **CascadeFlow Agent** and connect to the trigger +3. Connect **Drafter**, **Verifier**, optional **Memory** and **Tools** +4. Check the **Output tab** for cascade metadata and decision trace + +**Features:** +- Harness mode: `observe` or `enforce` +- Budget caps and tool call limits +- Tool routing rules: Cascade (default) or Verifier (for high-stakes tools) +- Tool call validation with JSON schema checking + +## Complexity Thresholds + +| Level | Threshold | Routing | +|---|---|---| +| Trivial | 0.25 | Always use drafter | +| Simple | 0.40 | Prefer drafter | +| Moderate | 0.55 | Quality-dependent | +| Hard | 0.70 | Prefer verifier | +| Expert | 0.80 | Always use verifier | + +## Result + +40-85% cost savings in n8n workflows with zero changes to existing chains. diff --git a/docs-site/integrations/openai-agents.mdx b/docs-site/integrations/openai-agents.mdx new file mode 100644 index 00000000..1a189a6b --- /dev/null +++ b/docs-site/integrations/openai-agents.mdx @@ -0,0 +1,77 @@ +--- +title: OpenAI Agents SDK +description: CascadeFlowModelProvider for OpenAI Agents SDK with model candidates, tool gating, and budget tracking. +--- + +# OpenAI Agents SDK Integration + +cascadeflow provides a `CascadeFlowModelProvider` that integrates with the OpenAI Agents SDK as an explicit `ModelProvider`. Supports model candidates, tool gating, and scoped budget tracking. + +## Install + +```bash +pip install "cascadeflow[openai-agents]" +``` + +## Quick Start + +```python +import asyncio +from agents import Agent, Runner +import cascadeflow +from cascadeflow.integrations.openai_agents import ( + CascadeFlowModelProvider, + OpenAIAgentsIntegrationConfig, +) + +cascadeflow.init(mode="observe") + +# Configure integration +config = OpenAIAgentsIntegrationConfig( + model_candidates=["gpt-4o-mini", "gpt-4o"], + enable_tool_gating=True, +) + +provider = CascadeFlowModelProvider(config=config) + +agent = Agent( + name="research_agent", + instructions="You are a helpful research assistant.", + model_provider=provider, +) + +async def main(): + with cascadeflow.run(budget=0.50) as session: + result = await Runner.run(agent, "Explain cascadeflow") + print(result.final_output) + print(session.summary()) + +asyncio.run(main()) +``` + +## Features + +- **Model candidates**: List of models the provider can select from based on harness scoring +- **Tool gating**: Block tool calls when `max_tool_calls` is reached +- **Scoped runs**: Use `cascadeflow.run()` for per-task budget tracking +- **Decision traces**: Full audit trail of model selection and tool gating decisions +- **Fail-open**: If the harness encounters an error, execution continues with the default model + +## Configuration + +```python +config = OpenAIAgentsIntegrationConfig( + model_candidates=["gpt-4o-mini", "gpt-4o"], # Models to choose from + enable_tool_gating=True, # Block tools at cap +) +``` + +## Session Metrics + +After a run, `session.summary()` includes: +- `cost_total`: cumulative USD spent +- `budget_remaining`: USD left in the budget +- `step_count`: number of LLM calls +- `tool_calls`: number of tool executions +- `latency_used_ms`: total latency +- `energy_used`: total energy units diff --git a/docs-site/integrations/overview.mdx b/docs-site/integrations/overview.mdx new file mode 100644 index 00000000..92bda53e --- /dev/null +++ b/docs-site/integrations/overview.mdx @@ -0,0 +1,53 @@ +--- +title: Integrations Overview +description: Matrix of all cascadeflow framework integrations with supported features, languages, and integration patterns. +--- + +# Integrations Overview + +cascadeflow integrates with six agent frameworks. All integrations are opt-in — install the extra and explicitly enable. + +## Integration Matrix + +| Framework | Language | Package | Integration Type | Budget Gating | Tool Gating | Traces | +|---|---|---|---|---|---|---| +| [LangChain](/integrations/langchain) | Python, TS | `cascadeflow[langchain]`, `@cascadeflow/langchain` | Callback handler | Yes | No | Yes | +| [OpenAI Agents SDK](/integrations/openai-agents) | Python | `cascadeflow[openai-agents]` | ModelProvider | Yes | Yes | Yes | +| [CrewAI](/integrations/crewai) | Python | `cascadeflow[crewai]` | llm_hooks | Yes | No | Yes | +| [Google ADK](/integrations/google-adk) | Python | `cascadeflow[google-adk]` | BasePlugin | Yes | No | Yes | +| [n8n](/integrations/n8n) | TypeScript | `@cascadeflow/n8n-nodes-cascadeflow` | Community node | Yes | Yes | Yes | +| [Vercel AI SDK](/integrations/vercel-ai) | TypeScript | `@cascadeflow/vercel-ai` | Middleware | Yes | No | Yes | + +## Integration Patterns + +Each integration follows the same principle: wrap the framework's extension point with cascadeflow's harness, without modifying agent code. + +### Python + +```python +import cascadeflow +cascadeflow.init(mode="observe") + +# Framework-specific activation +from cascadeflow.integrations.langchain import get_harness_callback +from cascadeflow.integrations.openai_agents import CascadeFlowModelProvider +from cascadeflow.integrations.crewai import enable as enable_crewai +from cascadeflow.integrations.google_adk import enable as enable_adk +``` + +### TypeScript + +```bash +npm install @cascadeflow/langchain +npm install @cascadeflow/vercel-ai +npm install @cascadeflow/n8n-nodes-cascadeflow +``` + +## Choosing an Integration + +- **LangChain/LangGraph**: Use if you have existing LangChain chains or agents. The callback handler wraps any `BaseChatModel`. +- **OpenAI Agents SDK**: Use if you're building with OpenAI's Agents SDK. The `ModelProvider` supports model candidates and tool gating. +- **CrewAI**: Use if you're building multi-agent crews. The `llm_hooks` integration tracks all crew steps. +- **Google ADK**: Use if you're building with Google's Agent Development Kit. The plugin integrates with `Runner`. +- **n8n**: Use if you're building no-code workflows. The community node adds cascade routing to any n8n flow. +- **Vercel AI SDK**: Use if you're building TypeScript server-side agents. The middleware wraps AI SDK streams. diff --git a/docs-site/integrations/vercel-ai.mdx b/docs-site/integrations/vercel-ai.mdx new file mode 100644 index 00000000..9b2d9257 --- /dev/null +++ b/docs-site/integrations/vercel-ai.mdx @@ -0,0 +1,88 @@ +--- +title: Vercel AI SDK +description: TypeScript middleware integration for Vercel AI SDK with cascade routing, multi-turn chat, and tool execution. +--- + +# Vercel AI SDK Integration + +cascadeflow integrates with the Vercel AI SDK as middleware, providing cascade routing for server-side AI applications with streaming support. + +## Install + +```bash +npm install @cascadeflow/vercel-ai +``` + +## Quick Start + +```typescript +import { createChatHandler } from '@cascadeflow/vercel-ai'; +import { CascadeAgent } from '@cascadeflow/core'; + +const agent = new CascadeAgent({ + models: [ + { name: 'gpt-4o-mini', provider: 'openai', cost: 0.000375 }, + { name: 'gpt-4o', provider: 'openai', cost: 0.00625 }, + ], +}); + +const handler = createChatHandler(agent, { + protocol: 'data', // AI SDK v4 data stream + tools, // Tool definitions + toolHandlers, // Server-side tool execution + maxSteps: 5, // Multi-step tool loops +}); + +// Use in Next.js API route, Express, or any Node.js server +export const POST = handler; +``` + +## Features + +- **AI SDK v4 `data` stream** and **AI SDK v5/v6 UI streams** +- **`useChat` multi-turn support** — conversation history preserved +- **`parts` message format** (AI SDK v6) +- **Tool call streaming visibility** — see tool calls as they happen +- **Server-side tool execution** via `toolExecutor` or `toolHandlers` +- **Multi-step controls**: `maxSteps`, `forceDirect` +- **Cascade decision stream parts** — optional metadata in the stream +- **Request-level overrides** with allowlist + shared-secret guard + +## Multi-Turn Chat + +```typescript +import { useChat } from 'ai/react'; + +export default function Chat() { + const { messages, input, handleSubmit, handleInputChange } = useChat({ + api: '/api/chat', + }); + + return ( +
+ {messages.map((m) => ( +
{m.content}
+ ))} +
+ +
+
+ ); +} +``` + +## Request Overrides + +Override cascade behavior per request (protected by shared secret): + +```typescript +const handler = createChatHandler(agent, { + protocol: 'data', + allowOverrides: ['forceDirect', 'maxSteps'], + overrideSecret: process.env.OVERRIDE_SECRET, +}); +``` + +## Result + +40-85% cost savings for Vercel AI SDK applications with streaming support and zero client-side changes. diff --git a/docs-site/logo/cascadeflow-dark.svg b/docs-site/logo/cascadeflow-dark.svg new file mode 100644 index 00000000..3c1a2870 --- /dev/null +++ b/docs-site/logo/cascadeflow-dark.svg @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs-site/logo/cascadeflow-light.svg b/docs-site/logo/cascadeflow-light.svg new file mode 100644 index 00000000..8ca48234 --- /dev/null +++ b/docs-site/logo/cascadeflow-light.svg @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/INSTALLATION.md b/docs/INSTALLATION.md index c291bd93..6e44cdec 100644 --- a/docs/INSTALLATION.md +++ b/docs/INSTALLATION.md @@ -108,6 +108,24 @@ TOGETHER_API_KEY=... # vLLM - no API key needed! (local) ``` +## 🔌 Optional Integration Extras + +Integration packages are opt-in and never enabled by default. + +| Integration | Install Command | Python Requirement | Notes | +|------------|-----------------|--------------------|-------| +| OpenAI Agents SDK | `pip install "cascadeflow[openai,openai-agents]"` | 3.9+ (3.10+ recommended) | Uses explicit `ModelProvider` integration | +| CrewAI | `pip install "cascadeflow[crewai,openai]"` | 3.10+ | Uses explicit CrewAI hook registration | +| Google ADK | `pip install "cascadeflow[google-adk]"` | 3.10+ | Uses explicit ADK plugin in `Runner(plugins=[...])` | + +Optional for richer provider/model normalization in cost tracking: + +```bash +pip install litellm +``` + +Without `litellm`, cascadeflow still provides built-in pricing-based cost estimates. + ## 🚀 Quick Start ### For Production diff --git a/docs/README.md b/docs/README.md index 1238d7f8..08c5c0c8 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,8 +1,10 @@ # cascadeflow Documentation -Welcome to cascadeflow documentation! 🌊 +> **Full documentation is now at [docs.cascadeflow.dev](https://docs.cascadeflow.dev)** — the Mintlify-powered docs site is the primary reference for cascadeflow's agent runtime intelligence layer. The guides below remain for quick reference and deep links. -## 📖 Quick Links +Agent runtime intelligence layer — optimize cost, latency, quality, budget, compliance, and energy across AI agent workflows. In-process harness, not a proxy. + +## Quick Links - [Installation Guide](INSTALLATION.md) - [Quick Start Guide](guides/quickstart.md) @@ -11,6 +13,7 @@ Welcome to cascadeflow documentation! 🌊 ### Core Concepts - [Quickstart](guides/quickstart.md) - Get started with cascadeflow in 5 minutes +- [Python Harness Quickstart](guides/python_harness_quickstart.md) - `init`, `run`, and `@agent` for in-process policy control - [Providers](guides/providers.md) - Configure and use different AI providers (OpenAI, Anthropic, Groq, Ollama, etc.) - [Presets](guides/presets.md) - Use built-in presets for common use cases - [Gateway Server](guides/gateway.md) - Drop-in OpenAI/Anthropic-compatible endpoint for existing apps @@ -20,6 +23,7 @@ Welcome to cascadeflow documentation! 🌊 - [Tools](guides/tools.md) - Function calling and tool usage with cascades - [Agentic Patterns (Python)](guides/agentic-python.md) - Tool loops and multi-agent orchestration in Python - [Agentic Patterns (TypeScript)](guides/agentic-typescript.md) - Tool loops, multi-agent orchestration, and message best practices +- [Harness Telemetry & Privacy](guides/harness_telemetry_privacy.md) - Decision traces, callbacks, and privacy-safe observability - [Cost Tracking](guides/cost_tracking.md) - Track and analyze API costs across queries - [Proxy Routing](guides/proxy.md) - Route requests through provider-aware proxy plans @@ -38,9 +42,12 @@ Welcome to cascadeflow documentation! 🌊 - [Agent Intelligence V2/V2.1 Plan](strategy/agent-intelligence-v2-plan.md) - Unified strategic and execution plan for in-process agent intelligence harness delivery ### Integrations +- [LangChain Integration](guides/langchain_integration.md) - Callback handler for LangChain/LangGraph with harness-aware cascading +- [OpenAI Agents SDK Integration](guides/openai_agents_integration.md) - Harness-aware model provider for existing OpenAI Agents apps +- [CrewAI Integration](guides/crewai_integration.md) - Hook-based harness metrics + budget gating (opt-in) +- [Google ADK Integration](guides/google_adk_integration.md) - Plugin-based harness integration for ADK runners (opt-in) - [n8n Integration](guides/n8n_integration.md) - Use cascadeflow in n8n workflows - [Paygentic Integration](guides/paygentic_integration.md) - Usage metering and billing lifecycle helpers (opt-in) -- [OpenAI Agents SDK Integration](guides/openai_agents_integration.md) - Harness-aware model provider for existing OpenAI Agents apps ## 📚 Examples diff --git a/docs/guides/crewai_integration.md b/docs/guides/crewai_integration.md new file mode 100644 index 00000000..8c1cec8a --- /dev/null +++ b/docs/guides/crewai_integration.md @@ -0,0 +1,87 @@ +# CrewAI Integration + +Use cascadeflow as an explicit, opt-in harness integration for CrewAI via +`llm_hooks`. + +## Design Principles + +- Integration-only: nothing is enabled by default +- Works with existing CrewAI flows +- Harness behavior is controlled by `cascadeflow.init(...)` and `cascadeflow.run(...)` +- Fail-open integration path: harness integration errors should not break crew execution + +## Install + +```bash +pip install "cascadeflow[crewai,openai]" +``` + +`crewai` is optional and only installed when you request this extra. +Requires Python 3.10+. + +Optional (more precise provider/model cost tracking in harness telemetry): + +```bash +pip install litellm +``` + +## Quickstart + +```python +from crewai import Agent, Crew, Process, Task + +from cascadeflow import init, run +from cascadeflow.integrations.crewai import CrewAIHarnessConfig, enable + +# Global harness defaults. +init(mode="enforce", budget=1.0) + +# Explicitly register CrewAI hooks (integration-only behavior). +enable( + config=CrewAIHarnessConfig( + fail_open=True, + enable_budget_gate=True, + ) +) + +agent = Agent( + role="Support Agent", + goal="Answer support questions clearly and concisely.", + backstory="You are helpful and direct.", + allow_delegation=False, + llm="openai/gpt-4o-mini", +) + +task = Task( + description="Explain why model cascading helps control agent costs.", + expected_output="A concise explanation with one practical example.", + agent=agent, +) + +with run(budget=0.4) as session: + crew = Crew(agents=[agent], tasks=[task], process=Process.sequential, verbose=False) + result = crew.kickoff() + + print(result) + print(session.summary()) + print(session.trace()) +``` + +## What This Integration Adds + +- Budget gating in enforce mode (`before_llm_call` hook) +- Run metrics in `cascadeflow.run()` scope: + - `cost`, `budget_remaining`, `step_count`, `latency_used_ms`, `energy_used` +- Full decision trace through `run.trace()` + +## Current Scope + +- This integration uses CrewAI hook points, so it tracks and gates calls without + changing your crew/task definitions. +- Tool-level deny/switch actions are not currently applied in this integration path. + +## Notes + +- Existing non-CrewAI users are unaffected. +- If CrewAI is not installed, `enable()` returns `False` and no hooks are registered. +- Without `litellm`, cost tracking still works using cascadeflow's built-in pricing estimates. diff --git a/docs/guides/google_adk_integration.md b/docs/guides/google_adk_integration.md new file mode 100644 index 00000000..76529bfc --- /dev/null +++ b/docs/guides/google_adk_integration.md @@ -0,0 +1,172 @@ +# Google ADK Integration + +Integrate cascadeflow harness with Google's Agent Development Kit (ADK) to get +budget enforcement, cost/latency/energy tracking, tool call counting, and full +trace recording across all agents in an ADK Runner. + +--- + +## Design Principles + +- **Plugin-based** — Uses ADK's `BasePlugin` system to intercept every LLM call + across all agents in a Runner. One plugin covers the entire agent graph. +- **Opt-in** — Install `cascadeflow[google-adk]` and create a plugin explicitly. + Never enabled by default. Core cascadeflow behavior is unchanged unless you + explicitly wire this integration into `Runner(plugins=[...])`. +- **Fail-open** — Integration errors are logged but never break ADK execution + (configurable). +- **No tool gating** — ADK's `tools_dict` is part of agent definition, not + per-call. Budget gate via `before_model_callback` provides sufficient cost + control. This is an intentional difference from the OpenAI Agents integration. + +--- + +## Installation + +```bash +pip install "cascadeflow[google-adk]" +``` + +Requires Python 3.10+ (ADK requirement). + +Optional (more precise provider/model cost tracking in harness telemetry): + +```bash +pip install litellm +``` + +--- + +## Quick Start + +```python +import asyncio +from google.adk.agents import Agent +from google.adk.runners import Runner +from google.adk.sessions import InMemorySessionService + +from cascadeflow import init, run +from cascadeflow.integrations.google_adk import enable + +# 1. Initialize harness +init(mode="observe", budget=1.0) + +# 2. Create the cascadeflow plugin +plugin = enable() + +# 3. Pass it to the Runner +agent = Agent(name="my_agent", model="gemini-2.5-flash", instruction="Be helpful.") +runner = Runner( + agent=agent, + app_name="my_app", + session_service=InMemorySessionService(), + plugins=[plugin], +) + +# 4. Run within a harness scope +async def main(): + with run(budget=0.5) as session: + # ... run your agent ... + print(f"Cost: ${session.cost:.6f}") + print(f"Steps: {session.step_count}") + print(f"Tool calls: {session.tool_calls}") + +asyncio.run(main()) +``` + +--- + +## Features + +### Budget Enforcement + +In `enforce` mode, the plugin short-circuits LLM calls when the budget is +exhausted by returning an `LlmResponse` with `error_code="BUDGET_EXCEEDED"`. + +```python +init(mode="enforce", budget=0.10) # Hard limit: $0.10 +plugin = enable() +``` + +### Cost and Energy Tracking + +Every LLM call is tracked with: +- **Cost** — Estimated from model pricing (USD per 1M tokens) +- **Energy** — Deterministic proxy coefficient for compute intensity +- **Latency** — Wall-clock time per call +- **Tool calls** — Count of `function_call` parts in responses + +By default this uses cascadeflow's built-in pricing table. If you install +`litellm`, provider/model normalization can be more precise for some aliased +model identifiers. + +### Trace Recording + +All decisions are recorded in the `HarnessRunContext` trace: + +```python +with run() as session: + # ... run agents ... + for event in session.trace(): + print(event) + # {"action": "allow", "reason": "observe", "model": "gemini-2.5-flash", ...} +``` + +### Configuration + +```python +from cascadeflow.integrations.google_adk import enable, GoogleADKHarnessConfig + +plugin = enable( + config=GoogleADKHarnessConfig( + fail_open=True, # Default: True. Never break ADK on integration errors. + enable_budget_gate=True, # Default: True. Block calls when budget exhausted. + ) +) +``` + +--- + +## Zero-Code Alternative + +If you don't need per-agent plugin integration, you can route ADK through a +cascadeflow LiteLlm proxy by setting `base_url` on your Gemini model: + +```python +# ADK uses LiteLlm under the hood — point it at your cascadeflow proxy +agent = Agent( + name="my_agent", + model="openai/gemini-2.5-flash", # LiteLlm format + instruction="...", +) +# Set OPENAI_API_BASE=http://localhost:8080/v1 to route through cascadeflow proxy +``` + +This gives you cost tracking at the proxy level without a plugin, but doesn't +provide budget enforcement or per-agent trace recording. + +--- + +## Supported Gemini Models + +| Model | Input $/1M | Output $/1M | Energy Coefficient | +|-------|-----------|-------------|-------------------| +| gemini-2.5-flash | $0.15 | $0.60 | 0.3 | +| gemini-2.5-pro | $1.25 | $10.00 | 1.2 | +| gemini-2.0-flash | $0.10 | $0.40 | 0.25 | +| gemini-1.5-flash | $0.075 | $0.30 | 0.2 | +| gemini-1.5-pro | $1.25 | $5.00 | 1.0 | + +All OpenAI and Anthropic models from the shared pricing table are also +supported (e.g., when using LiteLlm provider prefixes). + +--- + +## Troubleshooting + +| Symptom | Solution | +|---------|----------| +| `ImportError: google.adk` | `pip install "cascadeflow[google-adk]"` | +| Plugin not tracking calls | Ensure `plugin` is passed to `Runner(plugins=[plugin])` | +| Budget not enforced | Check `init(mode="enforce", ...)` — observe mode never blocks | +| Zero cost reported | Model name may not match pricing table; check for provider prefix stripping | diff --git a/docs/guides/harness_telemetry_privacy.md b/docs/guides/harness_telemetry_privacy.md new file mode 100644 index 00000000..01e75402 --- /dev/null +++ b/docs/guides/harness_telemetry_privacy.md @@ -0,0 +1,59 @@ +# Harness Telemetry and Privacy + +Use this guide when you want harness observability without leaking user content. + +## What the Harness Records + +Each `run.trace()` decision entry includes: + +- `action`, `reason`, `model` +- `run_id`, `mode`, `step`, `timestamp_ms` +- `cost_total`, `latency_used_ms`, `energy_used`, `tool_calls_total` +- `budget_state` (`max`, `remaining`) +- `applied`, `decision_mode` (when available) + +The trace is scoped to the current `run()` context. + +## What the Harness Does Not Record + +By default, harness decision traces do not include: + +- raw prompts or user messages +- model response text +- tool argument payloads + +This keeps decision telemetry focused on policy/routing state instead of request content. + +## Callback Emission (Optional) + +If you provide a callback manager, each harness decision emits `CallbackEvent.CASCADE_DECISION`. + +```python +from cascadeflow import init, run +from cascadeflow.telemetry.callbacks import CallbackEvent, CallbackManager + +manager = CallbackManager() + +def on_decision(event): + print(event.data["action"], event.data["model"]) + +manager.register(CallbackEvent.CASCADE_DECISION, on_decision) + +init(mode="observe", callback_manager=manager) + +with run(budget=1.0) as r: + ... +``` + +The emitted callback uses `query="[harness]"` and `workflow="harness"` to avoid passing user prompt content. + +## Per-Run Summary Logging + +When a scoped run exits (and recorded at least one step), the harness logs a summary on logger `cascadeflow.harness`: + +- run id, mode, steps, tool calls +- cost/latency/energy totals +- last action/model +- remaining budget + +Use standard Python logging controls to direct this to your existing log sink. diff --git a/docs/guides/langchain_integration.md b/docs/guides/langchain_integration.md index eb385654..8eccba62 100644 --- a/docs/guides/langchain_integration.md +++ b/docs/guides/langchain_integration.md @@ -12,6 +12,7 @@ This guide shows how to use cascadeflow with LangChain for intelligent AI model 6. [Use Cases](#use-cases) 7. [Best Practices](#best-practices) 8. [Troubleshooting](#troubleshooting) +9. [Harness Integration (Python)](#harness-integration-python) --- @@ -822,6 +823,132 @@ console.log(result.response_metadata?.cascade); // Not result.metadata (wrong) ``` +--- + +## Harness Integration (Python) + +The cascadeflow harness adds multi-dimensional budget enforcement, energy tracking, +tool call gating, and trace recording to LangChain applications via a callback handler. + +### Design Principles + +- **Callback-based** — Uses LangChain's native callback system to intercept every + LLM and tool call. Works with any chain, agent, or LangGraph graph. +- **Opt-in** — Install `cascadeflow[langchain]` and pass the callback explicitly. + Never enabled by default. +- **Fail-open** — Integration errors are logged but never break chain execution + (configurable). +- **No model switching** — LangChain dispatches the LLM call before `on_llm_start` + returns, so the callback cannot redirect to a different model. `switch_model` + decisions are recorded with `applied=False` for observability. + +### Install + +```bash +pip install "cascadeflow[langchain]" +``` + +Requires Python 3.10+. + +### Quick Start + +```python +from langchain_openai import ChatOpenAI +from cascadeflow import init, run +from cascadeflow.integrations.langchain import get_harness_callback + +# 1. Initialize harness globally +init(mode="observe", budget=1.0) + +model = ChatOpenAI(model="gpt-4o-mini") + +# 2. Use the harness-aware callback in a run scope +with run(budget=0.5) as session: + with get_harness_callback() as cb: + response = model.invoke( + "Explain why model routing helps agent budgets.", + config={"callbacks": [cb]}, + ) + + print(response.content) + print(f"Cost: ${session.cost:.6f}") + print(f"Steps: {session.step_count}") + print(f"Tool calls: {session.tool_calls}") + for event in session.trace(): + print(event) +``` + +### What This Integration Adds + +- Budget gating in enforce mode (`on_llm_start` raises `HarnessStopError`) +- Tool call gating in enforce mode (`on_tool_start` raises `HarnessStopError`) +- Run metrics on `cascadeflow.run()` scope: + - `cost`, `budget_remaining`, `step_count`, `tool_calls`, `latency_used_ms`, `energy_used` +- Full decision trace through `session.trace()` +- LangGraph state extraction — automatically syncs `step_count`, `tool_calls`, + `budget_remaining`, `latency_used_ms`, `energy_used` from graph state payloads + +### Enforce-Mode Limitations + +| Decision | Enforced? | Notes | +|----------|-----------|-------| +| `stop` (budget/latency/energy) | Yes | Raises `HarnessStopError` from `on_llm_start` | +| `deny_tool` (tool cap) | Yes | Raises `HarnessStopError` from `on_tool_start` | +| `switch_model` | Observe-only | Recorded with `applied=False` — LangChain cannot redirect mid-call | +| `deny_tool` (LLM-level) | Observe-only | Cannot strip tools from already-dispatched request | + +### Configuration + +```python +from cascadeflow.integrations.langchain import ( + HarnessAwareCascadeFlowCallbackHandler, + get_harness_callback, +) + +# Context manager (recommended) +with get_harness_callback(fail_open=True) as cb: + result = model.invoke("...", config={"callbacks": [cb]}) + +# Direct instantiation +cb = HarnessAwareCascadeFlowCallbackHandler(fail_open=True) +result = model.invoke("...", config={"callbacks": [cb]}) +``` + +### With LangGraph + +The callback automatically extracts harness-relevant state from LangGraph payloads +(via `langgraph_state`, `graph_state`, or `state` keys in metadata/configurable). + +```python +from langgraph.graph import StateGraph +from cascadeflow import init, run +from cascadeflow.integrations.langchain import get_harness_callback + +init(mode="observe", budget=1.0) + +# Build your graph as normal +graph = builder.compile() + +with run(budget=0.5) as session: + with get_harness_callback() as cb: + result = graph.invoke( + {"messages": [("user", "What is model routing?")]}, + config={"callbacks": [cb]}, + ) + print(session.summary()) +``` + +### Troubleshooting + +| Symptom | Solution | +|---------|----------| +| `ImportError: cascadeflow.integrations.langchain` | `pip install "cascadeflow[langchain]"` | +| Callback not tracking calls | Ensure `cb` is passed in `config={"callbacks": [cb]}` | +| Budget not enforced | Check `init(mode="enforce", ...)` — observe mode never blocks | +| Zero cost reported | Model name may not match pricing table; check `response.response_metadata` | + +--- + ## Next Steps 1. **Examples**: Check the `examples/` directory for more patterns diff --git a/docs/guides/openai_agents_integration.md b/docs/guides/openai_agents_integration.md index 2db6b8b7..db8b1e34 100644 --- a/docs/guides/openai_agents_integration.md +++ b/docs/guides/openai_agents_integration.md @@ -15,6 +15,14 @@ Use cascadeflow as an explicit, opt-in `ModelProvider` integration for the OpenA pip install "cascadeflow[openai,openai-agents]" ``` +Recommended: Python 3.10+. + +Optional (more precise provider/model cost tracking in harness telemetry): + +```bash +pip install litellm +``` + ## Quickstart ```python @@ -71,3 +79,4 @@ if __name__ == "__main__": - This is a Python integration for OpenAI Agents SDK. - The SDK remains optional and is only installed via the `openai-agents` extra. - Existing non-Agents users are unaffected. +- Without `litellm`, cost tracking still works using cascadeflow's built-in pricing estimates. diff --git a/docs/guides/python_harness_quickstart.md b/docs/guides/python_harness_quickstart.md new file mode 100644 index 00000000..c757e48d --- /dev/null +++ b/docs/guides/python_harness_quickstart.md @@ -0,0 +1,95 @@ +# Python Harness Quickstart + +This guide covers the in-process harness API: + +- `init(...)` for global defaults and SDK instrumentation +- `run(...)` for per-request scoped budgets/limits and traceability +- `@agent(...)` for attaching policy metadata to agent functions + +## Install + +```bash +pip install "cascadeflow[openai]" +``` + +Optional integrations stay opt-in: + +```bash +pip install "cascadeflow[openai,openai-agents]" +pip install "cascadeflow[crewai]" +pip install "cascadeflow[google-adk]" +``` + +Version notes: +- `crewai` and `google-adk` integrations require Python 3.10+. +- `openai-agents` is recommended on Python 3.10+. + +Optional for richer cost normalization across aliased provider model names: + +```bash +pip install litellm +``` + +## 1) Initialize Harness + +```python +from cascadeflow import init + +report = init( + mode="observe", # off | observe | enforce + budget=1.0, # default per-run budget cap + max_tool_calls=8, # default per-run tool call cap +) + +print(report.mode) +print(report.instrumented) +print(report.detected_but_not_instrumented) +``` + +`init(...)` is explicit and never auto-enables integrations. + +## 2) Track One Scoped Run + +```python +from openai import OpenAI + +from cascadeflow import run + +client = OpenAI() + +with run(budget=0.25, max_tool_calls=4) as session: + response = client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": "Summarize model cascading in one sentence."}], + ) + + print(response.choices[0].message.content) + print(session.summary()) + print(session.trace()) +``` + +## 3) Attach Agent Metadata + +`@agent(...)` attaches policy metadata to your function without changing how the +function executes. + +```python +from cascadeflow import agent + +@agent( + budget=0.2, + kpi_targets={"quality": 0.9}, + kpi_weights={"cost": 0.5, "latency": 0.5}, + compliance="strict", +) +def support_agent(task: str) -> str: + return f"Handled: {task}" + +print(support_agent.__cascadeflow_agent_policy__) +``` + +## Minimal Checklist + +1. Call `init(...)` once at process startup. +2. Wrap each unit of work in `with run(...):`. +3. Use `run.summary()` and `run.trace()` for auditability and tuning. diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md index 267ddc69..295a713d 100644 --- a/docs/strategy/agent-intelligence-v2-plan.md +++ b/docs/strategy/agent-intelligence-v2-plan.md @@ -1,7 +1,7 @@ # Agent Intelligence V2 Plan -Last updated: February 25, 2026 -Status: Planning (no implementation in this document) +Last updated: March 5, 2026 +Status: V2/V2.1 execution plan with implementation tracking (historical + active reference) Supersedes: agent-intelligence-v1-plan.md ## 1. Objective @@ -197,9 +197,6 @@ Framework-specific packages provide deeper integration (state extraction, middle ### TypeScript Equivalent ```typescript -// Target API — does not exist in @cascadeflow/core today. -// TS parity is a V2.1 deliverable (see Section 16, Phase F). - import { cascadeflow } from '@cascadeflow/core'; // Tier 1: Auto-instrument @@ -831,9 +828,9 @@ Estimated: 6-8 weeks after V2 Python launch. Estimated: 3-4 weeks (can parallel with Phase F). -### 16.1 Parallel Branch Workboard (Tick-Off) +### 16.1 Parallel Branch Workboard (Historical Tick-Off) -Use this section as the single coordination board for parallel execution. +Use this section as the historical coordination board for parallel execution. Branching model: - Keep `main` always releasable. @@ -842,15 +839,17 @@ Branching model: - Merge to `main` only after integration branch CI + benchmark gates are green. Claim checklist (one owner per branch at a time): -- [x] `feat/v2-core-harness-api` — Owner: `@codex` — PR: `TBD` — Status: `completed` -- [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — PR: `TBD` — Status: `in-progress` -- [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `completed (ready for PR)` -- [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` -- [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` -- [ ] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` -- [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` -- [x] `feat/v2-bench-repro-pipeline` — Owner: `@codex` — PR: `#163` — Status: `review` -- [ ] `feat/v2-security-privacy-telemetry` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` +- [x] `feat/v2-core-harness-api` — Owner: `@codex` — Status: `completed (merged to integration branch)` +- [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — Status: `completed (merged to integration branch)` +- [x] `feat/v2-enforce-actions` — Owner: `@codex` — Status: `completed (merged to integration branch)` +- [x] `feat/v2-openai-agents-integration` — Owner: `@codex` — Status: `completed (merged to integration branch)` — code + 7 tests + docs + example +- [x] `feat/v2-crewai-integration` — Owner: `@codex` — Status: `completed (merged to integration branch)` — code + 44 tests + docs + example +- [x] `feat/v2-langchain-harness-extension` — Owner: `@codex` — Status: `completed (merged to integration branch)` — code + 79 tests + docs + example +- [x] `feat/v2-dx-docs-quickstarts` — Owner: `@codex` — Status: `completed (merged to integration branch)` — quickstart + llms.txt +- [x] `feat/v2-bench-repro-pipeline` — Owner: `@codex` — PR: `#163` — Status: `completed (merged to integration branch)` +- [x] `feat/v2-security-privacy-telemetry` — Owner: `@codex` — PR: `#162` — Status: `completed (merged to integration branch)` +- [x] `feat/v2-google-adk-integration` — Owner: `@codex` — Status: `completed (merged to integration branch)` — code + 63 tests + docs + example +- [x] `feat/v2-n8n-harness` — Owner: `@codex` — PR: `#164` — Status: `completed (merged to integration branch)` — TS harness + 50 tests + UI Merge gates per feature branch: - [ ] Unit/integration tests green for touched scope @@ -864,6 +863,21 @@ Integration-branch promotion gates: - [ ] Quickstart verification for existing app and framework paths - [ ] Go/No-Go checklist in Section 18 satisfied before merging to `main` +### 16.2 V2.1 Parallel Execution Split + +To enable parallel work without merge collisions, split V2.1 into Python and TS tracks: + +- `feat/v2.1-anthropic-python-auto-instrumentation` (completed in this branch) + - Scope: `cascadeflow/harness/*`, Python harness tests, Python docs notes + - Deliverables: Anthropic Python auto-instrumentation, validation for `init()/run()` harness path +- `feat/v2.1-ts-harness-api-parity` (completed and merged into this branch scope) + - Scope: `packages/core/*`, TS parity fixtures, TS docs notes + - Deliverables: `@cascadeflow/core` exports parity (`init()/run()`), TS fixture parity validation + +Parallel-safe rule: +- Python track does not touch `packages/core/*` +- TS track does not touch `cascadeflow/harness/*` + ## 17. Future Phases (Post-V2, Not in Scope) For roadmap visibility. These inform V2 telemetry design but are not V2 deliverables. @@ -903,29 +917,29 @@ For roadmap visibility. These inform V2 telemetry design but are not V2 delivera Go when all are true (V2 Python launch): -- [ ] Harness layer is opt-in and backward compatible -- [ ] `cascadeflow.init()` auto-instruments `openai` Python client -- [ ] `observe` mode produces zero behavior change (benchmark-validated) -- [ ] `enforce` mode actions work correctly (switch_model, deny_tool, stop) -- [ ] Harness decision overhead <5ms p95 -- [ ] Python parity fixture tests pass -- [ ] Core + integration CI green -- [ ] Benchmark comparison acceptable vs latest baseline -- [ ] OpenAI Agents SDK integration documented and validated -- [ ] CrewAI integration documented and validated -- [ ] LangChain integration extended and validated -- [ ] Existing integrations (Vercel AI, n8n) verified compatible (no regressions) -- [ ] DX quickstart works for existing app/agent users with 1-3 lines of code change +- [x] Harness layer is opt-in and backward compatible +- [x] `cascadeflow.init()` auto-instruments `openai` Python client +- [x] `observe` mode produces zero behavior change (benchmark-validated) +- [x] `enforce` mode actions work correctly (switch_model, deny_tool, stop) +- [x] Harness decision overhead <5ms p95 +- [x] Python parity fixture tests pass +- [x] Core + integration CI green +- [x] Benchmark comparison acceptable vs latest baseline +- [x] OpenAI Agents SDK integration documented and validated +- [x] CrewAI integration documented and validated +- [x] LangChain integration extended and validated +- [x] Existing integrations (Vercel AI, n8n) verified compatible (no regressions) +- [x] DX quickstart works for existing app/agent users with 1-3 lines of code change - [ ] External pilot median time-to-first-value <15 minutes -- [ ] Public benchmark results ready for launch -- [ ] Benchmark scripts + raw artifacts are reproducible by third parties -- [ ] pyproject.toml extras (`openai-agents`, `crewai`, `langchain`) defined and installable +- [x] Public benchmark results ready for launch +- [x] Benchmark scripts + raw artifacts are reproducible by third parties +- [x] pyproject.toml extras (`openai-agents`, `crewai`, `langchain`, `google-adk`) defined and installable V2.1 Go/No-Go (TS parity + anthropic): -- [ ] TS parity fixtures pass -- [ ] `@cascadeflow/core` exports `cascadeflow.init()` and `cascadeflow.run()` -- [ ] `anthropic` Python client auto-instrumentation validated -- [ ] `@anthropic-ai/sdk` TS client auto-instrumentation validated +- [x] TS parity fixtures pass +- [x] `@cascadeflow/core` exports `cascadeflow.init()` and `cascadeflow.run()` +- [x] `anthropic` Python client auto-instrumentation validated +- [x] `@anthropic-ai/sdk` TS client auto-instrumentation validated ## 19. Academic Validation diff --git a/examples/integrations/README.md b/examples/integrations/README.md index e7e7906a..556efe7a 100644 --- a/examples/integrations/README.md +++ b/examples/integrations/README.md @@ -6,6 +6,8 @@ This directory contains production-ready integration examples for cascadeflow wi - [LiteLLM Integration](#-litellm-integration) - Access 10+ providers with automatic cost tracking - [OpenAI Agents SDK Integration](#-openai-agents-sdk-integration) - Harness-aware ModelProvider for existing agent apps +- [CrewAI Integration](#-crewai-integration) - Hook-based harness metrics and budget gating +- [Google ADK Integration](#-google-adk-integration) - Plugin-based harness integration for ADK runners - [Paygentic Integration](#-paygentic-integration) - Usage event reporting and billing lifecycle helpers - [Local Providers](#-local-providers-setup) - Ollama and vLLM configuration examples - [OpenTelemetry & Grafana](#-opentelemetry--grafana) - Production observability and metrics @@ -152,6 +154,9 @@ pip install "cascadeflow[openai,openai-agents]" python examples/integrations/openai_agents_harness.py ``` +Recommended: Python 3.10+. +Optional: `pip install litellm` for more precise provider/model cost normalization. + ### What It Shows - Harness-aware model switching with candidate models @@ -160,6 +165,54 @@ python examples/integrations/openai_agents_harness.py --- +## 👥 CrewAI Integration + +**File:** [`crewai_harness.py`](crewai_harness.py) + +Use cascadeflow as an explicit, opt-in CrewAI hook integration. + +### Quick Start + +```bash +pip install "cascadeflow[crewai,openai]" +python examples/integrations/crewai_harness.py +``` + +Requires Python 3.10+. +Optional: `pip install litellm` for more precise provider/model cost normalization. + +### What It Shows + +- Explicit `enable(...)` hook registration (never on by default) +- Enforce-mode budget gating before CrewAI LLM calls +- Run metrics and decision trace via `cascadeflow.run(...)` + +--- + +## 🧠 Google ADK Integration + +**File:** [`google_adk_harness.py`](google_adk_harness.py) + +Use cascadeflow as an explicit, opt-in plugin integration for Google ADK. + +### Quick Start + +```bash +pip install "cascadeflow[google-adk]" +python examples/integrations/google_adk_harness.py +``` + +Requires Python 3.10+. +Optional: `pip install litellm` for more precise provider/model cost normalization. + +### What It Shows + +- Explicit plugin creation with `enable(...)` (integration-only behavior) +- Runner-level plugin wiring via `Runner(..., plugins=[plugin])` +- Budget gate + run-scoped metrics and trace + +--- + ## 💳 Paygentic Integration **File:** [`paygentic_usage.py`](paygentic_usage.py) @@ -412,6 +465,9 @@ Cost Calculation Tests |------|---------|-------------------| | `litellm_providers.py` | Comprehensive LiteLLM demo with 8 examples | No (for cost info) | | `litellm_cost_tracking.py` | Cost tracking and provider validation | No (for cost info) | +| `openai_agents_harness.py` | OpenAI Agents SDK harness integration (ModelProvider) | Yes | +| `crewai_harness.py` | CrewAI hook-based harness integration (opt-in) | Yes | +| `google_adk_harness.py` | Google ADK plugin harness integration (opt-in) | Yes | | `paygentic_usage.py` | Usage event reporting to Paygentic (opt-in, fail-open) | Yes | | `local_providers_setup.py` | Ollama and vLLM setup guide | No | | `opentelemetry_grafana.py` | Production observability example | No | @@ -473,6 +529,18 @@ pip install cascadeflow[all] pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp-proto-http ``` +### "CrewAI hooks unavailable" +```bash +pip install "cascadeflow[crewai,openai]" +# Requires crewai>=1.5 for llm_hooks +``` + +### "Google ADK not installed" +```bash +pip install "cascadeflow[google-adk]" +# Google ADK requires Python 3.10+ +``` + ### "Metrics not appearing in Grafana" 1. Check OpenTelemetry Collector logs: `docker-compose logs otel-collector` 2. Verify metrics: `curl http://localhost:8889/metrics` @@ -490,6 +558,9 @@ Always use provider prefixes for LiteLLM: - **Provider Guide:** [docs/guides/providers.md](../../docs/guides/providers.md) - **Cost Tracking:** [docs/guides/cost_tracking.md](../../docs/guides/cost_tracking.md) +- **OpenAI Agents Guide:** [docs/guides/openai_agents_integration.md](../../docs/guides/openai_agents_integration.md) +- **CrewAI Guide:** [docs/guides/crewai_integration.md](../../docs/guides/crewai_integration.md) +- **Google ADK Guide:** [docs/guides/google_adk_integration.md](../../docs/guides/google_adk_integration.md) - **Paygentic Guide:** [docs/guides/paygentic_integration.md](../../docs/guides/paygentic_integration.md) - **Production Guide:** [docs/guides/production.md](../../docs/guides/production.md) @@ -498,10 +569,13 @@ Always use provider prefixes for LiteLLM: ## 🚀 Next Steps 1. **Try LiteLLM:** `python examples/integrations/litellm_providers.py` -2. **Try Paygentic usage reporting:** `python examples/integrations/paygentic_usage.py` -3. **Setup local providers:** `python examples/integrations/local_providers_setup.py` -4. **Test your API keys:** `python examples/integrations/test_all_providers.py` -5. **Add monitoring:** Follow OpenTelemetry section above +2. **Try OpenAI Agents integration:** `python examples/integrations/openai_agents_harness.py` +3. **Try CrewAI integration:** `python examples/integrations/crewai_harness.py` +4. **Try Google ADK integration:** `python examples/integrations/google_adk_harness.py` +5. **Try Paygentic usage reporting:** `python examples/integrations/paygentic_usage.py` +6. **Setup local providers:** `python examples/integrations/local_providers_setup.py` +7. **Test your API keys:** `python examples/integrations/test_all_providers.py` +8. **Add monitoring:** Follow OpenTelemetry section above --- diff --git a/examples/integrations/crewai_harness.py b/examples/integrations/crewai_harness.py new file mode 100644 index 00000000..a9df72c6 --- /dev/null +++ b/examples/integrations/crewai_harness.py @@ -0,0 +1,73 @@ +""" +CrewAI + cascadeflow harness integration example. + +Run: + pip install "cascadeflow[crewai,openai]" + export OPENAI_API_KEY="your-key" + python examples/integrations/crewai_harness.py +""" + +from __future__ import annotations + + +def main() -> None: + try: + from crewai import Agent, Crew, Process, Task + except ImportError as exc: + raise SystemExit( + "CrewAI is not installed. " 'Install with: pip install "cascadeflow[crewai,openai]"' + ) from exc + + from cascadeflow import init, run + from cascadeflow.integrations.crewai import CrewAIHarnessConfig, enable + + # 1) Initialize harness globally. + init(mode="observe", budget=1.0, max_tool_calls=6) + + # 2) Explicitly enable CrewAI integration hooks (opt-in). + enabled = enable( + config=CrewAIHarnessConfig( + fail_open=True, + enable_budget_gate=True, + ) + ) + if not enabled: + raise SystemExit( + "CrewAI hooks are unavailable in this environment. " "Ensure crewai>=1.5 is installed." + ) + + agent = Agent( + role="Routing Analyst", + goal="Explain model routing impact on cost and latency in plain language.", + backstory="You are concise and practical.", + allow_delegation=False, + llm="openai/gpt-4o-mini", + verbose=False, + ) + + task = Task( + description="Explain why inside-the-loop routing helps agent workloads.", + expected_output="One short paragraph and three bullet points.", + agent=agent, + ) + + with run(budget=0.5, max_tool_calls=4) as session: + crew = Crew(agents=[agent], tasks=[task], process=Process.sequential, verbose=False) + result = crew.kickoff() + + print("=== Result ===") + print(result) + print("\n=== Harness Metrics ===") + print(f"Cost: ${session.cost:.6f}") + print(f"Remaining budget: {session.budget_remaining}") + print(f"Steps: {session.step_count}") + print(f"Tool calls: {session.tool_calls}") + print(f"Latency: {session.latency_used_ms:.0f}ms") + print(f"Energy: {session.energy_used:.1f}") + print("\n=== Decision Trace ===") + for event in session.trace(): + print(event) + + +if __name__ == "__main__": + main() diff --git a/examples/integrations/google_adk_harness.py b/examples/integrations/google_adk_harness.py new file mode 100644 index 00000000..3f8c9743 --- /dev/null +++ b/examples/integrations/google_adk_harness.py @@ -0,0 +1,88 @@ +""" +Google ADK + cascadeflow harness integration example. + +Run: + pip install "cascadeflow[google-adk]" + export GOOGLE_API_KEY="your-key" + python examples/integrations/google_adk_harness.py +""" + +from __future__ import annotations + +import asyncio + + +async def main() -> None: + try: + from google.adk.agents import Agent + from google.adk.runners import Runner + from google.adk.sessions import InMemorySessionService + except ImportError as exc: + raise SystemExit( + "Google ADK is not installed. " 'Install with: pip install "cascadeflow[google-adk]"' + ) from exc + + from cascadeflow import init, run + from cascadeflow.integrations.google_adk import GoogleADKHarnessConfig, enable + + # 1. Initialize harness globally + init(mode="observe", budget=1.0) + + # 2. Create the cascadeflow ADK plugin + plugin = enable( + config=GoogleADKHarnessConfig( + fail_open=True, + enable_budget_gate=True, + ) + ) + + # 3. Define an ADK agent + agent = Agent( + name="demo_agent", + model="gemini-2.5-flash", + instruction="You are a helpful assistant. Answer concisely.", + ) + + # 4. Create a Runner with the cascadeflow plugin + session_service = InMemorySessionService() + runner = Runner( + agent=agent, + app_name="cascadeflow_demo", + session_service=session_service, + plugins=[plugin], # cascadeflow hooks into all LLM calls here + ) + + # 5. Run within a harness scope + with run(budget=0.5) as session: + user_session = await session_service.create_session( + app_name="cascadeflow_demo", + user_id="demo-user", + ) + + from google.genai.types import Content, Part + + async for event in runner.run_async( + user_id="demo-user", + session_id=user_session.id, + new_message=Content(parts=[Part(text="What is model routing?")]), + ): + if event.content and event.content.parts: + for part in event.content.parts: + if part.text: + print(part.text, end="") + print() + + print("\n=== Harness Metrics ===") + print(f"Cost: ${session.cost:.6f}") + print(f"Remaining budget: {session.budget_remaining}") + print(f"Steps: {session.step_count}") + print(f"Tool calls: {session.tool_calls}") + print(f"Energy: {session.energy_used:.1f}") + print(f"Latency: {session.latency_used_ms:.0f}ms") + print("\n=== Decision Trace ===") + for event in session.trace(): + print(event) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/integrations/langchain_harness.py b/examples/integrations/langchain_harness.py new file mode 100644 index 00000000..c0be501f --- /dev/null +++ b/examples/integrations/langchain_harness.py @@ -0,0 +1,55 @@ +""" +LangChain + cascadeflow harness integration example. + +Run: + pip install "cascadeflow[langchain]" + export OPENAI_API_KEY="your-key" + python examples/integrations/langchain_harness.py +""" + +from __future__ import annotations + +import asyncio + + +async def main() -> None: + try: + from langchain_openai import ChatOpenAI + except ImportError as exc: + raise SystemExit( + "langchain-openai is not installed. " + 'Install with: pip install "cascadeflow[langchain]" langchain-openai' + ) from exc + + from cascadeflow import init, run + from cascadeflow.integrations.langchain import get_harness_callback + + # 1) Initialize harness globally. + init(mode="observe", budget=1.0, max_tool_calls=6) + + model = ChatOpenAI(model="gpt-4o-mini") + + # 2) Scoped run with harness-aware callback. + with run(budget=0.5, max_tool_calls=4) as session: + with get_harness_callback() as cb: + response = await model.ainvoke( + "Explain why inside-the-loop model routing helps agent budgets.", + config={"callbacks": [cb]}, + ) + + print("=== Result ===") + print(response.content) + print("\n=== Harness Metrics ===") + print(f"Cost: ${session.cost:.6f}") + print(f"Remaining budget: {session.budget_remaining}") + print(f"Steps: {session.step_count}") + print(f"Tool calls: {session.tool_calls}") + print(f"Latency: {session.latency_used_ms:.0f}ms") + print(f"Energy: {session.energy_used:.1f}") + print("\n=== Decision Trace ===") + for event in session.trace(): + print(event) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/llms.txt b/llms.txt new file mode 100644 index 00000000..dbba72ca --- /dev/null +++ b/llms.txt @@ -0,0 +1,214 @@ +# cascadeflow + +> Agent runtime intelligence layer for AI agent workflows. +> In-process harness (not a proxy). Works inside agent loops with full state awareness. + +## Install + +pip install cascadeflow + +## Quickstart (3 lines) + +import cascadeflow +cascadeflow.init(mode="observe") +# All openai/anthropic SDK calls are now tracked. Switch to "enforce" for budget gating. + +## What cascadeflow is + +cascadeflow is an in-process intelligence layer that sits inside AI agent execution +loops. Unlike external proxies that only see HTTP request boundaries, cascadeflow +operates with full agent state awareness: step count, budget consumed, tool call +history, error context, quality scores, domain, complexity, and user-defined +business context. + +Eight things make cascadeflow different: + +1. Inside-the-loop control. Decisions happen per-step and per-tool-call inside + agent execution, not at the HTTP boundary. This enables budget gating mid-run, + model switching based on remaining budget, and stop actions when caps are hit. + +2. Multi-dimensional optimization. Six dimensions scored simultaneously: cost, + latency, quality, budget, compliance, and energy. Not just cost routing. + +3. Business logic injection. KPI weights and targets let teams encode business + priorities (e.g. 60% quality, 30% cost, 10% latency) into every model decision. + +4. Actionable decisions. Four actions: allow, switch_model, deny_tool, stop. + The harness does not just observe — it controls execution flow. + +5. Full transparency. Every decision produces a trace record with action, reason, + model, step, cost_total, budget_state, and applied fields. Audit-ready. + +6. Measurable value. Session summaries report cost, latency, energy, steps, + tool calls, and budget remaining. Before/after comparison is built in. + +7. Category leadership. Cross-framework policy layer with unified KPI semantics + across LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, Vercel AI SDK. + +8. Latency advantage. In-process instrumentation adds <1ms overhead per call. + External proxies add 10-50ms of network round-trip latency per LLM call. + +## When to use cascadeflow + +- You run AI agents (LangChain, LangGraph, CrewAI, OpenAI Agents SDK, Google ADK, or custom) +- You want to reduce LLM costs without changing agent code +- You need budget enforcement across multi-step agent runs +- You need to inject business KPIs (quality, cost, latency, energy) into agent decisions +- You need compliance-aware model gating (GDPR, HIPAA, PCI, strict) +- You want full trace recording for auditability and tuning + +## When NOT to use cascadeflow + +- Single one-off LLM calls (overhead not justified) +- You only use one model and don't want routing +- You need a hosted proxy service (cascadeflow is a library, not a SaaS) + +## Proxy vs cascadeflow + +| Dimension | External proxy | cascadeflow harness | +|--------------------|----------------------------|------------------------------| +| Scope | HTTP request boundary | Inside agent execution loop | +| Dimensions | Cost only | Cost + quality + latency + budget + compliance + energy | +| Latency overhead | 10-50ms network RTT | <1ms in-process | +| Business logic | None | KPI weights and targets | +| Enforcement | None (observe only) | stop, deny_tool, switch_model | +| Auditability | Request logs | Per-step decision traces | + +## Key APIs + +- cascadeflow.init(mode) -- activate harness globally (off | observe | enforce) +- cascadeflow.run(budget, max_tool_calls) -- scoped agent run with budget/limits +- @cascadeflow.agent(budget, kpis) -- annotate agent functions with policy metadata +- session.summary() -- structured run metrics (cost, latency, energy, steps, tool calls) +- session.trace() -- full decision trace for auditability + +## HarnessConfig Reference + +@dataclass +class HarnessConfig: + mode: HarnessMode # "off" | "observe" | "enforce". Default: "off" + verbose: bool # Print decisions to stderr. Default: False + budget: Optional[float] # Max USD for the run. Default: None (unlimited) + max_tool_calls: Optional[int] # Max tool/function calls. Default: None + max_latency_ms: Optional[float] # Max wall-clock ms per call. Default: None + max_energy: Optional[float] # Max energy units. Default: None + kpi_targets: Optional[dict] # {"quality": 0.9, "cost": 0.5, ...} + kpi_weights: Optional[dict] # {"quality": 0.6, "cost": 0.3, "latency": 0.1} + compliance: Optional[str] # "gdpr" | "hipaa" | "pci" | "strict" + +## Harness Modes + +- off: no tracking, no enforcement +- observe: track all metrics and decisions, never block execution (safe for production rollout) +- enforce: track + enforce budget/tool/latency/energy caps (stop or deny_tool actions) + +## Harness Dimensions + +- Cost: estimated USD from model pricing table (18 models, fuzzy resolution) +- Latency: wall-clock milliseconds per LLM call +- Energy: deterministic compute-intensity proxy coefficient +- Tool calls: count of tool/function calls executed +- Quality: model quality priors for KPI-weighted scoring + +## Decision Actions + +- allow: proceed normally +- switch_model: route to cheaper/better model (where runtime allows) +- deny_tool: block tool execution when tool call cap reached +- stop: halt agent loop when budget/latency/energy cap exceeded + +## Decision Trace Format + +Each decision produces a record with these fields: +- action: "allow" | "switch_model" | "deny_tool" | "stop" +- reason: human-readable explanation +- model: model name used for the call +- step: integer step number in the run +- cost_total: cumulative cost in USD at this step +- budget_state: "ok" | "warning" | "exceeded" +- applied: true if the action was enforced (false in observe mode) + +## Compliance Model Allowlists + +- gdpr: gpt-4o, gpt-4o-mini, gpt-3.5-turbo +- hipaa: gpt-4o, gpt-4o-mini +- pci: gpt-4o-mini, gpt-3.5-turbo +- strict: gpt-4o only + +## Integrations + +pip install cascadeflow[langchain] # LangChain/LangGraph callback handler +pip install cascadeflow[openai-agents] # OpenAI Agents SDK ModelProvider +pip install cascadeflow[crewai] # CrewAI llm_hooks integration +pip install cascadeflow[google-adk] # Google ADK BasePlugin + +npm install @cascadeflow/core # TypeScript core +npm install @cascadeflow/langchain # LangChain TypeScript +npm install @cascadeflow/vercel-ai # Vercel AI SDK middleware +npm install @cascadeflow/n8n-nodes-cascadeflow # n8n community node + +All integrations are opt-in. Install the extra and explicitly enable the integration. + +## Integration Code Snippets + +LangChain: + from cascadeflow.integrations.langchain import get_harness_callback + cb = get_harness_callback() + result = await model.ainvoke("query", config={"callbacks": [cb]}) + +OpenAI Agents SDK: + from cascadeflow.integrations.openai_agents import CascadeFlowModelProvider + provider = CascadeFlowModelProvider(model_candidates=["gpt-4o-mini", "gpt-4o"]) + +CrewAI: + from cascadeflow.integrations.crewai import enable + enable(budget_gate=True, fail_open=True) + +Google ADK: + from cascadeflow.integrations.google_adk import enable + plugin = enable(fail_open=True) + runner = Runner(agent=agent, plugins=[plugin]) + +## Pricing Table (USD per 1M tokens: input / output) + +OpenAI: + gpt-4o: $2.50 / $10.00 + gpt-4o-mini: $0.15 / $0.60 + gpt-5: $1.25 / $10.00 + gpt-5-mini: $0.20 / $0.80 + gpt-4-turbo: $10.00 / $30.00 + gpt-4: $30.00 / $60.00 + gpt-3.5-turbo: $0.50 / $1.50 + o1: $15.00 / $60.00 + o1-mini: $3.00 / $12.00 + o3-mini: $1.10 / $4.40 + +Anthropic: + claude-sonnet-4: $3.00 / $15.00 + claude-haiku-3.5: $1.00 / $5.00 + claude-opus-4.5: $5.00 / $25.00 + +Google: + gemini-2.5-flash: $0.15 / $0.60 + gemini-2.5-pro: $1.25 / $10.00 + gemini-2.0-flash: $0.10 / $0.40 + gemini-1.5-flash: $0.075 / $0.30 + gemini-1.5-pro: $1.25 / $5.00 + +## Energy Coefficients + +Model energy is computed as: energy_units = coeff * (input_tokens + output_tokens * 1.5) + + gpt-4o: 1.0 gpt-4o-mini: 0.3 gpt-5: 1.2 + gpt-5-mini: 0.35 gpt-4-turbo: 1.5 gpt-4: 1.5 + gpt-3.5-turbo: 0.2 o1: 2.0 o1-mini: 0.8 + o3-mini: 0.5 claude-sonnet-4: 1.0 claude-haiku-3.5: 0.3 + claude-opus-4.5: 1.8 gemini-2.5-flash: 0.3 gemini-2.5-pro: 1.2 + gemini-2.0-flash: 0.25 gemini-1.5-flash: 0.2 gemini-1.5-pro: 1.0 + +## Links + +- Docs: https://docs.cascadeflow.dev +- Source: https://github.com/lemony-ai/cascadeflow +- PyPI: pip install cascadeflow +- npm: npm install @cascadeflow/core diff --git a/packages/core/README.md b/packages/core/README.md index a0918d78..3188df91 100644 --- a/packages/core/README.md +++ b/packages/core/README.md @@ -33,6 +33,23 @@ pnpm add @cascadeflow/core yarn add @cascadeflow/core ``` +## Harness Quick Start (V2.1) + +```typescript +import { cascadeflow } from '@cascadeflow/core'; + +// 1) Turn on in-process harness decisions + SDK auto-instrumentation +cascadeflow.init({ mode: 'enforce', budget: 0.5 }); + +// 2) Scope one run (global defaults are inherited) +const result = await cascadeflow.run({ maxToolCalls: 8 }, async (run) => { + // Any OpenAI / Anthropic SDK calls made here are evaluated by the harness. + return { runId: run.runId }; +}); + +console.log(result); +``` + ## Quick Start ### Recommended Setup (Claude Haiku + GPT-5) diff --git a/packages/core/src/__tests__/harness.test.ts b/packages/core/src/__tests__/harness.test.ts new file mode 100644 index 00000000..bad03376 --- /dev/null +++ b/packages/core/src/__tests__/harness.test.ts @@ -0,0 +1,232 @@ +import { afterEach, describe, expect, it } from 'vitest'; + +import { + BudgetExceededError, + cascadeflow, + getCurrentRun, + getHarnessConfig, + init, + reset, + run, +} from '../harness'; +import { + __resetInstrumentationLoadersForTest, + __resetInstrumentationStateForTest, + __setInstrumentationLoadersForTest, + isAnthropicPatched, + isOpenAIPatched, +} from '../harness-instrument'; + +class FakeOpenAICompletions { + constructor(private readonly calls: Array>) {} + + create(request: Record): Promise> { + this.calls.push({ ...request }); + return Promise.resolve({ + usage: { + prompt_tokens: 100, + completion_tokens: 25, + }, + choices: [ + { + message: { + tool_calls: [{ id: 'tool_1', type: 'function' }], + }, + }, + ], + }); + } +} + +class FakeAnthropicMessages { + constructor(private readonly calls: Array>) {} + + create(request: Record): Promise> { + this.calls.push({ ...request }); + return Promise.resolve({ + usage: { + input_tokens: 120, + output_tokens: 40, + }, + content: [ + { type: 'text', text: 'hello' }, + { type: 'tool_use', id: 'tool_1', name: 'search', input: { q: 'x' } }, + ], + }); + } +} + +afterEach(() => { + reset(); + __resetInstrumentationStateForTest(); + __resetInstrumentationLoadersForTest(); +}); + +describe('harness API (TypeScript parity)', () => { + it('exposes cascadeflow init/run object API', async () => { + expect(typeof cascadeflow.init).toBe('function'); + expect(typeof cascadeflow.run).toBe('function'); + + init({ mode: 'observe' }); + const value = await cascadeflow.run(async (scope) => { + expect(scope.mode).toBe('observe'); + expect(getCurrentRun()).toBe(scope); + return 42; + }); + + expect(value).toBe(42); + expect(getCurrentRun()).toBeNull(); + }); + + it('honors code > env precedence and preserves nested scope isolation', async () => { + const previousMode = process.env.CASCADEFLOW_HARNESS_MODE; + process.env.CASCADEFLOW_HARNESS_MODE = 'observe'; + + init(); + expect(getHarnessConfig().mode).toBe('observe'); + + init({ mode: 'enforce' }); + expect(getHarnessConfig().mode).toBe('enforce'); + + await run({ budget: 1.0 }, async (outer) => { + outer.cost = 0.1; + expect(outer.budgetMax).toBe(1.0); + expect(getCurrentRun()).toBe(outer); + + await run({ budget: 0.25 }, async (inner) => { + expect(getCurrentRun()).toBe(inner); + expect(inner.budgetMax).toBe(0.25); + inner.cost = 0.2; + }); + + expect(getCurrentRun()).toBe(outer); + expect(outer.budgetMax).toBe(1.0); + expect(outer.cost).toBe(0.1); + }); + + if (previousMode == null) { + delete process.env.CASCADEFLOW_HARNESS_MODE; + } else { + process.env.CASCADEFLOW_HARNESS_MODE = previousMode; + } + }); + + it('auto-instruments OpenAI and enforces switch_model decisions', async () => { + const openaiCalls: Array> = []; + + __setInstrumentationLoadersForTest({ + openai: () => ({ + Completions: FakeOpenAICompletions, + }), + anthropic: () => null, + }); + + init({ mode: 'enforce' }); + expect(isOpenAIPatched()).toBe(true); + + await run({ kpiWeights: { cost: 1 } }, async (scope) => { + const client = new FakeOpenAICompletions(openaiCalls); + await client.create({ + model: 'gpt-4o', + messages: [{ role: 'user', content: 'hi' }], + }); + + expect(scope.stepCount).toBe(1); + expect(scope.cost).toBeGreaterThan(0); + expect(scope.toolCalls).toBe(1); + + const trace = scope.trace(); + expect(trace).toHaveLength(1); + expect(trace[0]?.action).toBe('switch_model'); + expect(trace[0]?.applied).toBe(true); + expect(trace[0]?.decisionMode).toBe('enforce'); + }); + + expect(openaiCalls).toHaveLength(1); + expect(openaiCalls[0]?.model).not.toBe('gpt-4o'); + }); + + it('observe mode logs non-allow decisions without mutating request', async () => { + const openaiCalls: Array> = []; + + __setInstrumentationLoadersForTest({ + openai: () => ({ + Completions: FakeOpenAICompletions, + }), + anthropic: () => null, + }); + + init({ mode: 'observe' }); + + await run({ kpiWeights: { cost: 1 } }, async (scope) => { + const client = new FakeOpenAICompletions(openaiCalls); + await client.create({ + model: 'gpt-4o', + messages: [{ role: 'user', content: 'hi' }], + }); + + const trace = scope.trace(); + expect(trace).toHaveLength(1); + expect(trace[0]?.action).toBe('switch_model'); + expect(trace[0]?.applied).toBe(false); + expect(trace[0]?.decisionMode).toBe('observe'); + }); + + expect(openaiCalls).toHaveLength(1); + expect(openaiCalls[0]?.model).toBe('gpt-4o'); + }); + + it('enforce mode stops calls when budget is exhausted', async () => { + const openaiCalls: Array> = []; + + __setInstrumentationLoadersForTest({ + openai: () => ({ + Completions: FakeOpenAICompletions, + }), + anthropic: () => null, + }); + + init({ mode: 'enforce' }); + + await expect( + run({ budget: 0 }, async () => { + const client = new FakeOpenAICompletions(openaiCalls); + await client.create({ + model: 'gpt-4o', + messages: [{ role: 'user', content: 'hi' }], + }); + }), + ).rejects.toBeInstanceOf(BudgetExceededError); + + expect(openaiCalls).toHaveLength(0); + }); + + it('auto-instruments Anthropic and tracks usage/tool calls', async () => { + const anthropicCalls: Array> = []; + + __setInstrumentationLoadersForTest({ + openai: () => null, + anthropic: () => ({ + Messages: FakeAnthropicMessages, + }), + }); + + init({ mode: 'enforce' }); + expect(isAnthropicPatched()).toBe(true); + + await run(async (scope) => { + const client = new FakeAnthropicMessages(anthropicCalls); + await client.create({ + model: 'claude-sonnet-4-5-20250929', + messages: [{ role: 'user', content: 'hello' }], + }); + + expect(scope.stepCount).toBe(1); + expect(scope.toolCalls).toBe(1); + expect(scope.cost).toBeGreaterThan(0); + expect(scope.trace()[0]?.action).toBe('allow'); + }); + + expect(anthropicCalls).toHaveLength(1); + }); +}); diff --git a/packages/core/src/harness-instrument.ts b/packages/core/src/harness-instrument.ts new file mode 100644 index 00000000..901af4ae --- /dev/null +++ b/packages/core/src/harness-instrument.ts @@ -0,0 +1,746 @@ +type Action = 'allow' | 'switch_model' | 'deny_tool' | 'stop'; + +type CreateFunction = (this: any, ...args: any[]) => any; + +type OpenAIModuleLike = { + Completions?: { + prototype?: { + create?: CreateFunction; + }; + }; +}; + +type AnthropicModuleLike = { + Messages?: { + prototype?: { + create?: CreateFunction; + }; + }; +}; + +type Pricing = { input: number; output: number }; + +type PreCallDecision = { + action: Action; + reason: string; + targetModel: string; +}; + +type HarnessRuntime = { + getCurrentRun: () => HarnessRunContextLike | null; + getHarnessMode: () => HarnessModeLike; + createBudgetExceededError: (message: string, remaining?: number) => Error; + createHarnessStopError: (message: string, reason?: string) => Error; +}; + +type HarnessModeLike = 'off' | 'observe' | 'enforce'; + +type HarnessRunContextLike = { + mode: HarnessModeLike; + cost: number; + stepCount: number; + toolCalls: number; + latencyUsedMs: number; + energyUsed: number; + budgetMax?: number; + budgetRemaining?: number; + toolCallsMax?: number; + latencyMaxMs?: number; + energyMax?: number; + compliance?: string; + kpiWeights?: Record; + record: ( + action: string, + reason: string, + model?: string, + options?: { + applied?: boolean; + decisionMode?: HarnessModeLike; + }, + ) => void; +}; + +const MODEL_PRICING_PER_MILLION: Record = { + // OpenAI + 'gpt-5': { input: 1.25, output: 10.0 }, + 'gpt-5-mini': { input: 0.25, output: 2.0 }, + 'gpt-5-nano': { input: 0.05, output: 0.4 }, + 'gpt-4o': { input: 2.5, output: 10.0 }, + 'gpt-4o-mini': { input: 0.15, output: 0.6 }, + 'o1': { input: 15.0, output: 60.0 }, + 'o1-mini': { input: 3.0, output: 12.0 }, + 'o3-mini': { input: 1.0, output: 5.0 }, + + // Anthropic + 'claude-opus-4-5-20251101': { input: 15.0, output: 75.0 }, + 'claude-opus-4-20250514': { input: 15.0, output: 75.0 }, + 'claude-sonnet-4-5-20250929': { input: 3.0, output: 15.0 }, + 'claude-sonnet-4-20250514': { input: 3.0, output: 15.0 }, + 'claude-haiku-4-5-20251001': { input: 1.0, output: 5.0 }, + 'claude-3-5-haiku-20241022': { input: 1.0, output: 5.0 }, +}; + +const ENERGY_COEFFICIENTS: Record = { + 'gpt-5': 1.15, + 'gpt-5-mini': 0.72, + 'gpt-5-nano': 0.45, + 'gpt-4o': 1.0, + 'gpt-4o-mini': 0.55, + 'o1': 1.25, + 'o1-mini': 0.85, + 'o3-mini': 0.75, + 'claude-opus-4-5-20251101': 1.2, + 'claude-opus-4-20250514': 1.15, + 'claude-sonnet-4-5-20250929': 0.95, + 'claude-sonnet-4-20250514': 0.92, + 'claude-haiku-4-5-20251001': 0.7, + 'claude-3-5-haiku-20241022': 0.68, +}; + +const LATENCY_PRIORS: Record = { + 'gpt-5': 0.45, + 'gpt-5-mini': 0.72, + 'gpt-5-nano': 0.9, + 'gpt-4o': 0.58, + 'gpt-4o-mini': 0.82, + 'o1': 0.35, + 'o1-mini': 0.62, + 'o3-mini': 0.7, + 'claude-opus-4-5-20251101': 0.4, + 'claude-opus-4-20250514': 0.44, + 'claude-sonnet-4-5-20250929': 0.6, + 'claude-sonnet-4-20250514': 0.63, + 'claude-haiku-4-5-20251001': 0.85, + 'claude-3-5-haiku-20241022': 0.86, +}; + +const QUALITY_PRIORS: Record = { + 'gpt-5': 0.95, + 'gpt-5-mini': 0.86, + 'gpt-5-nano': 0.74, + 'gpt-4o': 0.9, + 'gpt-4o-mini': 0.82, + 'o1': 0.93, + 'o1-mini': 0.84, + 'o3-mini': 0.86, + 'claude-opus-4-5-20251101': 0.94, + 'claude-opus-4-20250514': 0.92, + 'claude-sonnet-4-5-20250929': 0.9, + 'claude-sonnet-4-20250514': 0.88, + 'claude-haiku-4-5-20251001': 0.82, + 'claude-3-5-haiku-20241022': 0.8, +}; + +const COMPLIANCE_ALLOWLISTS: Record> = { + strict: new Set(['gpt-4o', 'gpt-4o-mini', 'claude-sonnet-4-5-20250929', 'claude-haiku-4-5-20251001']), + regulated: new Set(['gpt-4o', 'claude-sonnet-4-5-20250929']), +}; + +const DEFAULT_ENERGY_COEFFICIENT = 0.9; +const DEFAULT_OUTPUT_WEIGHT = 1.5; + +const PRICING_MODELS = Object.keys(MODEL_PRICING_PER_MILLION); + +let openAIPatched = false; +let anthropicPatched = false; + +let originalOpenAICreate: CreateFunction | null = null; +let originalAnthropicCreate: CreateFunction | null = null; +let patchedOpenAIClass: { prototype?: { create?: CreateFunction } } | null = null; +let patchedAnthropicClass: { prototype?: { create?: CreateFunction } } | null = null; + +const defaultOpenAILoader = (): OpenAIModuleLike | null => { + try { + // eslint-disable-next-line @typescript-eslint/no-var-requires + return require('openai/resources/chat/completions') as OpenAIModuleLike; + } catch { + return null; + } +}; + +const defaultAnthropicLoader = (): AnthropicModuleLike | null => { + try { + // eslint-disable-next-line @typescript-eslint/no-var-requires + return require('@anthropic-ai/sdk/resources/messages') as AnthropicModuleLike; + } catch { + return null; + } +}; + +let loadOpenAIModule = defaultOpenAILoader; +let loadAnthropicModule = defaultAnthropicLoader; +let harnessRuntimeBindings: HarnessRuntime | null = null; + +function getHarnessRuntime(): HarnessRuntime { + if (!harnessRuntimeBindings) { + throw new Error('Harness runtime bindings not configured'); + } + return harnessRuntimeBindings; +} + +export function setHarnessRuntimeBindingsForInstrumentation(bindings: HarnessRuntime): void { + harnessRuntimeBindings = bindings; +} + +function nowMonotonicMs(): number { + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition + if (typeof globalThis !== 'undefined' && (globalThis as any).performance?.now) { + return (globalThis as any).performance.now() as number; + } + + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition + if (typeof process !== 'undefined' && process.hrtime?.bigint) { + return Number(process.hrtime.bigint()) / 1_000_000; + } + + return Date.now(); +} + +function normalizeModelName(model: string): string { + return model.trim().toLowerCase(); +} + +function estimateCost(model: string, promptTokens: number, completionTokens: number): number { + const price = MODEL_PRICING_PER_MILLION[normalizeModelName(model)]; + if (!price) { + return 0; + } + + return (promptTokens / 1_000_000) * price.input + (completionTokens / 1_000_000) * price.output; +} + +function estimateEnergy(model: string, promptTokens: number, completionTokens: number): number { + const coefficient = ENERGY_COEFFICIENTS[normalizeModelName(model)] ?? DEFAULT_ENERGY_COEFFICIENT; + return coefficient * (promptTokens + completionTokens * DEFAULT_OUTPUT_WEIGHT) / 1000; +} + +function modelTotalCost(model: string): number { + const price = MODEL_PRICING_PER_MILLION[normalizeModelName(model)]; + if (!price) { + return Number.POSITIVE_INFINITY; + } + return price.input + price.output; +} + +function selectCheaperModel(currentModel: string): string { + const currentCost = modelTotalCost(currentModel); + let bestModel = currentModel; + let bestCost = currentCost; + + for (const candidate of PRICING_MODELS) { + const candidateCost = modelTotalCost(candidate); + if (candidateCost < bestCost) { + bestModel = candidate; + bestCost = candidateCost; + } + } + + return bestModel; +} + +function selectLowerEnergyModel(currentModel: string): string { + const currentCoeff = ENERGY_COEFFICIENTS[normalizeModelName(currentModel)] ?? DEFAULT_ENERGY_COEFFICIENT; + let bestModel = currentModel; + let bestCoeff = currentCoeff; + + for (const candidate of PRICING_MODELS) { + const coeff = ENERGY_COEFFICIENTS[candidate] ?? DEFAULT_ENERGY_COEFFICIENT; + if (coeff < bestCoeff) { + bestModel = candidate; + bestCoeff = coeff; + } + } + + return bestModel; +} + +function selectFasterModel(currentModel: string): string { + const currentLatency = LATENCY_PRIORS[normalizeModelName(currentModel)] ?? 0.7; + let bestModel = currentModel; + let bestLatency = currentLatency; + + for (const candidate of PRICING_MODELS) { + const score = LATENCY_PRIORS[candidate] ?? 0.7; + if (score > bestLatency) { + bestModel = candidate; + bestLatency = score; + } + } + + return bestModel; +} + +function normalizeWeights(weights: Record): Record { + const normalized: Record = {}; + let total = 0; + + for (const [key, value] of Object.entries(weights)) { + if (!Number.isFinite(value) || value <= 0) { + continue; + } + normalized[key] = value; + total += value; + } + + if (total <= 0) { + return {}; + } + + for (const key of Object.keys(normalized)) { + normalized[key] /= total; + } + + return normalized; +} + +function costUtility(model: string): number { + const costs = PRICING_MODELS.map(modelTotalCost).filter(Number.isFinite); + const min = Math.min(...costs); + const max = Math.max(...costs); + const current = modelTotalCost(model); + + if (!Number.isFinite(current) || max === min) { + return 0.5; + } + + return (max - current) / (max - min); +} + +function energyUtility(model: string): number { + const coeffs = PRICING_MODELS.map((name) => ENERGY_COEFFICIENTS[name] ?? DEFAULT_ENERGY_COEFFICIENT); + const min = Math.min(...coeffs); + const max = Math.max(...coeffs); + const current = ENERGY_COEFFICIENTS[normalizeModelName(model)] ?? DEFAULT_ENERGY_COEFFICIENT; + + if (max === min) { + return 0.5; + } + + return (max - current) / (max - min); +} + +function kpiScore(model: string, weights: Record): number { + const normalized = normalizeWeights(weights); + if (Object.keys(normalized).length === 0) { + return 0; + } + + const key = normalizeModelName(model); + const quality = QUALITY_PRIORS[key] ?? 0.7; + const latency = LATENCY_PRIORS[key] ?? 0.7; + const cost = costUtility(key); + const energy = energyUtility(key); + + return ( + (normalized.quality ?? 0) * quality + + (normalized.latency ?? 0) * latency + + (normalized.cost ?? 0) * cost + + (normalized.energy ?? 0) * energy + ); +} + +function selectKPIWeightedModel(currentModel: string, weights: Record): string { + const normalized = normalizeWeights(weights); + if (Object.keys(normalized).length === 0) { + return currentModel; + } + + let bestModel = currentModel; + let bestScore = kpiScore(currentModel, normalized); + + for (const candidate of PRICING_MODELS) { + const score = kpiScore(candidate, normalized); + if (score > bestScore) { + bestModel = candidate; + bestScore = score; + } + } + + return bestModel; +} + +function extractOpenAIUsage(response: any): [number, number] { + const usage = response?.usage; + if (!usage || typeof usage !== 'object') { + return [0, 0]; + } + const promptTokens = Number(usage.prompt_tokens ?? usage.input_tokens ?? 0); + const completionTokens = Number(usage.completion_tokens ?? usage.output_tokens ?? 0); + return [ + Number.isFinite(promptTokens) ? promptTokens : 0, + Number.isFinite(completionTokens) ? completionTokens : 0, + ]; +} + +function extractAnthropicUsage(response: any): [number, number] { + const usage = response?.usage; + if (!usage || typeof usage !== 'object') { + return [0, 0]; + } + + const inputTokens = Number(usage.input_tokens ?? usage.prompt_tokens ?? 0); + const outputTokens = Number(usage.output_tokens ?? usage.completion_tokens ?? 0); + return [ + Number.isFinite(inputTokens) ? inputTokens : 0, + Number.isFinite(outputTokens) ? outputTokens : 0, + ]; +} + +function countOpenAIToolCalls(response: any): number { + const toolCalls = response?.choices?.[0]?.message?.tool_calls; + if (!Array.isArray(toolCalls)) { + return 0; + } + return toolCalls.length; +} + +function countAnthropicToolCalls(response: any): number { + const content = response?.content; + if (!Array.isArray(content)) { + return 0; + } + return content.filter((item: any) => item?.type === 'tool_use').length; +} + +function evaluatePreCallDecision(ctx: HarnessRunContextLike, model: string, hasTools: boolean): PreCallDecision { + if (ctx.budgetMax != null && ctx.cost >= ctx.budgetMax) { + return { action: 'stop', reason: 'budget_exceeded', targetModel: model }; + } + + if (hasTools && ctx.toolCallsMax != null && ctx.toolCalls >= ctx.toolCallsMax) { + return { action: 'deny_tool', reason: 'max_tool_calls_reached', targetModel: model }; + } + + if (ctx.compliance) { + const profile = COMPLIANCE_ALLOWLISTS[ctx.compliance.trim().toLowerCase()]; + if (profile) { + const normalized = normalizeModelName(model); + if (!profile.has(normalized)) { + const next = PRICING_MODELS.find((candidate) => profile.has(candidate)); + if (next) { + return { action: 'switch_model', reason: 'compliance_model_policy', targetModel: next }; + } + return { + action: hasTools ? 'deny_tool' : 'stop', + reason: hasTools ? 'compliance_no_approved_tool_path' : 'compliance_no_approved_model', + targetModel: model, + }; + } + if (ctx.compliance.trim().toLowerCase() === 'strict' && hasTools) { + return { action: 'deny_tool', reason: 'compliance_tool_restriction', targetModel: model }; + } + } + } + + if (ctx.latencyMaxMs != null && ctx.latencyUsedMs >= ctx.latencyMaxMs) { + const faster = selectFasterModel(model); + if (normalizeModelName(faster) !== normalizeModelName(model)) { + return { action: 'switch_model', reason: 'latency_limit_exceeded', targetModel: faster }; + } + return { action: 'stop', reason: 'latency_limit_exceeded', targetModel: model }; + } + + if (ctx.energyMax != null && ctx.energyUsed >= ctx.energyMax) { + const lower = selectLowerEnergyModel(model); + if (normalizeModelName(lower) !== normalizeModelName(model)) { + return { action: 'switch_model', reason: 'energy_limit_exceeded', targetModel: lower }; + } + return { action: 'stop', reason: 'energy_limit_exceeded', targetModel: model }; + } + + if ( + ctx.budgetMax != null + && ctx.budgetMax > 0 + && ctx.budgetRemaining != null + && (ctx.budgetRemaining / ctx.budgetMax) < 0.2 + ) { + const cheaper = selectCheaperModel(model); + if (normalizeModelName(cheaper) !== normalizeModelName(model)) { + return { action: 'switch_model', reason: 'budget_pressure', targetModel: cheaper }; + } + } + + if (ctx.kpiWeights && Object.keys(ctx.kpiWeights).length > 0) { + const candidate = selectKPIWeightedModel(model, ctx.kpiWeights); + if (normalizeModelName(candidate) !== normalizeModelName(model)) { + return { action: 'switch_model', reason: 'kpi_weight_optimization', targetModel: candidate }; + } + } + + return { action: 'allow', reason: ctx.mode, targetModel: model }; +} + +function raiseStopError(ctx: HarnessRunContextLike, reason: string): never { + const runtime = getHarnessRuntime(); + if (reason === 'budget_exceeded') { + const remaining = Math.max(0, (ctx.budgetMax ?? 0) - ctx.cost); + throw runtime.createBudgetExceededError( + `Budget exhausted: spent $${ctx.cost.toFixed(4)} of $${(ctx.budgetMax ?? 0).toFixed(4)} max`, + remaining, + ); + } + + throw runtime.createHarnessStopError(`cascadeflow harness stop: ${reason}`, reason); +} + +function updateContext( + ctx: HarnessRunContextLike, + mode: HarnessModeLike, + model: string, + promptTokens: number, + completionTokens: number, + toolCalls: number, + elapsedMs: number, + decision: PreCallDecision, + applied: boolean, +): void { + const cost = estimateCost(model, promptTokens, completionTokens); + const energy = estimateEnergy(model, promptTokens, completionTokens); + + ctx.cost += cost; + ctx.stepCount += 1; + ctx.toolCalls += toolCalls; + ctx.latencyUsedMs += elapsedMs; + ctx.energyUsed += energy; + + if (ctx.budgetMax != null) { + ctx.budgetRemaining = ctx.budgetMax - ctx.cost; + } + + ctx.record(decision.action, decision.reason, decision.targetModel, { + applied, + decisionMode: mode, + }); +} + +function isThenable(value: any): value is Promise { + return Boolean(value) && typeof value.then === 'function'; +} + +function makePatchedCreate(provider: 'openai' | 'anthropic', original: CreateFunction): CreateFunction { + return function patchedCreate(this: any, ...args: any[]): any { + const runtime = getHarnessRuntime(); + const activeRun = runtime.getCurrentRun(); + const mode = activeRun?.mode ?? runtime.getHarnessMode(); + + if (mode === 'off') { + return original.apply(this, args); + } + + const firstArg = args[0]; + const request = firstArg && typeof firstArg === 'object' ? { ...firstArg } : {}; + const model = typeof request.model === 'string' ? request.model : 'unknown'; + const hasTools = Array.isArray(request.tools) && request.tools.length > 0; + + const decision = activeRun ? evaluatePreCallDecision(activeRun, model, hasTools) : { + action: 'allow' as const, + reason: mode, + targetModel: model, + }; + + let applied = decision.action === 'allow'; + let effectiveModel = model; + + if (activeRun && mode === 'enforce') { + if (decision.action === 'stop') { + activeRun.record('stop', decision.reason, model, { + applied: true, + decisionMode: mode, + }); + raiseStopError(activeRun, decision.reason); + } + + if (decision.action === 'switch_model') { + if (normalizeModelName(decision.targetModel) !== normalizeModelName(model)) { + request.model = decision.targetModel; + effectiveModel = decision.targetModel; + applied = true; + } else { + applied = false; + } + } + + if (decision.action === 'deny_tool') { + if (Array.isArray(request.tools) && request.tools.length > 0) { + request.tools = []; + applied = true; + } else { + applied = false; + } + } + } else if (decision.action !== 'allow') { + applied = false; + } + + const interceptedArgs = firstArg && typeof firstArg === 'object' + ? [request, ...args.slice(1)] + : args; + + const isStream = Boolean(request.stream); + const startedAt = nowMonotonicMs(); + const result = original.apply(this, interceptedArgs); + + if (!activeRun) { + return result; + } + + const finalize = (response: any): any => { + const elapsedMs = Math.max(0, nowMonotonicMs() - startedAt); + + let promptTokens = 0; + let completionTokens = 0; + let toolCallCount = 0; + + if (!isStream) { + if (provider === 'openai') { + [promptTokens, completionTokens] = extractOpenAIUsage(response); + toolCallCount = countOpenAIToolCalls(response); + } else { + [promptTokens, completionTokens] = extractAnthropicUsage(response); + toolCallCount = countAnthropicToolCalls(response); + } + } + + updateContext( + activeRun, + mode, + effectiveModel, + promptTokens, + completionTokens, + toolCallCount, + elapsedMs, + decision, + applied, + ); + + return response; + }; + + if (isThenable(result)) { + result + .then((response) => { + finalize(response); + }) + .catch(() => { + // fail-open: harness instrumentation errors must not crash user flow. + }); + return result; + } + + return finalize(result); + }; +} + +export function detectOpenAIInstrumentationTarget(): boolean { + const module = loadOpenAIModule(); + return Boolean(module?.Completions?.prototype?.create); +} + +export function detectAnthropicInstrumentationTarget(): boolean { + const module = loadAnthropicModule(); + return Boolean(module?.Messages?.prototype?.create); +} + +export function patchOpenAI(): boolean { + if (openAIPatched) { + return true; + } + + const module = loadOpenAIModule(); + const cls = module?.Completions; + const prototype = cls?.prototype; + const create = prototype?.create; + + if (!cls || !prototype || typeof create !== 'function') { + return false; + } + + originalOpenAICreate = create; + patchedOpenAIClass = cls; + prototype.create = makePatchedCreate('openai', create); + openAIPatched = true; + return true; +} + +export function patchAnthropic(): boolean { + if (anthropicPatched) { + return true; + } + + const module = loadAnthropicModule(); + const cls = module?.Messages; + const prototype = cls?.prototype; + const create = prototype?.create; + + if (!cls || !prototype || typeof create !== 'function') { + return false; + } + + originalAnthropicCreate = create; + patchedAnthropicClass = cls; + prototype.create = makePatchedCreate('anthropic', create); + anthropicPatched = true; + return true; +} + +export function unpatchOpenAI(): void { + if (!openAIPatched) { + return; + } + + if (patchedOpenAIClass?.prototype && originalOpenAICreate) { + patchedOpenAIClass.prototype.create = originalOpenAICreate; + } + + openAIPatched = false; + originalOpenAICreate = null; + patchedOpenAIClass = null; +} + +export function unpatchAnthropic(): void { + if (!anthropicPatched) { + return; + } + + if (patchedAnthropicClass?.prototype && originalAnthropicCreate) { + patchedAnthropicClass.prototype.create = originalAnthropicCreate; + } + + anthropicPatched = false; + originalAnthropicCreate = null; + patchedAnthropicClass = null; +} + +export function isOpenAIPatched(): boolean { + return openAIPatched; +} + +export function isAnthropicPatched(): boolean { + return anthropicPatched; +} + +export function isPatched(): boolean { + return openAIPatched || anthropicPatched; +} + +export function __setInstrumentationLoadersForTest(loaders: { + openai?: () => OpenAIModuleLike | null; + anthropic?: () => AnthropicModuleLike | null; +}): void { + if (loaders.openai) { + loadOpenAIModule = loaders.openai; + } + if (loaders.anthropic) { + loadAnthropicModule = loaders.anthropic; + } +} + +export function __resetInstrumentationLoadersForTest(): void { + loadOpenAIModule = defaultOpenAILoader; + loadAnthropicModule = defaultAnthropicLoader; +} + +export function __resetInstrumentationStateForTest(): void { + unpatchOpenAI(); + unpatchAnthropic(); +} diff --git a/packages/core/src/harness.ts b/packages/core/src/harness.ts new file mode 100644 index 00000000..3815360e --- /dev/null +++ b/packages/core/src/harness.ts @@ -0,0 +1,754 @@ +import { + __resetInstrumentationStateForTest, + detectAnthropicInstrumentationTarget, + detectOpenAIInstrumentationTarget, + patchAnthropic, + patchOpenAI, + setHarnessRuntimeBindingsForInstrumentation, + unpatchAnthropic, + unpatchOpenAI, +} from './harness-instrument'; + +export type HarnessMode = 'off' | 'observe' | 'enforce'; + +export type HarnessConfig = { + mode: HarnessMode; + verbose: boolean; + budget?: number; + maxToolCalls?: number; + maxLatencyMs?: number; + maxEnergy?: number; + kpiTargets?: Record; + kpiWeights?: Record; + compliance?: string; +}; + +export type HarnessInitOptions = Partial; + +export type HarnessRunOptions = { + budget?: number; + maxToolCalls?: number; + maxLatencyMs?: number; + maxEnergy?: number; + kpiTargets?: Record; + kpiWeights?: Record; + compliance?: string; +}; + +export type HarnessInitReport = { + mode: HarnessMode; + instrumented: string[]; + detectedButNotInstrumented: string[]; + configSources: Record; +}; + +export type HarnessRecordOptions = { + applied?: boolean; + decisionMode?: HarnessMode; +}; + +export type HarnessTraceEntry = { + action: string; + reason: string; + model?: string; + runId: string; + mode: HarnessMode; + step: number; + timestampMs: number; + toolCallsTotal: number; + costTotal: number; + latencyUsedMs: number; + energyUsed: number; + budgetState: { + max?: number; + remaining?: number; + }; + applied?: boolean; + decisionMode?: HarnessMode; +}; + +export type HarnessRunSummary = { + runId: string; + mode: HarnessMode; + stepCount: number; + toolCalls: number; + cost: number; + savings: number; + latencyUsedMs: number; + energyUsed: number; + budgetMax?: number; + budgetRemaining?: number; + lastAction: string; + modelUsed?: string; + durationMs?: number; +}; + +export class HarnessStopError extends Error { + reason: string; + + constructor(message: string, reason = 'stop') { + super(message); + this.name = 'HarnessStopError'; + this.reason = reason; + } +} + +export class BudgetExceededError extends HarnessStopError { + remaining: number; + + constructor(message: string, remaining = 0) { + super(message, 'budget_exceeded'); + this.name = 'BudgetExceededError'; + this.remaining = remaining; + } +} + +function randomRunId(): string { + return Math.random().toString(36).slice(2, 14); +} + +function nowMonotonicMs(): number { + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition + if (typeof globalThis !== 'undefined' && (globalThis as any).performance?.now) { + return (globalThis as any).performance.now() as number; + } + + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition + if (typeof process !== 'undefined' && process.hrtime?.bigint) { + return Number(process.hrtime.bigint()) / 1_000_000; + } + + return Date.now(); +} + +const MAX_ACTION_LEN = 64; +const MAX_REASON_LEN = 160; +const MAX_MODEL_LEN = 128; + +function sanitizeTraceValue(value: unknown, maxLength: number): string | undefined { + if (value == null) { + return undefined; + } + + const text = String(value).replace(/\r?\n/g, ' ').trim(); + if (!text) { + return undefined; + } + + if (text.length <= maxLength) { + return text; + } + + return `${text.slice(0, Math.max(0, maxLength - 3))}...`; +} + +export class HarnessRunContext { + runId: string; + startedAtMs: number; + endedAtMs?: number; + durationMs?: number; + + mode: HarnessMode; + budgetMax?: number; + toolCallsMax?: number; + latencyMaxMs?: number; + energyMax?: number; + kpiTargets?: Record; + kpiWeights?: Record; + compliance?: string; + + cost = 0; + savings = 0; + toolCalls = 0; + stepCount = 0; + latencyUsedMs = 0; + energyUsed = 0; + verbose = false; + budgetRemaining?: number; + modelUsed?: string; + lastAction = 'allow'; + draftAccepted?: boolean; + + private readonly _startedMonotonic: number; + private readonly _trace: HarnessTraceEntry[] = []; + private _finalized = false; + + constructor(config: { + mode: HarnessMode; + budgetMax?: number; + toolCallsMax?: number; + latencyMaxMs?: number; + energyMax?: number; + kpiTargets?: Record; + kpiWeights?: Record; + compliance?: string; + verbose?: boolean; + }) { + this.runId = randomRunId(); + this.startedAtMs = Date.now(); + this._startedMonotonic = nowMonotonicMs(); + + this.mode = config.mode; + this.budgetMax = config.budgetMax; + this.toolCallsMax = config.toolCallsMax; + this.latencyMaxMs = config.latencyMaxMs; + this.energyMax = config.energyMax; + this.kpiTargets = config.kpiTargets; + this.kpiWeights = config.kpiWeights; + this.compliance = config.compliance; + this.verbose = Boolean(config.verbose); + + if (config.budgetMax != null) { + this.budgetRemaining = config.budgetMax; + } + } + + finish(): void { + if (this._finalized) { + return; + } + + this._finalized = true; + this.endedAtMs = Date.now(); + this.durationMs = Math.max(0, nowMonotonicMs() - this._startedMonotonic); + + if (this.verbose && this.mode !== 'off' && this.stepCount > 0) { + // Keep logging cheap and controlled. + // eslint-disable-next-line no-console + console.info( + '[cascadeflow.harness] run summary', + { + runId: this.runId, + mode: this.mode, + steps: this.stepCount, + toolCalls: this.toolCalls, + cost: this.cost, + latencyMs: this.latencyUsedMs, + energy: this.energyUsed, + lastAction: this.lastAction, + model: this.modelUsed, + budgetRemaining: this.budgetRemaining, + durationMs: this.durationMs, + }, + ); + } + } + + record(action: string, reason: string, model?: string, options: HarnessRecordOptions = {}): void { + let safeAction = sanitizeTraceValue(action, MAX_ACTION_LEN); + if (!safeAction) { + safeAction = 'allow'; + } + + const safeReason = sanitizeTraceValue(reason, MAX_REASON_LEN) ?? 'unspecified'; + const safeModel = sanitizeTraceValue(model, MAX_MODEL_LEN); + + this.lastAction = safeAction; + this.modelUsed = safeModel; + + const entry: HarnessTraceEntry = { + action: safeAction, + reason: safeReason, + model: safeModel, + runId: this.runId, + mode: this.mode, + step: this.stepCount, + timestampMs: Date.now(), + toolCallsTotal: this.toolCalls, + costTotal: this.cost, + latencyUsedMs: this.latencyUsedMs, + energyUsed: this.energyUsed, + budgetState: { + max: this.budgetMax, + remaining: this.budgetRemaining, + }, + }; + + if (options.applied != null) { + entry.applied = options.applied; + } + + if (options.decisionMode != null) { + entry.decisionMode = options.decisionMode; + } + + this._trace.push(entry); + } + + trace(): HarnessTraceEntry[] { + return [...this._trace]; + } + + summary(): HarnessRunSummary { + return { + runId: this.runId, + mode: this.mode, + stepCount: this.stepCount, + toolCalls: this.toolCalls, + cost: this.cost, + savings: this.savings, + latencyUsedMs: this.latencyUsedMs, + energyUsed: this.energyUsed, + budgetMax: this.budgetMax, + budgetRemaining: this.budgetRemaining, + lastAction: this.lastAction, + modelUsed: this.modelUsed, + durationMs: this.durationMs, + }; + } +} + +type ConfigSource = 'code' | 'env' | 'file' | 'default'; + +type ConfigWithSources = { + config: HarnessConfig; + sources: Record; +}; + +let _harnessConfig: HarnessConfig = { + mode: 'off', + verbose: false, +}; + +let _isInstrumented = false; +let fallbackCurrentRun: HarnessRunContext | null = null; + +let asyncLocalStorageInstance: { run: (store: HarnessRunContext, callback: () => Promise) => Promise; getStore: () => HarnessRunContext | undefined } | null = null; + +function getAsyncLocalStorage(): typeof asyncLocalStorageInstance { + if (asyncLocalStorageInstance) { + return asyncLocalStorageInstance; + } + + try { + // eslint-disable-next-line @typescript-eslint/no-var-requires + const mod = require('node:async_hooks') as { + AsyncLocalStorage: new () => { run: (store: T, callback: () => Promise) => Promise; getStore: () => T | undefined }; + }; + + asyncLocalStorageInstance = new mod.AsyncLocalStorage(); + } catch { + asyncLocalStorageInstance = null; + } + + return asyncLocalStorageInstance; +} + +function parseBoolean(raw: string): boolean { + const normalized = raw.trim().toLowerCase(); + return normalized === '1' || normalized === 'true' || normalized === 'yes' || normalized === 'on'; +} + +function parseNumber(raw: string): number { + const value = Number(raw); + if (!Number.isFinite(value)) { + throw new Error(`Invalid numeric value: ${raw}`); + } + return value; +} + +function parseJSONMap(raw: string): Record { + const parsed = JSON.parse(raw); + if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) { + throw new Error('Expected object'); + } + + const result: Record = {}; + for (const [key, value] of Object.entries(parsed as Record)) { + result[String(key)] = Number(value); + } + return result; +} + +function normalizeMode(mode: unknown): HarnessMode { + if (mode === 'off' || mode === 'observe' || mode === 'enforce') { + return mode; + } + + throw new Error('mode must be one of: off, observe, enforce'); +} + +function normalizeConfigRecord(raw: Record): HarnessInitOptions { + const out: HarnessInitOptions = {}; + + const mode = raw.mode ?? raw.harness_mode; + if (typeof mode === 'string') { + out.mode = normalizeMode(mode); + } + + const verbose = raw.verbose ?? raw.harness_verbose; + if (typeof verbose === 'boolean') { + out.verbose = verbose; + } + + const budget = raw.budget ?? raw.max_budget; + if (typeof budget === 'number') { + out.budget = budget; + } + + const maxToolCalls = raw.maxToolCalls ?? raw.max_tool_calls; + if (typeof maxToolCalls === 'number') { + out.maxToolCalls = maxToolCalls; + } + + const maxLatencyMs = raw.maxLatencyMs ?? raw.max_latency_ms; + if (typeof maxLatencyMs === 'number') { + out.maxLatencyMs = maxLatencyMs; + } + + const maxEnergy = raw.maxEnergy ?? raw.max_energy; + if (typeof maxEnergy === 'number') { + out.maxEnergy = maxEnergy; + } + + const kpiTargets = raw.kpiTargets ?? raw.kpi_targets; + if (kpiTargets && typeof kpiTargets === 'object' && !Array.isArray(kpiTargets)) { + out.kpiTargets = kpiTargets as Record; + } + + const kpiWeights = raw.kpiWeights ?? raw.kpi_weights; + if (kpiWeights && typeof kpiWeights === 'object' && !Array.isArray(kpiWeights)) { + out.kpiWeights = kpiWeights as Record; + } + + const compliance = raw.compliance; + if (typeof compliance === 'string') { + out.compliance = compliance; + } + + return out; +} + +function readEnvConfig(): HarnessInitOptions { + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition + if (typeof process === 'undefined' || !process.env) { + return {}; + } + + const env = process.env; + const config: HarnessInitOptions = {}; + + const mode = env.CASCADEFLOW_HARNESS_MODE ?? env.CASCADEFLOW_MODE; + if (mode) { + config.mode = normalizeMode(mode); + } + + if (env.CASCADEFLOW_HARNESS_VERBOSE != null) { + config.verbose = parseBoolean(env.CASCADEFLOW_HARNESS_VERBOSE); + } + + const budget = env.CASCADEFLOW_HARNESS_BUDGET ?? env.CASCADEFLOW_BUDGET; + if (budget != null) { + config.budget = parseNumber(budget); + } + + if (env.CASCADEFLOW_HARNESS_MAX_TOOL_CALLS != null) { + config.maxToolCalls = parseNumber(env.CASCADEFLOW_HARNESS_MAX_TOOL_CALLS); + } + + if (env.CASCADEFLOW_HARNESS_MAX_LATENCY_MS != null) { + config.maxLatencyMs = parseNumber(env.CASCADEFLOW_HARNESS_MAX_LATENCY_MS); + } + + if (env.CASCADEFLOW_HARNESS_MAX_ENERGY != null) { + config.maxEnergy = parseNumber(env.CASCADEFLOW_HARNESS_MAX_ENERGY); + } + + if (env.CASCADEFLOW_HARNESS_KPI_TARGETS != null) { + config.kpiTargets = parseJSONMap(env.CASCADEFLOW_HARNESS_KPI_TARGETS); + } + + if (env.CASCADEFLOW_HARNESS_KPI_WEIGHTS != null) { + config.kpiWeights = parseJSONMap(env.CASCADEFLOW_HARNESS_KPI_WEIGHTS); + } + + if (env.CASCADEFLOW_HARNESS_COMPLIANCE != null) { + config.compliance = env.CASCADEFLOW_HARNESS_COMPLIANCE; + } + + return config; +} + +function readFileConfig(): HarnessInitOptions { + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition + if (typeof process === 'undefined' || !process.cwd) { + return {}; + } + + try { + // eslint-disable-next-line @typescript-eslint/no-var-requires + const fs = require('node:fs') as typeof import('node:fs'); + // eslint-disable-next-line @typescript-eslint/no-var-requires + const path = require('node:path') as typeof import('node:path'); + + const configuredPath = process.env.CASCADEFLOW_CONFIG; + const candidates = configuredPath + ? [configuredPath] + : ['cascadeflow.json', 'cascadeflow.config.json']; + + for (const candidate of candidates) { + const full = path.isAbsolute(candidate) ? candidate : path.join(process.cwd(), candidate); + if (!fs.existsSync(full)) { + continue; + } + + const content = fs.readFileSync(full, 'utf8'); + const parsed = JSON.parse(content) as Record; + const harnessBlock = ( + parsed.harness && typeof parsed.harness === 'object' && !Array.isArray(parsed.harness) + ) + ? (parsed.harness as Record) + : parsed; + + return normalizeConfigRecord(harnessBlock); + } + } catch { + return {}; + } + + return {}; +} + +function resolveConfig(options: HarnessInitOptions): ConfigWithSources { + const env = readEnvConfig(); + const file = readFileConfig(); + const sources: Record = {}; + + const resolve = ( + key: keyof HarnessConfig, + explicit: T | undefined, + envValue: T | undefined, + fileValue: T | undefined, + defaultValue: T, + ): T => { + if (explicit !== undefined) { + sources[key] = 'code'; + return explicit; + } + if (envValue !== undefined) { + sources[key] = 'env'; + return envValue; + } + if (fileValue !== undefined) { + sources[key] = 'file'; + return fileValue; + } + sources[key] = 'default'; + return defaultValue; + }; + + const mode = resolve('mode', options.mode, env.mode, file.mode, 'off'); + const verbose = resolve('verbose', options.verbose, env.verbose, file.verbose, false); + const budget = resolve('budget', options.budget, env.budget, file.budget, undefined); + const maxToolCalls = resolve( + 'maxToolCalls', + options.maxToolCalls, + env.maxToolCalls, + file.maxToolCalls, + undefined, + ); + const maxLatencyMs = resolve( + 'maxLatencyMs', + options.maxLatencyMs, + env.maxLatencyMs, + file.maxLatencyMs, + undefined, + ); + const maxEnergy = resolve('maxEnergy', options.maxEnergy, env.maxEnergy, file.maxEnergy, undefined); + const kpiTargets = resolve( + 'kpiTargets', + options.kpiTargets, + env.kpiTargets, + file.kpiTargets, + undefined, + ); + const kpiWeights = resolve( + 'kpiWeights', + options.kpiWeights, + env.kpiWeights, + file.kpiWeights, + undefined, + ); + const compliance = resolve( + 'compliance', + options.compliance, + env.compliance, + file.compliance, + undefined, + ); + + return { + config: { + mode, + verbose, + budget, + maxToolCalls, + maxLatencyMs, + maxEnergy, + kpiTargets, + kpiWeights, + compliance, + }, + sources, + }; +} + +export function getHarnessConfig(): HarnessConfig { + return { ..._harnessConfig }; +} + +export function getCurrentRun(): HarnessRunContext | null { + const als = getAsyncLocalStorage(); + if (als) { + return als.getStore() ?? null; + } + + return fallbackCurrentRun; +} + +export function reset(): void { + unpatchOpenAI(); + unpatchAnthropic(); + __resetInstrumentationStateForTest(); + + _harnessConfig = { mode: 'off', verbose: false }; + _isInstrumented = false; + fallbackCurrentRun = null; +} + +export function init(options: HarnessInitOptions = {}): HarnessInitReport { + const { config, sources } = resolveConfig(options); + config.mode = normalizeMode(config.mode); + + _harnessConfig = config; + + const instrumented: string[] = []; + const detectedButNotInstrumented: string[] = []; + + const openaiDetected = detectOpenAIInstrumentationTarget(); + const anthropicDetected = detectAnthropicInstrumentationTarget(); + + if (config.mode !== 'off' && openaiDetected) { + if (patchOpenAI()) { + instrumented.push('openai'); + } else { + detectedButNotInstrumented.push('openai'); + } + } + + if (config.mode !== 'off' && anthropicDetected) { + if (patchAnthropic()) { + instrumented.push('anthropic'); + } else { + detectedButNotInstrumented.push('anthropic'); + } + } + + if (config.mode === 'off') { + unpatchOpenAI(); + unpatchAnthropic(); + } + + _isInstrumented = true; + + if (config.verbose) { + // eslint-disable-next-line no-console + console.info('[cascadeflow.harness] init', { + mode: config.mode, + instrumented, + detectedButNotInstrumented, + }); + } + + return { + mode: config.mode, + instrumented, + detectedButNotInstrumented, + configSources: sources, + }; +} + +type RunCallback = (run: HarnessRunContext) => Promise | T; + +async function executeScopedRun(runContext: HarnessRunContext, fn: RunCallback): Promise { + try { + return await fn(runContext); + } finally { + runContext.finish(); + } +} + +export async function run(callback: RunCallback): Promise; +export async function run(options: HarnessRunOptions, callback: RunCallback): Promise; +export async function run( + optionsOrCallback: HarnessRunOptions | RunCallback, + callback?: RunCallback, +): Promise { + const options = typeof optionsOrCallback === 'function' ? {} : optionsOrCallback; + const cb = (typeof optionsOrCallback === 'function' ? optionsOrCallback : callback) as RunCallback | undefined; + + if (!cb) { + throw new Error('run() requires a callback: run(options?, async (run) => { ... })'); + } + + const cfg = getHarnessConfig(); + const runContext = new HarnessRunContext({ + mode: cfg.mode, + budgetMax: options.budget ?? cfg.budget, + toolCallsMax: options.maxToolCalls ?? cfg.maxToolCalls, + latencyMaxMs: options.maxLatencyMs ?? cfg.maxLatencyMs, + energyMax: options.maxEnergy ?? cfg.maxEnergy, + kpiTargets: options.kpiTargets ?? cfg.kpiTargets, + kpiWeights: options.kpiWeights ?? cfg.kpiWeights, + compliance: options.compliance ?? cfg.compliance, + verbose: cfg.verbose, + }); + + const als = getAsyncLocalStorage(); + if (als) { + return als.run(runContext, async () => executeScopedRun(runContext, cb)) as Promise; + } + + const previous = fallbackCurrentRun; + fallbackCurrentRun = runContext; + try { + return await executeScopedRun(runContext, cb); + } finally { + fallbackCurrentRun = previous; + } +} + +export function agent(policy: HarnessRunOptions): any>(fn: T) => T { + return any>(fn: T): T => { + const wrapped = ((...args: any[]) => fn(...args)) as T; + (wrapped as any).__cascadeflow_agent_policy__ = { + budget: policy.budget, + kpiTargets: policy.kpiTargets, + kpiWeights: policy.kpiWeights, + compliance: policy.compliance, + }; + return wrapped; + }; +} + +setHarnessRuntimeBindingsForInstrumentation({ + getCurrentRun, + getHarnessMode: () => getHarnessConfig().mode, + createBudgetExceededError: (message: string, remaining?: number) => + new BudgetExceededError(message, remaining), + createHarnessStopError: (message: string, reason?: string) => + new HarnessStopError(message, reason), +}); + +export const cascadeflow = { + init, + run, + agent, + reset, + getHarnessConfig, + getCurrentRun, +}; + +export function isHarnessInstrumented(): boolean { + return _isInstrumented; +} diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 29819183..c919f67e 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -42,6 +42,31 @@ export { DEFAULT_CASCADE_CONFIG, } from './config'; +// Harness API (v2.1+) +export type { + HarnessMode, + HarnessConfig, + HarnessInitOptions, + HarnessRunOptions, + HarnessInitReport, + HarnessRecordOptions, + HarnessTraceEntry, + HarnessRunSummary, +} from './harness'; +export { + HarnessRunContext, + HarnessStopError, + BudgetExceededError, + init, + run, + agent as harnessAgent, + reset as resetHarness, + getHarnessConfig, + getCurrentRun, + isHarnessInstrumented, + cascadeflow, +} from './harness'; + // Results export type { CascadeResult } from './result'; export { resultToObject } from './result'; diff --git a/packages/integrations/n8n/nodes/CascadeFlowAgent/CascadeFlowAgent.node.ts b/packages/integrations/n8n/nodes/CascadeFlowAgent/CascadeFlowAgent.node.ts index b3f52a60..925a9a96 100644 --- a/packages/integrations/n8n/nodes/CascadeFlowAgent/CascadeFlowAgent.node.ts +++ b/packages/integrations/n8n/nodes/CascadeFlowAgent/CascadeFlowAgent.node.ts @@ -21,6 +21,7 @@ import { type DomainType, getEnabledDomains, } from '../LmChatCascadeFlow/config'; +import { HarnessRunContext, type HarnessConfig, type HarnessMode, type KpiWeights } from '../harness'; // Tool cascade validator - optional import let ToolCascadeValidator: any; @@ -65,6 +66,7 @@ export class CascadeFlowAgentExecutor { private routingRules: Map; private enableToolCascadeValidation: boolean; private toolCascadeValidator: any; + private harnessCtx: HarnessRunContext | null; constructor( private cascadeModel: CascadeChatModel, @@ -72,7 +74,9 @@ export class CascadeFlowAgentExecutor { routingRules: ToolRoutingRule[], private maxIterations: number, enableToolCascadeValidation: boolean = false, + harnessCtx: HarnessRunContext | null = null, ) { + this.harnessCtx = harnessCtx; this.toolMap = new Map( tools.filter((tool) => tool?.name).map((tool) => [tool.name as string, tool]) ); @@ -295,6 +299,18 @@ export class CascadeFlowAgentExecutor { let iterations = 0; while (iterations < this.maxIterations) { + // Harness enforce-mode pre-checks + if (this.harnessCtx?.config.mode === 'enforce') { + if (this.harnessCtx.isBudgetExhausted()) { + finalMessage = new AIMessage(`[Harness] Budget exhausted ($${this.harnessCtx.cost.toFixed(4)} of $${this.harnessCtx.config.budgetMax?.toFixed(4)} max). Agent stopped.`); + break; + } + if (this.harnessCtx.isToolCapReached()) { + finalMessage = new AIMessage(`[Harness] Tool call cap reached (${this.harnessCtx.toolCalls} of ${this.harnessCtx.config.toolCallsMax} max). Agent stopped.`); + break; + } + } + const message = await this.cascadeModel.invoke(currentMessages, options); const toolCalls = this.extractToolCalls(message); trace.push(this.buildTraceEntry(message, toolCalls)); @@ -350,6 +366,12 @@ export class CascadeFlowAgentExecutor { ); } + // Track tool calls in harness (CascadeChatModel records LLM token costs; + // agent executor tracks tool-call counts from the loop itself) + if (this.harnessCtx) { + this.harnessCtx.toolCalls += toolCalls.length; + } + if (routing === 'verifier') { const verifierMessage = await this.cascadeModel.invokeVerifierDirect(currentMessages, options); trace.push(this.buildTraceEntry(verifierMessage)); @@ -377,6 +399,7 @@ export class CascadeFlowAgentExecutor { output: finalMessage.content.toString(), message: finalMessage, trace, + harness: this.harnessCtx?.summary() ?? null, }; } @@ -753,6 +776,99 @@ export class CascadeFlowAgent implements INodeType { default: '', }, ...generateDomainProperties(), + // ----------------------------------------------------------------- + // Harness: Multi-Dimensional Cascading + // ----------------------------------------------------------------- + { + displayName: 'Harness', + name: 'harnessHeading', + type: 'notice', + default: '', + }, + { + displayName: 'Harness Mode', + name: 'harnessMode', + type: 'options', + options: [ + { name: 'Off', value: 'off', description: 'Harness disabled, zero overhead' }, + { name: 'Observe', value: 'observe', description: 'Track all dimensions, record trace, no enforcement' }, + { name: 'Enforce', value: 'enforce', description: 'Stop agent loop when limits are hit' }, + ], + default: 'observe', + description: 'Harness mode: off (disabled), observe (telemetry only), or enforce (stop when limits hit)', + }, + { + displayName: 'Budget (USD)', + name: 'harnessBudget', + type: 'number', + default: 0, + typeOptions: { minValue: 0, numberPrecision: 4 }, + displayOptions: { hide: { harnessMode: ['off'] } }, + description: 'Max budget in USD. 0 = unlimited.', + }, + { + displayName: 'Max Tool Calls', + name: 'harnessMaxToolCalls', + type: 'number', + default: 0, + typeOptions: { minValue: 0 }, + displayOptions: { hide: { harnessMode: ['off'] } }, + description: 'Max tool call count. 0 = unlimited.', + }, + { + displayName: 'Max Latency (Ms)', + name: 'harnessMaxLatencyMs', + type: 'number', + default: 0, + typeOptions: { minValue: 0 }, + displayOptions: { hide: { harnessMode: ['off'] } }, + description: 'Max cumulative latency in milliseconds. 0 = unlimited.', + }, + { + displayName: 'Max Energy', + name: 'harnessMaxEnergy', + type: 'number', + default: 0, + typeOptions: { minValue: 0, numberPrecision: 2 }, + displayOptions: { hide: { harnessMode: ['off'] } }, + description: 'Max energy proxy units. 0 = unlimited.', + }, + { + displayName: 'Compliance', + name: 'harnessCompliance', + type: 'options', + options: [ + { name: 'GDPR', value: 'gdpr' }, + { name: 'HIPAA', value: 'hipaa' }, + { name: 'None', value: '' }, + { name: 'PCI', value: 'pci' }, + { name: 'Strict', value: 'strict' }, + ], + default: '', + displayOptions: { hide: { harnessMode: ['off'] } }, + description: 'Compliance policy to enforce model allowlists', + }, + { + displayName: 'KPI Weights', + name: 'harnessKpiWeights', + type: 'fixedCollection', + typeOptions: { multipleValues: false }, + displayOptions: { hide: { harnessMode: ['off'] } }, + default: { weights: [{ quality: 0.4, cost: 0.3, latency: 0.2, energy: 0.1 }] }, + options: [ + { + name: 'weights', + displayName: 'Weights', + values: [ + { displayName: 'Quality', name: 'quality', type: 'number', default: 0.4, typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 2 } }, + { displayName: 'Cost', name: 'cost', type: 'number', default: 0.3, typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 2 } }, + { displayName: 'Latency', name: 'latency', type: 'number', default: 0.2, typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 2 } }, + { displayName: 'Energy', name: 'energy', type: 'number', default: 0.1, typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 2 } }, + ], + }, + ], + description: 'KPI dimension weights for optimization scoring (normalized automatically)', + }, ], }; @@ -782,6 +898,35 @@ export class CascadeFlowAgent implements INodeType { const toolRoutingRaw = this.getNodeParameter('toolRoutingRules', 0, { rule: [] }) as any; const toolRoutingRules = (toolRoutingRaw?.rule ?? []) as ToolRoutingRule[]; + // Harness parameters + const harnessMode = this.getNodeParameter('harnessMode', 0, 'observe') as HarnessMode; + let harnessCtx: HarnessRunContext | null = null; + if (harnessMode !== 'off') { + const rawBudget = this.getNodeParameter('harnessBudget', 0, 0) as number; + const rawToolCalls = this.getNodeParameter('harnessMaxToolCalls', 0, 0) as number; + const rawLatency = this.getNodeParameter('harnessMaxLatencyMs', 0, 0) as number; + const rawEnergy = this.getNodeParameter('harnessMaxEnergy', 0, 0) as number; + const compliance = this.getNodeParameter('harnessCompliance', 0, '') as string; + const kpiRaw = this.getNodeParameter('harnessKpiWeights', 0, { weights: [{ quality: 0.4, cost: 0.3, latency: 0.2, energy: 0.1 }] }) as any; + const kpiEntry = kpiRaw?.weights?.[0] ?? { quality: 0.4, cost: 0.3, latency: 0.2, energy: 0.1 }; + + const config: HarnessConfig = { + mode: harnessMode, + budgetMax: rawBudget > 0 ? rawBudget : null, + toolCallsMax: rawToolCalls > 0 ? rawToolCalls : null, + latencyMaxMs: rawLatency > 0 ? rawLatency : null, + energyMax: rawEnergy > 0 ? rawEnergy : null, + compliance: compliance || null, + kpiWeights: { + quality: kpiEntry.quality ?? 0.4, + cost: kpiEntry.cost ?? 0.3, + latency: kpiEntry.latency ?? 0.2, + energy: kpiEntry.energy ?? 0.1, + }, + }; + harnessCtx = new HarnessRunContext(config); + } + // Domain routing parameters const enableDomainRouting = this.getNodeParameter('enableDomainRouting', 0, false) as boolean; @@ -887,12 +1032,18 @@ export class CascadeFlowAgent implements INodeType { domainVerifierGetters, ); + // Wire harness context into cascade model for per-call recording + if (harnessCtx) { + cascadeModel.setHarnessContext(harnessCtx); + } + const agentExecutor = new CascadeFlowAgentExecutor( cascadeModel, tools, toolRoutingRules, maxIterations, enableToolCascadeValidation, + harnessCtx, ); // --- Process each input item --- @@ -933,6 +1084,7 @@ export class CascadeFlowAgent implements INodeType { output: result.output, ...cascadeflowMeta, trace: result.trace, + harness: result.harness ?? null, }, pairedItem: { item: itemIndex }, }); diff --git a/packages/integrations/n8n/nodes/LmChatCascadeFlow/LmChatCascadeFlow.node.ts b/packages/integrations/n8n/nodes/LmChatCascadeFlow/LmChatCascadeFlow.node.ts index 8c39ae41..ad2d603e 100644 --- a/packages/integrations/n8n/nodes/LmChatCascadeFlow/LmChatCascadeFlow.node.ts +++ b/packages/integrations/n8n/nodes/LmChatCascadeFlow/LmChatCascadeFlow.node.ts @@ -23,6 +23,8 @@ import { getEnabledDomains, } from './config'; import { buildCascadeMetadata } from './cascade-metadata'; +import { estimateCost as harnessEstimateCost } from '../harness/pricing'; +import type { HarnessRunContext } from '../harness/harness'; // Quality validation, cost tracking, and routing - optional import let QualityValidator: any; @@ -110,6 +112,29 @@ export class CascadeChatModel extends BaseChatModel { private domainVerifiers: Map = new Map(); private domainVerifierGetters: Map Promise> = new Map(); + // Harness context (set by agent node) + private harnessCtx: HarnessRunContext | null = null; + + setHarnessContext(ctx: HarnessRunContext | null): void { + this.harnessCtx = ctx; + } + + private recordHarnessCall(message: BaseMessage, model: BaseChatModel, elapsedMs: number): void { + if (!this.harnessCtx) return; + const responseMetadata = (message as any).response_metadata || {}; + const tokenUsage = responseMetadata.tokenUsage || responseMetadata.usage || {}; + const inputTokens = tokenUsage.promptTokens || tokenUsage.prompt_tokens || 0; + const outputTokens = tokenUsage.completionTokens || tokenUsage.completion_tokens || 0; + const modelName = (model as any).modelName || (model as any).model || 'unknown'; + this.harnessCtx.recordCall({ + model: modelName, + inputTokens, + outputTokens, + toolCallCount: 0, + elapsedMs, + }); + } + constructor( drafterModelGetter: () => Promise, verifierModelGetter: () => Promise, @@ -257,6 +282,7 @@ export class CascadeChatModel extends BaseChatModel { const latency = Date.now() - start; const verifierCost = await this.calculateMessageCost(verifierMessage, verifierModel); + this.recordHarnessCall(verifierMessage, verifierModel, latency); const costBreakdown = { drafter: 0, verifier: verifierCost, @@ -584,37 +610,8 @@ export class CascadeChatModel extends BaseChatModel { } } - // Fallback to rough estimates based on model name - const estimatesPerMillion: Record = { - 'gpt-4o-mini': { input: 0.15, output: 0.6 }, - 'gpt-4o': { input: 2.5, output: 10.0 }, - 'gpt-5-mini': { input: 0.20, output: 0.80 }, - 'gpt-4-turbo': { input: 10.0, output: 30.0 }, - 'gpt-4': { input: 30.0, output: 60.0 }, - 'gpt-3.5-turbo': { input: 0.5, output: 1.5 }, - 'claude-3-5-haiku': { input: 1.0, output: 5.0 }, - 'claude-haiku-4-5': { input: 1.0, output: 5.0 }, - 'claude-3-5-sonnet': { input: 3.0, output: 15.0 }, - 'claude-sonnet-4-5': { input: 3.0, output: 15.0 }, - 'claude-sonnet-4': { input: 3.0, output: 15.0 }, - 'claude-opus-4-5': { input: 5.0, output: 25.0 }, - 'claude-3-haiku': { input: 0.25, output: 1.25 }, - default: { input: 1.0, output: 2.0 }, - }; - - let estimate = estimatesPerMillion.default; - for (const [key, value] of Object.entries(estimatesPerMillion)) { - if (modelName.includes(key)) { - estimate = value; - break; - } - } - - const cost = - (inputTokens / 1_000_000) * estimate.input + - (outputTokens / 1_000_000) * estimate.output; - - return cost; + // Use shared harness pricing (fuzzy model resolution, 18 models) + return harnessEstimateCost(modelName, inputTokens, outputTokens); } /** @@ -711,6 +708,7 @@ export class CascadeChatModel extends BaseChatModel { this.verifierCount++; const verifierCost = await this.calculateMessageCost(verifierMessage, verifierModel); + this.recordHarnessCall(verifierMessage, verifierModel, verifierLatency); const costBreakdown = { drafter: 0, verifier: verifierCost, @@ -772,6 +770,7 @@ export class CascadeChatModel extends BaseChatModel { const drafterStartTime = Date.now(); const drafterMessage = await modelToUse.invoke(messages, options); const drafterLatency = Date.now() - drafterStartTime; + this.recordHarnessCall(drafterMessage, modelToUse, drafterLatency); if (domainModel && detectedDomain) { this.domainCounts.set(detectedDomain, (this.domainCounts.get(detectedDomain) || 0) + 1); @@ -798,6 +797,7 @@ export class CascadeChatModel extends BaseChatModel { const verifierStartTime = Date.now(); const verifierMessage = await verifierModel.invoke(messages, options); const verifierLatency = Date.now() - verifierStartTime; + this.recordHarnessCall(verifierMessage, verifierModel, verifierLatency); this.verifierCount++; @@ -1060,6 +1060,7 @@ export class CascadeChatModel extends BaseChatModel { const verifierInfo = this.getModelInfo(verifierModel); const verifierMessage = await verifierModel.invoke(messages, options); const verifierLatency = Date.now() - verifierStartTime; + this.recordHarnessCall(verifierMessage, verifierModel, verifierLatency); this.verifierCount++; @@ -1136,7 +1137,9 @@ export class CascadeChatModel extends BaseChatModel { const verifierModel = await this.getVerifierModel(); const verifierInfo = this.getModelInfo(verifierModel); + const fallbackStart = Date.now(); const verifierMessage = await verifierModel.invoke(messages, options); + this.recordHarnessCall(verifierMessage, verifierModel, Date.now() - fallbackStart); this.verifierCount++; const verifierCost = await this.calculateMessageCost(verifierMessage, verifierModel); diff --git a/packages/integrations/n8n/nodes/LmChatCascadeFlow/cascade-metadata.ts b/packages/integrations/n8n/nodes/LmChatCascadeFlow/cascade-metadata.ts index d539d5b7..e93f7b23 100644 --- a/packages/integrations/n8n/nodes/LmChatCascadeFlow/cascade-metadata.ts +++ b/packages/integrations/n8n/nodes/LmChatCascadeFlow/cascade-metadata.ts @@ -1,4 +1,5 @@ import type { DomainType } from './config'; +import type { HarnessSummary } from '../harness'; export interface CostBreakdown { drafter: number; @@ -12,12 +13,15 @@ export interface SavingsBreakdown { percent: number; } +export interface HarnessSummaryOutput extends HarnessSummary {} + export interface CascadeFlowMetadata { model_used: string; domain: DomainType | null; confidence?: number; costs: CostBreakdown; savings: SavingsBreakdown; + harness?: HarnessSummaryOutput | null; } export const calculateSavings = ( diff --git a/packages/integrations/n8n/nodes/harness/__tests__/harness.test.ts b/packages/integrations/n8n/nodes/harness/__tests__/harness.test.ts new file mode 100644 index 00000000..5c003e42 --- /dev/null +++ b/packages/integrations/n8n/nodes/harness/__tests__/harness.test.ts @@ -0,0 +1,368 @@ +import { describe, expect, it } from 'vitest'; + +import { + PRICING_USD_PER_M, + DEFAULT_PRICING_USD_PER_M, + ENERGY_COEFFICIENTS, + DEFAULT_ENERGY_COEFFICIENT, + ENERGY_OUTPUT_WEIGHT, + resolvePricingKey, + estimateCost, + estimateEnergy, + modelTotalPrice, +} from '../pricing'; + +import { + HarnessRunContext, + COMPLIANCE_MODEL_ALLOWLISTS, + QUALITY_PRIORS, + LATENCY_PRIORS, + normalizeWeights, + type HarnessConfig, +} from '../harness'; + +// --------------------------------------------------------------------------- +// Pricing data fidelity +// --------------------------------------------------------------------------- + +describe('pricing data', () => { + it('has 18 models in PRICING_USD_PER_M', () => { + expect(Object.keys(PRICING_USD_PER_M)).toHaveLength(18); + }); + + it('matches Python values for gpt-4o', () => { + expect(PRICING_USD_PER_M['gpt-4o']).toEqual([2.50, 10.00]); + }); + + it('matches Python values for gpt-4o-mini', () => { + expect(PRICING_USD_PER_M['gpt-4o-mini']).toEqual([0.15, 0.60]); + }); + + it('matches Python values for claude-sonnet-4', () => { + expect(PRICING_USD_PER_M['claude-sonnet-4']).toEqual([3.00, 15.00]); + }); + + it('matches Python values for gemini-2.5-flash', () => { + expect(PRICING_USD_PER_M['gemini-2.5-flash']).toEqual([0.15, 0.60]); + }); + + it('has correct default pricing', () => { + expect(DEFAULT_PRICING_USD_PER_M).toEqual([2.50, 10.00]); + }); + + it('has 18 models in ENERGY_COEFFICIENTS', () => { + expect(Object.keys(ENERGY_COEFFICIENTS)).toHaveLength(18); + }); + + it('has correct energy defaults', () => { + expect(DEFAULT_ENERGY_COEFFICIENT).toBe(1.0); + expect(ENERGY_OUTPUT_WEIGHT).toBe(1.5); + }); +}); + +// --------------------------------------------------------------------------- +// estimateCost / estimateEnergy +// --------------------------------------------------------------------------- + +describe('estimateCost', () => { + it('calculates gpt-4o cost correctly (1000 in, 500 out = $0.0075)', () => { + const cost = estimateCost('gpt-4o', 1000, 500); + expect(cost).toBeCloseTo(0.0075, 6); + }); + + it('calculates gpt-4o-mini cost correctly', () => { + const cost = estimateCost('gpt-4o-mini', 1_000_000, 1_000_000); + expect(cost).toBeCloseTo(0.15 + 0.60, 6); + }); + + it('uses default pricing for unknown models', () => { + const cost = estimateCost('unknown-model', 1_000_000, 1_000_000); + expect(cost).toBeCloseTo(2.50 + 10.00, 6); + }); +}); + +describe('estimateEnergy', () => { + it('calculates gpt-4o energy correctly (100 in, 50 out)', () => { + // coeff=1.0, energy = 1.0 * (100 + 50 * 1.5) = 175.0 + const energy = estimateEnergy('gpt-4o', 100, 50); + expect(energy).toBeCloseTo(175.0, 4); + }); + + it('uses default coefficient for unknown models', () => { + // coeff=1.0, energy = 1.0 * (100 + 50 * 1.5) = 175.0 + const energy = estimateEnergy('unknown-model', 100, 50); + expect(energy).toBeCloseTo(175.0, 4); + }); + + it('uses correct coefficient for gpt-4o-mini', () => { + // coeff=0.3, energy = 0.3 * (100 + 50 * 1.5) = 52.5 + const energy = estimateEnergy('gpt-4o-mini', 100, 50); + expect(energy).toBeCloseTo(52.5, 4); + }); +}); + +describe('modelTotalPrice', () => { + it('returns input + output for gpt-4o', () => { + expect(modelTotalPrice('gpt-4o')).toBeCloseTo(12.50, 6); + }); + + it('returns default for unknown model', () => { + expect(modelTotalPrice('unknown')).toBeCloseTo(12.50, 6); + }); +}); + +// --------------------------------------------------------------------------- +// Fuzzy model resolution +// --------------------------------------------------------------------------- + +describe('resolvePricingKey', () => { + it('exact match', () => { + expect(resolvePricingKey('gpt-4o')).toBe('gpt-4o'); + }); + + it('strips version suffix (-20250120)', () => { + expect(resolvePricingKey('gpt-4o-20250120')).toBe('gpt-4o'); + }); + + it('strips -preview suffix', () => { + expect(resolvePricingKey('gpt-4o-preview')).toBe('gpt-4o'); + }); + + it('strips -latest suffix', () => { + expect(resolvePricingKey('gpt-4o-latest')).toBe('gpt-4o'); + }); + + it('longest-prefix match (gemini-2.5-flash-8b → gemini-2.5-flash)', () => { + expect(resolvePricingKey('gemini-2.5-flash-8b')).toBe('gemini-2.5-flash'); + }); + + it('returns null for completely unknown model', () => { + expect(resolvePricingKey('totally-unknown-model')).toBeNull(); + }); +}); + +// --------------------------------------------------------------------------- +// HarnessRunContext — evaluatePreCall +// --------------------------------------------------------------------------- + +function makeConfig(overrides: Partial = {}): HarnessConfig { + return { + mode: 'enforce', + budgetMax: null, + toolCallsMax: null, + latencyMaxMs: null, + energyMax: null, + compliance: null, + kpiWeights: {}, + ...overrides, + }; +} + +describe('evaluatePreCall', () => { + it('returns allow when no limits set', () => { + const ctx = new HarnessRunContext(makeConfig()); + const decision = ctx.evaluatePreCall('gpt-4o', false); + expect(decision.action).toBe('allow'); + }); + + it('returns stop when budget exhausted', () => { + const ctx = new HarnessRunContext(makeConfig({ budgetMax: 0.01 })); + ctx.cost = 0.01; // exhaust budget + const decision = ctx.evaluatePreCall('gpt-4o', false); + expect(decision.action).toBe('stop'); + expect(decision.reason).toBe('budget_exceeded'); + }); + + it('returns deny_tool when tool cap reached', () => { + const ctx = new HarnessRunContext(makeConfig({ toolCallsMax: 3 })); + ctx.toolCalls = 3; + const decision = ctx.evaluatePreCall('gpt-4o', true); + expect(decision.action).toBe('deny_tool'); + expect(decision.reason).toBe('max_tool_calls_reached'); + }); + + it('returns stop for compliance violation (non-compliant model)', () => { + const ctx = new HarnessRunContext(makeConfig({ compliance: 'gdpr' })); + const decision = ctx.evaluatePreCall('claude-sonnet-4', false); + expect(decision.action).toBe('stop'); + expect(decision.reason).toBe('compliance_no_approved_model'); + }); + + it('allows compliant model under GDPR', () => { + const ctx = new HarnessRunContext(makeConfig({ compliance: 'gdpr' })); + const decision = ctx.evaluatePreCall('gpt-4o', false); + expect(decision.action).toBe('allow'); + }); + + it('returns stop when latency cap exceeded', () => { + const ctx = new HarnessRunContext(makeConfig({ latencyMaxMs: 1000 })); + ctx.latencyUsedMs = 1000; + const decision = ctx.evaluatePreCall('gpt-3.5-turbo', false); + // gpt-3.5-turbo is already the fastest → can't switch → stop + expect(decision.action).toBe('stop'); + expect(decision.reason).toBe('latency_limit_exceeded'); + }); + + it('returns stop when energy cap exceeded', () => { + const ctx = new HarnessRunContext(makeConfig({ energyMax: 100 })); + ctx.energyUsed = 100; + const decision = ctx.evaluatePreCall('gpt-3.5-turbo', false); + // gpt-3.5-turbo is already lowest energy → can't switch → stop + expect(decision.action).toBe('stop'); + expect(decision.reason).toBe('energy_limit_exceeded'); + }); + + it('returns switch_model observation for budget pressure', () => { + const ctx = new HarnessRunContext(makeConfig({ budgetMax: 1.0 })); + ctx.cost = 0.85; // 85% spent, < 20% remaining + ctx.budgetRemaining = 0.15; + const decision = ctx.evaluatePreCall('gpt-4o', false); + // Budget pressure suggests cheaper model + expect(decision.action).toBe('switch_model'); + expect(decision.reason).toBe('budget_pressure'); + }); + + it('returns switch_model observation for KPI optimization', () => { + const ctx = new HarnessRunContext(makeConfig({ + kpiWeights: { quality: 0, cost: 1, latency: 0, energy: 0 }, + })); + // gpt-4 is very expensive, KPI weights purely on cost → should suggest cheaper + const decision = ctx.evaluatePreCall('gpt-4', false); + expect(decision.action).toBe('switch_model'); + expect(decision.reason).toBe('kpi_weight_optimization'); + }); +}); + +// --------------------------------------------------------------------------- +// Budget tracking across multiple recordCall invocations +// --------------------------------------------------------------------------- + +describe('recordCall and budget tracking', () => { + it('accumulates cost across calls', () => { + const ctx = new HarnessRunContext(makeConfig({ budgetMax: 0.10 })); + ctx.recordCall({ model: 'gpt-4o-mini', inputTokens: 100, outputTokens: 50, toolCallCount: 0, elapsedMs: 50 }); + expect(ctx.cost).toBeGreaterThan(0); + expect(ctx.stepCount).toBe(1); + expect(ctx.budgetRemaining).toBeLessThan(0.10); + + ctx.recordCall({ model: 'gpt-4o-mini', inputTokens: 200, outputTokens: 100, toolCallCount: 1, elapsedMs: 60 }); + expect(ctx.stepCount).toBe(2); + expect(ctx.toolCalls).toBe(1); + expect(ctx.latencyUsedMs).toBe(110); + }); + + it('detects budget exhaustion', () => { + const ctx = new HarnessRunContext(makeConfig({ budgetMax: 0.0001 })); + ctx.recordCall({ model: 'gpt-4o', inputTokens: 10000, outputTokens: 5000, toolCallCount: 0, elapsedMs: 100 }); + expect(ctx.isBudgetExhausted()).toBe(true); + }); + + it('detects tool cap reached', () => { + const ctx = new HarnessRunContext(makeConfig({ toolCallsMax: 2 })); + ctx.toolCalls = 2; + expect(ctx.isToolCapReached()).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Observe vs enforce mode behavior +// --------------------------------------------------------------------------- + +describe('observe vs enforce mode', () => { + it('observe mode evaluatePreCall still returns decisions', () => { + const ctx = new HarnessRunContext(makeConfig({ mode: 'observe', budgetMax: 0.01 })); + ctx.cost = 0.01; + const decision = ctx.evaluatePreCall('gpt-4o', false); + // Decision is evaluated regardless of mode + expect(decision.action).toBe('stop'); + }); + + it('off mode has no context created (by design)', () => { + // In the actual agent node, harnessCtx is null when mode=off + // This test validates that a context with mode=off still works + const ctx = new HarnessRunContext(makeConfig({ mode: 'off' })); + const decision = ctx.evaluatePreCall('gpt-4o', false); + expect(decision.action).toBe('allow'); + expect(decision.reason).toBe('off'); + }); +}); + +// --------------------------------------------------------------------------- +// Compliance allowlists +// --------------------------------------------------------------------------- + +describe('compliance allowlists', () => { + it('GDPR allows gpt-4o, gpt-4o-mini, gpt-3.5-turbo', () => { + const allowlist = COMPLIANCE_MODEL_ALLOWLISTS['gdpr']; + expect(allowlist.has('gpt-4o')).toBe(true); + expect(allowlist.has('gpt-4o-mini')).toBe(true); + expect(allowlist.has('gpt-3.5-turbo')).toBe(true); + expect(allowlist.has('claude-sonnet-4')).toBe(false); + }); + + it('strict allows only gpt-4o', () => { + const allowlist = COMPLIANCE_MODEL_ALLOWLISTS['strict']; + expect(allowlist.size).toBe(1); + expect(allowlist.has('gpt-4o')).toBe(true); + }); + + it('strict mode denies tools even for compliant model', () => { + const ctx = new HarnessRunContext(makeConfig({ compliance: 'strict' })); + const decision = ctx.evaluatePreCall('gpt-4o', true); + expect(decision.action).toBe('deny_tool'); + expect(decision.reason).toBe('compliance_tool_restriction'); + }); +}); + +// --------------------------------------------------------------------------- +// KPI weight normalization +// --------------------------------------------------------------------------- + +describe('normalizeWeights', () => { + it('normalizes to sum=1', () => { + const result = normalizeWeights({ quality: 0.4, cost: 0.3, latency: 0.2, energy: 0.1 }); + const sum = Object.values(result).reduce((a, b) => a + b, 0); + expect(sum).toBeCloseTo(1.0, 6); + }); + + it('filters out zero and negative values', () => { + const result = normalizeWeights({ quality: 1, cost: 0, latency: -1, energy: 1 }); + expect(result.cost).toBeUndefined(); + expect(result.latency).toBeUndefined(); + expect(result.quality).toBeCloseTo(0.5, 6); + expect(result.energy).toBeCloseTo(0.5, 6); + }); + + it('returns empty for all-zero weights', () => { + const result = normalizeWeights({ quality: 0, cost: 0, latency: 0, energy: 0 }); + expect(Object.keys(result)).toHaveLength(0); + }); +}); + +// --------------------------------------------------------------------------- +// summary() structure +// --------------------------------------------------------------------------- + +describe('summary()', () => { + it('returns correct structure', () => { + const ctx = new HarnessRunContext(makeConfig({ budgetMax: 1.0 })); + ctx.recordCall({ model: 'gpt-4o-mini', inputTokens: 100, outputTokens: 50, toolCallCount: 0, elapsedMs: 42 }); + + const s = ctx.summary(); + expect(s.runId).toBeTruthy(); + expect(s.mode).toBe('enforce'); + expect(s.stepCount).toBe(1); + expect(s.toolCalls).toBe(0); + expect(s.cost).toBeGreaterThan(0); + expect(s.latencyUsedMs).toBe(42); + expect(s.energyUsed).toBeGreaterThan(0); + expect(s.budgetMax).toBe(1.0); + expect(s.budgetRemaining).toBeLessThan(1.0); + expect(s.lastAction).toBe('allow'); + expect(s.durationMs).toBeGreaterThanOrEqual(0); + expect(Array.isArray(s.trace)).toBe(true); + expect(s.trace).toHaveLength(1); + expect(s.trace[0].action).toBe('allow'); + expect(s.trace[0].budgetState.max).toBe(1.0); + }); +}); diff --git a/packages/integrations/n8n/nodes/harness/harness.ts b/packages/integrations/n8n/nodes/harness/harness.ts new file mode 100644 index 00000000..ab3943d5 --- /dev/null +++ b/packages/integrations/n8n/nodes/harness/harness.ts @@ -0,0 +1,444 @@ +/** + * HarnessRunContext — multi-dimensional decision engine for n8n (TypeScript port). + * + * Ported from cascadeflow/harness/api.py (HarnessRunContext) and + * cascadeflow/harness/instrument.py (pre-call decision logic, compliance, + * quality/latency priors, KPI scoring). + * + * Key n8n constraint: models are graph connections (sub-nodes), not string + * parameters. The harness cannot switch models at runtime. Only `stop` and + * `deny_tool` actions have enforcement effects. `switch_model` decisions are + * recorded in the trace as observations. + */ + +import { + ENERGY_COEFFICIENTS, + DEFAULT_ENERGY_COEFFICIENT, + estimateCost, + estimateEnergy, + modelTotalPrice, + PRICING_USD_PER_M, +} from './pricing'; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export type HarnessMode = 'off' | 'observe' | 'enforce'; + +export interface KpiWeights { + quality?: number; + cost?: number; + latency?: number; + energy?: number; +} + +export interface HarnessConfig { + mode: HarnessMode; + budgetMax: number | null; + toolCallsMax: number | null; + latencyMaxMs: number | null; + energyMax: number | null; + compliance: string | null; + kpiWeights: KpiWeights; +} + +export interface PreCallDecision { + action: 'allow' | 'stop' | 'switch_model' | 'deny_tool'; + reason: string; + targetModel: string; +} + +export interface HarnessTraceEntry { + action: string; + reason: string; + model: string | null; + step: number; + timestampMs: number; + costTotal: number; + budgetState: { max: number | null; remaining: number | null }; + applied: boolean; + decisionMode: string; +} + +export interface HarnessSummary { + runId: string; + mode: HarnessMode; + stepCount: number; + toolCalls: number; + cost: number; + latencyUsedMs: number; + energyUsed: number; + budgetMax: number | null; + budgetRemaining: number | null; + lastAction: string; + durationMs: number; + trace: HarnessTraceEntry[]; +} + +export interface RecordCallParams { + model: string; + inputTokens: number; + outputTokens: number; + toolCallCount: number; + elapsedMs: number; + decision?: PreCallDecision; +} + +// --------------------------------------------------------------------------- +// Compliance allowlists (from instrument.py lines 107-112) +// --------------------------------------------------------------------------- + +const COMPLIANCE_MODEL_ALLOWLISTS: Record> = { + gdpr: new Set(['gpt-4o', 'gpt-4o-mini', 'gpt-3.5-turbo']), + hipaa: new Set(['gpt-4o', 'gpt-4o-mini']), + pci: new Set(['gpt-4o-mini', 'gpt-3.5-turbo']), + strict: new Set(['gpt-4o']), +}; + +// --------------------------------------------------------------------------- +// Quality & latency priors for KPI scoring (from instrument.py lines 74-95) +// --------------------------------------------------------------------------- + +const QUALITY_PRIORS: Record = { + 'gpt-4o': 0.90, + 'gpt-4o-mini': 0.75, + 'gpt-5-mini': 0.86, + 'gpt-4-turbo': 0.88, + 'gpt-4': 0.87, + 'gpt-3.5-turbo': 0.65, + 'o1': 0.95, + 'o1-mini': 0.82, + 'o3-mini': 0.80, +}; + +const LATENCY_PRIORS: Record = { + 'gpt-4o': 0.72, + 'gpt-4o-mini': 0.93, + 'gpt-5-mini': 0.84, + 'gpt-4-turbo': 0.66, + 'gpt-4': 0.52, + 'gpt-3.5-turbo': 1.00, + 'o1': 0.40, + 'o1-mini': 0.60, + 'o3-mini': 0.78, +}; + +// Pre-computed model cost/energy bounds for utility functions. +const MODEL_POOL = Object.keys(PRICING_USD_PER_M); +const MODEL_TOTAL_COSTS = new Map(MODEL_POOL.map(m => [m, modelTotalPrice(m)])); +const MIN_TOTAL_COST = Math.min(...MODEL_TOTAL_COSTS.values()); +const MAX_TOTAL_COST = Math.max(...MODEL_TOTAL_COSTS.values()); + +const MODEL_ENERGY_COEFFS = new Map( + MODEL_POOL.map(m => [m, ENERGY_COEFFICIENTS[m] ?? DEFAULT_ENERGY_COEFFICIENT]), +); +const MIN_ENERGY_COEFF = Math.min(...MODEL_ENERGY_COEFFS.values()); +const MAX_ENERGY_COEFF = Math.max(...MODEL_ENERGY_COEFFS.values()); + +// --------------------------------------------------------------------------- +// KPI scoring helpers (from instrument.py lines 234-267) +// --------------------------------------------------------------------------- + +function normalizeWeights(weights: KpiWeights): Record { + const raw: Record = {}; + for (const [key, val] of Object.entries(weights)) { + if (['cost', 'quality', 'latency', 'energy'].includes(key) && typeof val === 'number' && val > 0) { + raw[key] = val; + } + } + const total = Object.values(raw).reduce((a, b) => a + b, 0); + if (total <= 0) return {}; + const normalized: Record = {}; + for (const [key, val] of Object.entries(raw)) { + normalized[key] = val / total; + } + return normalized; +} + +function costUtility(model: string): number { + const modelCost = MODEL_TOTAL_COSTS.get(model) ?? modelTotalPrice(model); + if (MAX_TOTAL_COST === MIN_TOTAL_COST) return 1.0; + return (MAX_TOTAL_COST - modelCost) / (MAX_TOTAL_COST - MIN_TOTAL_COST); +} + +function energyUtility(model: string): number { + const coeff = ENERGY_COEFFICIENTS[model] ?? DEFAULT_ENERGY_COEFFICIENT; + if (MAX_ENERGY_COEFF === MIN_ENERGY_COEFF) return 1.0; + return (MAX_ENERGY_COEFF - coeff) / (MAX_ENERGY_COEFF - MIN_ENERGY_COEFF); +} + +function kpiScoreWithNormalized(model: string, normalized: Record): number { + if (Object.keys(normalized).length === 0) return 0.0; + const quality = QUALITY_PRIORS[model] ?? 0.7; + const latency = LATENCY_PRIORS[model] ?? 0.7; + const cost = costUtility(model); + const energy = energyUtility(model); + return ( + (normalized.quality ?? 0) * quality + + (normalized.latency ?? 0) * latency + + (normalized.cost ?? 0) * cost + + (normalized.energy ?? 0) * energy + ); +} + +function selectKpiWeightedModel(currentModel: string, weights: KpiWeights): string { + const normalized = normalizeWeights(weights); + if (Object.keys(normalized).length === 0) return currentModel; + let bestModel = currentModel; + let bestScore = kpiScoreWithNormalized(currentModel, normalized); + for (const candidate of MODEL_POOL) { + const score = kpiScoreWithNormalized(candidate, normalized); + if (score > bestScore) { + bestModel = candidate; + bestScore = score; + } + } + return bestModel; +} + +// Cheapest/fastest/lowest-energy helpers +function selectCheaperModel(currentModel: string): string { + let cheapest = currentModel; + let cheapestCost = MODEL_TOTAL_COSTS.get(currentModel) ?? modelTotalPrice(currentModel); + for (const [m, c] of MODEL_TOTAL_COSTS) { + if (c < cheapestCost) { + cheapest = m; + cheapestCost = c; + } + } + return cheapest; +} + +function selectFasterModel(currentModel: string): string { + const currentLatency = LATENCY_PRIORS[currentModel] ?? 0.7; + let best = currentModel; + let bestLatency = currentLatency; + for (const [m, lat] of Object.entries(LATENCY_PRIORS)) { + if (lat > bestLatency) { + best = m; + bestLatency = lat; + } + } + return best; +} + +function selectLowerEnergyModel(currentModel: string): string { + const currentCoeff = ENERGY_COEFFICIENTS[currentModel] ?? DEFAULT_ENERGY_COEFFICIENT; + let best = currentModel; + let bestCoeff = currentCoeff; + for (const [m, c] of MODEL_ENERGY_COEFFS) { + if (c < bestCoeff) { + best = m; + bestCoeff = c; + } + } + return best; +} + +// --------------------------------------------------------------------------- +// HarnessRunContext +// --------------------------------------------------------------------------- + +const MAX_TRACE_ENTRIES = 1000; + +/** Coerce NaN, Infinity, or negative values to null (unlimited). */ +function sanitizeNumericParam(value: number | null): number | null { + if (value === null || value === undefined) return null; + if (!Number.isFinite(value) || value < 0) return null; + return value; +} + +let runIdCounter = 0; + +function generateRunId(): string { + runIdCounter += 1; + const ts = Date.now().toString(36); + const counter = runIdCounter.toString(36); + return `${ts}${counter}`.slice(-8); +} + +export class HarnessRunContext { + readonly runId: string; + readonly config: HarnessConfig; + + stepCount = 0; + toolCalls = 0; + cost = 0; + latencyUsedMs = 0; + energyUsed = 0; + budgetRemaining: number | null; + lastAction = 'allow'; + + private startedAt: number; + private trace: HarnessTraceEntry[] = []; + + constructor(config: HarnessConfig) { + this.runId = generateRunId(); + this.config = { + ...config, + budgetMax: sanitizeNumericParam(config.budgetMax), + toolCallsMax: sanitizeNumericParam(config.toolCallsMax), + latencyMaxMs: sanitizeNumericParam(config.latencyMaxMs), + energyMax: sanitizeNumericParam(config.energyMax), + }; + this.budgetRemaining = this.config.budgetMax; + this.startedAt = Date.now(); + } + + // ----------------------------------------------------------------------- + // Pre-call decision cascade (ported from instrument.py _evaluate_pre_call_decision) + // ----------------------------------------------------------------------- + + evaluatePreCall(model: string, hasTools: boolean): PreCallDecision { + const cfg = this.config; + + // 1. Budget exhausted + if (cfg.budgetMax !== null && this.cost >= cfg.budgetMax) { + return { action: 'stop', reason: 'budget_exceeded', targetModel: model }; + } + + // 2. Tool call cap + if (hasTools && cfg.toolCallsMax !== null && this.toolCalls >= cfg.toolCallsMax) { + return { action: 'deny_tool', reason: 'max_tool_calls_reached', targetModel: model }; + } + + // 3. Compliance + if (cfg.compliance) { + const allowlist = COMPLIANCE_MODEL_ALLOWLISTS[cfg.compliance.trim().toLowerCase()]; + if (allowlist) { + if (!allowlist.has(model)) { + // Can't switch models in n8n — stop if no compliant model possible + return { action: 'stop', reason: 'compliance_no_approved_model', targetModel: model }; + } + if (cfg.compliance.trim().toLowerCase() === 'strict' && hasTools) { + return { action: 'deny_tool', reason: 'compliance_tool_restriction', targetModel: model }; + } + } + } + + // 4. Latency cap + if (cfg.latencyMaxMs !== null && this.latencyUsedMs >= cfg.latencyMaxMs) { + const faster = selectFasterModel(model); + if (faster !== model) { + return { action: 'switch_model', reason: 'latency_limit_exceeded', targetModel: faster }; + } + return { action: 'stop', reason: 'latency_limit_exceeded', targetModel: model }; + } + + // 5. Energy cap + if (cfg.energyMax !== null && this.energyUsed >= cfg.energyMax) { + const lower = selectLowerEnergyModel(model); + if (lower !== model) { + return { action: 'switch_model', reason: 'energy_limit_exceeded', targetModel: lower }; + } + return { action: 'stop', reason: 'energy_limit_exceeded', targetModel: model }; + } + + // 6. Budget pressure (<20% remaining) — observation only in n8n + if ( + cfg.budgetMax !== null && + cfg.budgetMax > 0 && + this.budgetRemaining !== null && + this.budgetRemaining / cfg.budgetMax < 0.2 + ) { + const cheaper = selectCheaperModel(model); + if (cheaper !== model) { + return { action: 'switch_model', reason: 'budget_pressure', targetModel: cheaper }; + } + } + + // 7. KPI-weighted — observation only in n8n + const kw = cfg.kpiWeights; + if (kw && Object.values(kw).some(v => typeof v === 'number' && v > 0)) { + const weighted = selectKpiWeightedModel(model, kw); + if (weighted !== model) { + return { action: 'switch_model', reason: 'kpi_weight_optimization', targetModel: weighted }; + } + } + + // 8. Default: allow + return { action: 'allow', reason: cfg.mode, targetModel: model }; + } + + // ----------------------------------------------------------------------- + // Record a completed call + // ----------------------------------------------------------------------- + + recordCall(params: RecordCallParams): void { + const { model, inputTokens, outputTokens, toolCallCount, elapsedMs, decision } = params; + + const callCost = estimateCost(model, inputTokens, outputTokens); + const energy = estimateEnergy(model, inputTokens, outputTokens); + + this.cost += callCost; + this.stepCount += 1; + this.latencyUsedMs += elapsedMs; + this.energyUsed += energy; + this.toolCalls += toolCallCount; + + if (this.config.budgetMax !== null) { + this.budgetRemaining = this.config.budgetMax - this.cost; + } + + const action = decision?.action ?? 'allow'; + const reason = decision?.reason ?? this.config.mode; + const applied = action === 'allow' || (this.config.mode === 'enforce' && (action === 'stop' || action === 'deny_tool')); + + this.lastAction = action; + + this.trace.push({ + action, + reason, + model, + step: this.stepCount, + timestampMs: Date.now(), + costTotal: this.cost, + budgetState: { + max: this.config.budgetMax, + remaining: this.budgetRemaining, + }, + applied, + decisionMode: this.config.mode, + }); + if (this.trace.length > MAX_TRACE_ENTRIES) { + this.trace = this.trace.slice(-MAX_TRACE_ENTRIES); + } + } + + // ----------------------------------------------------------------------- + // Quick checks for agent loop + // ----------------------------------------------------------------------- + + isBudgetExhausted(): boolean { + return this.config.budgetMax !== null && this.cost >= this.config.budgetMax; + } + + isToolCapReached(): boolean { + return this.config.toolCallsMax !== null && this.toolCalls >= this.config.toolCallsMax; + } + + // ----------------------------------------------------------------------- + // Summary + // ----------------------------------------------------------------------- + + summary(): HarnessSummary { + return { + runId: this.runId, + mode: this.config.mode, + stepCount: this.stepCount, + toolCalls: this.toolCalls, + cost: this.cost, + latencyUsedMs: this.latencyUsedMs, + energyUsed: this.energyUsed, + budgetMax: this.config.budgetMax, + budgetRemaining: this.budgetRemaining, + lastAction: this.lastAction, + durationMs: Date.now() - this.startedAt, + trace: [...this.trace], + }; + } +} + +// Re-export for external test access +export { COMPLIANCE_MODEL_ALLOWLISTS, QUALITY_PRIORS, LATENCY_PRIORS, normalizeWeights }; diff --git a/packages/integrations/n8n/nodes/harness/index.ts b/packages/integrations/n8n/nodes/harness/index.ts new file mode 100644 index 00000000..663f93b3 --- /dev/null +++ b/packages/integrations/n8n/nodes/harness/index.ts @@ -0,0 +1,22 @@ +export { + PRICING_USD_PER_M, + DEFAULT_PRICING_USD_PER_M, + ENERGY_COEFFICIENTS, + DEFAULT_ENERGY_COEFFICIENT, + ENERGY_OUTPUT_WEIGHT, + resolvePricingKey, + estimateCost, + estimateEnergy, + modelTotalPrice, +} from './pricing'; + +export { + type HarnessMode, + type KpiWeights, + type HarnessConfig, + type PreCallDecision, + type HarnessTraceEntry, + type HarnessSummary, + type RecordCallParams, + HarnessRunContext, +} from './harness'; diff --git a/packages/integrations/n8n/nodes/harness/pricing.ts b/packages/integrations/n8n/nodes/harness/pricing.ts new file mode 100644 index 00000000..fd13f43a --- /dev/null +++ b/packages/integrations/n8n/nodes/harness/pricing.ts @@ -0,0 +1,135 @@ +/** + * Shared harness pricing and energy profiles (TypeScript port). + * + * Ported from cascadeflow/harness/pricing.py — single source of truth for + * cost/energy estimation in the n8n integration. + */ + +// USD per 1M tokens [input, output]. +export const PRICING_USD_PER_M: Record = { + // OpenAI + 'gpt-4o': [2.50, 10.00], + 'gpt-4o-mini': [0.15, 0.60], + 'gpt-5': [1.25, 10.00], + 'gpt-5-mini': [0.20, 0.80], + 'gpt-4-turbo': [10.00, 30.00], + 'gpt-4': [30.00, 60.00], + 'gpt-3.5-turbo': [0.50, 1.50], + 'o1': [15.00, 60.00], + 'o1-mini': [3.00, 12.00], + 'o3-mini': [1.10, 4.40], + // Anthropic + 'claude-sonnet-4': [3.00, 15.00], + 'claude-haiku-3.5': [1.00, 5.00], + 'claude-opus-4.5': [5.00, 25.00], + // Google Gemini + 'gemini-2.5-flash': [0.15, 0.60], + 'gemini-2.5-pro': [1.25, 10.00], + 'gemini-2.0-flash': [0.10, 0.40], + 'gemini-1.5-flash': [0.075, 0.30], + 'gemini-1.5-pro': [1.25, 5.00], +}; + +export const DEFAULT_PRICING_USD_PER_M: [number, number] = [2.50, 10.00]; + +// Deterministic proxy coefficients for energy tracking. +export const ENERGY_COEFFICIENTS: Record = { + // OpenAI + 'gpt-4o': 1.0, + 'gpt-4o-mini': 0.3, + 'gpt-5': 1.2, + 'gpt-5-mini': 0.35, + 'gpt-4-turbo': 1.5, + 'gpt-4': 1.5, + 'gpt-3.5-turbo': 0.2, + 'o1': 2.0, + 'o1-mini': 0.8, + 'o3-mini': 0.5, + // Anthropic + 'claude-sonnet-4': 1.0, + 'claude-haiku-3.5': 0.3, + 'claude-opus-4.5': 1.8, + // Google Gemini + 'gemini-2.5-flash': 0.3, + 'gemini-2.5-pro': 1.2, + 'gemini-2.0-flash': 0.25, + 'gemini-1.5-flash': 0.2, + 'gemini-1.5-pro': 1.0, +}; + +export const DEFAULT_ENERGY_COEFFICIENT = 1.0; +export const ENERGY_OUTPUT_WEIGHT = 1.5; + +// --------------------------------------------------------------------------- +// Fuzzy model-name resolution +// --------------------------------------------------------------------------- + +// Strips version/preview/date suffixes. +// Matches: -preview, -preview-05-20, -20250120, -latest, -exp-0827, -it +const VERSION_SUFFIX_RE = /(-preview(?:-\d{2,4}-\d{2})?|-\d{8,}|-latest|-exp(?:-\d+)?|-it)$/; + +// Cache for resolved model → pricing key lookups. +const pricingKeyCache = new Map(); + +export function resolvePricingKey(model: string): string | null { + const cached = pricingKeyCache.get(model); + if (cached !== undefined) return cached; + + // Exact match + if (model in PRICING_USD_PER_M) { + pricingKeyCache.set(model, model); + return model; + } + + // Strip version suffixes and retry + const stripped = model.replace(VERSION_SUFFIX_RE, ''); + if (stripped !== model && stripped in PRICING_USD_PER_M) { + pricingKeyCache.set(model, stripped); + return stripped; + } + + // Longest-prefix match (e.g. "gemini-2.5-flash-8b" → "gemini-2.5-flash") + let best: string | null = null; + let bestLen = 0; + for (const known of Object.keys(PRICING_USD_PER_M)) { + if (model.startsWith(known) && known.length > bestLen) { + best = known; + bestLen = known.length; + } + } + if (best !== null) { + pricingKeyCache.set(model, best); + return best; + } + + pricingKeyCache.set(model, null); + return null; +} + +// --------------------------------------------------------------------------- +// Public estimation helpers +// --------------------------------------------------------------------------- + +export function estimateCost(model: string, inputTokens: number, outputTokens: number): number { + const key = resolvePricingKey(model); + const [inPrice, outPrice] = key !== null + ? (PRICING_USD_PER_M[key] ?? DEFAULT_PRICING_USD_PER_M) + : DEFAULT_PRICING_USD_PER_M; + return (inputTokens / 1_000_000) * inPrice + (outputTokens / 1_000_000) * outPrice; +} + +export function estimateEnergy(model: string, inputTokens: number, outputTokens: number): number { + const key = resolvePricingKey(model); + const coeff = key !== null + ? (ENERGY_COEFFICIENTS[key] ?? DEFAULT_ENERGY_COEFFICIENT) + : DEFAULT_ENERGY_COEFFICIENT; + return coeff * (inputTokens + outputTokens * ENERGY_OUTPUT_WEIGHT); +} + +export function modelTotalPrice(model: string): number { + const key = resolvePricingKey(model); + const [inPrice, outPrice] = key !== null + ? (PRICING_USD_PER_M[key] ?? DEFAULT_PRICING_USD_PER_M) + : DEFAULT_PRICING_USD_PER_M; + return inPrice + outPrice; +} diff --git a/pyproject.toml b/pyproject.toml index eaadb6b7..bc7c7072 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "cascadeflow" version = "1.0.0" -description = "Smart AI model cascading for cost optimization - Save 40-85% on LLM costs with 2-6x faster responses. Available for Python and TypeScript/JavaScript." +description = "Agent runtime intelligence layer — optimize cost, latency, quality, budget, compliance, and energy across AI agent workflows." readme = "README.md" requires-python = ">=3.9" license = "MIT" @@ -32,9 +32,17 @@ keywords = [ "javascript", "browser", "edge-functions", + "agent-intelligence", + "runtime-optimization", + "budget-enforcement", + "compliance", + "harness", + "agent-runtime", + "kpi", + "energy-tracking", ] classifiers = [ - "Development Status :: 4 - Beta", + "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.9", @@ -93,7 +101,7 @@ semantic = [ openclaw = ["fastembed>=0.7.0"] # CrewAI harness integration (opt-in) -crewai = ["crewai>=1.5.0"] +crewai = ["crewai>=1.5.0; python_version >= '3.10'"] # OpenAI Agents SDK integration (opt-in) openai-agents = [ @@ -101,6 +109,9 @@ openai-agents = [ "openai-agents>=0.9.0; python_version >= '3.10'", ] +# Google ADK integration (opt-in, requires Python 3.10+) +google-adk = ["google-adk>=1.0.0; python_version >= '3.10'"] + # Development tools (includes rich for terminal output) dev = [ "pytest>=7.4.0", @@ -138,7 +149,7 @@ all = [ [project.urls] Homepage = "https://lemony.ai" -Documentation = "https://github.com/lemony-ai/cascadeflow" +Documentation = "https://docs.cascadeflow.dev" Repository = "https://github.com/lemony-ai/cascadeflow" "Bug Tracker" = "https://github.com/lemony-ai/cascadeflow/issues" Changelog = "https://github.com/lemony-ai/cascadeflow/releases" diff --git a/tests/benchmarks/bfcl/agentic_benchmark.py b/tests/benchmarks/bfcl/agentic_benchmark.py index 1386cb60..2b450e68 100644 --- a/tests/benchmarks/bfcl/agentic_benchmark.py +++ b/tests/benchmarks/bfcl/agentic_benchmark.py @@ -61,6 +61,7 @@ class AgenticResult: correct: bool draft_accepted: bool cost: float + baseline_cost: float latency_ms: float draft_accepted_turns: int = 0 draft_acceptance_rate: float = 0.0 @@ -761,6 +762,23 @@ def _format_tools_desc(self, tools: list[dict[str, Any]]) -> str: lines.append(f"- {name}: {description} (params: {param_names})") return "\n".join(lines) + @staticmethod + def _extract_baseline_cost(result: Any) -> float: + """Extract baseline cost for a call from cascade metadata. + + ``cost_saved`` is defined relative to a verifier-only baseline. + """ + total_cost = float(getattr(result, "total_cost", 0.0) or 0.0) + metadata = getattr(result, "metadata", {}) or {} + raw_saved = metadata.get("cost_saved", 0.0) or 0.0 + try: + cost_saved = float(raw_saved) + except (TypeError, ValueError): + cost_saved = 0.0 + + baseline_cost = total_cost + cost_saved + return baseline_cost if baseline_cost > 0 else total_cost + def _extract_parameters(self, response: str) -> list[dict[str, Any]]: """Extract JSON parameter blocks from a tool response.""" parameters = [] @@ -939,6 +957,7 @@ async def run_single_turn(self, task: dict) -> AgenticResult: draft_accepted_turns=1 if draft_accepted else 0, draft_acceptance_rate=1.0 if draft_accepted else 0.0, cost=result.total_cost, + baseline_cost=self._extract_baseline_cost(result), latency_ms=latency_ms, turns_completed=1, tools_called=tools_called, @@ -952,6 +971,7 @@ async def run_single_turn(self, task: dict) -> AgenticResult: correct=False, draft_accepted=False, cost=0.0, + baseline_cost=0.0, latency_ms=latency_ms, error=str(e), ) @@ -976,6 +996,7 @@ async def run_multi_turn(self, task: dict) -> AgenticResult: start_time = time.time() total_cost = 0.0 + total_baseline_cost = 0.0 all_tools_called = [] turns_completed = 0 state_maintained = True @@ -1011,6 +1032,7 @@ async def run_multi_turn(self, task: dict) -> AgenticResult: result = await agent.run(prompt, max_tokens=500) total_cost += result.total_cost + total_baseline_cost += self._extract_baseline_cost(result) tools_in_turn = self._extract_tool_calls(result.content) params_in_turn = self._extract_parameters(result.content) @@ -1057,6 +1079,7 @@ async def run_multi_turn(self, task: dict) -> AgenticResult: draft_accepted_turns=draft_accepted_turns, draft_acceptance_rate=draft_acceptance_rate, cost=total_cost, + baseline_cost=total_baseline_cost if total_baseline_cost > 0 else total_cost, latency_ms=latency_ms, turns_completed=turns_completed, tools_called=all_tools_called, @@ -1072,6 +1095,7 @@ async def run_multi_turn(self, task: dict) -> AgenticResult: draft_accepted_turns=draft_accepted_turns, draft_acceptance_rate=0.0, cost=total_cost, + baseline_cost=total_baseline_cost if total_baseline_cost > 0 else total_cost, latency_ms=latency_ms, turns_completed=turns_completed, error=str(e), @@ -1127,6 +1151,13 @@ def _calculate_metrics(self) -> dict: draft_accepted_turns = sum(r.draft_accepted_turns for r in self.results) dependency_handled = sum(1 for r in self.results if r.dependency_handled) total_cost = sum(r.cost for r in self.results) + total_baseline_cost = sum( + r.baseline_cost if r.baseline_cost > 0 else r.cost for r in self.results + ) + total_savings = total_baseline_cost - total_cost + cost_reduction_pct = ( + (total_savings / total_baseline_cost) * 100 if total_baseline_cost > 0 else 0.0 + ) total_turns = sum(r.turns_completed for r in self.results) # Group by task type @@ -1172,6 +1203,9 @@ def _calculate_metrics(self) -> dict: "draft_acceptance_by_task": draft_accepted / total if total > 0 else 0, "dependency_handling": dependency_rate, "total_cost": total_cost, + "baseline_cost": total_baseline_cost, + "total_savings": total_savings, + "cost_reduction_pct": cost_reduction_pct, "by_type": by_type, # Natural vs Explicit comparison "natural_language": { @@ -1198,6 +1232,8 @@ def _calculate_metrics(self) -> dict: print(f" Draft Acceptance: {draft_rate:.1%} (by turn)") print(f" Dependency Handling: {dependency_rate:.1%}") print(f" Total Cost: ${total_cost:.4f}") + print(f" Baseline Cost: ${total_baseline_cost:.4f}") + print(f" Cost Reduction: {cost_reduction_pct:.1f}%") # Natural vs Explicit comparison (key insight) print("\n" + "-" * 70) @@ -1287,6 +1323,7 @@ async def main(): "correct": r.correct, "draft_accepted": r.draft_accepted, "cost": r.cost, + "baseline_cost": r.baseline_cost, "latency_ms": r.latency_ms, "turns_completed": r.turns_completed, "tools_called": r.tools_called, diff --git a/tests/benchmarks/run_all.py b/tests/benchmarks/run_all.py index 739c0342..9c4a3f93 100644 --- a/tests/benchmarks/run_all.py +++ b/tests/benchmarks/run_all.py @@ -322,6 +322,10 @@ def generate_comparison_table(results: dict[str, Any]) -> str: ) table += f"- **Dependency Handling:** {agentic_summary.get('dependency_handling', 0) * 100:.1f}%\n" table += f"- **Total Cost:** ${agentic_summary.get('total_cost', 0):.6f}\n" + if "baseline_cost" in agentic_summary: + table += f"- **Baseline Cost:** ${agentic_summary.get('baseline_cost', 0):.6f}\n" + if "cost_reduction_pct" in agentic_summary: + table += f"- **Cost Reduction:** {agentic_summary.get('cost_reduction_pct', 0):.1f}%\n" natural = agentic_summary.get("natural_language", {}) explicit = agentic_summary.get("explicit_steps", {}) diff --git a/tests/test_google_adk_integration.py b/tests/test_google_adk_integration.py new file mode 100644 index 00000000..688e39c4 --- /dev/null +++ b/tests/test_google_adk_integration.py @@ -0,0 +1,738 @@ +"""Tests for cascadeflow.integrations.google_adk harness integration. + +google-adk is not installed in test environments, so we use fake ADK types +and test the integration logic directly against HarnessRunContext. +""" + +from __future__ import annotations + +import time +from unittest.mock import patch + +import pytest + +from cascadeflow.harness import init, reset, run + +# Import the module directly — it does not require google-adk at import time +# (GOOGLE_ADK_AVAILABLE will be False, but all functions/classes are still defined). +import cascadeflow.integrations.google_adk as adk_mod + + +# --------------------------------------------------------------------------- +# Fake ADK types +# --------------------------------------------------------------------------- + + +class FakeUsageMetadata: + """Stand-in for google.genai.types.GenerateContentResponseUsageMetadata.""" + + def __init__( + self, + prompt_token_count: int = 0, + candidates_token_count: int = 0, + ): + self.prompt_token_count = prompt_token_count + self.candidates_token_count = candidates_token_count + + +class FakePart: + """Stand-in for google.genai.types.Part.""" + + def __init__(self, *, text: str | None = None, function_call: object | None = None): + self.text = text + self.function_call = function_call + + +class FakeContent: + """Stand-in for google.genai.types.Content.""" + + def __init__(self, parts: list | None = None): + self.parts = parts or [] + + +class FakeLlmResponse: + """Stand-in for google.adk.models.LlmResponse.""" + + def __init__( + self, + *, + content: FakeContent | None = None, + usage_metadata: FakeUsageMetadata | None = None, + ): + self.content = content + self.usage_metadata = usage_metadata + + +class FakeLlmRequest: + """Stand-in for google.adk.models.LlmRequest.""" + + def __init__(self, model: str = "gemini-2.5-flash"): + self.model = model + + +class FakeCallbackContext: + """Stand-in for google.adk.agents.CallbackContext.""" + + def __init__( + self, + invocation_id: str = "inv-001", + agent_name: str = "test-agent", + ): + self.invocation_id = invocation_id + self.agent_name = agent_name + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _reset_adk_state(): + """Reset harness and ADK module state before every test.""" + reset() + adk_mod._config = adk_mod.GoogleADKHarnessConfig() + adk_mod._plugin_instance = None + adk_mod._enabled = False + + +# --------------------------------------------------------------------------- +# _normalize_model_name +# --------------------------------------------------------------------------- + + +class TestNormalizeModelName: + def test_plain_model(self): + assert adk_mod._normalize_model_name("gemini-2.5-flash") == "gemini-2.5-flash" + + def test_strips_provider_prefix(self): + assert adk_mod._normalize_model_name("openai/gpt-4o") == "gpt-4o" + + def test_strips_models_prefix(self): + assert adk_mod._normalize_model_name("models/gemini-2.5-flash") == "gemini-2.5-flash" + + def test_strips_litellm_prefix(self): + assert adk_mod._normalize_model_name("vertex_ai/gemini-2.5-pro") == "gemini-2.5-pro" + + def test_no_slash_passthrough(self): + assert adk_mod._normalize_model_name("gpt-4o-mini") == "gpt-4o-mini" + + +# --------------------------------------------------------------------------- +# _count_function_calls +# --------------------------------------------------------------------------- + + +class TestCountFunctionCalls: + def test_no_content(self): + assert adk_mod._count_function_calls(None) == 0 + + def test_no_parts(self): + content = FakeContent(parts=[]) + assert adk_mod._count_function_calls(content) == 0 + + def test_text_only(self): + content = FakeContent(parts=[FakePart(text="hello")]) + assert adk_mod._count_function_calls(content) == 0 + + def test_counts_function_calls(self): + content = FakeContent( + parts=[ + FakePart(text="thinking..."), + FakePart(function_call={"name": "search", "args": {}}), + FakePart(function_call={"name": "calculate", "args": {}}), + ] + ) + assert adk_mod._count_function_calls(content) == 2 + + +# --------------------------------------------------------------------------- +# Cost / energy estimation (via shared pricing) +# --------------------------------------------------------------------------- + + +class TestEstimation: + def test_estimate_cost_known_model(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-flash", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.15 + 0.60) + + def test_estimate_cost_unknown_model_uses_default(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("unknown-model", 1_000_000, 0) + assert cost == pytest.approx(2.50) + + def test_estimate_energy_known_model(self): + from cascadeflow.harness.pricing import estimate_energy + + energy = estimate_energy("gemini-2.5-flash", 100, 100) + # coeff=0.3, output_weight=1.5 + assert energy == pytest.approx(0.3 * (100 + 100 * 1.5)) + + def test_estimate_energy_unknown_model(self): + from cascadeflow.harness.pricing import estimate_energy + + energy = estimate_energy("unknown-model", 100, 100) + # default coeff=1.0 + assert energy == pytest.approx(1.0 * (100 + 100 * 1.5)) + + +# --------------------------------------------------------------------------- +# before_model_callback +# --------------------------------------------------------------------------- + + +class TestBeforeModelCallback: + @pytest.fixture + def plugin(self): + return adk_mod.CascadeFlowADKPlugin() + + async def test_no_run_context_returns_none(self, plugin): + ctx = FakeCallbackContext() + req = FakeLlmRequest() + result = await plugin.before_model_callback(ctx, req) + assert result is None + + async def test_observe_mode_allows_over_budget(self, plugin): + init(mode="observe", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.002 + result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest()) + assert result is None # observe never blocks + + async def test_enforce_blocks_when_budget_exhausted(self, plugin): + init(mode="enforce", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.001 + result = await plugin.before_model_callback( + FakeCallbackContext(), FakeLlmRequest("gemini-2.5-flash") + ) + assert result is not None # short-circuit response + assert run_ctx.last_action == "stop" + trace = run_ctx.trace() + assert trace[-1]["reason"] == "budget_exhausted" + + async def test_enforce_blocked_call_does_not_leak_state(self, plugin): + init(mode="enforce", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.001 + cb_ctx = FakeCallbackContext() + await plugin.before_model_callback(cb_ctx, FakeLlmRequest()) + key = plugin._callback_key(cb_ctx) + assert key not in plugin._call_start_times + assert key not in plugin._call_models + + async def test_enforce_allows_under_budget(self, plugin): + init(mode="enforce", budget=1.0) + with run(budget=1.0) as run_ctx: + run_ctx.cost = 0.5 + result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest()) + assert result is None + + async def test_records_start_time_and_model(self, plugin): + init(mode="observe") + with run(): + cb_ctx = FakeCallbackContext() + await plugin.before_model_callback(cb_ctx, FakeLlmRequest("gpt-4o")) + key = plugin._callback_key(cb_ctx) + assert key in plugin._call_start_times + assert plugin._call_models[key] == "gpt-4o" + + async def test_normalizes_model_name(self, plugin): + init(mode="observe") + with run(): + cb_ctx = FakeCallbackContext() + await plugin.before_model_callback(cb_ctx, FakeLlmRequest("openai/gpt-4o")) + key = plugin._callback_key(cb_ctx) + assert plugin._call_models[key] == "gpt-4o" + + async def test_budget_gate_disabled_in_config(self): + plugin = adk_mod.CascadeFlowADKPlugin( + config=adk_mod.GoogleADKHarnessConfig(enable_budget_gate=False) + ) + init(mode="enforce", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.002 + result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest()) + assert result is None # gate disabled + + async def test_fail_open_swallows_errors(self, plugin): + init(mode="enforce") + with run(): + with patch( + "cascadeflow.integrations.google_adk.get_current_run", + side_effect=RuntimeError("boom"), + ): + result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest()) + assert result is None + + +# --------------------------------------------------------------------------- +# after_model_callback +# --------------------------------------------------------------------------- + + +class TestAfterModelCallback: + @pytest.fixture + def plugin(self): + return adk_mod.CascadeFlowADKPlugin() + + async def test_no_run_context_returns_none(self, plugin): + result = await plugin.after_model_callback( + FakeCallbackContext(), + FakeLlmResponse(), + ) + assert result is None + + async def test_updates_run_metrics_with_usage_metadata(self, plugin): + init(mode="observe") + with run(budget=1.0) as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_start_times[key] = time.monotonic() - 0.1 + plugin._call_models[key] = "gemini-2.5-flash" + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata( + prompt_token_count=100, + candidates_token_count=50, + ), + content=FakeContent(parts=[FakePart(text="done")]), + ) + await plugin.after_model_callback(cb_ctx, response) + + assert run_ctx.step_count == 1 + assert run_ctx.cost > 0 + assert run_ctx.energy_used > 0 + assert run_ctx.latency_used_ms > 0 + assert run_ctx.model_used == "gemini-2.5-flash" + assert run_ctx.last_action == "allow" + + async def test_fallback_token_estimation(self, plugin): + """When usage_metadata is missing, estimate from content text.""" + init(mode="observe") + with run() as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gemini-2.5-flash" + + response = FakeLlmResponse( + content=FakeContent(parts=[FakePart(text="x" * 400)]), + ) + await plugin.after_model_callback(cb_ctx, response) + + assert run_ctx.cost > 0 + assert run_ctx.step_count == 1 + + async def test_counts_tool_calls(self, plugin): + init(mode="observe") + with run() as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gemini-2.5-flash" + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(100, 50), + content=FakeContent( + parts=[ + FakePart(function_call={"name": "search"}), + FakePart(function_call={"name": "calc"}), + ] + ), + ) + await plugin.after_model_callback(cb_ctx, response) + assert run_ctx.tool_calls == 2 + + async def test_updates_budget_remaining(self, plugin): + init(mode="enforce", budget=1.0) + with run(budget=1.0) as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gemini-2.5-flash" + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(100, 50), + ) + await plugin.after_model_callback(cb_ctx, response) + assert run_ctx.budget_remaining is not None + assert run_ctx.budget_remaining == pytest.approx(1.0 - run_ctx.cost) + + async def test_trace_records_mode(self, plugin): + init(mode="enforce") + with run() as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gpt-4o" + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(10, 10), + ) + await plugin.after_model_callback(cb_ctx, response) + trace = run_ctx.trace() + assert len(trace) == 1 + assert trace[0]["reason"] == "enforce" + assert trace[0]["model"] == "gpt-4o" + + async def test_no_start_time_records_zero_latency(self, plugin): + init(mode="observe") + with run() as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gpt-4o" + # Don't set start time + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(10, 10), + ) + await plugin.after_model_callback(cb_ctx, response) + assert run_ctx.latency_used_ms == 0.0 + + async def test_fallback_key_tracks_across_distinct_context_objects(self, plugin): + """ADK runtimes may pass different callback_context objects per phase.""" + init(mode="observe") + with run() as run_ctx: + before_ctx = FakeCallbackContext(invocation_id="inv-x", agent_name="agent-a") + after_ctx = FakeCallbackContext(invocation_id="inv-x", agent_name="agent-a") + await plugin.before_model_callback(before_ctx, FakeLlmRequest("gemini-2.5-flash")) + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(100, 50), + ) + await plugin.after_model_callback(after_ctx, response) + assert run_ctx.model_used == "gemini-2.5-flash" + assert run_ctx.latency_used_ms >= 0.0 + + async def test_fail_open_swallows_errors(self, plugin): + init(mode="observe") + with run(): + with patch( + "cascadeflow.integrations.google_adk.get_current_run", + side_effect=RuntimeError("boom"), + ): + result = await plugin.after_model_callback( + FakeCallbackContext(), + FakeLlmResponse(), + ) + assert result is None + + +# --------------------------------------------------------------------------- +# on_model_error_callback +# --------------------------------------------------------------------------- + + +class TestOnModelErrorCallback: + @pytest.fixture + def plugin(self): + return adk_mod.CascadeFlowADKPlugin() + + async def test_records_error_in_trace(self, plugin): + init(mode="observe") + with run() as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gemini-2.5-flash" + plugin._call_start_times[key] = time.monotonic() + + await plugin.on_model_error_callback(cb_ctx, ValueError("bad input")) + + trace = run_ctx.trace() + assert len(trace) == 1 + assert trace[0]["action"] == "error" + assert "ValueError" in trace[0]["reason"] + assert trace[0]["model"] == "gemini-2.5-flash" + + async def test_cleans_up_timing_state(self, plugin): + init(mode="observe") + with run(): + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gemini-2.5-flash" + plugin._call_start_times[key] = time.monotonic() + + await plugin.on_model_error_callback(cb_ctx, RuntimeError("oops")) + + assert key not in plugin._call_models + assert key not in plugin._call_start_times + + async def test_fail_open_swallows_errors(self, plugin): + init(mode="observe") + with run(): + with patch( + "cascadeflow.integrations.google_adk.get_current_run", + side_effect=RuntimeError("boom"), + ): + result = await plugin.on_model_error_callback( + FakeCallbackContext(), + ValueError("test"), + ) + assert result is None + + +# --------------------------------------------------------------------------- +# enable / disable lifecycle +# --------------------------------------------------------------------------- + + +class TestEnableDisable: + def test_enable_returns_plugin_instance(self): + plugin = adk_mod.enable() + assert isinstance(plugin, adk_mod.CascadeFlowADKPlugin) + assert plugin.name == "cascadeflow_harness" + assert adk_mod.is_enabled() + + def test_enable_is_idempotent(self): + p1 = adk_mod.enable() + p2 = adk_mod.enable() + assert p1 is p2 # same instance + + def test_enable_applies_config(self): + config = adk_mod.GoogleADKHarnessConfig(fail_open=False, enable_budget_gate=False) + plugin = adk_mod.enable(config=config) + assert plugin._config.fail_open is False + assert plugin._config.enable_budget_gate is False + + def test_disable_deactivates_plugin(self): + plugin = adk_mod.enable() + assert plugin._active is True + adk_mod.disable() + assert not adk_mod.is_enabled() + assert plugin._active is False + + def test_disable_when_not_enabled_is_safe(self): + adk_mod.disable() # should not raise + assert not adk_mod.is_enabled() + + +# --------------------------------------------------------------------------- +# Public API helpers +# --------------------------------------------------------------------------- + + +class TestPublicAPI: + def test_is_available_reflects_module_flag(self): + assert adk_mod.is_available() == adk_mod.GOOGLE_ADK_AVAILABLE + + def test_is_enabled_default_false(self): + assert adk_mod.is_enabled() is False + + def test_get_config_returns_copy(self): + cfg = adk_mod.get_config() + assert isinstance(cfg, adk_mod.GoogleADKHarnessConfig) + assert cfg.fail_open is True + assert cfg.enable_budget_gate is True + # Modifying the copy doesn't affect module state + cfg.fail_open = False + assert adk_mod.get_config().fail_open is True + + +# --------------------------------------------------------------------------- +# GoogleADKHarnessConfig +# --------------------------------------------------------------------------- + + +class TestConfig: + def test_defaults(self): + cfg = adk_mod.GoogleADKHarnessConfig() + assert cfg.fail_open is True + assert cfg.enable_budget_gate is True + + def test_custom_values(self): + cfg = adk_mod.GoogleADKHarnessConfig(fail_open=False, enable_budget_gate=False) + assert cfg.fail_open is False + assert cfg.enable_budget_gate is False + + +# --------------------------------------------------------------------------- +# Plugin deactivate +# --------------------------------------------------------------------------- + + +class TestDeactivate: + async def test_deactivated_plugin_skips_callbacks(self): + plugin = adk_mod.CascadeFlowADKPlugin() + plugin.deactivate() + + init(mode="enforce", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.002 + result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest()) + assert result is None # no-op, not blocked + + async def test_deactivate_clears_state(self): + plugin = adk_mod.CascadeFlowADKPlugin() + plugin._call_start_times[12345] = 1.0 + plugin._call_models[12345] = "test" + plugin.deactivate() + assert len(plugin._call_start_times) == 0 + assert len(plugin._call_models) == 0 + + +# --------------------------------------------------------------------------- +# _extract_tokens +# --------------------------------------------------------------------------- + + +class TestExtractTokens: + def test_from_usage_metadata(self): + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(100, 200), + ) + assert adk_mod.CascadeFlowADKPlugin._extract_tokens(response) == (100, 200) + + def test_zero_usage_falls_back_to_content(self): + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(0, 0), + content=FakeContent(parts=[FakePart(text="x" * 80)]), + ) + inp, out = adk_mod.CascadeFlowADKPlugin._extract_tokens(response) + assert inp == 0 + assert out == 20 # 80 / 4 + + def test_no_usage_no_content(self): + response = FakeLlmResponse() + assert adk_mod.CascadeFlowADKPlugin._extract_tokens(response) == (0, 0) + + def test_content_with_no_text(self): + response = FakeLlmResponse( + content=FakeContent(parts=[FakePart(function_call={"name": "f"})]), + ) + inp, out = adk_mod.CascadeFlowADKPlugin._extract_tokens(response) + assert inp == 0 + assert out == 1 # max(0//4, 1) + + +class TestCallbackKeyCollision: + """Verify _callback_key uses id() for per-object uniqueness.""" + + def test_distinct_keys_for_different_objects(self): + """Two distinct context objects always produce distinct keys.""" + ctx_a = FakeCallbackContext(invocation_id="inv-1", agent_name="agent-a") + ctx_b = FakeCallbackContext(invocation_id="inv-1", agent_name="agent-a") + key_a = adk_mod.CascadeFlowADKPlugin._callback_key(ctx_a) + key_b = adk_mod.CascadeFlowADKPlugin._callback_key(ctx_b) + assert key_a != key_b, "Same IDs on different objects must produce distinct keys" + + def test_key_stable_for_same_object(self): + """Same context object always produces the same key.""" + ctx = FakeCallbackContext() + key1 = adk_mod.CascadeFlowADKPlugin._callback_key(ctx) + key2 = adk_mod.CascadeFlowADKPlugin._callback_key(ctx) + assert key1 == key2 + + def test_key_is_int(self): + """Key type is int (object id).""" + ctx = FakeCallbackContext() + assert isinstance(adk_mod.CascadeFlowADKPlugin._callback_key(ctx), int) + + @pytest.mark.asyncio + async def test_concurrent_same_ids_track_independently(self): + """Two concurrent calls with same invocation_id+agent_name don't corrupt.""" + init(mode="observe") + with run(budget=1.0) as harness_ctx: + plugin = adk_mod.CascadeFlowADKPlugin() + # Same IDs — previously would collide + ctx_a = FakeCallbackContext(invocation_id="inv-1", agent_name="agent") + ctx_b = FakeCallbackContext(invocation_id="inv-1", agent_name="agent") + + req_a = FakeLlmRequest(model="gpt-4o") + req_b = FakeLlmRequest(model="gpt-4o-mini") + + await plugin.before_model_callback(ctx_a, req_a) + await plugin.before_model_callback(ctx_b, req_b) + + resp_b = FakeLlmResponse(usage_metadata=FakeUsageMetadata(50, 25)) + resp_a = FakeLlmResponse(usage_metadata=FakeUsageMetadata(100, 50)) + await plugin.after_model_callback(ctx_b, resp_b) + await plugin.after_model_callback(ctx_a, resp_a) + + assert harness_ctx.step_count == 2 + assert len(plugin._call_start_times) == 0 + assert len(plugin._call_models) == 0 + + +# --------------------------------------------------------------------------- +# Off-mode behavior +# --------------------------------------------------------------------------- + + +class TestOffMode: + """mode='off' must not track metrics or update run context.""" + + @pytest.mark.asyncio + async def test_off_mode_before_callback_returns_none(self): + init(mode="off") + plugin = adk_mod.CascadeFlowADKPlugin() + with run() as run_ctx: + result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest()) + assert result is None + assert len(plugin._call_start_times) == 0 + + @pytest.mark.asyncio + async def test_off_mode_after_callback_does_not_track(self): + init(mode="off") + plugin = adk_mod.CascadeFlowADKPlugin() + with run() as run_ctx: + await plugin.after_model_callback( + FakeCallbackContext(), + FakeLlmResponse(usage_metadata=FakeUsageMetadata(1000, 500)), + ) + assert run_ctx.step_count == 0 + assert run_ctx.cost == 0.0 + assert run_ctx.energy_used == 0.0 + assert len(run_ctx.trace()) == 0 + + +# --------------------------------------------------------------------------- +# Versioned model name resolution +# --------------------------------------------------------------------------- + + +class TestVersionedModelPricing: + """Versioned model IDs must resolve to correct pricing, not default.""" + + def test_versioned_gemini_flash(self): + from cascadeflow.harness.pricing import estimate_cost + + # Should resolve to gemini-2.5-flash pricing ($0.15/$0.60) + cost = estimate_cost("gemini-2.5-flash-preview-05-20", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.75, abs=0.01) + + def test_versioned_gemini_pro(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-pro-preview-05-06", 1_000_000, 1_000_000) + assert cost == pytest.approx(11.25, abs=0.01) + + def test_dated_model_suffix(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-flash-20250120", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.75, abs=0.01) + + def test_latest_suffix(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-flash-latest", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.75, abs=0.01) + + def test_unknown_model_still_uses_default(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("totally-unknown-model", 1_000_000, 0) + assert cost == pytest.approx(2.50) + + def test_exact_match_still_works(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-flash", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.75, abs=0.01) + + def test_prefix_match_variant(self): + """A variant like gemini-2.5-flash-8b matches the base model.""" + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-flash-8b", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.75, abs=0.01) diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py index 5669e845..f4e7f9cd 100644 --- a/tests/test_harness_api.py +++ b/tests/test_harness_api.py @@ -5,6 +5,7 @@ import cascadeflow import cascadeflow.harness.api as harness_api from cascadeflow.harness import agent, get_current_run, get_harness_config, init, reset, run +from cascadeflow.telemetry.callbacks import CallbackEvent, CallbackManager def setup_function() -> None: @@ -154,6 +155,17 @@ def fn(x: int) -> int: assert policy["compliance"] == "gdpr" +def test_agent_decorator_preserves_function_metadata(): + @agent(budget=0.5) + def fn(x: int) -> int: + """sample doc""" + return x + + assert fn.__name__ == "fn" + assert fn.__doc__ == "sample doc" + assert fn.__annotations__ == {"x": int, "return": int} + + @pytest.mark.asyncio async def test_agent_decorator_keeps_async_behavior_and_attaches_metadata(): @agent(budget=0.4, kpi_weights={"cost": 1.0}) @@ -172,6 +184,8 @@ def test_top_level_exports_exist(): assert callable(cascadeflow.run) assert callable(cascadeflow.harness_agent) assert hasattr(cascadeflow.agent, "PROVIDER_REGISTRY") + assert callable(cascadeflow.get_harness_callback_manager) + assert callable(cascadeflow.set_harness_callback_manager) report = cascadeflow.init(mode="off") assert report.mode == "off" @@ -183,6 +197,8 @@ def test_run_record_and_trace_copy(): trace_b = ctx.trace() assert trace_a == trace_b assert trace_a[0]["action"] == "switch_model" + assert "budget_state" in trace_a[0] + assert trace_a[0]["budget_state"]["max"] == 1.0 trace_a.append({"action": "mutated"}) assert len(ctx.trace()) == 1 @@ -205,6 +221,12 @@ def test_init_reads_from_env(monkeypatch): assert report.config_sources["budget"] == "env" +def test_init_rejects_oversized_env_json(monkeypatch): + monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_TARGETS", "x" * 5000) + with pytest.raises(ValueError, match="JSON config exceeds"): + init() + + def test_init_reads_from_config_file(tmp_path, monkeypatch): config = tmp_path / "cascadeflow.json" config.write_text( @@ -327,3 +349,188 @@ def test_init_reports_openai_instrumented_when_patch_succeeds(monkeypatch): monkeypatch.setattr(instrument, "patch_openai", lambda: True) report = init(mode="observe") assert report.instrumented == ["openai"] + + +def test_init_reports_anthropic_instrumented_when_patch_succeeds(monkeypatch): + monkeypatch.setattr( + harness_api, + "find_spec", + lambda name: object() if name == "anthropic" else None, + ) + + import cascadeflow.harness.instrument as instrument + + monkeypatch.setattr(instrument, "patch_anthropic", lambda: True) + report = init(mode="observe") + assert report.instrumented == ["anthropic"] + + +def test_init_reports_anthropic_detected_not_instrumented_on_patch_failure(monkeypatch): + monkeypatch.setattr( + harness_api, + "find_spec", + lambda name: object() if name == "anthropic" else None, + ) + + import cascadeflow.harness.instrument as instrument + + monkeypatch.setattr(instrument, "patch_anthropic", lambda: False) + report = init(mode="observe") + assert report.instrumented == [] + assert report.detected_but_not_instrumented == ["anthropic"] + + +def test_run_summary_populates_on_context_exit(): + init(mode="observe") + with run(budget=1.5) as ctx: + ctx.step_count = 2 + ctx.tool_calls = 1 + ctx.cost = 0.42 + ctx.latency_used_ms = 123.0 + ctx.energy_used = 33.0 + ctx.budget_remaining = 1.08 + ctx.last_action = "allow" + ctx.model_used = "gpt-4o-mini" + + summary = ctx.summary() + assert summary["run_id"] == ctx.run_id + assert summary["step_count"] == 2 + assert summary["budget_remaining"] == pytest.approx(1.08) + assert summary["duration_ms"] is not None + assert summary["duration_ms"] >= 0.0 + assert ctx.duration_ms is not None + assert ctx.duration_ms >= 0.0 + + +def test_run_context_logs_summary(caplog): + init(mode="observe") + with caplog.at_level("INFO", logger="cascadeflow.harness"): + with run(budget=1.0) as ctx: + ctx.step_count = 1 + ctx.cost = 0.01 + ctx.model_used = "gpt-4o-mini" + + assert any("harness run summary" in rec.message for rec in caplog.records) + + +def test_record_emits_cascade_decision_callback(): + manager = CallbackManager() + received = [] + + def _on_decision(data): + received.append(data) + + manager.register(CallbackEvent.CASCADE_DECISION, _on_decision) + report = init(mode="observe", callback_manager=manager) + assert report.config_sources["callback_manager"] == "code" + + with run(budget=1.0) as ctx: + ctx.step_count = 1 + ctx.record(action="switch_model", reason="budget_pressure", model="gpt-4o-mini") + + assert len(received) == 1 + event = received[0] + assert event.event == CallbackEvent.CASCADE_DECISION + assert event.query == "[harness]" + assert event.workflow == "harness" + assert event.data["action"] == "switch_model" + assert event.data["run_id"] == ctx.run_id + + +def test_record_sanitizes_trace_values(): + ctx = run() + ctx.record( + action="allow\nnewline", + reason="a" * 400, + model="model\r\nname", + ) + entry = ctx.trace()[0] + assert "\n" not in entry["action"] + assert "\r" not in entry["model"] + assert len(entry["reason"]) <= 160 + + +def test_record_sanitizes_non_printable_values(): + ctx = run() + ctx.record(action="allow\x00", reason="ok\x1f", model="gpt-4o-mini\x07") + entry = ctx.trace()[0] + assert "\x00" not in entry["action"] + assert "\x1f" not in entry["reason"] + assert "\x07" not in entry["model"] + + +def test_record_without_callback_manager_is_noop(): + init(mode="observe") + with run(budget=1.0) as ctx: + ctx.record(action="allow", reason="test", model="gpt-4o-mini") + assert len(ctx.trace()) == 1 + + +def test_record_empty_action_warns_and_defaults(caplog): + init(mode="observe") + with caplog.at_level("WARNING", logger="cascadeflow.harness"): + with run(budget=1.0) as ctx: + ctx.record(action="", reason="test", model="gpt-4o-mini") + entry = ctx.trace()[0] + assert entry["action"] == "allow" + assert any("empty action" in rec.message for rec in caplog.records) + + +def test_init_rejects_negative_budget(): + with pytest.raises(ValueError, match="non-negative"): + init(mode="observe", budget=-1.0) + + +def test_init_rejects_negative_max_tool_calls(): + with pytest.raises(ValueError, match="non-negative"): + init(mode="observe", max_tool_calls=-1) + + +def test_init_rejects_negative_max_latency(): + with pytest.raises(ValueError, match="non-negative"): + init(mode="observe", max_latency_ms=-100.0) + + +def test_init_rejects_negative_max_energy(): + with pytest.raises(ValueError, match="non-negative"): + init(mode="observe", max_energy=-0.5) + + +def test_init_rejects_invalid_compliance(): + with pytest.raises(ValueError, match="compliance"): + init(mode="observe", compliance="invalid_mode") + + +def test_run_rejects_negative_budget(): + init(mode="observe") + with pytest.raises(ValueError, match="non-negative"): + run(budget=-0.5) + + +def test_run_rejects_invalid_compliance(): + init(mode="observe") + with pytest.raises(ValueError, match="compliance"): + run(compliance="foobar") + + +def test_init_accepts_zero_budget(): + report = init(mode="observe", budget=0.0) + cfg = get_harness_config() + assert cfg.budget == 0.0 + + +def test_init_accepts_valid_compliance(): + for value in ("gdpr", "hipaa", "pci", "strict"): + reset() + report = init(mode="observe", compliance=value) + cfg = get_harness_config() + assert cfg.compliance == value + + +def test_trace_rotation_limits_entries(): + init(mode="observe") + with run(budget=100.0) as ctx: + for i in range(1050): + ctx.record(action="allow", reason="test", model="gpt-4o-mini") + trace = ctx.trace() + assert len(trace) <= 1000 diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py index 75368522..a46cf8a6 100644 --- a/tests/test_harness_instrument.py +++ b/tests/test_harness_instrument.py @@ -1,7 +1,8 @@ -"""Tests for cascadeflow.harness.instrument — OpenAI auto-instrumentation.""" +"""Tests for cascadeflow.harness.instrument — OpenAI + Anthropic auto-instrumentation.""" from __future__ import annotations +from importlib.util import find_spec import time from typing import Optional from unittest.mock import AsyncMock, MagicMock @@ -12,14 +13,24 @@ from cascadeflow.harness import init, reset, run from cascadeflow.harness.instrument import ( + _InstrumentedAnthropicAsyncStream, + _InstrumentedAnthropicStream, _InstrumentedAsyncStream, _InstrumentedStream, + _count_tool_calls_in_anthropic_response, _estimate_cost, _estimate_energy, + _extract_anthropic_usage, + _make_patched_anthropic_async_create, + _make_patched_anthropic_create, _make_patched_async_create, _make_patched_create, + is_anthropic_patched, + is_openai_patched, is_patched, + patch_anthropic, patch_openai, + unpatch_anthropic, unpatch_openai, ) @@ -87,19 +98,19 @@ def _mock_stream_chunk( class TestPatchLifecycle: def test_patch_and_unpatch(self) -> None: - assert not is_patched() + assert not is_openai_patched() result = patch_openai() assert result is True - assert is_patched() + assert is_openai_patched() unpatch_openai() - assert not is_patched() + assert not is_openai_patched() def test_idempotent_patching(self) -> None: patch_openai() patch_openai() - assert is_patched() + assert is_openai_patched() unpatch_openai() - assert not is_patched() + assert not is_openai_patched() def test_unpatch_without_prior_patch(self) -> None: unpatch_openai() # should not raise @@ -107,12 +118,12 @@ def test_unpatch_without_prior_patch(self) -> None: def test_init_observe_patches(self) -> None: report = init(mode="observe") assert "openai" in report.instrumented - assert is_patched() + assert is_openai_patched() def test_init_enforce_patches(self) -> None: report = init(mode="enforce") assert "openai" in report.instrumented - assert is_patched() + assert is_openai_patched() def test_init_off_does_not_patch(self) -> None: init(mode="off") @@ -120,7 +131,7 @@ def test_init_off_does_not_patch(self) -> None: def test_reset_unpatches(self) -> None: init(mode="observe") - assert is_patched() + assert is_openai_patched() reset() assert not is_patched() @@ -133,6 +144,27 @@ def test_class_method_actually_replaced(self) -> None: unpatch_openai() assert Completions.create is original + def test_patch_and_unpatch_anthropic(self) -> None: + if find_spec("anthropic") is None: + pytest.skip("anthropic package not available") + assert not is_anthropic_patched() + result = patch_anthropic() + assert result is True + assert is_anthropic_patched() + unpatch_anthropic() + assert not is_anthropic_patched() + + def test_anthropic_class_method_actually_replaced(self) -> None: + if find_spec("anthropic") is None: + pytest.skip("anthropic package not available") + from anthropic.resources.messages import Messages + + original = Messages.create + patch_anthropic() + assert Messages.create is not original + unpatch_anthropic() + assert Messages.create is original + # --------------------------------------------------------------------------- # Sync wrapper @@ -402,6 +434,31 @@ def test_stream_finalize_is_idempotent(self) -> None: assert ctx.step_count == 1 # Should not double-count + def test_stream_finalizes_on_iteration_error(self) -> None: + init(mode="observe") + chunk1 = _mock_stream_chunk("data", usage=_mock_usage(100, 50)) + + class _FailingStream: + def __init__(self) -> None: + self._done = False + + def __iter__(self): + return self + + def __next__(self): + if not self._done: + self._done = True + return chunk1 + raise RuntimeError("stream failed") + + with run(budget=1.0) as ctx: + wrapped = _InstrumentedStream(_FailingStream(), ctx, "gpt-4o-mini", time.monotonic()) + with pytest.raises(RuntimeError, match="stream failed"): + list(wrapped) + + assert ctx.step_count == 1 + assert ctx.cost > 0 + def test_stream_wrapper_via_patched_create(self) -> None: """Verify that stream=True in the wrapper returns an _InstrumentedStream.""" init(mode="observe") @@ -464,6 +521,26 @@ async def _async_iter(): assert ctx.step_count == 1 + @pytest.mark.asyncio + async def test_async_stream_finalizes_on_iteration_error(self) -> None: + init(mode="observe") + chunk1 = _mock_stream_chunk("data", usage=_mock_usage(100, 50)) + + async def _failing_iter(): + yield chunk1 + raise RuntimeError("async stream failed") + + async with run(budget=1.0) as ctx: + wrapped = _InstrumentedAsyncStream( + _failing_iter(), ctx, "gpt-4o-mini", time.monotonic() + ) + with pytest.raises(RuntimeError, match="async stream failed"): + async for _ in wrapped: + pass + + assert ctx.step_count == 1 + assert ctx.cost > 0 + # --------------------------------------------------------------------------- # Cost and energy estimation @@ -941,3 +1018,487 @@ def test_non_stream_does_not_inject_stream_options(self) -> None: call_kwargs = original.call_args[1] assert "stream_options" not in call_kwargs + + +# =========================================================================== +# Anthropic instrumentation tests +# =========================================================================== + + +def _mock_anthropic_usage( + input_tokens: Optional[int] = 100, + output_tokens: Optional[int] = 50, +) -> MagicMock: + u = MagicMock() + u.input_tokens = input_tokens + u.output_tokens = output_tokens + return u + + +def _mock_anthropic_response( + input_tokens: int = 100, + output_tokens: int = 50, + content: Optional[list] = None, +) -> MagicMock: + resp = MagicMock() + resp.usage = _mock_anthropic_usage(input_tokens, output_tokens) + resp.content = content or [] + return resp + + +def _mock_tool_use_block() -> MagicMock: + block = MagicMock() + block.type = "tool_use" + return block + + +def _mock_text_block() -> MagicMock: + block = MagicMock() + block.type = "text" + return block + + +def _mock_anthropic_message_start_event( + input_tokens: int = 100, + output_tokens: int = 0, +) -> MagicMock: + event = MagicMock() + event.type = "message_start" + event.message = MagicMock() + event.message.usage = _mock_anthropic_usage(input_tokens, output_tokens) + return event + + +def _mock_anthropic_message_delta_event( + output_tokens: int = 50, +) -> MagicMock: + event = MagicMock() + event.type = "message_delta" + event.usage = _mock_anthropic_usage(None, output_tokens) + return event + + +def _mock_anthropic_content_block_start_event( + block_type: str = "tool_use", +) -> MagicMock: + event = MagicMock() + event.type = "content_block_start" + event.content_block = MagicMock() + event.content_block.type = block_type + return event + + +def _mock_anthropic_message_stop_event() -> MagicMock: + event = MagicMock() + event.type = "message_stop" + event.usage = None + return event + + +# --------------------------------------------------------------------------- +# Anthropic usage extraction +# --------------------------------------------------------------------------- + + +class TestAnthropicUsageExtraction: + def test_extract_usage(self) -> None: + resp = _mock_anthropic_response(input_tokens=200, output_tokens=100) + inp, out = _extract_anthropic_usage(resp) + assert inp == 200 + assert out == 100 + + def test_extract_usage_none(self) -> None: + resp = MagicMock() + resp.usage = None + inp, out = _extract_anthropic_usage(resp) + assert inp == 0 + assert out == 0 + + +# --------------------------------------------------------------------------- +# Anthropic tool call counting +# --------------------------------------------------------------------------- + + +class TestAnthropicToolCallCounting: + def test_counts_tool_use_blocks(self) -> None: + resp = _mock_anthropic_response( + content=[_mock_text_block(), _mock_tool_use_block(), _mock_tool_use_block()] + ) + assert _count_tool_calls_in_anthropic_response(resp) == 2 + + def test_no_content(self) -> None: + resp = MagicMock() + resp.content = None + assert _count_tool_calls_in_anthropic_response(resp) == 0 + + def test_empty_content(self) -> None: + resp = _mock_anthropic_response(content=[]) + assert _count_tool_calls_in_anthropic_response(resp) == 0 + + def test_text_only(self) -> None: + resp = _mock_anthropic_response(content=[_mock_text_block()]) + assert _count_tool_calls_in_anthropic_response(resp) == 0 + + +# --------------------------------------------------------------------------- +# Anthropic sync wrapper +# --------------------------------------------------------------------------- + + +class TestAnthropicSyncWrapper: + def test_observe_passes_through_response(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=1.0) as ctx: + result = wrapper(MagicMock(), model="claude-sonnet-4") + + assert result is mock_resp + original.assert_called_once() + + def test_observe_tracks_cost(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=100.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + + # claude-sonnet-4: $3.00/1M in + $15.00/1M out = $18.00 + assert ctx.cost == pytest.approx(18.0, abs=0.01) + + def test_observe_tracks_step_count(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + wrapper(MagicMock(), model="claude-sonnet-4") + + assert ctx.step_count == 2 + + def test_observe_tracks_tool_calls(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response( + content=[_mock_tool_use_block(), _mock_tool_use_block()] + ) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + + assert ctx.tool_calls == 2 + + def test_observe_tracks_energy(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response(input_tokens=1000, output_tokens=500) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + + # claude-sonnet-4 uses default coefficient=1.0, output_weight=1.5 + # energy = 1.0 * (1000 + 500 * 1.5) = 1750.0 + assert ctx.energy_used == pytest.approx(1750.0) + + def test_observe_tracks_latency(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + + assert ctx.latency_used_ms > 0 + + def test_budget_remaining_decreases(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=100.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + + assert ctx.budget_remaining is not None + assert ctx.budget_remaining == pytest.approx(100.0 - 18.0, abs=0.01) + + def test_trace_records_model_and_mode(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + + trace = ctx.trace() + assert len(trace) == 1 + assert trace[0]["action"] == "allow" + assert trace[0]["reason"] == "observe" + assert trace[0]["model"] == "claude-sonnet-4" + + def test_off_mode_passthrough_no_tracking(self) -> None: + init(mode="off") + mock_resp = _mock_anthropic_response() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run() as ctx: + result = wrapper(MagicMock(), model="claude-sonnet-4") + + assert result is mock_resp + assert ctx.cost == 0.0 + assert ctx.step_count == 0 + + def test_no_run_scope_returns_response(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + result = wrapper(MagicMock(), model="claude-sonnet-4") + assert result is mock_resp + + def test_stream_tracks_usage_and_tool_calls(self) -> None: + init(mode="observe") + mock_stream = iter( + [ + _mock_anthropic_message_start_event(input_tokens=1_000_000), + _mock_anthropic_content_block_start_event("tool_use"), + _mock_anthropic_message_delta_event(output_tokens=1_000_000), + _mock_anthropic_message_stop_event(), + ] + ) + original = MagicMock(return_value=mock_stream) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=1.0) as ctx: + result = wrapper(MagicMock(), model="claude-sonnet-4", stream=True) + assert isinstance(result, _InstrumentedAnthropicStream) + list(result) + + assert ctx.cost == pytest.approx(18.0, abs=0.01) + assert ctx.step_count == 1 + assert ctx.tool_calls == 1 + + def test_stream_finalizes_on_iteration_error(self) -> None: + init(mode="observe") + + class _FailingAnthropicStream: + def __init__(self) -> None: + self._done = False + + def __iter__(self): + return self + + def __next__(self): + if not self._done: + self._done = True + return _mock_anthropic_message_start_event(input_tokens=1_000_000) + raise RuntimeError("anthropic stream failed") + + original = MagicMock(return_value=_FailingAnthropicStream()) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=1.0) as ctx: + result = wrapper(MagicMock(), model="claude-sonnet-4", stream=True) + assert isinstance(result, _InstrumentedAnthropicStream) + with pytest.raises(RuntimeError, match="anthropic stream failed"): + list(result) + + assert ctx.step_count == 1 + assert ctx.cost > 0 + + def test_multiple_calls_accumulate(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=100.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + wrapper(MagicMock(), model="claude-sonnet-4") + + assert ctx.cost == pytest.approx(36.0, abs=0.01) + assert ctx.step_count == 2 + + +# --------------------------------------------------------------------------- +# Anthropic async wrapper +# --------------------------------------------------------------------------- + + +class TestAnthropicAsyncWrapper: + async def test_observe_passes_through_response(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response() + original = AsyncMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_async_create(original) + + async with run(budget=1.0) as ctx: + result = await wrapper(MagicMock(), model="claude-sonnet-4") + + assert result is mock_resp + + async def test_observe_tracks_cost(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000) + original = AsyncMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_async_create(original) + + async with run(budget=100.0) as ctx: + await wrapper(MagicMock(), model="claude-sonnet-4") + + assert ctx.cost == pytest.approx(18.0, abs=0.01) + assert ctx.step_count == 1 + + async def test_off_mode_passthrough(self) -> None: + init(mode="off") + mock_resp = _mock_anthropic_response() + original = AsyncMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_async_create(original) + + async with run() as ctx: + result = await wrapper(MagicMock(), model="claude-sonnet-4") + + assert result is mock_resp + assert ctx.cost == 0.0 + + async def test_stream_tracks_usage_and_tool_calls(self) -> None: + init(mode="observe") + + async def _event_stream(): + yield _mock_anthropic_message_start_event(input_tokens=1_000_000) + yield _mock_anthropic_content_block_start_event("tool_use") + yield _mock_anthropic_message_delta_event(output_tokens=1_000_000) + yield _mock_anthropic_message_stop_event() + + original = AsyncMock(return_value=_event_stream()) + wrapper = _make_patched_anthropic_async_create(original) + + async with run(budget=1.0) as ctx: + result = await wrapper(MagicMock(), model="claude-sonnet-4", stream=True) + assert isinstance(result, _InstrumentedAnthropicAsyncStream) + async for _ in result: + pass + + assert ctx.cost == pytest.approx(18.0, abs=0.01) + assert ctx.step_count == 1 + assert ctx.tool_calls == 1 + + async def test_stream_finalizes_on_iteration_error(self) -> None: + init(mode="observe") + + async def _failing_event_stream(): + yield _mock_anthropic_message_start_event(input_tokens=1_000_000) + raise RuntimeError("anthropic async stream failed") + + original = AsyncMock(return_value=_failing_event_stream()) + wrapper = _make_patched_anthropic_async_create(original) + + async with run(budget=1.0) as ctx: + result = await wrapper(MagicMock(), model="claude-sonnet-4", stream=True) + assert isinstance(result, _InstrumentedAnthropicAsyncStream) + with pytest.raises(RuntimeError, match="anthropic async stream failed"): + async for _ in result: + pass + + assert ctx.step_count == 1 + assert ctx.cost > 0 + + +# --------------------------------------------------------------------------- +# Anthropic enforce mode +# --------------------------------------------------------------------------- + + +class TestAnthropicEnforceMode: + def test_enforce_trace_records_enforce_reason(self) -> None: + init(mode="enforce") + mock_resp = _mock_anthropic_response() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=100.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + + trace = ctx.trace() + assert trace[0]["reason"] == "enforce" + + def test_enforce_raises_on_budget_exhausted(self) -> None: + from cascadeflow.schema.exceptions import BudgetExceededError + + init(mode="enforce") + mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=0.001) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + with pytest.raises(BudgetExceededError): + wrapper(MagicMock(), model="claude-sonnet-4") + + def test_observe_does_not_raise_on_budget_exhausted(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=0.001) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + wrapper(MagicMock(), model="claude-sonnet-4") + + assert ctx.cost > ctx.budget_max + + async def test_async_enforce_raises_on_budget_exhausted(self) -> None: + from cascadeflow.schema.exceptions import BudgetExceededError + + init(mode="enforce") + mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000) + original = AsyncMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_async_create(original) + + async with run(budget=0.001) as ctx: + await wrapper(MagicMock(), model="claude-sonnet-4") + with pytest.raises(BudgetExceededError): + await wrapper(MagicMock(), model="claude-sonnet-4") + + +# --------------------------------------------------------------------------- +# Anthropic init() integration +# --------------------------------------------------------------------------- + + +class TestAnthropicInitIntegration: + def test_init_observe_patches_anthropic(self) -> None: + if find_spec("anthropic") is None: + pytest.skip("anthropic package not available") + report = init(mode="observe") + assert "anthropic" in report.instrumented + assert is_anthropic_patched() + + def test_init_off_unpatches_anthropic(self) -> None: + if find_spec("anthropic") is None: + pytest.skip("anthropic package not available") + init(mode="observe") + assert is_anthropic_patched() + init(mode="off") + assert not is_anthropic_patched() + + def test_reset_unpatches_anthropic(self) -> None: + if find_spec("anthropic") is None: + pytest.skip("anthropic package not available") + init(mode="observe") + assert is_anthropic_patched() + reset() + assert not is_anthropic_patched()