diff --git a/cascadeflow/__init__.py b/cascadeflow/__init__.py index aabe6191..6dd64b05 100644 --- a/cascadeflow/__init__.py +++ b/cascadeflow/__init__.py @@ -239,6 +239,22 @@ get_tool_risk_routing, ) +# NEW: Harness API scaffold (V2 core branch) +# NOTE: harness.agent is NOT re-exported here — it would shadow the +# cascadeflow.agent *module* and break dotted-path resolution +# (e.g. patch("cascadeflow.agent.PROVIDER_REGISTRY")). +# Use ``from cascadeflow.harness import agent`` instead. +from .harness import ( + HarnessConfig, + HarnessInitReport, + HarnessRunContext, + init, + reset, + run, + get_harness_config, + get_current_run, +) + # ==================== MAIN AGENT & RESULT ==================== @@ -381,6 +397,15 @@ "ToolRiskClassification", # NEW: v0.8.0 - Classification result "ToolRiskClassifier", # NEW: v0.8.0 - Tool risk classifier "get_tool_risk_routing", # NEW: v0.8.0 - Routing by risk level + # ===== HARNESS API (V2 scaffold) ===== + "HarnessConfig", + "HarnessInitReport", + "HarnessRunContext", + "init", + "reset", + "run", + "get_harness_config", + "get_current_run", # ===== PROVIDERS ===== "ModelResponse", "BaseProvider", diff --git a/cascadeflow/harness/__init__.py b/cascadeflow/harness/__init__.py new file mode 100644 index 00000000..43a03662 --- /dev/null +++ b/cascadeflow/harness/__init__.py @@ -0,0 +1,34 @@ +""" +Core harness API scaffold for V2 planning work. + +This module provides a minimal, backward-compatible surface: +- init(): global harness settings (opt-in) +- run(): scoped run context for budget/trace accounting +- agent(): decorator for attaching policy metadata + +The implementation intentionally avoids modifying existing CascadeAgent behavior. +""" + +from .api import ( + HarnessConfig, + HarnessInitReport, + HarnessRunContext, + agent, + get_current_run, + get_harness_config, + init, + reset, + run, +) + +__all__ = [ + "HarnessConfig", + "HarnessInitReport", + "HarnessRunContext", + "init", + "run", + "agent", + "get_current_run", + "get_harness_config", + "reset", +] diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py new file mode 100644 index 00000000..00289f9c --- /dev/null +++ b/cascadeflow/harness/api.py @@ -0,0 +1,441 @@ +from __future__ import annotations + +import inspect +import json +import logging +import os +from contextvars import ContextVar, Token +from dataclasses import dataclass, field +from importlib.util import find_spec +from pathlib import Path +from typing import Any, Callable, Literal, Optional, TypeVar, cast +from uuid import uuid4 + +logger = logging.getLogger("cascadeflow.harness") + +HarnessMode = Literal["off", "observe", "enforce"] + + +@dataclass +class HarnessConfig: + mode: HarnessMode = "off" + verbose: bool = False + budget: Optional[float] = None + max_tool_calls: Optional[int] = None + max_latency_ms: Optional[float] = None + max_energy: Optional[float] = None + kpi_targets: Optional[dict[str, float]] = None + kpi_weights: Optional[dict[str, float]] = None + compliance: Optional[str] = None + + +@dataclass +class HarnessInitReport: + mode: HarnessMode + instrumented: list[str] + detected_but_not_instrumented: list[str] + config_sources: dict[str, str] + + +@dataclass +class HarnessRunContext: + run_id: str = field(default_factory=lambda: uuid4().hex[:12]) + mode: HarnessMode = "off" + budget_max: Optional[float] = None + tool_calls_max: Optional[int] = None + latency_max_ms: Optional[float] = None + energy_max: Optional[float] = None + + cost: float = 0.0 + savings: float = 0.0 + tool_calls: int = 0 + step_count: int = 0 + latency_used_ms: float = 0.0 + energy_used: float = 0.0 + budget_remaining: Optional[float] = None + model_used: Optional[str] = None + last_action: str = "allow" + draft_accepted: Optional[bool] = None + _trace: list[dict[str, Any]] = field(default_factory=list) + _token: Optional[Token[Optional[HarnessRunContext]]] = field(default=None, init=False, repr=False) + + def __post_init__(self) -> None: + if self.budget_max is not None and self.budget_remaining is None: + self.budget_remaining = self.budget_max + + def __enter__(self) -> HarnessRunContext: + self._token = _current_run.set(self) + return self + + def __exit__(self, exc_type: Any, exc: Any, tb: Any) -> None: + if self._token is not None: + _current_run.reset(self._token) + self._token = None + + async def __aenter__(self) -> HarnessRunContext: + return self.__enter__() + + async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None: + self.__exit__(exc_type, exc, tb) + + def trace(self) -> list[dict[str, Any]]: + return list(self._trace) + + def record(self, action: str, reason: str, model: Optional[str] = None) -> None: + self.last_action = action + self.model_used = model + self._trace.append( + { + "action": action, + "reason": reason, + "model": model, + "run_id": self.run_id, + } + ) + + +_harness_config: HarnessConfig = HarnessConfig() +_current_run: ContextVar[Optional[HarnessRunContext]] = ContextVar("cascadeflow_harness_run", default=None) +_is_instrumented: bool = False +_UNSET = object() + + +def _validate_mode(mode: str) -> HarnessMode: + if mode not in {"off", "observe", "enforce"}: + raise ValueError("mode must be one of: off, observe, enforce") + return cast(HarnessMode, mode) + + +def _detect_sdks() -> dict[str, bool]: + return { + "openai": find_spec("openai") is not None, + "anthropic": find_spec("anthropic") is not None, + } + + +def get_harness_config() -> HarnessConfig: + return HarnessConfig(**_harness_config.__dict__) + + +def get_current_run() -> Optional[HarnessRunContext]: + return _current_run.get() + + +def reset() -> None: + """ + Reset harness global state and unpatch instrumented clients. + + Intended for tests and controlled shutdown paths. + """ + + global _harness_config + global _is_instrumented + + from cascadeflow.harness.instrument import unpatch_openai + + unpatch_openai() + _harness_config = HarnessConfig() + _is_instrumented = False + _current_run.set(None) + + +def _parse_bool(raw: str) -> bool: + normalized = raw.strip().lower() + return normalized in {"1", "true", "yes", "on"} + + +def _parse_float(raw: str) -> float: + return float(raw.strip()) + + +def _parse_int(raw: str) -> int: + return int(raw.strip()) + + +def _parse_json_dict(raw: str) -> dict[str, float]: + value = json.loads(raw) + if not isinstance(value, dict): + raise ValueError("expected JSON object") + parsed: dict[str, float] = {} + for key, item in value.items(): + parsed[str(key)] = float(item) + return parsed + + +def _read_env_config() -> dict[str, Any]: + env_config: dict[str, Any] = {} + + mode = os.getenv("CASCADEFLOW_HARNESS_MODE") or os.getenv("CASCADEFLOW_MODE") + if mode: + env_config["mode"] = mode + + verbose = os.getenv("CASCADEFLOW_HARNESS_VERBOSE") + if verbose is not None: + env_config["verbose"] = _parse_bool(verbose) + + budget = os.getenv("CASCADEFLOW_HARNESS_BUDGET") or os.getenv("CASCADEFLOW_BUDGET") + if budget is not None: + env_config["budget"] = _parse_float(budget) + + max_tool_calls = os.getenv("CASCADEFLOW_HARNESS_MAX_TOOL_CALLS") + if max_tool_calls is not None: + env_config["max_tool_calls"] = _parse_int(max_tool_calls) + + max_latency_ms = os.getenv("CASCADEFLOW_HARNESS_MAX_LATENCY_MS") + if max_latency_ms is not None: + env_config["max_latency_ms"] = _parse_float(max_latency_ms) + + max_energy = os.getenv("CASCADEFLOW_HARNESS_MAX_ENERGY") + if max_energy is not None: + env_config["max_energy"] = _parse_float(max_energy) + + compliance = os.getenv("CASCADEFLOW_HARNESS_COMPLIANCE") + if compliance is not None: + env_config["compliance"] = compliance + + kpi_targets = os.getenv("CASCADEFLOW_HARNESS_KPI_TARGETS") + if kpi_targets is not None: + env_config["kpi_targets"] = _parse_json_dict(kpi_targets) + + kpi_weights = os.getenv("CASCADEFLOW_HARNESS_KPI_WEIGHTS") + if kpi_weights is not None: + env_config["kpi_weights"] = _parse_json_dict(kpi_weights) + + return env_config + + +def _read_file_config() -> tuple[dict[str, Any], Optional[str]]: + """ + Read harness config from CASCADEFLOW_CONFIG path or default config discovery. + """ + + config_path: Optional[str] = os.getenv("CASCADEFLOW_CONFIG") + loaded_path: Optional[str] = None + + try: + from cascadeflow.config_loader import find_config, load_config + except Exception: + logger.debug("config_loader unavailable while reading harness config", exc_info=True) + return {}, None + + try: + if config_path: + loaded_path = str(Path(config_path)) + raw = load_config(config_path) + else: + discovered = find_config() + if not discovered: + return {}, None + loaded_path = str(discovered) + raw = load_config(discovered) + except Exception: + logger.warning("failed to load harness config file", exc_info=True) + return {}, None + + if not isinstance(raw, dict): + return {}, loaded_path + + harness_block = raw.get("harness") + if isinstance(harness_block, dict): + return dict(harness_block), loaded_path + + # Fallback: allow top-level harness keys. + keys = { + "mode", + "verbose", + "budget", + "max_tool_calls", + "max_latency_ms", + "max_energy", + "kpi_targets", + "kpi_weights", + "compliance", + } + fallback = {k: v for k, v in raw.items() if k in keys} + return fallback, loaded_path + + +def _resolve_value( + name: str, + explicit: Any, + env_config: dict[str, Any], + file_config: dict[str, Any], + default: Any, + sources: dict[str, str], +) -> Any: + if explicit is not _UNSET: + sources[name] = "code" + return explicit + if name in env_config: + sources[name] = "env" + return env_config[name] + if name in file_config: + sources[name] = "file" + return file_config[name] + sources[name] = "default" + return default + + +def init( + *, + mode: HarnessMode | object = _UNSET, + verbose: bool | object = _UNSET, + budget: Optional[float] | object = _UNSET, + max_tool_calls: Optional[int] | object = _UNSET, + max_latency_ms: Optional[float] | object = _UNSET, + max_energy: Optional[float] | object = _UNSET, + kpi_targets: Optional[dict[str, float]] | object = _UNSET, + kpi_weights: Optional[dict[str, float]] | object = _UNSET, + compliance: Optional[str] | object = _UNSET, +) -> HarnessInitReport: + """ + Initialize global harness settings and instrument detected SDK clients. + """ + + global _harness_config + global _is_instrumented + + env_config = _read_env_config() + file_config, file_path = _read_file_config() + sources: dict[str, str] = {} + + resolved_mode = _resolve_value("mode", mode, env_config, file_config, "off", sources) + resolved_verbose = _resolve_value("verbose", verbose, env_config, file_config, False, sources) + resolved_budget = _resolve_value("budget", budget, env_config, file_config, None, sources) + resolved_max_tool_calls = _resolve_value( + "max_tool_calls", max_tool_calls, env_config, file_config, None, sources + ) + resolved_max_latency_ms = _resolve_value( + "max_latency_ms", max_latency_ms, env_config, file_config, None, sources + ) + resolved_max_energy = _resolve_value("max_energy", max_energy, env_config, file_config, None, sources) + resolved_kpi_targets = _resolve_value( + "kpi_targets", kpi_targets, env_config, file_config, None, sources + ) + resolved_kpi_weights = _resolve_value( + "kpi_weights", kpi_weights, env_config, file_config, None, sources + ) + resolved_compliance = _resolve_value( + "compliance", compliance, env_config, file_config, None, sources + ) + + validated_mode = _validate_mode(str(resolved_mode)) + _harness_config = HarnessConfig( + mode=validated_mode, + verbose=bool(resolved_verbose), + budget=cast(Optional[float], resolved_budget), + max_tool_calls=cast(Optional[int], resolved_max_tool_calls), + max_latency_ms=cast(Optional[float], resolved_max_latency_ms), + max_energy=cast(Optional[float], resolved_max_energy), + kpi_targets=cast(Optional[dict[str, float]], resolved_kpi_targets), + kpi_weights=cast(Optional[dict[str, float]], resolved_kpi_weights), + compliance=cast(Optional[str], resolved_compliance), + ) + + sdk_presence = _detect_sdks() + instrumented: list[str] = [] + detected_but_not_instrumented: list[str] = [] + + if validated_mode != "off" and sdk_presence["openai"]: + from cascadeflow.harness.instrument import patch_openai + + if patch_openai(): + instrumented.append("openai") + elif validated_mode == "off": + from cascadeflow.harness.instrument import is_patched, unpatch_openai + + if is_patched(): + unpatch_openai() + if sdk_presence["anthropic"]: + detected_but_not_instrumented.append("anthropic") + + if _is_instrumented: + logger.debug("harness init called again; instrumentation remains idempotent") + _is_instrumented = True + + logger.info("harness init mode=%s instrumented=%s", validated_mode, instrumented) + if detected_but_not_instrumented: + logger.info( + "harness detected but not instrumented=%s", + detected_but_not_instrumented, + ) + if file_path: + logger.debug("harness loaded config file=%s", file_path) + + return HarnessInitReport( + mode=validated_mode, + instrumented=instrumented, + detected_but_not_instrumented=detected_but_not_instrumented, + config_sources=sources, + ) + + +def run( + *, + budget: Optional[float] = None, + max_tool_calls: Optional[int] = None, + max_latency_ms: Optional[float] = None, + max_energy: Optional[float] = None, +) -> HarnessRunContext: + """ + Create a scoped run context. + + Scope-level values override global init defaults for the scope only. + """ + + config = get_harness_config() + resolved_budget = budget if budget is not None else config.budget + resolved_tool_calls = max_tool_calls if max_tool_calls is not None else config.max_tool_calls + resolved_latency = max_latency_ms if max_latency_ms is not None else config.max_latency_ms + resolved_energy = max_energy if max_energy is not None else config.max_energy + + return HarnessRunContext( + mode=config.mode, + budget_max=resolved_budget, + tool_calls_max=resolved_tool_calls, + latency_max_ms=resolved_latency, + energy_max=resolved_energy, + ) + + +F = TypeVar("F", bound=Callable[..., Any]) + + +def agent( + *, + budget: Optional[float] = None, + kpi_targets: Optional[dict[str, float]] = None, + kpi_weights: Optional[dict[str, float]] = None, + compliance: Optional[str] = None, +) -> Callable[[F], F]: + """ + Attach policy metadata to an agent function without changing behavior. + """ + + metadata = { + "budget": budget, + "kpi_targets": kpi_targets, + "kpi_weights": kpi_weights, + "compliance": compliance, + } + + def decorator(func: F) -> F: + func.__cascadeflow_agent_policy__ = metadata # type: ignore[attr-defined] + + if inspect.iscoroutinefunction(func): + + async def async_wrapper(*args: Any, **kwargs: Any) -> Any: + return await func(*args, **kwargs) + + async_wrapper.__cascadeflow_agent_policy__ = metadata # type: ignore[attr-defined] + async_wrapper.__name__ = getattr(func, "__name__", "wrapped_agent") + return cast(F, async_wrapper) + + def sync_wrapper(*args: Any, **kwargs: Any) -> Any: + return func(*args, **kwargs) + + sync_wrapper.__cascadeflow_agent_policy__ = metadata # type: ignore[attr-defined] + sync_wrapper.__name__ = getattr(func, "__name__", "wrapped_agent") + return cast(F, sync_wrapper) + + return decorator diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py new file mode 100644 index 00000000..c02200f7 --- /dev/null +++ b/cascadeflow/harness/instrument.py @@ -0,0 +1,576 @@ +"""OpenAI Python client auto-instrumentation for cascadeflow harness. + +Patches ``openai.resources.chat.completions.Completions.create`` (sync) and +``AsyncCompletions.create`` (async) to intercept LLM calls for observe/enforce +modes. + +This module is called internally by ``cascadeflow.harness.init()``. Users +should not call ``patch_openai`` / ``unpatch_openai`` directly. + +Implementation notes: + - Patching is class-level (all current and future client instances). + - Patching is idempotent (safe to call multiple times). + - ``unpatch_openai()`` restores the original methods exactly. + - Streaming responses are wrapped to capture usage after completion. + - ``with_raw_response`` is NOT patched in V2 (known limitation). +""" + +from __future__ import annotations + +import functools +import logging +import time +from typing import Any + +logger = logging.getLogger("cascadeflow.harness.instrument") + +# --------------------------------------------------------------------------- +# Module-level state for idempotent patch/unpatch +# --------------------------------------------------------------------------- + +_openai_patched: bool = False +_original_sync_create: Any = None +_original_async_create: Any = None + +# --------------------------------------------------------------------------- +# Pricing table (USD per 1M tokens: input, output) +# --------------------------------------------------------------------------- + +_PRICING: dict[str, tuple[float, float]] = { + "gpt-4o": (2.50, 10.00), + "gpt-4o-mini": (0.15, 0.60), + "gpt-5-mini": (0.20, 0.80), + "gpt-4-turbo": (10.00, 30.00), + "gpt-4": (30.00, 60.00), + "gpt-3.5-turbo": (0.50, 1.50), + "o1": (15.00, 60.00), + "o1-mini": (3.00, 12.00), + "o3-mini": (1.10, 4.40), +} +_DEFAULT_PRICING: tuple[float, float] = (2.50, 10.00) + +# --------------------------------------------------------------------------- +# Energy estimation coefficients (deterministic proxy, not live carbon data) +# energy_units = coefficient * (input_tokens + output_tokens * output_weight) +# --------------------------------------------------------------------------- + +_ENERGY_COEFFICIENTS: dict[str, float] = { + "gpt-4o": 1.0, + "gpt-4o-mini": 0.3, + "gpt-5-mini": 0.35, + "gpt-4-turbo": 1.5, + "gpt-4": 1.5, + "gpt-3.5-turbo": 0.2, + "o1": 2.0, + "o1-mini": 0.8, + "o3-mini": 0.5, +} +_DEFAULT_ENERGY_COEFFICIENT: float = 1.0 +_ENERGY_OUTPUT_WEIGHT: float = 1.5 + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _ensure_stream_usage(kwargs: dict[str, Any]) -> dict[str, Any]: + """Inject ``stream_options.include_usage=True`` for streaming requests. + + OpenAI only sends usage data in the final stream chunk when this option + is set. Without it the harness would record zero cost for every + streaming call. + """ + if not kwargs.get("stream", False): + return kwargs + stream_options = kwargs.get("stream_options") or {} + if not stream_options.get("include_usage"): + stream_options = {**stream_options, "include_usage": True} + kwargs = {**kwargs, "stream_options": stream_options} + return kwargs + + +def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float: + """Estimate cost in USD from model name and token counts.""" + per_million = _PRICING.get(model, _DEFAULT_PRICING) + input_cost = (prompt_tokens / 1_000_000) * per_million[0] + output_cost = (completion_tokens / 1_000_000) * per_million[1] + return input_cost + output_cost + + +def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> float: + """Estimate energy units (deterministic proxy, not live carbon).""" + coeff = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT) + return coeff * (prompt_tokens + completion_tokens * _ENERGY_OUTPUT_WEIGHT) + + +def _count_tool_calls_in_response(response: Any) -> int: + """Count tool calls in a non-streaming ChatCompletion response.""" + choices = getattr(response, "choices", None) + if not choices: + return 0 + message = getattr(choices[0], "message", None) + if message is None: + return 0 + tool_calls = getattr(message, "tool_calls", None) + if tool_calls is None: + return 0 + return len(tool_calls) + + +def _extract_usage(response: Any) -> tuple[int, int]: + """Extract (prompt_tokens, completion_tokens) from a response.""" + usage = getattr(response, "usage", None) + if usage is None: + return 0, 0 + return ( + getattr(usage, "prompt_tokens", 0) or 0, + getattr(usage, "completion_tokens", 0) or 0, + ) + + +def _check_budget_pre_call(ctx: Any) -> None: + """Raise BudgetExceededError in enforce mode if budget is already exhausted.""" + if ctx.mode != "enforce": + return + if ctx.budget_max is not None and ctx.cost >= ctx.budget_max: + from cascadeflow.schema.exceptions import BudgetExceededError + + remaining = ctx.budget_max - ctx.cost + raise BudgetExceededError( + f"Budget exhausted: spent ${ctx.cost:.4f} of ${ctx.budget_max:.4f} max", + remaining=remaining, + ) + + +def _update_context( + ctx: Any, + model: str, + prompt_tokens: int, + completion_tokens: int, + tool_call_count: int, + elapsed_ms: float, +) -> None: + """Update a HarnessRunContext with call metrics.""" + cost = _estimate_cost(model, prompt_tokens, completion_tokens) + energy = _estimate_energy(model, prompt_tokens, completion_tokens) + + ctx.cost += cost + ctx.step_count += 1 + ctx.latency_used_ms += elapsed_ms + ctx.energy_used += energy + ctx.tool_calls += tool_call_count + + if ctx.budget_max is not None: + ctx.budget_remaining = ctx.budget_max - ctx.cost + + ctx.model_used = model + ctx.record(action="allow", reason=ctx.mode, model=model) + + +# --------------------------------------------------------------------------- +# Stream wrappers +# --------------------------------------------------------------------------- + + +class _InstrumentedStream: + """Wraps an OpenAI ``Stream`` to capture usage after all chunks are consumed.""" + + __slots__ = ( + "_stream", + "_ctx", + "_model", + "_start_time", + "_usage", + "_tool_call_count", + "_finalized", + ) + + def __init__( + self, + stream: Any, + ctx: Any, + model: str, + start_time: float, + ) -> None: + self._stream = stream + self._ctx = ctx + self._model = model + self._start_time = start_time + self._usage: Any = None + self._tool_call_count: int = 0 + self._finalized: bool = False + + # --- iteration --------------------------------------------------------- + + def __iter__(self) -> _InstrumentedStream: + return self + + def __next__(self) -> Any: + try: + chunk = next(self._stream) + self._inspect_chunk(chunk) + return chunk + except StopIteration: + self._finalize() + raise + + # --- context manager --------------------------------------------------- + + def __enter__(self) -> _InstrumentedStream: + if hasattr(self._stream, "__enter__"): + self._stream.__enter__() + return self + + def __exit__(self, *args: Any) -> bool: + self._finalize() + if hasattr(self._stream, "__exit__"): + return self._stream.__exit__(*args) # type: ignore[no-any-return] + return False + + # --- proxied attributes ------------------------------------------------ + + def close(self) -> None: + self._finalize() + if hasattr(self._stream, "close"): + self._stream.close() + + @property + def response(self) -> Any: + return getattr(self._stream, "response", None) + + # --- internals --------------------------------------------------------- + + def _inspect_chunk(self, chunk: Any) -> None: + usage = getattr(chunk, "usage", None) + if usage is not None: + self._usage = usage + + choices = getattr(chunk, "choices", []) + if choices: + delta = getattr(choices[0], "delta", None) + if delta: + tool_calls = getattr(delta, "tool_calls", None) + if tool_calls: + for tc in tool_calls: + # A new tool call has an ``id``; subsequent deltas + # for the same call only have ``index``. + if getattr(tc, "id", None): + self._tool_call_count += 1 + + def _finalize(self) -> None: + if self._finalized: + return + self._finalized = True + + if self._ctx is None: + return + + elapsed_ms = (time.monotonic() - self._start_time) * 1000 + prompt_tokens = 0 + completion_tokens = 0 + if self._usage: + prompt_tokens = getattr(self._usage, "prompt_tokens", 0) or 0 + completion_tokens = getattr(self._usage, "completion_tokens", 0) or 0 + + _update_context( + self._ctx, + self._model, + prompt_tokens, + completion_tokens, + self._tool_call_count, + elapsed_ms, + ) + + +class _InstrumentedAsyncStream: + """Wraps an OpenAI ``AsyncStream`` to capture usage after consumption.""" + + __slots__ = ( + "_stream", + "_ctx", + "_model", + "_start_time", + "_usage", + "_tool_call_count", + "_finalized", + ) + + def __init__( + self, + stream: Any, + ctx: Any, + model: str, + start_time: float, + ) -> None: + self._stream = stream + self._ctx = ctx + self._model = model + self._start_time = start_time + self._usage: Any = None + self._tool_call_count: int = 0 + self._finalized: bool = False + + # --- async iteration --------------------------------------------------- + + def __aiter__(self) -> _InstrumentedAsyncStream: + return self + + async def __anext__(self) -> Any: + try: + chunk = await self._stream.__anext__() + self._inspect_chunk(chunk) + return chunk + except StopAsyncIteration: + self._finalize() + raise + + # --- async context manager --------------------------------------------- + + async def __aenter__(self) -> _InstrumentedAsyncStream: + if hasattr(self._stream, "__aenter__"): + await self._stream.__aenter__() + return self + + async def __aexit__(self, *args: Any) -> bool: + self._finalize() + if hasattr(self._stream, "__aexit__"): + return await self._stream.__aexit__(*args) # type: ignore[no-any-return] + return False + + # --- proxied attributes ------------------------------------------------ + + def close(self) -> None: + self._finalize() + if hasattr(self._stream, "close"): + self._stream.close() + + @property + def response(self) -> Any: + return getattr(self._stream, "response", None) + + # --- internals --------------------------------------------------------- + + def _inspect_chunk(self, chunk: Any) -> None: + usage = getattr(chunk, "usage", None) + if usage is not None: + self._usage = usage + + choices = getattr(chunk, "choices", []) + if choices: + delta = getattr(choices[0], "delta", None) + if delta: + tool_calls = getattr(delta, "tool_calls", None) + if tool_calls: + for tc in tool_calls: + if getattr(tc, "id", None): + self._tool_call_count += 1 + + def _finalize(self) -> None: + if self._finalized: + return + self._finalized = True + + if self._ctx is None: + return + + elapsed_ms = (time.monotonic() - self._start_time) * 1000 + prompt_tokens = 0 + completion_tokens = 0 + if self._usage: + prompt_tokens = getattr(self._usage, "prompt_tokens", 0) or 0 + completion_tokens = getattr(self._usage, "completion_tokens", 0) or 0 + + _update_context( + self._ctx, + self._model, + prompt_tokens, + completion_tokens, + self._tool_call_count, + elapsed_ms, + ) + + +# --------------------------------------------------------------------------- +# Wrapper factories +# --------------------------------------------------------------------------- + + +def _make_patched_create(original_fn: Any) -> Any: + """Create a patched version of ``Completions.create``.""" + + @functools.wraps(original_fn) + def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: + from cascadeflow.harness.api import get_current_run, get_harness_config + + config = get_harness_config() + ctx = get_current_run() + mode = ctx.mode if ctx else config.mode + + if mode == "off": + return original_fn(self, *args, **kwargs) + + model: str = kwargs.get("model", "unknown") + is_stream: bool = bool(kwargs.get("stream", False)) + + if ctx: + _check_budget_pre_call(ctx) + + start_time = time.monotonic() + + kwargs = _ensure_stream_usage(kwargs) + + logger.debug("harness intercept: model=%s stream=%s mode=%s", model, is_stream, mode) + + response = original_fn(self, *args, **kwargs) + + if is_stream and ctx: + return _InstrumentedStream(response, ctx, model, start_time) + elif not is_stream and ctx: + elapsed_ms = (time.monotonic() - start_time) * 1000 + prompt_tokens, completion_tokens = _extract_usage(response) + tool_call_count = _count_tool_calls_in_response(response) + _update_context( + ctx, + model, + prompt_tokens, + completion_tokens, + tool_call_count, + elapsed_ms, + ) + else: + logger.debug( + "harness %s: model=%s (no active run scope, metrics not tracked)", + mode, + model, + ) + + return response + + return wrapper + + +def _make_patched_async_create(original_fn: Any) -> Any: + """Create a patched version of ``AsyncCompletions.create``.""" + + @functools.wraps(original_fn) + async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: + from cascadeflow.harness.api import get_current_run, get_harness_config + + config = get_harness_config() + ctx = get_current_run() + mode = ctx.mode if ctx else config.mode + + if mode == "off": + return await original_fn(self, *args, **kwargs) + + model: str = kwargs.get("model", "unknown") + is_stream: bool = bool(kwargs.get("stream", False)) + + if ctx: + _check_budget_pre_call(ctx) + + start_time = time.monotonic() + + kwargs = _ensure_stream_usage(kwargs) + + logger.debug( + "harness intercept async: model=%s stream=%s mode=%s", + model, + is_stream, + mode, + ) + + response = await original_fn(self, *args, **kwargs) + + if is_stream and ctx: + return _InstrumentedAsyncStream(response, ctx, model, start_time) + elif not is_stream and ctx: + elapsed_ms = (time.monotonic() - start_time) * 1000 + prompt_tokens, completion_tokens = _extract_usage(response) + tool_call_count = _count_tool_calls_in_response(response) + _update_context( + ctx, + model, + prompt_tokens, + completion_tokens, + tool_call_count, + elapsed_ms, + ) + else: + logger.debug( + "harness %s: model=%s (no active run scope, metrics not tracked)", + mode, + model, + ) + + return response + + return wrapper + + +# --------------------------------------------------------------------------- +# Public API (called by cascadeflow.harness.api) +# --------------------------------------------------------------------------- + + +def patch_openai() -> bool: + """Patch the OpenAI Python client for harness instrumentation. + + Returns ``True`` if patching succeeded, ``False`` if openai is not + installed. Idempotent: safe to call multiple times. + """ + global _openai_patched, _original_sync_create, _original_async_create + + if _openai_patched: + logger.debug("openai already patched, skipping") + return True + + try: + from openai.resources.chat.completions import AsyncCompletions, Completions + except ImportError: + logger.debug("openai package not available, skipping instrumentation") + return False + + _original_sync_create = Completions.create + _original_async_create = AsyncCompletions.create + + Completions.create = _make_patched_create(_original_sync_create) # type: ignore[assignment] + AsyncCompletions.create = _make_patched_async_create( # type: ignore[assignment] + _original_async_create, + ) + + _openai_patched = True + logger.info("openai client instrumented (sync + async)") + return True + + +def unpatch_openai() -> None: + """Restore original OpenAI client methods. + + Safe to call even if not patched. Used by ``reset()`` and tests. + """ + global _openai_patched, _original_sync_create, _original_async_create + + if not _openai_patched: + return + + try: + from openai.resources.chat.completions import AsyncCompletions, Completions + except ImportError: + _openai_patched = False + return + + if _original_sync_create is not None: + Completions.create = _original_sync_create # type: ignore[assignment] + if _original_async_create is not None: + AsyncCompletions.create = _original_async_create # type: ignore[assignment] + + _original_sync_create = None + _original_async_create = None + _openai_patched = False + logger.info("openai client unpatched") + + +def is_patched() -> bool: + """Return whether the OpenAI client is currently patched.""" + return _openai_patched diff --git a/cascadeflow/harness/pricing.py b/cascadeflow/harness/pricing.py new file mode 100644 index 00000000..7f6cd44b --- /dev/null +++ b/cascadeflow/harness/pricing.py @@ -0,0 +1,133 @@ +"""Shared pricing and energy estimation for harness integrations. + +Provides approximate USD-per-1M-token pricing and deterministic energy +coefficients used by CrewAI, OpenAI Agents, Google ADK, and future +integration modules. + +A future pricing registry will consolidate with ``cascadeflow.pricing`` +and LiteLLM live data. Until then this module is the canonical source +for harness-level cost/energy estimation. +""" + +from __future__ import annotations + +import re as _re + +# --------------------------------------------------------------------------- +# Pricing (USD per 1M tokens: input, output) +# --------------------------------------------------------------------------- + +PRICING_USD_PER_M: dict[str, tuple[float, float]] = { + # OpenAI + "gpt-4o": (2.50, 10.00), + "gpt-4o-mini": (0.15, 0.60), + "gpt-5": (1.25, 10.00), + "gpt-5-mini": (0.20, 0.80), + "gpt-4-turbo": (10.00, 30.00), + "gpt-4": (30.00, 60.00), + "gpt-3.5-turbo": (0.50, 1.50), + "o1": (15.00, 60.00), + "o1-mini": (3.00, 12.00), + "o3-mini": (1.10, 4.40), + # Anthropic + "claude-sonnet-4": (3.00, 15.00), + "claude-haiku-3.5": (1.00, 5.00), + "claude-opus-4.5": (5.00, 25.00), + # Google Gemini + "gemini-2.5-flash": (0.15, 0.60), + "gemini-2.5-pro": (1.25, 10.00), + "gemini-2.0-flash": (0.10, 0.40), + "gemini-1.5-flash": (0.075, 0.30), + "gemini-1.5-pro": (1.25, 5.00), +} +DEFAULT_PRICING_USD_PER_M: tuple[float, float] = (2.50, 10.00) + +# --------------------------------------------------------------------------- +# Energy coefficients (deterministic proxy for compute intensity) +# --------------------------------------------------------------------------- + +ENERGY_COEFFICIENTS: dict[str, float] = { + # OpenAI + "gpt-4o": 1.0, + "gpt-4o-mini": 0.3, + "gpt-5": 1.2, + "gpt-5-mini": 0.35, + "gpt-4-turbo": 1.5, + "gpt-4": 1.5, + "gpt-3.5-turbo": 0.2, + "o1": 2.0, + "o1-mini": 0.8, + "o3-mini": 0.5, + # Anthropic + "claude-sonnet-4": 1.0, + "claude-haiku-3.5": 0.3, + "claude-opus-4.5": 1.8, + # Google Gemini + "gemini-2.5-flash": 0.3, + "gemini-2.5-pro": 1.2, + "gemini-2.0-flash": 0.25, + "gemini-1.5-flash": 0.2, + "gemini-1.5-pro": 1.0, +} +DEFAULT_ENERGY_COEFFICIENT: float = 1.0 +ENERGY_OUTPUT_WEIGHT: float = 1.5 + + +# Pre-compiled pattern for stripping version/preview/date suffixes. +# Matches: -preview, -preview-05-20, -20250120, -latest, -exp-0827, etc. +_VERSION_SUFFIX_RE = _re.compile( + r"(-preview(?:-\d{2,4}-\d{2})?|-\d{8,}|-latest|-exp(?:-\d+)?|-it)$" +) + +# Cache for resolved model → pricing key lookups. +_pricing_key_cache: dict[str, str | None] = {} + + +def _resolve_pricing_key(model: str) -> str | None: + """Resolve a model name to a known pricing table key. + + Tries exact match first, then strips version/preview/date suffixes, + then tries longest-prefix match against known model names. + Returns ``None`` when no match is found (caller should use defaults). + """ + if model in _pricing_key_cache: + return _pricing_key_cache[model] + + # Exact match + if model in PRICING_USD_PER_M: + _pricing_key_cache[model] = model + return model + + # Strip version suffixes and retry + stripped = _VERSION_SUFFIX_RE.sub("", model) + if stripped != model and stripped in PRICING_USD_PER_M: + _pricing_key_cache[model] = stripped + return stripped + + # Longest-prefix match (e.g. "gemini-2.5-flash-8b" → "gemini-2.5-flash") + best: str | None = None + best_len = 0 + for known in PRICING_USD_PER_M: + if model.startswith(known) and len(known) > best_len: + best = known + best_len = len(known) + if best is not None: + _pricing_key_cache[model] = best + return best + + _pricing_key_cache[model] = None + return None + + +def estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float: + """Estimate cost in USD from model name and token counts.""" + key = _resolve_pricing_key(model) + in_price, out_price = PRICING_USD_PER_M.get(key, DEFAULT_PRICING_USD_PER_M) if key else DEFAULT_PRICING_USD_PER_M + return (input_tokens / 1_000_000) * in_price + (output_tokens / 1_000_000) * out_price + + +def estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float: + """Estimate energy proxy from model name and token counts.""" + key = _resolve_pricing_key(model) + coeff = ENERGY_COEFFICIENTS.get(key, DEFAULT_ENERGY_COEFFICIENT) if key else DEFAULT_ENERGY_COEFFICIENT + return coeff * (input_tokens + output_tokens * ENERGY_OUTPUT_WEIGHT) diff --git a/cascadeflow/integrations/__init__.py b/cascadeflow/integrations/__init__.py index 9a0dfa4d..61c3ebbd 100644 --- a/cascadeflow/integrations/__init__.py +++ b/cascadeflow/integrations/__init__.py @@ -90,6 +90,25 @@ extract_token_usage = None MODEL_PRICING = None +# Try to import OpenAI Agents SDK integration +try: + from .openai_agents import ( + OPENAI_AGENTS_SDK_AVAILABLE, + CascadeFlowModelProvider, + OpenAIAgentsIntegrationConfig, + create_openai_agents_provider, + is_openai_agents_sdk_available, + ) + + OPENAI_AGENTS_AVAILABLE = OPENAI_AGENTS_SDK_AVAILABLE +except ImportError: + OPENAI_AGENTS_AVAILABLE = False + OPENAI_AGENTS_SDK_AVAILABLE = False + CascadeFlowModelProvider = None + OpenAIAgentsIntegrationConfig = None + create_openai_agents_provider = None + is_openai_agents_sdk_available = None + # OpenClaw integration helpers (no external deps) try: from .openclaw import ( @@ -146,6 +165,48 @@ PaygenticUsageReporter = None PaygenticProxyService = None +# Try to import CrewAI integration +try: + from .crewai import ( + CREWAI_AVAILABLE, + CrewAIHarnessConfig, + enable as crewai_enable, + disable as crewai_disable, + is_available as crewai_is_available, + is_enabled as crewai_is_enabled, + get_config as crewai_get_config, + ) +except ImportError: + CREWAI_AVAILABLE = False + CrewAIHarnessConfig = None + crewai_enable = None + crewai_disable = None + crewai_is_available = None + crewai_is_enabled = None + crewai_get_config = None + +# Try to import Google ADK integration +try: + from .google_adk import ( + GOOGLE_ADK_AVAILABLE, + GoogleADKHarnessConfig, + CascadeFlowADKPlugin, + enable as google_adk_enable, + disable as google_adk_disable, + is_available as google_adk_is_available, + is_enabled as google_adk_is_enabled, + get_config as google_adk_get_config, + ) +except ImportError: + GOOGLE_ADK_AVAILABLE = False + GoogleADKHarnessConfig = None + CascadeFlowADKPlugin = None + google_adk_enable = None + google_adk_disable = None + google_adk_is_available = None + google_adk_is_enabled = None + google_adk_get_config = None + __all__ = [] if LITELLM_AVAILABLE: @@ -209,6 +270,17 @@ ] ) +if OPENAI_AGENTS_AVAILABLE: + __all__.extend( + [ + "OPENAI_AGENTS_SDK_AVAILABLE", + "CascadeFlowModelProvider", + "OpenAIAgentsIntegrationConfig", + "create_openai_agents_provider", + "is_openai_agents_sdk_available", + ] + ) + if PAYGENTIC_AVAILABLE: __all__.extend( [ @@ -222,13 +294,43 @@ ] ) +if CREWAI_AVAILABLE: + __all__.extend( + [ + "CREWAI_AVAILABLE", + "CrewAIHarnessConfig", + "crewai_enable", + "crewai_disable", + "crewai_is_available", + "crewai_is_enabled", + "crewai_get_config", + ] + ) + +if GOOGLE_ADK_AVAILABLE: + __all__.extend( + [ + "GOOGLE_ADK_AVAILABLE", + "GoogleADKHarnessConfig", + "CascadeFlowADKPlugin", + "google_adk_enable", + "google_adk_disable", + "google_adk_is_available", + "google_adk_is_enabled", + "google_adk_get_config", + ] + ) + # Integration capabilities INTEGRATION_CAPABILITIES = { "litellm": LITELLM_AVAILABLE, "opentelemetry": OPENTELEMETRY_AVAILABLE, "langchain": LANGCHAIN_AVAILABLE, + "openai_agents": OPENAI_AGENTS_AVAILABLE, "openclaw": OPENCLAW_AVAILABLE, "paygentic": PAYGENTIC_AVAILABLE, + "crewai": CREWAI_AVAILABLE, + "google_adk": GOOGLE_ADK_AVAILABLE, } @@ -250,6 +352,9 @@ def get_integration_info(): "litellm_available": LITELLM_AVAILABLE, "opentelemetry_available": OPENTELEMETRY_AVAILABLE, "langchain_available": LANGCHAIN_AVAILABLE, + "openai_agents_available": OPENAI_AGENTS_AVAILABLE, "openclaw_available": OPENCLAW_AVAILABLE, "paygentic_available": PAYGENTIC_AVAILABLE, + "crewai_available": CREWAI_AVAILABLE, + "google_adk_available": GOOGLE_ADK_AVAILABLE, } diff --git a/cascadeflow/integrations/crewai.py b/cascadeflow/integrations/crewai.py new file mode 100644 index 00000000..7ff765f0 --- /dev/null +++ b/cascadeflow/integrations/crewai.py @@ -0,0 +1,344 @@ +"""CrewAI harness integration for cascadeflow. + +Uses CrewAI's native ``llm_hooks`` system (v1.5+) to intercept all LLM calls +inside Crew executions, feeding metrics into ``cascadeflow.harness`` run +contexts. + +This module is optional — ``pip install cascadeflow[crewai]`` pulls in the +crewai dependency. When crewai is not installed the public helpers return +gracefully and ``CREWAI_AVAILABLE`` is ``False``. + +Integration surface: + - ``enable()``: register before/after LLM-call hooks globally + - ``disable()``: unregister hooks and clean up + - ``CrewAIHarnessConfig``: optional knobs (fail_open, enable_budget_gate) +""" + +from __future__ import annotations + +import logging +import time +from dataclasses import dataclass +from importlib.util import find_spec +from typing import Any, Optional + +logger = logging.getLogger("cascadeflow.integrations.crewai") + +CREWAI_AVAILABLE = find_spec("crewai") is not None + +# --------------------------------------------------------------------------- +# Pricing table (USD per 1M tokens: input, output) +# Shared with instrument.py — kept small and self-contained to avoid +# cross-module coupling. A future pricing registry will deduplicate. +# --------------------------------------------------------------------------- + +_PRICING: dict[str, tuple[float, float]] = { + "gpt-4o": (2.50, 10.00), + "gpt-4o-mini": (0.15, 0.60), + "gpt-5-mini": (0.20, 0.80), + "gpt-4-turbo": (10.00, 30.00), + "gpt-4": (30.00, 60.00), + "gpt-3.5-turbo": (0.50, 1.50), + "o1": (15.00, 60.00), + "o1-mini": (3.00, 12.00), + "o3-mini": (1.10, 4.40), + "claude-sonnet-4": (3.00, 15.00), + "claude-haiku-3.5": (1.00, 5.00), + "claude-opus-4.5": (5.00, 25.00), +} +_DEFAULT_PRICING: tuple[float, float] = (2.50, 10.00) + +_ENERGY_COEFFICIENTS: dict[str, float] = { + "gpt-4o": 1.0, + "gpt-4o-mini": 0.3, + "gpt-5-mini": 0.35, + "gpt-4-turbo": 1.5, + "gpt-4": 1.5, + "gpt-3.5-turbo": 0.2, + "o1": 2.0, + "o1-mini": 0.8, + "o3-mini": 0.5, +} +_DEFAULT_ENERGY_COEFFICIENT: float = 1.0 +_ENERGY_OUTPUT_WEIGHT: float = 1.5 + + +def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float: + per_million = _PRICING.get(model, _DEFAULT_PRICING) + return (prompt_tokens / 1_000_000) * per_million[0] + (completion_tokens / 1_000_000) * per_million[1] + + +def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> float: + coeff = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT) + return coeff * (prompt_tokens + completion_tokens * _ENERGY_OUTPUT_WEIGHT) + + +def _extract_message_content(message: Any) -> str: + """Extract content text from a CrewAI message (dict or object). + + CrewAI hooks pass messages as dicts (``{"role": "...", "content": "..."}``) + but we also handle object-style messages defensively. + """ + if isinstance(message, dict): + return str(message.get("content", "") or "") + return str(getattr(message, "content", "") or "") + + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + + +@dataclass +class CrewAIHarnessConfig: + """Runtime configuration for the CrewAI harness integration. + + fail_open: + If ``True`` (default), errors inside hooks never break the CrewAI + execution — they are logged and swallowed. + enable_budget_gate: + If ``True`` (default), a ``before_llm_call`` hook blocks calls when + the harness run budget is exhausted (enforce mode only). + """ + + fail_open: bool = True + enable_budget_gate: bool = True + + +# --------------------------------------------------------------------------- +# Module-level state +# --------------------------------------------------------------------------- + +_config: CrewAIHarnessConfig = CrewAIHarnessConfig() +_hooks_registered: bool = False +_before_hook_ref: Any = None +_after_hook_ref: Any = None +# Track call start times per thread via a dict keyed by id(context) +_call_start_times: dict[int, float] = {} + + +# --------------------------------------------------------------------------- +# Hook implementations +# --------------------------------------------------------------------------- + + +def _extract_model_name(context: Any) -> str: + """Best-effort extraction of the model name from a LLMCallHookContext.""" + llm = getattr(context, "llm", None) + if llm is None: + return "unknown" + # CrewAI LLM objects have a .model attribute + model = getattr(llm, "model", None) + if isinstance(model, str): + # Strip provider prefix like "openai/gpt-4o" → "gpt-4o" + if "/" in model: + return model.rsplit("/", 1)[-1] + return model + return "unknown" + + +def _before_llm_call_hook(context: Any) -> Optional[bool]: + """Harness before-LLM-call hook registered with CrewAI. + + - In enforce mode with budget gate: blocks calls when budget exhausted. + - Tracks call start time for latency measurement. + - Returns ``None`` (allow) or ``False`` (block). + """ + try: + from cascadeflow.harness.api import get_current_run + + ctx = get_current_run() + if ctx is None: + return None + + # Budget gate in enforce mode — check BEFORE recording start time + # so blocked calls don't leak entries in _call_start_times. + if ( + _config.enable_budget_gate + and ctx.mode == "enforce" + and ctx.budget_max is not None + and ctx.cost >= ctx.budget_max + ): + logger.warning( + "crewai hook: blocking LLM call — budget exhausted " + "(spent $%.4f of $%.4f max)", + ctx.cost, + ctx.budget_max, + ) + ctx.record(action="stop", reason="budget_exhausted", model=_extract_model_name(context)) + return False + + # Record start time for latency tracking (only for allowed calls) + _call_start_times[id(context)] = time.monotonic() + + return None + except Exception: + if _config.fail_open: + logger.debug("crewai before_llm_call hook error (fail_open)", exc_info=True) + return None + raise + + +def _after_llm_call_hook(context: Any) -> Optional[str]: + """Harness after-LLM-call hook registered with CrewAI. + + Updates the active HarnessRunContext with: + - cost (estimated from model + response length) + - latency + - energy estimate + - step count + - trace record + + Returns ``None`` (keep original response). + """ + try: + from cascadeflow.harness.api import get_current_run + + ctx = get_current_run() + if ctx is None: + return None + + model = _extract_model_name(context) + response = getattr(context, "response", None) or "" + + # Estimate tokens from text (rough: 1 token ≈ 4 chars). + # CrewAI hooks don't expose raw token counts, so we approximate. + # Messages are typically dicts ({"role": "...", "content": "..."}). + messages = getattr(context, "messages", []) + prompt_chars = sum(len(_extract_message_content(m)) for m in messages) + completion_chars = len(str(response)) + prompt_tokens = max(prompt_chars // 4, 1) + completion_tokens = max(completion_chars // 4, 1) + + cost = _estimate_cost(model, prompt_tokens, completion_tokens) + energy = _estimate_energy(model, prompt_tokens, completion_tokens) + + # Latency + start_time = _call_start_times.pop(id(context), None) + elapsed_ms = (time.monotonic() - start_time) * 1000 if start_time else 0.0 + + ctx.cost += cost + ctx.step_count += 1 + ctx.latency_used_ms += elapsed_ms + ctx.energy_used += energy + + if ctx.budget_max is not None: + ctx.budget_remaining = ctx.budget_max - ctx.cost + + ctx.model_used = model + ctx.record(action="allow", reason=ctx.mode, model=model) + + logger.debug( + "crewai hook: tracked call model=%s cost=$%.6f latency=%.0fms", + model, + cost, + elapsed_ms, + ) + + return None + except Exception: + if _config.fail_open: + logger.debug("crewai after_llm_call hook error (fail_open)", exc_info=True) + return None + raise + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def is_available() -> bool: + """Return whether the crewai package is installed.""" + return CREWAI_AVAILABLE + + +def is_enabled() -> bool: + """Return whether harness hooks are currently registered with CrewAI.""" + return _hooks_registered + + +def enable(config: Optional[CrewAIHarnessConfig] = None) -> bool: + """Register cascadeflow harness hooks with CrewAI's global hook system. + + Idempotent: safe to call multiple times. + + Args: + config: Optional configuration overrides. + + Returns: + ``True`` if hooks were registered, ``False`` if crewai is not + installed. + """ + global _config, _hooks_registered, _before_hook_ref, _after_hook_ref + + if _hooks_registered: + logger.debug("crewai harness hooks already registered") + return True + + if not CREWAI_AVAILABLE: + logger.debug("crewai not installed, skipping hook registration") + return False + + if config is not None: + _config = config + + try: + from crewai.hooks import ( # noqa: I001 + register_after_llm_call_hook, + register_before_llm_call_hook, + ) + except ImportError: + logger.warning( + "crewai is installed but hooks module not available " + "(requires crewai>=1.5); skipping" + ) + return False + + _before_hook_ref = _before_llm_call_hook + _after_hook_ref = _after_llm_call_hook + + register_before_llm_call_hook(_before_hook_ref) + register_after_llm_call_hook(_after_hook_ref) + + _hooks_registered = True + logger.info("crewai harness hooks registered (before + after llm call)") + return True + + +def disable() -> None: + """Unregister cascadeflow harness hooks from CrewAI. + + Safe to call even if not enabled. + """ + global _hooks_registered, _before_hook_ref, _after_hook_ref + + if not _hooks_registered: + return + + try: + from crewai.hooks import ( # noqa: I001 + unregister_after_llm_call_hook, + unregister_before_llm_call_hook, + ) + + if _before_hook_ref is not None: + unregister_before_llm_call_hook(_before_hook_ref) + if _after_hook_ref is not None: + unregister_after_llm_call_hook(_after_hook_ref) + except ImportError: + pass + + _before_hook_ref = None + _after_hook_ref = None + _hooks_registered = False + _call_start_times.clear() + logger.info("crewai harness hooks unregistered") + + +def get_config() -> CrewAIHarnessConfig: + """Return a copy of the current configuration.""" + return CrewAIHarnessConfig( + fail_open=_config.fail_open, + enable_budget_gate=_config.enable_budget_gate, + ) diff --git a/cascadeflow/integrations/google_adk.py b/cascadeflow/integrations/google_adk.py new file mode 100644 index 00000000..1c6a853d --- /dev/null +++ b/cascadeflow/integrations/google_adk.py @@ -0,0 +1,443 @@ +"""Google ADK (Agent Development Kit) harness integration for cascadeflow. + +Uses ADK's ``BasePlugin`` system to intercept all LLM calls across all agents +in a Runner, feeding metrics into ``cascadeflow.harness`` run contexts. + +This module is optional — ``pip install cascadeflow[google-adk]`` pulls in the +google-adk dependency. When google-adk is not installed the public helpers +return gracefully and ``GOOGLE_ADK_AVAILABLE`` is ``False``. + +Integration surface: + - ``enable()``: create and return a plugin instance + - ``disable()``: deactivate the plugin and clean up + - ``CascadeFlowADKPlugin``: BasePlugin subclass for Runner(plugins=[...]) + +Unlike CrewAI (global hooks), ADK plugins are registered per-Runner. +``enable()`` returns the plugin instance; the user passes it to +``Runner(plugins=[plugin])``. + +Design note — no tool gating: + ADK's ``tools_dict`` is part of agent definition, not per-call. + Budget gate via ``before_model_callback`` provides sufficient cost control. +""" + +from __future__ import annotations + +import logging +import time +from dataclasses import dataclass +from importlib.util import find_spec +from typing import Any, Optional + +from cascadeflow.harness.api import get_current_run +from cascadeflow.harness.pricing import estimate_cost, estimate_energy + +logger = logging.getLogger("cascadeflow.integrations.google_adk") + +GOOGLE_ADK_AVAILABLE = find_spec("google.adk") is not None + +# Resolve the base class: use ADK's BasePlugin when available, else object. +_ADKBasePlugin: type +if GOOGLE_ADK_AVAILABLE: + try: + from google.adk.plugins import BasePlugin as _ADKBasePlugin # type: ignore[assignment] + except ImportError: + _ADKBasePlugin = object # type: ignore[assignment,misc] + GOOGLE_ADK_AVAILABLE = False +else: + _ADKBasePlugin = object # type: ignore[assignment,misc] + + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + + +@dataclass +class GoogleADKHarnessConfig: + """Runtime configuration for the Google ADK harness integration. + + fail_open: + If ``True`` (default), errors inside callbacks never break ADK + execution — they are logged and swallowed. + enable_budget_gate: + If ``True`` (default), ``before_model_callback`` blocks calls when + the harness run budget is exhausted (enforce mode only). + """ + + fail_open: bool = True + enable_budget_gate: bool = True + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _normalize_model_name(model: str) -> str: + """Strip LiteLlm-style provider prefix (``openai/gpt-4o`` → ``gpt-4o``). + + Also handles ``models/gemini-2.5-flash`` → ``gemini-2.5-flash``. + """ + if "/" in model: + return model.rsplit("/", 1)[-1] + return model + + +def _count_function_calls(content: Any) -> int: + """Count ``function_call`` parts in an ADK LlmResponse content.""" + if content is None: + return 0 + parts = getattr(content, "parts", None) + if not parts: + return 0 + count = 0 + for part in parts: + if getattr(part, "function_call", None) is not None: + count += 1 + return count + + +# --------------------------------------------------------------------------- +# Plugin +# --------------------------------------------------------------------------- + + +class CascadeFlowADKPlugin(_ADKBasePlugin): # type: ignore[misc] + """Google ADK BasePlugin with cascadeflow harness awareness. + + Intercepts every LLM call across all agents in a Runner to provide: + - Budget enforcement (enforce mode: short-circuits with error response) + - Cost, latency, and energy tracking + - Tool call counting + - Full trace recording into HarnessRunContext + """ + + def __init__(self, config: Optional[GoogleADKHarnessConfig] = None) -> None: + self._config = config or GoogleADKHarnessConfig() + self._active = True + self._call_seq: int = 0 + # Track call metadata between before/after callbacks. + # Keyed by id(callback_context) to guarantee uniqueness even when + # two concurrent calls share (invocation_id, agent_name). + self._call_start_times: dict[int, float] = {} + self._call_models: dict[int, str] = {} + + @staticmethod + def _callback_key(callback_context: Any) -> int: + """Return a unique key for a callback_context object. + + Uses ``id()`` which is guaranteed unique for the lifetime of the + object — ADK keeps the same CallbackContext alive across the + before/after/error callback sequence for a single LLM call. + """ + return id(callback_context) + + async def before_model_callback( + self, + callback_context: Any, + llm_request: Any, + ) -> Any: + """Budget gate and timing setup. + + Returns ``None`` to proceed normally, or an ``LlmResponse`` with + an error to short-circuit the call when budget is exhausted. + """ + if not self._active: + return None + + try: + ctx = get_current_run() + if ctx is None: + return None + if ctx.mode == "off": + return None + + # Extract model name from request + model_raw = getattr(llm_request, "model", None) or "unknown" + model = _normalize_model_name(str(model_raw)) + + key = self._callback_key(callback_context) + + # Budget gate in enforce mode + if ( + self._config.enable_budget_gate + and ctx.mode == "enforce" + and ctx.budget_max is not None + and ctx.cost >= ctx.budget_max + ): + logger.warning( + "google-adk: blocking LLM call — budget exhausted " + "(spent $%.4f of $%.4f max)", + ctx.cost, + ctx.budget_max, + ) + ctx.record(action="stop", reason="budget_exhausted", model=model) + return self._make_budget_error_response(ctx) + + # Record start time and model for after_model_callback + self._call_start_times[key] = time.monotonic() + self._call_models[key] = model + + return None + except Exception: + if self._config.fail_open: + logger.debug( + "google-adk before_model_callback error (fail_open)", exc_info=True + ) + return None + raise + + async def after_model_callback( + self, + callback_context: Any, + llm_response: Any, + ) -> Any: + """Extract tokens, count tool calls, estimate cost/energy, update run context.""" + if not self._active: + return None + + try: + ctx = get_current_run() + if ctx is None: + return None + if ctx.mode == "off": + return None + + key = self._callback_key(callback_context) + + # Recover model name stored during before_model_callback + model = self._call_models.pop(key, "unknown") + + # Extract token counts from usage_metadata + input_tokens, output_tokens = self._extract_tokens(llm_response) + + # Count function_call parts in response content + content = getattr(llm_response, "content", None) + tool_calls = _count_function_calls(content) + + # Cost and energy estimation + cost = estimate_cost(model, input_tokens, output_tokens) + energy = estimate_energy(model, input_tokens, output_tokens) + + # Latency + start_time = self._call_start_times.pop(key, None) + elapsed_ms = (time.monotonic() - start_time) * 1000 if start_time else 0.0 + + # Update run context + ctx.cost += cost + ctx.step_count += 1 + ctx.latency_used_ms += elapsed_ms + ctx.energy_used += energy + ctx.tool_calls += tool_calls + + if ctx.budget_max is not None: + ctx.budget_remaining = ctx.budget_max - ctx.cost + + ctx.model_used = model + ctx.record(action="allow", reason=ctx.mode, model=model) + + logger.debug( + "google-adk: tracked call model=%s cost=$%.6f latency=%.0fms tools=%d", + model, + cost, + elapsed_ms, + tool_calls, + ) + + return None + except Exception: + if self._config.fail_open: + logger.debug( + "google-adk after_model_callback error (fail_open)", exc_info=True + ) + return None + raise + + async def on_model_error_callback( + self, + callback_context: Any, + error: Exception, + ) -> Any: + """Record error in trace and clean up timing state.""" + if not self._active: + return None + + try: + key = self._callback_key(callback_context) + model = self._call_models.pop(key, "unknown") + self._call_start_times.pop(key, None) + + ctx = get_current_run() + if ctx is not None: + error_type = type(error).__name__ + ctx.record( + action="error", + reason=f"model_error:{error_type}", + model=model, + ) + + return None + except Exception: + if self._config.fail_open: + logger.debug( + "google-adk on_model_error_callback error (fail_open)", exc_info=True + ) + return None + raise + + def deactivate(self) -> None: + """Make all callbacks no-ops without unregistering from Runner.""" + self._active = False + self._call_seq = 0 + self._call_start_times.clear() + self._call_models.clear() + + @staticmethod + def _extract_tokens(llm_response: Any) -> tuple[int, int]: + """Extract input/output token counts from an ADK LlmResponse. + + ADK responses carry ``usage_metadata`` with ``prompt_token_count`` + and ``candidates_token_count``. Falls back to estimating from + content text (4 chars ≈ 1 token). + """ + usage = getattr(llm_response, "usage_metadata", None) + if usage is not None: + input_tokens = getattr(usage, "prompt_token_count", 0) or 0 + output_tokens = getattr(usage, "candidates_token_count", 0) or 0 + if input_tokens > 0 or output_tokens > 0: + return int(input_tokens), int(output_tokens) + + # Fallback: estimate from content text + content = getattr(llm_response, "content", None) + if content is not None: + parts = getattr(content, "parts", None) + if parts: + text_chars = sum(len(getattr(p, "text", "") or "") for p in parts) + return 0, max(text_chars // 4, 1) + + return 0, 0 + + @staticmethod + def _make_budget_error_response(ctx: Any) -> Any: + """Build an LlmResponse that short-circuits the LLM call. + + When ADK is available we return a real ``LlmResponse``. When not + (shouldn't happen in practice), we return a sentinel dict. + + The user-facing message is intentionally generic to avoid leaking + internal spend/limit numbers. Exact figures are logged separately. + """ + # Generic message safe for end-user exposure. + msg = "cascadeflow harness budget exceeded" + # Detailed figures for operators only. + logger.warning( + "google-adk: budget exceeded — spent $%.4f of $%.4f max", + ctx.cost, + ctx.budget_max, + ) + if GOOGLE_ADK_AVAILABLE: + try: + from google.adk.models import LlmResponse # type: ignore[import-untyped] + from google.genai.types import Content, Part # type: ignore[import-untyped] + + return LlmResponse( + content=Content(parts=[Part(text=msg)]), + error_code="BUDGET_EXCEEDED", + error_message=msg, + ) + except ImportError: + pass + + return {"error_code": "BUDGET_EXCEEDED", "error_message": msg} + + +# --------------------------------------------------------------------------- +# Module-level state +# --------------------------------------------------------------------------- + +_config: GoogleADKHarnessConfig = GoogleADKHarnessConfig() +_plugin_instance: Optional[CascadeFlowADKPlugin] = None +_enabled: bool = False + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def is_available() -> bool: + """Return whether the google-adk package is installed.""" + return GOOGLE_ADK_AVAILABLE + + +def is_enabled() -> bool: + """Return whether a plugin instance has been created via ``enable()``.""" + return _enabled + + +def get_config() -> GoogleADKHarnessConfig: + """Return a copy of the current configuration.""" + return GoogleADKHarnessConfig( + fail_open=_config.fail_open, + enable_budget_gate=_config.enable_budget_gate, + ) + + +def enable( + config: Optional[GoogleADKHarnessConfig] = None, +) -> CascadeFlowADKPlugin: + """Create a cascadeflow-instrumented ADK plugin instance. + + Unlike CrewAI (global hooks), ADK plugins are per-Runner. Pass the + returned plugin to ``Runner(plugins=[plugin])``. + + Idempotent: returns the same instance on repeated calls unless + ``disable()`` was called in between. + + Args: + config: Optional configuration overrides. + + Returns: + ``CascadeFlowADKPlugin`` instance ready for ``Runner(plugins=[...])``. + """ + global _config, _plugin_instance, _enabled + + if _enabled and _plugin_instance is not None: + logger.debug("google-adk plugin already enabled; returning existing instance") + return _plugin_instance + + if config is not None: + _config = config + + _plugin_instance = CascadeFlowADKPlugin(config=_config) + _enabled = True + logger.info("google-adk harness plugin created") + return _plugin_instance + + +def disable() -> None: + """Deactivate the plugin and clear module state. + + Safe to call even if not enabled. + """ + global _plugin_instance, _enabled + + if _plugin_instance is not None: + _plugin_instance.deactivate() + + _plugin_instance = None + _enabled = False + logger.info("google-adk harness plugin disabled") + + +__all__ = [ + "GOOGLE_ADK_AVAILABLE", + "GoogleADKHarnessConfig", + "CascadeFlowADKPlugin", + "enable", + "disable", + "is_available", + "is_enabled", + "get_config", +] diff --git a/cascadeflow/integrations/openai_agents.py b/cascadeflow/integrations/openai_agents.py new file mode 100644 index 00000000..1205cd98 --- /dev/null +++ b/cascadeflow/integrations/openai_agents.py @@ -0,0 +1,493 @@ +""" +OpenAI Agents SDK integration for cascadeflow harness. + +This module provides an opt-in ModelProvider implementation that applies +cascadeflow harness decisions (model switching, tool gating, run accounting) +inside OpenAI Agents SDK execution. +""" + +from __future__ import annotations + +import logging +import time +from dataclasses import dataclass +from importlib.util import find_spec +from typing import TYPE_CHECKING, Any, AsyncIterator, Optional + +from cascadeflow.harness import get_current_run +from cascadeflow.schema.exceptions import BudgetExceededError + +logger = logging.getLogger("cascadeflow.harness.openai_agents") + +OPENAI_AGENTS_SDK_AVAILABLE = find_spec("agents") is not None + +if TYPE_CHECKING: + from agents.items import ModelResponse + from agents.model_settings import ModelSettings + from agents.models.interface import Model, ModelProvider, ModelTracing + from agents.tool import Tool + from openai.types.responses.response_prompt_param import ResponsePromptParam +else: + Model = object + ModelProvider = object + ModelSettings = Any + ModelTracing = Any + ModelResponse = Any + Tool = Any + ResponsePromptParam = Any + + +@dataclass +class OpenAIAgentsIntegrationConfig: + """ + Runtime behavior for the OpenAI Agents integration. + + model_candidates: + Optional ordered list of candidate models used when harness decides + to switch models under pressure (for example low remaining budget). + enable_tool_gating: + If enabled, removes tools from a model call when the run already + exceeded tool-call caps in enforce mode. + fail_open: + If True, harness-side integration errors never break the agent call. + """ + + model_candidates: Optional[list[str]] = None + enable_tool_gating: bool = True + fail_open: bool = True + + +# Approximate pricing (USD per 1M tokens: input, output). +_PRICING_USD_PER_M = { + "gpt-4o": (2.50, 10.00), + "gpt-4o-mini": (0.15, 0.60), + "gpt-5": (1.25, 10.00), + "gpt-5-mini": (0.20, 0.80), + "gpt-4-turbo": (10.00, 30.00), +} +_DEFAULT_PRICING_USD_PER_M = (2.50, 10.00) + +# Deterministic proxy coefficients for energy tracking. +_ENERGY_COEFFICIENTS = { + "gpt-4o": 1.0, + "gpt-4o-mini": 0.3, + "gpt-5": 1.2, + "gpt-5-mini": 0.35, + "gpt-4-turbo": 1.5, +} +_DEFAULT_ENERGY_COEFFICIENT = 1.0 +_ENERGY_OUTPUT_WEIGHT = 1.5 + + +def _estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float: + in_price, out_price = _PRICING_USD_PER_M.get(model, _DEFAULT_PRICING_USD_PER_M) + return (input_tokens / 1_000_000.0) * in_price + (output_tokens / 1_000_000.0) * out_price + + +def _estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float: + coefficient = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT) + return coefficient * (input_tokens + (output_tokens * _ENERGY_OUTPUT_WEIGHT)) + + +def _total_model_price(model: str) -> float: + return sum(_PRICING_USD_PER_M.get(model, _DEFAULT_PRICING_USD_PER_M)) + + +def _extract_usage_tokens(usage: Any) -> tuple[int, int]: + if usage is None: + return 0, 0 + + input_tokens = getattr(usage, "input_tokens", None) + output_tokens = getattr(usage, "output_tokens", None) + + if input_tokens is None: + input_tokens = getattr(usage, "prompt_tokens", 0) + if output_tokens is None: + output_tokens = getattr(usage, "completion_tokens", 0) + + return int(input_tokens or 0), int(output_tokens or 0) + + +def _count_tool_calls(output_items: Any) -> int: + if not output_items: + return 0 + + count = 0 + for item in output_items: + item_type = None + if isinstance(item, dict): + item_type = item.get("type") + else: + item_type = getattr(item, "type", None) + + if item_type in {"function_call", "tool_call"}: + count += 1 + + return count + + +def _safe_record(action: str, reason: str, model: Optional[str]) -> None: + run = get_current_run() + if run is None: + return + run.record(action=action, reason=reason, model=model) + + +def _apply_run_metrics( + *, + model_name: str, + response: Any, + elapsed_ms: float, + pre_action: str, + allow_reason: str, +) -> None: + run = get_current_run() + if run is None: + return + + usage = getattr(response, "usage", None) if response is not None else None + input_tokens, output_tokens = _extract_usage_tokens(usage) + tool_calls = _count_tool_calls(getattr(response, "output", None)) if response is not None else 0 + + run.step_count += 1 + run.latency_used_ms += elapsed_ms + run.energy_used += _estimate_energy(model_name, input_tokens, output_tokens) + run.cost += _estimate_cost(model_name, input_tokens, output_tokens) + run.tool_calls += tool_calls + + if run.budget_max is not None: + run.budget_remaining = run.budget_max - run.cost + + if pre_action == "deny_tool": + run.last_action = "deny_tool" + run.model_used = model_name + else: + run.record("allow", allow_reason, model_name) + + if run.mode == "enforce" and run.budget_remaining is not None and run.budget_remaining <= 0: + logger.info("openai-agents step exhausted budget; next step will be blocked") + + +class CascadeFlowModelProvider(ModelProvider): # type: ignore[misc] + """ + OpenAI Agents SDK ModelProvider with cascadeflow harness awareness. + + Works as an integration layer only. It is opt-in and never enabled by + default for existing cascadeflow users. + """ + + def __init__( + self, + *, + base_provider: Optional[Any] = None, + config: Optional[OpenAIAgentsIntegrationConfig] = None, + ) -> None: + self._config = config or OpenAIAgentsIntegrationConfig() + self._base_provider = base_provider or self._create_default_provider() + + def _create_default_provider(self) -> Any: + if not OPENAI_AGENTS_SDK_AVAILABLE: + raise ImportError( + "OpenAI Agents SDK not installed. Install with `pip install cascadeflow[openai-agents]`." + ) + + # Local import keeps this integration optional for users who don't + # install the extra. + from agents.models.openai_provider import OpenAIProvider + + return OpenAIProvider() + + def _initial_model_candidate(self, requested_model: Optional[str]) -> str: + if requested_model: + return requested_model + if self._config.model_candidates: + return self._config.model_candidates[0] + return "gpt-4o-mini" + + def _resolve_model(self, requested_model: Optional[str]) -> str: + candidate = self._initial_model_candidate(requested_model) + + run = get_current_run() + if run is None: + return candidate + if run.mode != "enforce": + return candidate + + if run.budget_remaining is not None and run.budget_remaining <= 0: + run.record("stop", "budget_exceeded", candidate) + raise BudgetExceededError( + "cascadeflow harness budget exceeded", + remaining=run.budget_remaining, + ) + + if not self._config.model_candidates or run.budget_max is None or run.budget_max <= 0: + return candidate + + if run.budget_remaining is None: + return candidate + + # Under budget pressure, switch to the cheapest configured candidate. + if run.budget_remaining / run.budget_max < 0.2: + cheapest = min( + self._config.model_candidates, + key=_total_model_price, + ) + if cheapest != candidate: + run.record("switch_model", "budget_pressure", cheapest) + return cheapest + + return candidate + + def get_model(self, model_name: str | None) -> Model: + fallback_model = self._initial_model_candidate(model_name) + selected_model = fallback_model + + try: + selected_model = self._resolve_model(model_name) + except BudgetExceededError: + raise + except Exception: + if not self._config.fail_open: + raise + logger.exception( + "openai-agents model resolution failed; falling back to requested model (fail-open)" + ) + selected_model = fallback_model + + try: + base_model = self._base_provider.get_model(selected_model) + except Exception: + if not self._config.fail_open: + raise + logger.exception( + "openai-agents provider.get_model failed; retrying with fallback model (fail-open)" + ) + selected_model = fallback_model + base_model = self._base_provider.get_model(selected_model) + + return _CascadeFlowWrappedModel( + base_model=base_model, + model_name=selected_model, + config=self._config, + ) + + async def aclose(self) -> None: + close = getattr(self._base_provider, "aclose", None) + if close is None: + return + await close() + + +class _CascadeFlowWrappedModel(Model): # type: ignore[misc] + def __init__( + self, + *, + base_model: Any, + model_name: str, + config: OpenAIAgentsIntegrationConfig, + ) -> None: + self._base_model = base_model + self._model_name = model_name + self._config = config + + def _gate_tools(self, tools: list[Tool]) -> tuple[list[Tool], str]: + run = get_current_run() + if run is None: + return tools, "allow" + if run.mode != "enforce" or not self._config.enable_tool_gating: + return tools, "allow" + if run.tool_calls_max is None: + return tools, "allow" + if run.tool_calls < run.tool_calls_max: + return tools, "allow" + if not tools: + return tools, "allow" + + run.record("deny_tool", "max_tool_calls_reached", self._model_name) + return [], "deny_tool" + + def _update_run_metrics( + self, + *, + response: Any, + elapsed_ms: float, + pre_action: str, + ) -> None: + _apply_run_metrics( + model_name=self._model_name, + response=response, + elapsed_ms=elapsed_ms, + pre_action=pre_action, + allow_reason="openai_agents_step", + ) + + async def get_response( + self, + system_instructions: str | None, + input: str | list[Any], # noqa: A002 - required by OpenAI Agents SDK Model interface + model_settings: ModelSettings, + tools: list[Tool], + output_schema: Any | None, + handoffs: list[Any], + tracing: ModelTracing, + *, + previous_response_id: str | None, + conversation_id: str | None, + prompt: ResponsePromptParam | None, + ) -> ModelResponse: + gated_tools, pre_action = self._gate_tools(tools) + started_at = time.monotonic() + + response = await self._base_model.get_response( + system_instructions=system_instructions, + input=input, + model_settings=model_settings, + tools=gated_tools, + output_schema=output_schema, + handoffs=handoffs, + tracing=tracing, + previous_response_id=previous_response_id, + conversation_id=conversation_id, + prompt=prompt, + ) + + elapsed_ms = (time.monotonic() - started_at) * 1000.0 + + try: + self._update_run_metrics(response=response, elapsed_ms=elapsed_ms, pre_action=pre_action) + except Exception: + if self._config.fail_open: + logger.exception("openai-agents harness metric update failed (fail-open)") + else: + raise + + return response + + def stream_response( + self, + system_instructions: str | None, + input: str | list[Any], # noqa: A002 - required by OpenAI Agents SDK Model interface + model_settings: ModelSettings, + tools: list[Tool], + output_schema: Any | None, + handoffs: list[Any], + tracing: ModelTracing, + *, + previous_response_id: str | None, + conversation_id: str | None, + prompt: ResponsePromptParam | None, + ) -> AsyncIterator[Any]: + gated_tools, pre_action = self._gate_tools(tools) + started_at = time.monotonic() + + stream = self._base_model.stream_response( + system_instructions=system_instructions, + input=input, + model_settings=model_settings, + tools=gated_tools, + output_schema=output_schema, + handoffs=handoffs, + tracing=tracing, + previous_response_id=previous_response_id, + conversation_id=conversation_id, + prompt=prompt, + ) + return _CascadeFlowStreamWrapper( + stream=stream, + model_name=self._model_name, + started_at=started_at, + pre_action=pre_action, + fail_open=self._config.fail_open, + ) + + +class _CascadeFlowStreamWrapper: + def __init__( + self, + *, + stream: AsyncIterator[Any], + model_name: str, + started_at: float, + pre_action: str, + fail_open: bool, + ) -> None: + self._stream = stream + self._model_name = model_name + self._started_at = started_at + self._pre_action = pre_action + self._fail_open = fail_open + self._finalized = False + self._last_response = None + + def __aiter__(self) -> _CascadeFlowStreamWrapper: + return self + + async def __anext__(self) -> Any: + try: + event = await self._stream.__anext__() + except StopAsyncIteration: + await self._finalize() + raise + + response = getattr(event, "response", None) + if response is not None: + self._last_response = response + return event + + async def _finalize(self) -> None: + if self._finalized: + return + self._finalized = True + + run = get_current_run() + if run is None: + return + + elapsed_ms = (time.monotonic() - self._started_at) * 1000.0 + response = self._last_response + + try: + _apply_run_metrics( + model_name=self._model_name, + response=response, + elapsed_ms=elapsed_ms, + pre_action=self._pre_action, + allow_reason="openai_agents_stream_step", + ) + except Exception: + if self._fail_open: + logger.exception("openai-agents stream metric update failed (fail-open)") + return + raise + + +def create_openai_agents_provider( + *, + model_candidates: Optional[list[str]] = None, + enable_tool_gating: bool = True, + fail_open: bool = True, +) -> CascadeFlowModelProvider: + """ + Convenience factory for OpenAI Agents SDK integration. + """ + + return CascadeFlowModelProvider( + config=OpenAIAgentsIntegrationConfig( + model_candidates=model_candidates, + enable_tool_gating=enable_tool_gating, + fail_open=fail_open, + ) + ) + + +def is_openai_agents_sdk_available() -> bool: + return OPENAI_AGENTS_SDK_AVAILABLE + + +__all__ = [ + "OPENAI_AGENTS_SDK_AVAILABLE", + "OpenAIAgentsIntegrationConfig", + "CascadeFlowModelProvider", + "create_openai_agents_provider", + "is_openai_agents_sdk_available", +] diff --git a/docs/README.md b/docs/README.md index 1972c55f..1238d7f8 100644 --- a/docs/README.md +++ b/docs/README.md @@ -35,10 +35,12 @@ Welcome to cascadeflow documentation! 🌊 - [Custom Validation](guides/custom_validation.md) - Implement custom quality validators - [Edge Device Deployment](guides/edge_device.md) - Deploy cascades on edge devices (Jetson, etc.) - [Browser/Edge Runtime](guides/browser_cascading.md) - Run cascades in browser or edge environments +- [Agent Intelligence V2/V2.1 Plan](strategy/agent-intelligence-v2-plan.md) - Unified strategic and execution plan for in-process agent intelligence harness delivery ### Integrations - [n8n Integration](guides/n8n_integration.md) - Use cascadeflow in n8n workflows - [Paygentic Integration](guides/paygentic_integration.md) - Usage metering and billing lifecycle helpers (opt-in) +- [OpenAI Agents SDK Integration](guides/openai_agents_integration.md) - Harness-aware model provider for existing OpenAI Agents apps ## 📚 Examples diff --git a/docs/guides/google_adk_integration.md b/docs/guides/google_adk_integration.md new file mode 100644 index 00000000..d0d32b3f --- /dev/null +++ b/docs/guides/google_adk_integration.md @@ -0,0 +1,161 @@ +# Google ADK Integration + +Integrate cascadeflow harness with Google's Agent Development Kit (ADK) to get +budget enforcement, cost/latency/energy tracking, tool call counting, and full +trace recording across all agents in an ADK Runner. + +--- + +## Design Principles + +- **Plugin-based** — Uses ADK's `BasePlugin` system to intercept every LLM call + across all agents in a Runner. One plugin covers the entire agent graph. +- **Opt-in** — Install `cascadeflow[google-adk]` and create a plugin explicitly. + Never enabled by default. +- **Fail-open** — Integration errors are logged but never break ADK execution + (configurable). +- **No tool gating** — ADK's `tools_dict` is part of agent definition, not + per-call. Budget gate via `before_model_callback` provides sufficient cost + control. This is an intentional difference from the OpenAI Agents integration. + +--- + +## Installation + +```bash +pip install "cascadeflow[google-adk]" +``` + +Requires Python 3.10+ (ADK requirement). + +--- + +## Quick Start + +```python +import asyncio +from google.adk.agents import Agent +from google.adk.runners import Runner +from google.adk.sessions import InMemorySessionService + +from cascadeflow import init, run +from cascadeflow.integrations.google_adk import enable + +# 1. Initialize harness +init(mode="observe", budget=1.0) + +# 2. Create the cascadeflow plugin +plugin = enable() + +# 3. Pass it to the Runner +agent = Agent(name="my_agent", model="gemini-2.5-flash", instruction="Be helpful.") +runner = Runner( + agent=agent, + app_name="my_app", + session_service=InMemorySessionService(), + plugins=[plugin], +) + +# 4. Run within a harness scope +async def main(): + with run(budget=0.5) as session: + # ... run your agent ... + print(f"Cost: ${session.cost:.6f}") + print(f"Steps: {session.step_count}") + print(f"Tool calls: {session.tool_calls}") + +asyncio.run(main()) +``` + +--- + +## Features + +### Budget Enforcement + +In `enforce` mode, the plugin short-circuits LLM calls when the budget is +exhausted by returning an `LlmResponse` with `error_code="BUDGET_EXCEEDED"`. + +```python +init(mode="enforce", budget=0.10) # Hard limit: $0.10 +plugin = enable() +``` + +### Cost and Energy Tracking + +Every LLM call is tracked with: +- **Cost** — Estimated from model pricing (USD per 1M tokens) +- **Energy** — Deterministic proxy coefficient for compute intensity +- **Latency** — Wall-clock time per call +- **Tool calls** — Count of `function_call` parts in responses + +### Trace Recording + +All decisions are recorded in the `HarnessRunContext` trace: + +```python +with run() as session: + # ... run agents ... + for event in session.trace(): + print(event) + # {"action": "allow", "reason": "observe", "model": "gemini-2.5-flash", ...} +``` + +### Configuration + +```python +from cascadeflow.integrations.google_adk import enable, GoogleADKHarnessConfig + +plugin = enable( + config=GoogleADKHarnessConfig( + fail_open=True, # Default: True. Never break ADK on integration errors. + enable_budget_gate=True, # Default: True. Block calls when budget exhausted. + ) +) +``` + +--- + +## Zero-Code Alternative + +If you don't need per-agent plugin integration, you can route ADK through a +cascadeflow LiteLlm proxy by setting `base_url` on your Gemini model: + +```python +# ADK uses LiteLlm under the hood — point it at your cascadeflow proxy +agent = Agent( + name="my_agent", + model="openai/gemini-2.5-flash", # LiteLlm format + instruction="...", +) +# Set OPENAI_API_BASE=http://localhost:8080/v1 to route through cascadeflow proxy +``` + +This gives you cost tracking at the proxy level without a plugin, but doesn't +provide budget enforcement or per-agent trace recording. + +--- + +## Supported Gemini Models + +| Model | Input $/1M | Output $/1M | Energy Coefficient | +|-------|-----------|-------------|-------------------| +| gemini-2.5-flash | $0.15 | $0.60 | 0.3 | +| gemini-2.5-pro | $1.25 | $10.00 | 1.2 | +| gemini-2.0-flash | $0.10 | $0.40 | 0.25 | +| gemini-1.5-flash | $0.075 | $0.30 | 0.2 | +| gemini-1.5-pro | $1.25 | $5.00 | 1.0 | + +All OpenAI and Anthropic models from the shared pricing table are also +supported (e.g., when using LiteLlm provider prefixes). + +--- + +## Troubleshooting + +| Symptom | Solution | +|---------|----------| +| `ImportError: google.adk` | `pip install "cascadeflow[google-adk]"` | +| Plugin not tracking calls | Ensure `plugin` is passed to `Runner(plugins=[plugin])` | +| Budget not enforced | Check `init(mode="enforce", ...)` — observe mode never blocks | +| Zero cost reported | Model name may not match pricing table; check for provider prefix stripping | diff --git a/docs/guides/openai_agents_integration.md b/docs/guides/openai_agents_integration.md new file mode 100644 index 00000000..2db6b8b7 --- /dev/null +++ b/docs/guides/openai_agents_integration.md @@ -0,0 +1,73 @@ +# OpenAI Agents SDK Integration + +Use cascadeflow as an explicit, opt-in `ModelProvider` integration for the OpenAI Agents SDK. + +## Design Principles + +- Integration-only: nothing is enabled by default +- Works with existing Agents SDK apps +- Harness behavior is controlled by `cascadeflow.init(...)` and `cascadeflow.run(...)` +- Fail-open integration path: harness integration errors should not break agent execution + +## Install + +```bash +pip install "cascadeflow[openai,openai-agents]" +``` + +## Quickstart + +```python +import asyncio + +from agents import Agent, RunConfig, Runner +from cascadeflow import init, run +from cascadeflow.integrations.openai_agents import ( + CascadeFlowModelProvider, + OpenAIAgentsIntegrationConfig, +) + + +async def main() -> None: + # Global harness defaults. + init(mode="enforce", budget=1.0, max_tool_calls=6) + + provider = CascadeFlowModelProvider( + config=OpenAIAgentsIntegrationConfig( + model_candidates=["gpt-4o", "gpt-4o-mini"], + enable_tool_gating=True, + ) + ) + + agent = Agent( + name="SupportAgent", + instructions="Answer support questions clearly and concisely.", + model="gpt-4o", + ) + + run_config = RunConfig(model_provider=provider) + + # Scoped run accounting for a single user task. + with run(budget=0.5, max_tool_calls=3) as session: + result = await Runner.run(agent, "Reset my account password", run_config=run_config) + print(result.final_output) + print(session.trace()) + + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## What This Integration Adds + +- Harness-aware model switching under budget pressure +- Tool gating when enforce-mode limits are reached +- Run metrics on `cascadeflow.run()` context: + - `cost`, `budget_remaining`, `step_count`, `tool_calls`, `latency_used_ms`, `energy_used` +- Full action trace through `run.trace()` + +## Notes + +- This is a Python integration for OpenAI Agents SDK. +- The SDK remains optional and is only installed via the `openai-agents` extra. +- Existing non-Agents users are unaffected. diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md new file mode 100644 index 00000000..0d815af6 --- /dev/null +++ b/docs/strategy/agent-intelligence-v2-plan.md @@ -0,0 +1,1070 @@ +# Agent Intelligence V2 Plan + +Last updated: February 25, 2026 +Status: Planning (no implementation in this document) +Supersedes: agent-intelligence-v1-plan.md + +## 1. Objective + +Make cascadeflow the default **in-process agent intelligence harness** for teams that need runtime control over cost, latency, quality, risk, budget, energy, and business KPIs. + +Not a proxy. Not a hosted dependency. A local-first infrastructure layer that can influence agent decisions during execution. + +### 1.1 Winning Criteria + +This plan is successful only if all three pillars are achieved: + +1. **Low-friction install** + - Time-to-first-value under 15 minutes + - Existing apps can activate in 1-3 lines + - Explicit opt-in, no breaking changes for current users +2. **In-loop business KPI control** + - Policies can influence step-level decisions and tool usage at runtime + - Hard constraints and soft KPI preferences both supported + - Decisions are explainable (`why` + `what action`) +3. **Reproducible benchmark superiority on realistic workflows** + - Better or equal quality vs baseline while improving cost/latency + - Results reproducible with pinned configs, prompts, models, and scripts + - Agentic benchmarks include tool loops and multi-step workflows (not only static QA) + +## 2. Product Thesis (Grounded) + +Most routers and gateways optimize at request boundaries. The bigger opportunity is inside agent execution: + +- Per-step model decisions based on agent state +- Per-tool-call gating based on remaining budget +- Runtime-aware stop/continue/escalate actions +- Business KPI injection during agent loops +- Learning from outcomes to improve future routing + +This is the moat: **in-process harness for agent decisions**, not external provider routing. + +### What Competitors Already Do (and Why That Is Not Enough) + +- External routers/gateways already do strong request-level routing, fallback, and policy checks. +- Agent frameworks already expose hook systems and guardrails. + +The remaining gap is **cross-framework, local-first, step-level optimization with shared policy semantics**: +- one policy model across different agent stacks, +- one observability model across direct SDK + frameworks, +- one enforcement model across tool loops and sub-agent calls. + +### Why External Proxies Stay Structurally Limited + +A proxy sees: `POST /v1/chat/completions { model, messages, tools }`. + +cascadeflow's harness sees: agent state, step count, budget consumed, tool call history, error context, quality scores on intermediate results, domain, complexity, conversation depth, and any user-defined business context. + +This information asymmetry is structural and permanent. Replicating in-process agent state awareness from an external proxy requires fundamental architectural changes — not a feature addition. + +## 3. Target Users and Segments + +- Startups shipping AI agents in existing products +- Platform teams standardizing agent behavior across products and tenants +- Individual developers are supported, but V2 optimization is for teams with production constraints + +Primary constraints (hard): +- Max cost, max latency, max tool calls, risk/compliance gates, max energy + +Secondary constraints (soft): +- Weighted KPI preferences that influence model/tool decisions when hard limits are not violated + +## 4. V2/V2.1 Release Contract (Single Plan) + +This document contains both releases in one plan with explicit boundaries: + +| Area | V2 (Python-first) | V2.1 | +|---|---|---| +| Core harness API (`init`, `run`, `@agent`) | Python | TypeScript parity | +| Auto-instrumentation | OpenAI Python client | Anthropic Python + OpenAI/Anthropic TS clients | +| Integrations | OpenAI Agents SDK, CrewAI, LangChain (Python) + regression checks for existing integrations | TS integration parity + deeper framework convergence | +| Policy semantics | Defined and validated in Python | Same semantics validated in TS parity fixtures | +| Launch target | Production-ready Python harness + reproducible benchmarks | Cross-language parity release | + +## 5. V2 Product Definition + +V2 ships an **agent harness** as an optional, integration-first intelligence layer: + +- Not enabled by default +- No cloud dependency required +- Works in existing apps/agents with minimal code changes (target: 1-3 lines) +- Default behavior remains unchanged unless explicitly enabled +- All framework-specific integrations are separate packages (not bundled with core) + +### Harness Modes + +- `off`: No harness evaluation (default for all existing users) +- `observe`: Evaluate + emit decisions, no behavior change (safe production rollout) +- `enforce`: Apply harness actions at runtime + +### Recommended Rollout for Users + +1. Start with `observe` in production +2. Validate traces + false positives + overhead +3. Enable `enforce` for selected tenants/channels + +## 5.1 Low-Friction DX Contract (Must-Haves) + +- Explicit activation only: no hidden patching. +- Existing code path preserved if harness is `off`. +- If auto-instrumentation is not safe in a runtime, users can use explicit adapter hooks (fallback mode). +- Quickstarts prioritize existing applications first, greenfield second. + +## 5.2 DX Philosophy + +### Principle: Invisible infrastructure, not wrappers + +The gold standard DX is Sentry, DataDog, OpenTelemetry — you activate it, your existing code doesn't change. + +cascadeflow targets this with **auto-instrumentation where safe**, plus **framework-native hooks** in optional integration packages. + +> **Note**: The APIs shown below (`cascadeflow.init()`, `cascadeflow.run()`, `@cascadeflow.agent()`) are the **target V2 API design**. They do not exist today. Current API is `CascadeAgent(models).run(query)`. Building these APIs is the V2 deliverable. + +### Tier 1: Zero-change activation (core, target API) + +```python +import cascadeflow + +cascadeflow.init(mode="observe") +# Every openai call in your app is now observed. +# No code changes. No wrappers. +# Example startup diagnostics: +# [cascadeflow] instrumented: openai +# [cascadeflow] detected but not instrumented in V2: anthropic (planned V2.1) + +cascadeflow.init(mode="enforce") +# Now actively cascading, routing, and enforcing budgets. +``` + +How it works: `init()` patches LLM client libraries at the call level. This is the same proven pattern used by Sentry, DataDog APM, and OpenTelemetry auto-instrumentation. + +V2 scope: `openai` Python client patching only. `anthropic` client patching follows in V2.1. Auto-instrumentation covers code that calls the `openai` SDK directly. Frameworks that abstract over the SDK (LangChain's `ChatOpenAI`, CrewAI via LiteLLM) require their respective integration packages for full coverage. + +### Tier 2: Agent-scoped harness (core, target API) + +```python +async with cascadeflow.run(budget=0.50, max_tool_calls=10) as run: + # Your existing agent code + result = await my_agent.invoke({"task": "Fix the login bug"}) + + print(run.cost) # $0.12 + print(run.savings) # 68% + print(run.tool_calls) # 4 of 10 budget used +``` + +A context manager scopes budget tracking and harness decisions to an agent run. No restructuring of agent code required. + +### Tier 3: Decorated agent with KPIs (core, target API) + +```python +import openai + +@cascadeflow.agent( + budget=0.50, + kpi_targets={"quality_min": 0.90, "latency_ms_max": 3000}, + kpi_weights={"cost": 0.4, "quality": 0.3, "latency": 0.2, "energy": 0.1}, + compliance="gdpr", +) +async def customer_support_agent(task: str): + client = openai.AsyncOpenAI() + response = await client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": task}], + ) + return response.choices[0].message.content +``` + +A decorator adds metadata. The function body doesn't change. + +### Tier 4: Framework-specific deep integration (integration packages) + +```python +# Requires separate install — not bundled with core. +# These extras do not exist in pyproject.toml today and must be added in Phase D. + +# pip install cascadeflow[langchain] +from cascadeflow.integrations.langchain import CascadeFlowCallbackHandler + +# pip install cascadeflow[openai-agents] +from cascadeflow.integrations.openai_agents import CascadeFlowModelProvider + +# pip install cascadeflow[crewai] +from cascadeflow.integrations.crewai import CascadeFlowHooks +``` + +Framework-specific packages provide deeper integration (state extraction, middleware hooks, framework-native telemetry). These are optional — Tier 1-3 work without them for code that calls the `openai` SDK directly. + +### TypeScript Equivalent + +```typescript +// Target API — does not exist in @cascadeflow/core today. +// TS parity is a V2.1 deliverable (see Section 16, Phase F). + +import { cascadeflow } from '@cascadeflow/core'; + +// Tier 1: Auto-instrument +cascadeflow.init({ mode: 'enforce' }); + +// Tier 2: Scoped run +const result = await cascadeflow.run({ budget: 0.50 }, async (run) => { + return await myAgent.invoke({ task: 'Fix the login bug' }); +}); + +// Tier 4: Framework packages +// npm install @cascadeflow/langchain +// npm install @cascadeflow/openai-agents +// npm install @cascadeflow/vercel-ai (already exists) +// npm install @cascadeflow/n8n (already exists) +``` + +## 5.3 DX Execution Contracts (Required) + +These contracts remove ambiguity for production teams: + +1. **`init()` instrumentation diagnostics** + - `init()` emits a startup summary of what was instrumented and what was detected but not instrumented in the current version. + - V2 example: OpenAI instrumented, Anthropic detected-but-not-instrumented warning. +2. **`init()` + `run()` scope composition** + - `init()` defines global defaults for calls outside any scoped run. + - `run()` creates an isolated child scope. + - Inside a `run()` scope, run-level settings override global defaults for that scope only. + - Nested `run()` scopes are isolated; inner scope does not mutate outer scope. +3. **Existing `CascadeAgent` migration behavior** + - `cascadeflow.init()` does not rewrite `CascadeAgent`'s core cascade behavior. + - `CascadeAgent` can execute inside `cascadeflow.run()` to contribute to run-level budget/trace accounting. +4. **Configuration precedence** + - Effective config resolution order: explicit code kwargs > environment variables > config file (`cascadeflow.yaml` / JSON) > library defaults. + - `init()` without kwargs may resolve from env/file for platform deployments. + +## 6. Scope (V2) + +### In Scope + +- Harness engine in core (init, run context, decorator, action evaluation) +- Auto-instrumentation of `openai` Python client library (V2 scope; `anthropic` client and TS parity in V2.1) +- Harness modes: `off | observe | enforce` +- Action vocabulary: `allow | switch_model | deny_tool | stop` +- Config precedence support for harness init (code kwargs > env vars > config file > defaults) +- Hard controls: max cost, max latency, max tool calls, risk gates, max energy +- Soft controls: weighted KPI preferences +- Step-level and tool-level harness hooks +- Energy dimension (optional, in core) +- Parity fixtures/spec for TS implementation in V2.1 (Python implementation ships in V2) +- Integration packages (separate install, not bundled with core): + - `cascadeflow[openai-agents]` — OpenAI Agents SDK (NEW — extra must be added to pyproject.toml) + - `cascadeflow[crewai]` — CrewAI via LLM hooks (NEW — extra must be added to pyproject.toml) + - `cascadeflow[langchain]` — LangChain/LangGraph (EXISTS as code, extra must be added to pyproject.toml) + - Existing integrations verified: Vercel AI SDK, n8n +- Named benchmark suite with acceptance gates + +### Out of Scope (V2) + +- Hosted control plane / Studio (future product) +- Mandatory migration for existing users +- Autonomous learning loop with remote training (future phase) +- Speculative agent execution (future phase) +- Carbon API integration (future; energy estimate is V2, live carbon data is not) +- MCP tool call interception (future phase) +- Google ADK integration (on demand) + +## 7. Non-Negotiable Constraints + +- Backward compatible: existing users see zero behavior change +- Opt-in only: `off` by default +- No default latency regression for non-harness users +- Harness decision overhead target: **<5ms p95** +- Cascade execution overhead: documented and expected (extra LLM call for verification) +- Preserve existing DX simplicity for non-harness users +- Framework integrations are never auto-installed with core +- Auto-instrumentation is explicit (`cascadeflow.init()`) — never hidden + +## 8. Architecture + +### 8.1 Package Boundaries + +``` +cascadeflow (core) +├── cascadeflow.harness # Harness engine (NEW) +│ ├── init() # Auto-instrumentation entry point +│ ├── run() # Context manager for scoped runs +│ ├── agent() # Decorator for KPI-annotated agents +│ ├── actions # allow, switch_model, deny_tool, stop +│ ├── context # HarnessContext (runtime state) +│ └── instrument # LLM client patching (openai, anthropic) +├── cascadeflow.rules # Rule engine (EXISTS, extended) +├── cascadeflow.quality # Quality validation (EXISTS) +├── cascadeflow.routing # Routing (EXISTS) +├── cascadeflow.core.cascade # Speculative cascade (EXISTS) +├── cascadeflow.telemetry # Cost tracking + metrics (EXISTS) +└── cascadeflow.providers # LLM providers (EXISTS) + +cascadeflow[openai-agents] # Integration package (NEW) +├── CascadeFlowModelProvider # OpenAI Agents SDK ModelProvider +├── tool_guard # Tool call gating via Agents SDK hooks +└── trace_adapter # Map Agents SDK traces to harness context + +cascadeflow[crewai] # Integration package (NEW) +├── CascadeFlowHooks # CrewAI LLM call hooks +├── crew_context # Extract crew/agent/task state +└── step_callback # Budget tracking per crew step + +cascadeflow[langchain] # Integration package (EXISTS, extended) +├── CascadeFlow(BaseChatModel) # Existing LangChain wrapper +├── harness_callback # NEW: LangGraph middleware for harness +└── state_extractor # NEW: Extract LangGraph state for context +``` + +### 8.2 Core Harness Layer + +Extend current rule context with runtime/loop state: + +```python +@dataclass +class HarnessContext: + # Identification + agent_id: Optional[str] = None + run_id: str = field(default_factory=lambda: uuid4().hex[:12]) + + # Budget tracking (hard controls) + budget_max: Optional[float] = None + budget_used: float = 0.0 + tool_calls_max: Optional[int] = None + tool_calls_used: int = 0 + latency_max_ms: Optional[float] = None + latency_used_ms: float = 0.0 + energy_max: Optional[float] = None + energy_used: float = 0.0 + + # Agent state + step_count: int = 0 + tool_history: list[str] = field(default_factory=list) + error_history: list[str] = field(default_factory=list) + prior_actions: list[str] = field(default_factory=list) + cascade_active: bool = False + draft_model: Optional[str] = None + verifier_model: Optional[str] = None + draft_accepted: Optional[bool] = None + + # Soft controls (KPI weights, sum to 1.0) + kpi_weights: Optional[dict[str, float]] = None + + # Compliance + compliance_tags: list[str] = field(default_factory=list) + + # Harness mode + mode: Literal["off", "observe", "enforce"] = "off" +``` + +### 8.3 Harness Action Surface + +Actions the harness can take: + +| Action | Description | When | +|---|---|---| +| `allow` | Proceed normally (default) | Hard limits not violated | +| `switch_model` | Use a different model for this call | Cost/quality/latency optimization | +| `deny_tool` | Block a tool call | Budget exhausted, risk gate, compliance | +| `stop` | Terminate the agent run | Hard budget exceeded, safety gate | + +These actions are evaluated at three hook points: + +- **Pre-LLM-call**: Before each model invocation (model selection, budget check) +- **Pre-tool-call**: Before each tool execution (tool gating, budget check) +- **Post-LLM-call**: After each model response (quality validation, state update) + +In `observe` mode: actions are computed and logged but not applied. +In `enforce` mode: actions are computed, logged, and applied. + +### 8.3.1 `switch_model` Resolution Path + +`switch_model` is not a simple fallback list. It uses existing cascadeflow intelligence: + +1. Rule constraints (tenant/channel/KPI/tier/workflow context) +2. Complexity + domain signals +3. Model capability and safety constraints (tool support, risk/compliance requirements) +4. Cost/latency/quality scoring over remaining candidate models + +The selected model and reason are always included in the decision trace. + +### 8.3.2 `deny_tool` Contract (Default) + +Default behavior in V2: + +1. **Prevention path (preferred):** if a tool is disallowed before model execution, the tool is removed/blocked from the callable set for that step. +2. **Interception path:** if a disallowed tool call is still emitted, return a synthetic structured tool result: + - `{"error":"tool_denied","reason":"budget_exceeded","action":"deny_tool"}` +3. Continue the loop with the denial result in context so the agent can recover or stop. + +Integrations may map this to framework-native interruption semantics, but the default contract remains structured and non-crashing. + +### 8.4 Auto-Instrumentation Layer + +Core patches LLM client libraries to intercept calls: + +```python +# V2 scope — core auto-instrumentation: +# - openai (Python) — already an optional dep in pyproject.toml + +# V2.1 scope: +# - anthropic (Python) — already an optional dep +# - openai (TypeScript) — in @cascadeflow/core + +# Supported via integration packages (separate install): +# - litellm (existing integration module; optional dependency) +# - langchain ChatModels (via cascadeflow[langchain]) +# - crewai LLM (via cascadeflow[crewai]) +``` + +The patch intercepts `create()` / `acreate()` calls and: +1. Reads the current `HarnessContext` (from context manager or `contextvars`, not thread-local) +2. Evaluates harness rules (complexity, domain, budget state) +3. In `observe`: logs the decision, passes through unchanged +4. In `enforce`: applies action (switch model, cascade, deny) +5. Updates context (cost, latency, step count) + +Implementation contract: +- Patch registration is idempotent (multiple `init()` calls are safe). +- Scoped runs use isolated contextvar state (including nested runs). +- A clean unpatch/reset path exists for tests and controlled shutdown. + +### 8.5 Integration Layer + +Ship as optional integration packages, same pattern as existing integrations: + +- Explicit install (`pip install cascadeflow[crewai]`) +- Explicit enable/config +- No hidden activation from core install +- Try/except imports with `AVAILABLE` flags +- Graceful degradation when not installed + +Each integration provides: +1. **State extraction**: Pull agent/framework state into `HarnessContext` +2. **Native hooks**: Use the framework's own extension points (not custom wrappers) +3. **Telemetry bridge**: Map framework traces to harness telemetry + +| Integration | Framework Extension Point | What It Adds | +|---|---|---| +| `openai-agents` | `ModelProvider` at `Runner.run` level | Model routing, tool gating | +| `crewai` | `llm_hooks` (native CrewAI feature) | LLM call interception, crew state | +| `langchain` | `BaseChatModel` (existing) + LangGraph middleware | State extraction, callbacks | +| `vercel-ai` | Existing `@cascadeflow/vercel-ai` | Extend with harness config | +| `n8n` | Existing `@cascadeflow/n8n-nodes-cascadeflow` | Extend with harness node params | + +## 9. Hard vs Soft Controls + +### 9.0 KPI Input Schema + +To avoid ambiguity, harness KPI config is split into two explicit inputs: + +- `kpi_targets`: absolute goals/limits (for example `quality_min`, `latency_ms_max`) +- `kpi_weights`: optimization preferences used for scoring when hard limits are not violated + +### Hard Controls (enforced when enabled) + +| Control | Config | Action on Violation | +|---|---|---| +| Max cost per run | `budget=0.50` | `switch_model` (downgrade) or `stop` | +| Max tool calls | `max_tool_calls=10` | `deny_tool` | +| Max latency per run | `max_latency_ms=5000` | `switch_model` (faster) or `stop` | +| Risk/compliance gate | `compliance="gdpr"` | Route to compliant model or `deny_tool` | +| Max energy estimate | `max_energy=0.01` | `switch_model` (lighter) or `stop` | + +### Soft Controls (influence, don't enforce) + +Weighted KPI preferences that influence model/tool decisions when hard limits are not violated: + +```python +cascadeflow.init( + mode="enforce", + kpi_weights={ + "cost": 0.4, # 40% weight on cost optimization + "quality": 0.3, # 30% weight on quality + "latency": 0.2, # 20% weight on latency + "energy": 0.1, # 10% weight on energy efficiency + } +) +``` + +Soft controls affect model scoring in the cascade routing decision. They do not trigger `deny_tool` or `stop`. + +### 9.1 Prompt Caching Strategy + +Prompt caching is complementary to cascading and budget enforcement. + +V2: +- Capture cache-related usage signals where available (e.g., cached tokens) in telemetry. +- Expose cache metrics in traces and benchmark artifacts. +- Do not make cache-hit optimization a hard routing objective yet. + +V2.1: +- Optional cache-aware scoring bias for compatible providers/models. +- Validate that cache-aware routing improves net economics without quality regressions. + +### 9.2 Energy Estimation Specification (V2) + +V2 uses a deterministic proxy estimate (not real-time grid carbon): + +- `energy_units = model_coefficient * (input_tokens + output_tokens * output_weight)` +- `model_coefficient` comes from a versioned local mapping (fallback to default when unknown). +- `output_weight` defaults to >1 to reflect higher generation compute cost. + +This keeps energy scoring deterministic, reproducible, and local-first. Live carbon-intensity routing remains post-V2. + +## 10. TS/Python Parity Requirements + +Parity means same core semantics, not necessarily identical APIs. + +V2 ships Python first. TS parity is a V2.1 deliverable (Phase F). Parity fixtures are written in V2 Phase A as the TS implementation spec. + +Target parity (V2.1): +- Same harness modes: `off | observe | enforce` +- Same action vocabulary: `allow | switch_model | deny_tool | stop` +- Same `HarnessContext` fields for budget/latency/energy/tool-depth +- Same fallback behavior when harness is disabled +- Same hook points: pre-LLM-call, pre-tool-call, post-LLM-call +- Comparable telemetry fields for analysis +- Shared parity test fixtures (written in V2, validated in V2.1) + +## 11. Framework Integrations (V2) + +### 11.1 OpenAI Agents SDK (`cascadeflow[openai-agents]`) + +Required as official integration coverage in V2. + +Integration approach: +- Use `ModelProvider` at `Runner.run` level (framework's native extension) +- NOT a custom wrapper around the SDK +- Harness evaluates at each agent step via the model provider +- Tool gating via tool-call inspection in model responses + +Minimum capabilities: +- Harness runs in `observe` and `enforce` modes +- Tool-call gating (deny on harness action) +- Model recommendation/switch based on harness decision +- Budget tracking across multi-step agent runs +- No hard dependency forced onto all cascadeflow users + +### 11.2 CrewAI (`cascadeflow[crewai]`) + +Integration approach: +- Use CrewAI's native `llm_hooks` (before/after LLM calls) +- Extract crew/agent/task state into `HarnessContext` +- Budget tracking via `step_callback` + +### 11.3 LangChain/LangGraph (`cascadeflow[langchain]`) + +Integration approach: +- Extend existing `CascadeFlow(BaseChatModel)` wrapper +- Add LangGraph-specific middleware for state extraction +- Add harness-aware callback handler +- Preserve existing DX for current LangChain users + +### 11.4 Existing Integrations + +Verify and extend (no breaking changes): +- `@cascadeflow/vercel-ai`: Add harness config pass-through +- `@cascadeflow/n8n-nodes-cascadeflow`: Add harness mode parameter to nodes +- `cascadeflow.integrations.litellm`: Verify harness compatibility +- `cascadeflow.integrations.openclaw`: Verify harness compatibility + +## 12. Transparency and Debugging + +Auto-instrumentation must not be magic. Every harness decision is visible: + +- `cascadeflow.init(mode="observe")`: Logs every decision (what it *would* do) +- `cascadeflow.init(mode="enforce", verbose=True)`: Rich console output showing cascade path +- Harness metadata is accessible via two paths depending on usage mode: + - **Library mode** (in-process): Metadata on `HarnessContext` / `run` object — `run.last_action`, `run.model_used`, `run.draft_accepted`, `run.budget_remaining`, `run.run_id` + - **Proxy mode** (HTTP gateway): `x-cascadeflow-*` response headers (existing proxy behavior, unchanged) +- `run.trace()` returns full decision log for a scoped run +- Harness decisions are emitted via existing `CallbackManager` events +- All decisions include: action taken, reason, model used, budget state, run_id for correlation + +Default logging destination: +- Logger name: `cascadeflow.harness` +- `DEBUG`: per-step decisions and action reasons +- `INFO`: per-run summaries in `run()` scope +- `verbose=True`: adds rich console rendering on top of logger output (does not replace structured logging) + +### 12.1 Run Object Surface (V2 Target API) + +```python +run.cost # float: total cost in scoped run +run.savings # float: savings percentage vs selected baseline +run.tool_calls # int: tool calls used +run.budget_remaining # float|None: remaining budget if configured +run.model_used # str|None: most recent selected model +run.last_action # str: allow|switch_model|deny_tool|stop +run.draft_accepted # bool|None: draft acceptance for last cascade decision +run.run_id # str: correlation id +run.trace() # list[dict]: full decision timeline +``` + +## 13. Benchmark and Validation Plan + +Use live API runs and keep comparability with prior benchmark set. Winning claims require reproducible, public methodology. + +### 13.1 Benchmark Families + +- Baseline language/reasoning: MT-Bench, TruthfulQA +- Code correctness: HumanEval, SWE-bench Lite slices +- Classification/structured output: Banking77 +- Tool use and agent loops: BFCL-style tool/function scenarios + internal loop tests +- Product realism: customer-support and multi-agent delegation scenarios already aligned with cascadeflow usage + +### 13.2 Realistic Workflow Suite (Required) + +Each benchmark run must include at least these workload types: + +- Existing app integration flow (OpenAI SDK direct calls) +- Existing agent framework flow (OpenAI Agents SDK, LangChain/LangGraph, CrewAI) +- Tool-heavy flow (5+ loop steps, mixed tool success/failure) +- Budget-constrained flow (mid-run budget pressure) +- Risk/compliance-constrained flow (policy escalation and tool deny paths) + +### 13.3 Reproducibility Protocol (Non-Negotiable) + +- Pin exact git SHA, benchmark script version, model names, and provider endpoints. +- Store raw per-case outputs (JSON/JSONL), not only aggregate summaries. +- Record both quality metrics and economics metrics per case: + - accepted/rejected, + - draft acceptance, + - total cost, + - latency, + - selected model path, + - policy action path. +- Publish confidence intervals and sample sizes for reported improvements. +- Re-run on at least two separate days before public claims. + +### 13.4 Superiority Criteria (Grounded) + +To claim “winning” in go-to-market material: + +- Quality: non-inferior to baseline on core tasks with agreed margin. +- Cost: statistically significant reduction on realistic agent workflows. +- Latency: no material regression for non-harness users; harness overhead p95 <5ms. +- Policy safety: false-positive enforcement rate under agreed threshold. +- DX: time-to-first-value within target and successful quickstart completion by external testers. + +### 13.5 Launch Gates + +- Observe mode must be behavior-identical to baseline (output parity checks). +- Enforce mode must show measurable value on at least three realistic workflow families. +- Benchmark scripts and result artifacts must be executable by third parties with documented setup. + +## 14. Competitive Positioning + +### 14.1 Ecosystem Baseline Capabilities + +- Provider/model fallback and load balancing +- Request-level cost optimization (model selection) +- Cross-provider unified API access +- Low integration friction (URL change) +- Framework middleware/hooks, guardrails, and tracing + +### 14.2 What Remains Unresolved Across These Tools + +- No shared cross-framework policy semantics for business KPIs. +- Limited consistent in-loop controls across model/tool/agent-step decisions. +- Weak portability of optimization behavior across direct SDK use and framework use. +- Economic claims are often hard to reproduce end-to-end on realistic workflows. + +### 14.3 Remaining Gap We Target + +- Cross-framework policy semantics (one control model across stacks). +- In-loop optimization that combines cost, latency, quality, risk, and business KPIs. +- Local-first deployment without mandatory cloud control plane. +- Reproducible economic + quality gains under realistic agent workflows. + +### 14.4 Positioning Against Current Market + +| Category | Examples | Their Strength | cascadeflow Differentiator | +|---|---|---|---| +| Budget-only enforcement | AgentBudget, custom budget middleware | Fast setup for spend caps and loop stops | Multi-dimensional optimization: cost + quality + latency + KPI + energy + cascade validation | +| Proxy cost-control + observability | Helicone, similar gateway observability stacks | Fast request-level analytics/caching/rules without code-level harness changes | In-process agent-state decisions and step/tool-level policy enforcement inside loops | +| External routers/gateways | OpenRouter, Portkey, NotDiamond | Provider/routing control at API boundary | In-loop action control with agent state and policy context | +| Framework-native orchestration | OpenAI Agents SDK, LangGraph, CrewAI | Rich framework-specific hooks and orchestration | Cross-framework policy layer + unified KPI semantics | +| Single-provider optimization | Provider-native routing features | Tight provider integration and defaults | Multi-provider, user-economics-first optimization | + +## 15. Risks and Mitigations + +- **Risk**: Over-complex harness UX + Mitigation: Default `off`, `observe` before `enforce`, 1-3 lines to activate. Progressive complexity. + +- **Risk**: Auto-instrumentation surprises (patching library internals) + Mitigation: Explicit `init()` required. Never hidden. `observe` mode first. Verbose logging available. Metadata on every response. + +- **Risk**: "Always verifier" behavior in sensitive benchmarks + Mitigation: Explicit harness reasons + scenario tests + calibrated hard/soft boundaries. + +- **Risk**: TS/Python drift + Mitigation: Shared parity fixtures and decision test cases. + +- **Risk**: Integration sprawl + Mitigation: One harness core, thin adapters per integration. Auto-instrumentation plus explicit adapter mode for hard runtimes. + +- **Risk**: Framework API instability (breaking changes in LangGraph, CrewAI, etc.) + Mitigation: Integrations are thin adapters (<500 lines). Core harness works via LLM client patching regardless of framework changes. + +- **Risk**: LangChain/OpenAI build competing harness features + Mitigation: Ship fast, position as complementary (not competing), framework-agnostic is the moat. LangChain's Deep Agents is LangChain-only. cascadeflow works with everything. + +- **Risk**: LLM provider builds internal routing (GPT-5 internal router) + Mitigation: Provider routing is single-provider and optimizes for provider economics. cascadeflow is multi-provider and optimizes for user economics/KPIs. Re-evaluate this risk quarterly with a documented competitive capability review. + +- **Risk**: Harness decision overhead exceeds target + Mitigation: Rule evaluation is CPU-only (no network calls). Benchmark continuously. Degrade gracefully (skip harness if overhead budget exceeded). + +- **Risk**: Low-friction promise fails in real teams + Mitigation: Track time-to-first-value in external pilot tests; gate launch on quickstart completion metrics. + +- **Risk**: Benchmark claims are not trusted externally + Mitigation: Publish reproducibility protocol, scripts, and raw artifacts for independent reruns. + +## 16. Release Plan (Phased) + +### Phase A: Harness Core Definition (2-3 weeks) + +- Finalize `HarnessContext` schema +- Finalize action vocabulary and hook points +- Define `off | observe | enforce` mode behavior +- Write parity fixtures (Python first, TS fixtures as spec — TS implementation in V2.1) +- Design auto-instrumentation for `openai` Python client +- Add new extras to `pyproject.toml`: `langchain`, `openai-agents`, `crewai` + +Exit criteria: +- Schema frozen +- Python parity fixture tests green +- Auto-instrumentation prototype patching `openai` Python client +- pyproject.toml extras defined (even if integration code is not yet complete) + +### Phase B: Observe Mode (3-4 weeks) + +- Implement `cascadeflow.init(mode="observe")` (NEW top-level API) +- Auto-instrument `openai` Python client (sync + async + streaming + tool calling) +- Emit startup instrumentation diagnostics (instrumented vs detected-but-not-instrumented SDKs) +- Implement `cascadeflow.run()` context manager +- Emit decision traces via `CallbackManager` +- Integrate with existing `RuleEngine` (extended with `HarnessContext`) +- Harness metadata on `HarnessContext` / `run` object + +Note: Auto-instrumentation of `openai` client is the highest-risk engineering task. Patching async streaming, tool calling, retries, and `with_raw_response` requires exhaustive edge-case testing. + +Exit criteria: +- `observe` mode produces zero behavior change (validated by benchmark) +- Decision traces are accurate and complete +- Overhead within <5ms p95 target +- All existing tests still pass (backward compatibility) +- Edge cases validated: streaming, async, tool calling, parallel tool calls, retries + +### Phase C: Enforce Mode (3-4 weeks) + +- Activate `switch_model`, `deny_tool`, `stop` actions +- Implement hard controls (budget, tool-call cap, latency, energy) +- Implement soft controls (KPI-weighted model scoring) +- Add safety fallbacks (graceful degradation on harness error) +- Implement `@cascadeflow.agent()` decorator + +Exit criteria: +- Enforced behavior matches harness intent +- No critical regressions in benchmark suite +- Hard controls reliably enforced (100% of violations caught) +- Harness errors never crash the agent (fail-open) + +### Phase D: Integration Packages (3-5 weeks, parallelizable with Phase C) + +- `cascadeflow[openai-agents]`: ModelProvider + tool gating +- `cascadeflow[crewai]`: LLM hooks + crew state extraction +- `cascadeflow[langchain]`: Extend existing with harness callbacks +- Verify existing integrations: Vercel AI SDK, n8n, LiteLLM, OpenClaw +- Docs + quickstarts + examples for each integration + +Exit criteria: +- Install and quickstart verified end-to-end for each integration +- CI and integration tests green +- Each integration <500 lines of framework-specific code + +### Phase E: Benchmarks + Public Launch (2-3 weeks) + +- Run full benchmark suite (baseline + agentic + harness scenarios) +- Publish reproducible benchmark results +- Write launch content (blog post, integration cookbooks) +- Go/No-Go checklist validated + +Exit criteria: +- All acceptance gates met +- Benchmark results published and reproducible +- DX quickstart works for existing app/agent users with 1-3 lines of code + +### Total V2 Timeline (Python): 14-18 weeks + +This is the realistic timeline for Python-first delivery with one primary contributor. Phases C and D can overlap (integration packages start once enforce mode core is stable). + +### V2 Success Scorecard (Must Pass Before Launch) + +- **Low-friction install** + - 80%+ of pilot users complete quickstart without maintainer help. + - Median time-to-first-value under 15 minutes. +- **In-loop KPI control** + - Policy actions (`switch_model`, `deny_tool`, `stop`) triggered and logged correctly in scenario tests. + - Observe→enforce rollout shows no unexpected behavior in pilot tenants. +- **Benchmark superiority** + - Quality non-inferior vs baseline on agreed benchmark set. + - Statistically significant cost reduction on realistic agent workflows. + - Harness overhead p95 under 5ms for decision path. + +### Phase F: TypeScript Parity (V2.1, post-V2 launch) + +- Port `cascadeflow.init()` / `run()` to `@cascadeflow/core` +- Auto-instrument `openai` TypeScript client (OpenAI Node SDK) +- Port `HarnessContext`, action evaluation, harness modes +- TS parity fixture tests green +- Extend `@cascadeflow/vercel-ai` and `@cascadeflow/n8n` with harness support + +Estimated: 6-8 weeks after V2 Python launch. + +### Phase G: Anthropic Client Instrumentation (V2.1) + +- Auto-instrument `anthropic` Python client +- Auto-instrument `@anthropic-ai/sdk` TypeScript client +- Validate with Claude-based agent workflows + +Estimated: 3-4 weeks (can parallel with Phase F). + +### 16.1 Parallel Branch Workboard (Tick-Off) + +Use this section as the single coordination board for parallel execution. + +Branching model: +- Keep `main` always releasable. +- Use one integration branch for this program: `feature/agent-intelligence-v2-integration`. +- Contributors build on short-lived feature branches and merge to the integration branch first. +- Merge to `main` only after integration branch CI + benchmark gates are green. + +Claim checklist (one owner per branch at a time): +- [x] `feat/v2-core-harness-api` — Owner: `@codex` — PR: `TBD` — Status: `completed` +- [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — PR: `TBD` — Status: `in-progress` +- [ ] `feat/v2-enforce-actions` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` +- [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` +- [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` +- [ ] `feat/v2-langchain-harness-extension` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` +- [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` +- [ ] `feat/v2-bench-repro-pipeline` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` +- [ ] `feat/v2-security-privacy-telemetry` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` + +Merge gates per feature branch: +- [ ] Unit/integration tests green for touched scope +- [ ] Docs/examples updated for any API or behavior change +- [ ] Backward compatibility verified (`off` mode unchanged) +- [ ] Bench impact assessed (if runtime behavior changed) + +Integration-branch promotion gates: +- [ ] Core + integration CI green +- [ ] Full benchmark suite rerun with reproducibility artifacts +- [ ] Quickstart verification for existing app and framework paths +- [ ] Go/No-Go checklist in Section 18 satisfied before merging to `main` + +## 17. Future Phases (Post-V2, Not in Scope) + +For roadmap visibility. These inform V2 telemetry design but are not V2 deliverables. + +### Future: Speculative Agent Execution +- Extend speculative cascade from model-level to agent-step-level +- Speculative next-step execution with cheap models, rollback on validation failure +- Selective verification (not every step needs verification) +- Validated by: Sherlock (Microsoft, 2025), Speculative Actions (2025) + +### Future: Adaptive Learning Engine +- Contextual bandit routing (replace/augment static rules) +- Per-agent, per-task performance tracking +- Online learning from outcomes, no offline training needed +- Cold-start with aggregated anonymous routing telemetry (opt-in) +- Validated by: EMNLP 2025 bandit routing papers, BATS (Google, 2025) + +### Future: cascadeflow Studio (Cloud Product) +- Dashboard: real-time visualization of all dimensions +- Fleet suggestions: auto-recommend optimal model combinations +- Learning flywheel: shared (anonymized) routing data improves routing for all users +- A/B testing: compare routing strategies in production +- Custom KPI builder: visual interface for defining business dimensions +- V2 telemetry fields are designed to support Studio without breaking changes + +### Future: MCP Integration +- Intercept MCP tool calls (not just function-calling) +- Apply harness logic to MCP server interactions +- Track MCP server latency/reliability as routing dimensions + +### Future: Additional Dimensions +- Carbon-aware routing with live grid carbon intensity data +- Data residency / compliance-aware model selection +- Custom business KPI plugins (user-defined scoring functions) + +## 18. Go/No-Go Checklist + +Go when all are true (V2 Python launch): + +- [ ] Harness layer is opt-in and backward compatible +- [ ] `cascadeflow.init()` auto-instruments `openai` Python client +- [ ] `observe` mode produces zero behavior change (benchmark-validated) +- [ ] `enforce` mode actions work correctly (switch_model, deny_tool, stop) +- [ ] Harness decision overhead <5ms p95 +- [ ] Python parity fixture tests pass +- [ ] Core + integration CI green +- [ ] Benchmark comparison acceptable vs latest baseline +- [ ] OpenAI Agents SDK integration documented and validated +- [ ] CrewAI integration documented and validated +- [ ] LangChain integration extended and validated +- [ ] Existing integrations (Vercel AI, n8n) verified compatible (no regressions) +- [ ] DX quickstart works for existing app/agent users with 1-3 lines of code change +- [ ] External pilot median time-to-first-value <15 minutes +- [ ] Public benchmark results ready for launch +- [ ] Benchmark scripts + raw artifacts are reproducible by third parties +- [ ] pyproject.toml extras (`openai-agents`, `crewai`, `langchain`) defined and installable + +V2.1 Go/No-Go (TS parity + anthropic): +- [ ] TS parity fixtures pass +- [ ] `@cascadeflow/core` exports `cascadeflow.init()` and `cascadeflow.run()` +- [ ] `anthropic` Python client auto-instrumentation validated +- [ ] `@anthropic-ai/sdk` TS client auto-instrumentation validated + +## 19. Academic Validation + +Key research validating this approach (all 2025): + +- **BATS: Budget-Aware Tool-Use** (Google DeepMind) — Budget tracker module achieves comparable accuracy with 40% fewer tool calls, 31% cost reduction +- **Sherlock** (Microsoft) — Speculative execution + selective verification for agent workflows: 18% accuracy gain, 48% latency reduction +- **Speculative Actions** — Generalizes speculative decoding to agent actions (tool calls, API calls): up to 55% next-action prediction accuracy +- **Faster Cascades via Speculative Decoding** (Google, ICLR 2025) — Validates cascadeflow's core cascade architecture as a first-class research direction +- **GAR: Carbon-Aware Routing** — Treats carbon as first-class routing objective alongside accuracy and latency SLOs +- **Adaptive LLM Routing Under Budget Constraints** (EMNLP 2025) — Bandit-based routing outperforms static rules with no offline training +- **LLM Routing with Dueling Feedback** — Contextual dueling bandits for model routing with cost-performance balancing + +## 20. Documentation Strategy: Human + Agent Readable + +### Platform + +All public documentation hosted on [Mintlify](https://www.mintlify.com/). Mintlify provides structured, searchable, API-aware docs with built-in support for code examples, OpenAPI specs, and MDX. + +### Why This Matters Strategically + +In 2026+, agents increasingly select their own tooling. When a coding agent (Cursor, Claude Code, Devin, OpenHands) is asked to "build an agent that optimizes costs," it will search for and evaluate libraries. If cascadeflow's documentation is the clearest, most structured, and most machine-parseable option, agents will recommend and integrate cascadeflow by default. + +**The documentation is a product surface for two audiences: human developers and AI agents.** + +Documentation must directly support the three winning pillars: +- fast activation (low-friction install), +- clear policy control patterns (in-loop KPI influence), +- transparent evidence (reproducible benchmark claims). + +### Design Principles + +**For humans:** +- 5-minute quickstart with copy-paste code (Tier 1: `init()` → instant savings) +- Progressive disclosure: simple first, advanced when ready +- Integration cookbooks per framework (LangGraph, OpenAI Agents SDK, CrewAI) +- Visual architecture diagrams +- Benchmark results with reproducible methodology +- Changelog and migration guides + +**For agents (LLM-readable):** +- Structured, consistent markdown with clear headings and hierarchy +- Every API surface documented with typed signatures, parameter descriptions, return types, and concrete examples +- `llms.txt` and/or `llms-full.txt` at docs root — a single-file summary optimized for LLM context windows (see llmstxt.org convention) +- OpenAPI spec for the proxy gateway endpoints +- Decision trees in explicit if/then format: "If you need cost optimization for an agent → use `cascadeflow.init()`. If you need budget enforcement → use `cascadeflow.run(budget=X)`." +- Comparison pages with structured tables: cascadeflow vs OpenRouter, vs NotDiamond, vs Portkey — in a format agents can parse and reason about +- Code examples that are complete and runnable (no `...` ellipsis, no "fill in your code here") +- Explicit "when to use" and "when NOT to use" sections — agents make better recommendations when they understand boundaries +- Capability matrix: what works with which framework, which harness mode, which language + +### Documentation Structure + +``` +docs/ +├── quickstart.mdx # 5-minute setup, Tier 1 (init) +├── concepts/ +│ ├── how-cascading-works.mdx # Core cascade explained +│ ├── harness-modes.mdx # off / observe / enforce +│ ├── hard-soft-controls.mdx # Budget, KPIs, compliance +│ └── actions.mdx # allow, switch_model, deny_tool, stop +├── guides/ +│ ├── budget-enforcement.mdx # Tier 2: run() context manager +│ ├── kpi-injection.mdx # Tier 3: @agent() decorator +│ ├── observe-first.mdx # Production rollout guide +│ └── benchmarks.mdx # How to reproduce our benchmarks +├── integrations/ +│ ├── openai-agents-sdk.mdx # Cookbook + full example +│ ├── crewai.mdx # Cookbook + full example +│ ├── langchain.mdx # Cookbook + full example +│ ├── vercel-ai-sdk.mdx # Existing, extended +│ ├── n8n.mdx # Existing, extended +│ └── litellm.mdx # Existing +├── api-reference/ +│ ├── python/ # Auto-generated from docstrings +│ └── typescript/ # Auto-generated from TSDoc +├── comparisons/ +│ ├── vs-openrouter.mdx # Structured comparison +│ ├── vs-notdiamond.mdx +│ ├── vs-portkey.mdx +│ └── vs-litellm.mdx +├── llms.txt # Single-file LLM-optimized summary +└── llms-full.txt # Complete API reference for agent context +``` + +### `llms.txt` Specification + +A concise, structured file at the docs root that gives any LLM/agent everything it needs to evaluate and use cascadeflow: + +``` +# cascadeflow + +> Agent intelligence harness for cost, quality, latency, and business KPI optimization. +> In-process library (not a proxy). Works inside agent loops with full state awareness. + +## Install +pip install cascadeflow + +## Quickstart (3 lines) +import cascadeflow +cascadeflow.init(mode="enforce") +# All openai SDK calls are now cascaded. 40-80% cost savings. + +## When to use cascadeflow +- You run AI agents (LangGraph, CrewAI, OpenAI Agents SDK, or custom) +- You want to reduce LLM costs without changing agent code +- You need budget enforcement across multi-step agent runs +- You need to inject business KPIs into agent decisions + +## When NOT to use cascadeflow +- Single one-off LLM calls (overhead not justified) +- You only use one model and don't want routing + +## Key APIs +- cascadeflow.init(mode) — activate harness globally +- cascadeflow.run(budget, max_tool_calls) — scoped agent run with budget +- @cascadeflow.agent(budget, kpis) — annotate agent functions + +## Integrations +- pip install cascadeflow[langchain] +- pip install cascadeflow[openai-agents] +- pip install cascadeflow[crewai] + +## Docs: https://docs.cascadeflow.ai +``` + +### Timeline + +Documentation is not a post-launch task. It ships with each phase: + +- Phase A: `llms.txt`, concepts pages, API reference stubs +- Phase B: Quickstart (observe mode), `llms-full.txt` +- Phase C: Budget enforcement guide, KPI injection guide +- Phase D: Integration cookbooks (one per framework) +- Phase E: Comparison pages, benchmark results, launch blog post + +## 21. Document Owners + +- Product strategy: cascadeflow maintainers +- Technical design owner: core/runtime maintainers +- Integration owners: per package maintainer (same pattern as existing integrations) +- Documentation: maintained alongside code — every PR that changes API must update docs diff --git a/examples/integrations/README.md b/examples/integrations/README.md index f8728e21..e7e7906a 100644 --- a/examples/integrations/README.md +++ b/examples/integrations/README.md @@ -5,6 +5,7 @@ This directory contains production-ready integration examples for cascadeflow wi ## 📋 Table of Contents - [LiteLLM Integration](#-litellm-integration) - Access 10+ providers with automatic cost tracking +- [OpenAI Agents SDK Integration](#-openai-agents-sdk-integration) - Harness-aware ModelProvider for existing agent apps - [Paygentic Integration](#-paygentic-integration) - Usage event reporting and billing lifecycle helpers - [Local Providers](#-local-providers-setup) - Ollama and vLLM configuration examples - [OpenTelemetry & Grafana](#-opentelemetry--grafana) - Production observability and metrics @@ -138,6 +139,27 @@ export HF_TOKEN="..." --- +## 🤖 OpenAI Agents SDK Integration + +**File:** [`openai_agents_harness.py`](openai_agents_harness.py) + +Use cascadeflow as an explicit `ModelProvider` integration in the OpenAI Agents SDK. + +### Quick Start + +```bash +pip install "cascadeflow[openai,openai-agents]" +python examples/integrations/openai_agents_harness.py +``` + +### What It Shows + +- Harness-aware model switching with candidate models +- Tool gating when enforce-mode caps are reached +- Run-scoped metrics and trace inspection via `cascadeflow.run(...)` + +--- + ## 💳 Paygentic Integration **File:** [`paygentic_usage.py`](paygentic_usage.py) diff --git a/examples/integrations/google_adk_harness.py b/examples/integrations/google_adk_harness.py new file mode 100644 index 00000000..0315dc90 --- /dev/null +++ b/examples/integrations/google_adk_harness.py @@ -0,0 +1,89 @@ +""" +Google ADK + cascadeflow harness integration example. + +Run: + pip install "cascadeflow[google-adk]" + export GOOGLE_API_KEY="your-key" + python examples/integrations/google_adk_harness.py +""" + +from __future__ import annotations + +import asyncio + + +async def main() -> None: + try: + from google.adk.agents import Agent + from google.adk.runners import Runner + from google.adk.sessions import InMemorySessionService + except ImportError as exc: + raise SystemExit( + "Google ADK is not installed. " + 'Install with: pip install "cascadeflow[google-adk]"' + ) from exc + + from cascadeflow import init, run + from cascadeflow.integrations.google_adk import enable, GoogleADKHarnessConfig + + # 1. Initialize harness globally + init(mode="observe", budget=1.0) + + # 2. Create the cascadeflow ADK plugin + plugin = enable( + config=GoogleADKHarnessConfig( + fail_open=True, + enable_budget_gate=True, + ) + ) + + # 3. Define an ADK agent + agent = Agent( + name="demo_agent", + model="gemini-2.5-flash", + instruction="You are a helpful assistant. Answer concisely.", + ) + + # 4. Create a Runner with the cascadeflow plugin + session_service = InMemorySessionService() + runner = Runner( + agent=agent, + app_name="cascadeflow_demo", + session_service=session_service, + plugins=[plugin], # cascadeflow hooks into all LLM calls here + ) + + # 5. Run within a harness scope + with run(budget=0.5) as session: + user_session = await session_service.create_session( + app_name="cascadeflow_demo", + user_id="demo-user", + ) + + from google.genai.types import Content, Part + + async for event in runner.run_async( + user_id="demo-user", + session_id=user_session.id, + new_message=Content(parts=[Part(text="What is model routing?")]), + ): + if event.content and event.content.parts: + for part in event.content.parts: + if part.text: + print(part.text, end="") + print() + + print("\n=== Harness Metrics ===") + print(f"Cost: ${session.cost:.6f}") + print(f"Remaining budget: {session.budget_remaining}") + print(f"Steps: {session.step_count}") + print(f"Tool calls: {session.tool_calls}") + print(f"Energy: {session.energy_used:.1f}") + print(f"Latency: {session.latency_used_ms:.0f}ms") + print("\n=== Decision Trace ===") + for event in session.trace(): + print(event) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/integrations/openai_agents_harness.py b/examples/integrations/openai_agents_harness.py new file mode 100644 index 00000000..69ea6bcd --- /dev/null +++ b/examples/integrations/openai_agents_harness.py @@ -0,0 +1,62 @@ +""" +OpenAI Agents SDK + cascadeflow harness integration example. + +Run: + pip install "cascadeflow[openai,openai-agents]" + python examples/integrations/openai_agents_harness.py +""" + +from __future__ import annotations + +import asyncio + + +async def main() -> None: + try: + from agents import Agent, RunConfig, Runner + except ImportError as exc: + raise SystemExit( + "OpenAI Agents SDK is not installed. " + "Install with: pip install \"cascadeflow[openai,openai-agents]\"" + ) from exc + + from cascadeflow import init, run + from cascadeflow.integrations.openai_agents import ( + CascadeFlowModelProvider, + OpenAIAgentsIntegrationConfig, + ) + + init(mode="observe", budget=1.0, max_tool_calls=5) + + provider = CascadeFlowModelProvider( + config=OpenAIAgentsIntegrationConfig( + model_candidates=["gpt-4o", "gpt-4o-mini"], + enable_tool_gating=True, + ) + ) + + agent = Agent( + name="RouteAwareAgent", + instructions="Respond clearly and include a short reasoning summary.", + model="gpt-4o", + ) + + run_config = RunConfig(model_provider=provider) + + with run(budget=0.5, max_tool_calls=3) as session: + result = await Runner.run(agent, "Summarize why model routing helps agent budgets.", run_config=run_config) + + print("=== Result ===") + print(result.final_output) + print("\n=== Harness Metrics ===") + print(f"Cost: ${session.cost:.6f}") + print(f"Remaining budget: {session.budget_remaining}") + print(f"Steps: {session.step_count}") + print(f"Tool calls: {session.tool_calls}") + print("\n=== Decision Trace ===") + for event in session.trace(): + print(event) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/pyproject.toml b/pyproject.toml index 0d488faa..8f11ae44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,6 +92,18 @@ semantic = [ # OpenClaw integration (auto-enables FastEmbed for semantic routing) openclaw = ["fastembed>=0.7.0"] +# CrewAI harness integration (opt-in) +crewai = ["crewai>=1.5.0"] + +# OpenAI Agents SDK integration (opt-in) +openai-agents = [ + "openai-agents>=0.8.4; python_version < '3.10'", + "openai-agents>=0.9.0; python_version >= '3.10'", +] + +# Google ADK integration (opt-in, requires Python 3.10+) +google-adk = ["google-adk>=1.0.0; python_version >= '3.10'"] + # Development tools (includes rich for terminal output) dev = [ "pytest>=7.4.0", diff --git a/tests/test_crewai_integration.py b/tests/test_crewai_integration.py new file mode 100644 index 00000000..622f4b4b --- /dev/null +++ b/tests/test_crewai_integration.py @@ -0,0 +1,508 @@ +"""Tests for cascadeflow.integrations.crewai harness integration. + +crewai is not installed in test environments, so we mock the hooks module +and test the integration logic directly against HarnessRunContext. +""" + +from __future__ import annotations + +import types +from unittest.mock import patch + +import pytest + +from cascadeflow.harness import init, reset, run + +# Import the module directly — it does not require crewai at import time +# (CREWAI_AVAILABLE will be False, but all functions/classes are still defined). +import cascadeflow.integrations.crewai as crewai_mod + + +@pytest.fixture(autouse=True) +def _reset_crewai_state(): + """Reset harness and crewai module state before every test.""" + reset() + crewai_mod._hooks_registered = False + crewai_mod._before_hook_ref = None + crewai_mod._after_hook_ref = None + crewai_mod._config = crewai_mod.CrewAIHarnessConfig() + crewai_mod._call_start_times.clear() + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +class FakeLLM: + """Minimal stand-in for a CrewAI LLM object.""" + + def __init__(self, model: str = "gpt-4o"): + self.model = model + + +class FakeHookContext: + """Minimal stand-in for crewai's LLMCallHookContext.""" + + def __init__( + self, + *, + llm: FakeLLM | None = None, + messages: list | None = None, + response: str | None = None, + ): + self.llm = llm or FakeLLM() + self.messages = messages or [] + self.response = response + + +def _make_fake_hooks_module(): + """Build a fake crewai.hooks module with recording registration helpers.""" + mod = types.ModuleType("crewai.hooks") + mod._before_hooks = [] + mod._after_hooks = [] + mod.register_before_llm_call_hook = lambda fn: mod._before_hooks.append(fn) + mod.register_after_llm_call_hook = lambda fn: mod._after_hooks.append(fn) + mod.unregister_before_llm_call_hook = lambda fn: ( + mod._before_hooks.remove(fn) if fn in mod._before_hooks else None + ) + mod.unregister_after_llm_call_hook = lambda fn: ( + mod._after_hooks.remove(fn) if fn in mod._after_hooks else None + ) + return mod + + +# --------------------------------------------------------------------------- +# _extract_message_content +# --------------------------------------------------------------------------- + + +class TestExtractMessageContent: + def test_dict_message(self): + msg = {"role": "user", "content": "Hello world"} + assert crewai_mod._extract_message_content(msg) == "Hello world" + + def test_dict_message_missing_content(self): + msg = {"role": "system"} + assert crewai_mod._extract_message_content(msg) == "" + + def test_dict_message_none_content(self): + msg = {"role": "assistant", "content": None} + assert crewai_mod._extract_message_content(msg) == "" + + def test_object_message(self): + class Msg: + content = "from object" + + assert crewai_mod._extract_message_content(Msg()) == "from object" + + def test_object_message_no_content(self): + assert crewai_mod._extract_message_content(object()) == "" + + +# --------------------------------------------------------------------------- +# _extract_model_name +# --------------------------------------------------------------------------- + + +class TestExtractModelName: + def test_extracts_plain_model(self): + ctx = FakeHookContext(llm=FakeLLM("gpt-4o")) + assert crewai_mod._extract_model_name(ctx) == "gpt-4o" + + def test_strips_provider_prefix(self): + ctx = FakeHookContext(llm=FakeLLM("openai/gpt-4o-mini")) + assert crewai_mod._extract_model_name(ctx) == "gpt-4o-mini" + + def test_no_llm_returns_unknown(self): + ctx = FakeHookContext() + ctx.llm = None + assert crewai_mod._extract_model_name(ctx) == "unknown" + + def test_no_model_attr_returns_unknown(self): + ctx = FakeHookContext() + ctx.llm = object() # no .model attribute + assert crewai_mod._extract_model_name(ctx) == "unknown" + + def test_non_string_model_returns_unknown(self): + ctx = FakeHookContext() + ctx.llm = FakeLLM("gpt-4o") + ctx.llm.model = 42 # not a string + assert crewai_mod._extract_model_name(ctx) == "unknown" + + +# --------------------------------------------------------------------------- +# Cost / energy estimation +# --------------------------------------------------------------------------- + + +class TestEstimation: + def test_estimate_cost_known_model(self): + cost = crewai_mod._estimate_cost("gpt-4o-mini", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.15 + 0.60) + + def test_estimate_cost_unknown_model_uses_default(self): + cost = crewai_mod._estimate_cost("unknown-model", 1_000_000, 0) + assert cost == pytest.approx(2.50) + + def test_estimate_energy_known_model(self): + energy = crewai_mod._estimate_energy("gpt-4o", 100, 100) + # coeff=1.0, output_weight=1.5 + assert energy == pytest.approx(1.0 * (100 + 100 * 1.5)) + + def test_estimate_energy_unknown_model(self): + energy = crewai_mod._estimate_energy("unknown-model", 100, 100) + assert energy == pytest.approx(1.0 * (100 + 100 * 1.5)) + + +# --------------------------------------------------------------------------- +# before_llm_call_hook +# --------------------------------------------------------------------------- + + +class TestBeforeHook: + def test_no_run_context_returns_none(self): + ctx = FakeHookContext() + result = crewai_mod._before_llm_call_hook(ctx) + assert result is None + + def test_observe_mode_allows(self): + init(mode="observe", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.002 # over budget + hook_ctx = FakeHookContext() + result = crewai_mod._before_llm_call_hook(hook_ctx) + # observe mode never blocks + assert result is None + + def test_enforce_blocks_when_budget_exhausted(self): + init(mode="enforce", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.001 # exactly at budget + hook_ctx = FakeHookContext(llm=FakeLLM("gpt-4o")) + result = crewai_mod._before_llm_call_hook(hook_ctx) + assert result is False + assert run_ctx.last_action == "stop" + trace = run_ctx.trace() + assert trace[-1]["reason"] == "budget_exhausted" + + def test_enforce_blocked_call_does_not_leak_start_time(self): + """Blocked calls must not leave stale entries in _call_start_times.""" + init(mode="enforce", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.001 + hook_ctx = FakeHookContext(llm=FakeLLM("gpt-4o")) + crewai_mod._before_llm_call_hook(hook_ctx) + assert id(hook_ctx) not in crewai_mod._call_start_times + + def test_enforce_allows_when_under_budget(self): + init(mode="enforce", budget=1.0) + with run(budget=1.0) as run_ctx: + run_ctx.cost = 0.5 + hook_ctx = FakeHookContext() + result = crewai_mod._before_llm_call_hook(hook_ctx) + assert result is None + + def test_records_start_time(self): + init(mode="observe") + with run(): + hook_ctx = FakeHookContext() + crewai_mod._before_llm_call_hook(hook_ctx) + assert id(hook_ctx) in crewai_mod._call_start_times + + def test_budget_gate_disabled_in_config(self): + crewai_mod._config = crewai_mod.CrewAIHarnessConfig(enable_budget_gate=False) + init(mode="enforce", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.002 + hook_ctx = FakeHookContext() + result = crewai_mod._before_llm_call_hook(hook_ctx) + assert result is None # gate disabled, not blocked + + def test_fail_open_swallows_errors(self): + crewai_mod._config = crewai_mod.CrewAIHarnessConfig(fail_open=True) + init(mode="enforce") + with run(): + hook_ctx = FakeHookContext() + with patch( + "cascadeflow.harness.api.get_current_run", + side_effect=RuntimeError("boom"), + ): + result = crewai_mod._before_llm_call_hook(hook_ctx) + assert result is None # fail_open returns None + + def test_fail_closed_raises_errors(self): + crewai_mod._config = crewai_mod.CrewAIHarnessConfig(fail_open=False) + init(mode="enforce") + with run(): + hook_ctx = FakeHookContext() + with patch( + "cascadeflow.harness.api.get_current_run", + side_effect=RuntimeError("boom"), + ): + with pytest.raises(RuntimeError, match="boom"): + crewai_mod._before_llm_call_hook(hook_ctx) + + +# --------------------------------------------------------------------------- +# after_llm_call_hook +# --------------------------------------------------------------------------- + + +class TestAfterHook: + def test_no_run_context_returns_none(self): + ctx = FakeHookContext(response="hello") + result = crewai_mod._after_llm_call_hook(ctx) + assert result is None + + def test_updates_run_metrics_with_dict_messages(self): + """CrewAI passes messages as dicts — verify cost is nonzero.""" + init(mode="observe") + with run(budget=1.0) as run_ctx: + hook_ctx = FakeHookContext( + llm=FakeLLM("gpt-4o-mini"), + messages=[{"role": "user", "content": "What is 2+2?"}], + response="The answer is 4.", + ) + crewai_mod._call_start_times[id(hook_ctx)] = __import__("time").monotonic() - 0.1 + + crewai_mod._after_llm_call_hook(hook_ctx) + + assert run_ctx.step_count == 1 + assert run_ctx.cost > 0 + assert run_ctx.energy_used > 0 + assert run_ctx.latency_used_ms > 0 + assert run_ctx.model_used == "gpt-4o-mini" + assert run_ctx.last_action == "allow" + + def test_updates_run_metrics_with_object_messages(self): + """Also support object-style messages (defensive).""" + init(mode="observe") + + class ObjMsg: + content = "What is 2+2?" + + with run(budget=1.0) as run_ctx: + hook_ctx = FakeHookContext( + llm=FakeLLM("gpt-4o-mini"), + messages=[ObjMsg()], + response="The answer is 4.", + ) + crewai_mod._after_llm_call_hook(hook_ctx) + assert run_ctx.cost > 0 + + def test_updates_budget_remaining(self): + init(mode="enforce", budget=1.0) + with run(budget=1.0) as run_ctx: + hook_ctx = FakeHookContext( + llm=FakeLLM("gpt-4o"), + messages=[{"role": "user", "content": "test"}], + response="response", + ) + crewai_mod._after_llm_call_hook(hook_ctx) + assert run_ctx.budget_remaining is not None + assert run_ctx.budget_remaining == pytest.approx(1.0 - run_ctx.cost) + + def test_trace_records_mode(self): + init(mode="enforce") + with run() as run_ctx: + hook_ctx = FakeHookContext( + llm=FakeLLM("gpt-4o"), + messages=[{"role": "user", "content": "test"}], + response="done", + ) + crewai_mod._after_llm_call_hook(hook_ctx) + trace = run_ctx.trace() + assert len(trace) == 1 + assert trace[0]["reason"] == "enforce" + assert trace[0]["model"] == "gpt-4o" + + def test_no_start_time_records_zero_latency(self): + init(mode="observe") + with run() as run_ctx: + hook_ctx = FakeHookContext( + llm=FakeLLM("gpt-4o"), + messages=[], + response="ok", + ) + # Don't set start time + crewai_mod._after_llm_call_hook(hook_ctx) + assert run_ctx.latency_used_ms == 0.0 + + def test_token_estimation_from_dict_messages(self): + """Verify token estimation works with dict messages (real CrewAI shape).""" + init(mode="observe") + with run() as run_ctx: + # 400 chars in messages → 100 prompt tokens + # 80 chars in response → 20 completion tokens + messages = [{"role": "user", "content": "x" * 400}] + hook_ctx = FakeHookContext( + llm=FakeLLM("gpt-4o"), + messages=messages, + response="y" * 80, + ) + crewai_mod._after_llm_call_hook(hook_ctx) + # gpt-4o: $2.50/1M in, $10.00/1M out + expected_cost = (100 / 1_000_000) * 2.50 + (20 / 1_000_000) * 10.00 + assert run_ctx.cost == pytest.approx(expected_cost) + + def test_fail_open_swallows_errors(self): + crewai_mod._config = crewai_mod.CrewAIHarnessConfig(fail_open=True) + init(mode="observe") + with run(): + hook_ctx = FakeHookContext(response="ok") + with patch( + "cascadeflow.harness.api.get_current_run", + side_effect=RuntimeError("boom"), + ): + result = crewai_mod._after_llm_call_hook(hook_ctx) + assert result is None + + +# --------------------------------------------------------------------------- +# enable / disable lifecycle +# --------------------------------------------------------------------------- + + +class TestEnableDisable: + def test_enable_returns_false_when_crewai_not_available(self): + with patch.object(crewai_mod, "CREWAI_AVAILABLE", False): + result = crewai_mod.enable() + assert result is False + assert not crewai_mod.is_enabled() + + def test_enable_registers_hooks(self, monkeypatch): + fake_hooks = _make_fake_hooks_module() + monkeypatch.setattr(crewai_mod, "CREWAI_AVAILABLE", True) + + import sys + + monkeypatch.setitem(sys.modules, "crewai.hooks", fake_hooks) + + result = crewai_mod.enable() + assert result is True + assert crewai_mod.is_enabled() + assert len(fake_hooks._before_hooks) == 1 + assert len(fake_hooks._after_hooks) == 1 + + def test_enable_is_idempotent(self, monkeypatch): + fake_hooks = _make_fake_hooks_module() + monkeypatch.setattr(crewai_mod, "CREWAI_AVAILABLE", True) + + import sys + + monkeypatch.setitem(sys.modules, "crewai.hooks", fake_hooks) + + crewai_mod.enable() + crewai_mod.enable() # second call + assert len(fake_hooks._before_hooks) == 1 # still just one + + def test_enable_applies_config(self, monkeypatch): + fake_hooks = _make_fake_hooks_module() + monkeypatch.setattr(crewai_mod, "CREWAI_AVAILABLE", True) + + import sys + + monkeypatch.setitem(sys.modules, "crewai.hooks", fake_hooks) + + custom_config = crewai_mod.CrewAIHarnessConfig(fail_open=False, enable_budget_gate=False) + crewai_mod.enable(config=custom_config) + + cfg = crewai_mod.get_config() + assert cfg.fail_open is False + assert cfg.enable_budget_gate is False + + def test_disable_unregisters_hooks(self, monkeypatch): + fake_hooks = _make_fake_hooks_module() + monkeypatch.setattr(crewai_mod, "CREWAI_AVAILABLE", True) + + import sys + + monkeypatch.setitem(sys.modules, "crewai.hooks", fake_hooks) + + crewai_mod.enable() + assert crewai_mod.is_enabled() + assert len(fake_hooks._before_hooks) == 1 + + crewai_mod.disable() + assert not crewai_mod.is_enabled() + assert len(fake_hooks._before_hooks) == 0 + assert len(fake_hooks._after_hooks) == 0 + + def test_disable_when_not_enabled_is_safe(self): + crewai_mod.disable() # should not raise + assert not crewai_mod.is_enabled() + + def test_disable_clears_call_start_times(self, monkeypatch): + fake_hooks = _make_fake_hooks_module() + monkeypatch.setattr(crewai_mod, "CREWAI_AVAILABLE", True) + + import sys + + monkeypatch.setitem(sys.modules, "crewai.hooks", fake_hooks) + + crewai_mod.enable() + crewai_mod._call_start_times[123] = 1.0 + crewai_mod.disable() + assert len(crewai_mod._call_start_times) == 0 + + def test_enable_returns_false_for_old_crewai(self, monkeypatch): + """When crewai is installed but lacks hooks module (< v1.5).""" + monkeypatch.setattr(crewai_mod, "CREWAI_AVAILABLE", True) + + import sys + + # Remove crewai.hooks from modules so import fails + monkeypatch.delitem(sys.modules, "crewai.hooks", raising=False) + + original_import = __builtins__.__import__ if hasattr(__builtins__, "__import__") else __import__ + + def fake_import(name, *args, **kwargs): + if name == "crewai.hooks": + raise ImportError("no hooks") + return original_import(name, *args, **kwargs) + + monkeypatch.setattr("builtins.__import__", fake_import) + + result = crewai_mod.enable() + assert result is False + + +# --------------------------------------------------------------------------- +# Public API helpers +# --------------------------------------------------------------------------- + + +class TestPublicAPI: + def test_is_available_reflects_module_flag(self): + # crewai is not installed in test env + assert crewai_mod.is_available() == crewai_mod.CREWAI_AVAILABLE + + def test_is_enabled_default_false(self): + assert crewai_mod.is_enabled() is False + + def test_get_config_returns_copy(self): + cfg = crewai_mod.get_config() + assert isinstance(cfg, crewai_mod.CrewAIHarnessConfig) + assert cfg.fail_open is True + assert cfg.enable_budget_gate is True + # Modifying the copy doesn't affect the module state + cfg.fail_open = False + assert crewai_mod.get_config().fail_open is True + + +# --------------------------------------------------------------------------- +# CrewAIHarnessConfig +# --------------------------------------------------------------------------- + + +class TestConfig: + def test_defaults(self): + cfg = crewai_mod.CrewAIHarnessConfig() + assert cfg.fail_open is True + assert cfg.enable_budget_gate is True + + def test_custom_values(self): + cfg = crewai_mod.CrewAIHarnessConfig(fail_open=False, enable_budget_gate=False) + assert cfg.fail_open is False + assert cfg.enable_budget_gate is False diff --git a/tests/test_google_adk_integration.py b/tests/test_google_adk_integration.py new file mode 100644 index 00000000..e68edcaf --- /dev/null +++ b/tests/test_google_adk_integration.py @@ -0,0 +1,734 @@ +"""Tests for cascadeflow.integrations.google_adk harness integration. + +google-adk is not installed in test environments, so we use fake ADK types +and test the integration logic directly against HarnessRunContext. +""" + +from __future__ import annotations + +import time +from unittest.mock import patch + +import pytest + +from cascadeflow.harness import init, reset, run + +# Import the module directly — it does not require google-adk at import time +# (GOOGLE_ADK_AVAILABLE will be False, but all functions/classes are still defined). +import cascadeflow.integrations.google_adk as adk_mod + + +# --------------------------------------------------------------------------- +# Fake ADK types +# --------------------------------------------------------------------------- + + +class FakeUsageMetadata: + """Stand-in for google.genai.types.GenerateContentResponseUsageMetadata.""" + + def __init__( + self, + prompt_token_count: int = 0, + candidates_token_count: int = 0, + ): + self.prompt_token_count = prompt_token_count + self.candidates_token_count = candidates_token_count + + +class FakePart: + """Stand-in for google.genai.types.Part.""" + + def __init__(self, *, text: str | None = None, function_call: object | None = None): + self.text = text + self.function_call = function_call + + +class FakeContent: + """Stand-in for google.genai.types.Content.""" + + def __init__(self, parts: list | None = None): + self.parts = parts or [] + + +class FakeLlmResponse: + """Stand-in for google.adk.models.LlmResponse.""" + + def __init__( + self, + *, + content: FakeContent | None = None, + usage_metadata: FakeUsageMetadata | None = None, + ): + self.content = content + self.usage_metadata = usage_metadata + + +class FakeLlmRequest: + """Stand-in for google.adk.models.LlmRequest.""" + + def __init__(self, model: str = "gemini-2.5-flash"): + self.model = model + + +class FakeCallbackContext: + """Stand-in for google.adk.agents.CallbackContext.""" + + def __init__( + self, + invocation_id: str = "inv-001", + agent_name: str = "test-agent", + ): + self.invocation_id = invocation_id + self.agent_name = agent_name + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _reset_adk_state(): + """Reset harness and ADK module state before every test.""" + reset() + adk_mod._config = adk_mod.GoogleADKHarnessConfig() + adk_mod._plugin_instance = None + adk_mod._enabled = False + + +# --------------------------------------------------------------------------- +# _normalize_model_name +# --------------------------------------------------------------------------- + + +class TestNormalizeModelName: + def test_plain_model(self): + assert adk_mod._normalize_model_name("gemini-2.5-flash") == "gemini-2.5-flash" + + def test_strips_provider_prefix(self): + assert adk_mod._normalize_model_name("openai/gpt-4o") == "gpt-4o" + + def test_strips_models_prefix(self): + assert adk_mod._normalize_model_name("models/gemini-2.5-flash") == "gemini-2.5-flash" + + def test_strips_litellm_prefix(self): + assert adk_mod._normalize_model_name("vertex_ai/gemini-2.5-pro") == "gemini-2.5-pro" + + def test_no_slash_passthrough(self): + assert adk_mod._normalize_model_name("gpt-4o-mini") == "gpt-4o-mini" + + +# --------------------------------------------------------------------------- +# _count_function_calls +# --------------------------------------------------------------------------- + + +class TestCountFunctionCalls: + def test_no_content(self): + assert adk_mod._count_function_calls(None) == 0 + + def test_no_parts(self): + content = FakeContent(parts=[]) + assert adk_mod._count_function_calls(content) == 0 + + def test_text_only(self): + content = FakeContent(parts=[FakePart(text="hello")]) + assert adk_mod._count_function_calls(content) == 0 + + def test_counts_function_calls(self): + content = FakeContent( + parts=[ + FakePart(text="thinking..."), + FakePart(function_call={"name": "search", "args": {}}), + FakePart(function_call={"name": "calculate", "args": {}}), + ] + ) + assert adk_mod._count_function_calls(content) == 2 + + +# --------------------------------------------------------------------------- +# Cost / energy estimation (via shared pricing) +# --------------------------------------------------------------------------- + + +class TestEstimation: + def test_estimate_cost_known_model(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-flash", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.15 + 0.60) + + def test_estimate_cost_unknown_model_uses_default(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("unknown-model", 1_000_000, 0) + assert cost == pytest.approx(2.50) + + def test_estimate_energy_known_model(self): + from cascadeflow.harness.pricing import estimate_energy + + energy = estimate_energy("gemini-2.5-flash", 100, 100) + # coeff=0.3, output_weight=1.5 + assert energy == pytest.approx(0.3 * (100 + 100 * 1.5)) + + def test_estimate_energy_unknown_model(self): + from cascadeflow.harness.pricing import estimate_energy + + energy = estimate_energy("unknown-model", 100, 100) + # default coeff=1.0 + assert energy == pytest.approx(1.0 * (100 + 100 * 1.5)) + + +# --------------------------------------------------------------------------- +# before_model_callback +# --------------------------------------------------------------------------- + + +class TestBeforeModelCallback: + @pytest.fixture + def plugin(self): + return adk_mod.CascadeFlowADKPlugin() + + async def test_no_run_context_returns_none(self, plugin): + ctx = FakeCallbackContext() + req = FakeLlmRequest() + result = await plugin.before_model_callback(ctx, req) + assert result is None + + async def test_observe_mode_allows_over_budget(self, plugin): + init(mode="observe", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.002 + result = await plugin.before_model_callback( + FakeCallbackContext(), FakeLlmRequest() + ) + assert result is None # observe never blocks + + async def test_enforce_blocks_when_budget_exhausted(self, plugin): + init(mode="enforce", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.001 + result = await plugin.before_model_callback( + FakeCallbackContext(), FakeLlmRequest("gemini-2.5-flash") + ) + assert result is not None # short-circuit response + assert run_ctx.last_action == "stop" + trace = run_ctx.trace() + assert trace[-1]["reason"] == "budget_exhausted" + + async def test_enforce_blocked_call_does_not_leak_state(self, plugin): + init(mode="enforce", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.001 + cb_ctx = FakeCallbackContext() + await plugin.before_model_callback(cb_ctx, FakeLlmRequest()) + key = plugin._callback_key(cb_ctx) + assert key not in plugin._call_start_times + assert key not in plugin._call_models + + async def test_enforce_allows_under_budget(self, plugin): + init(mode="enforce", budget=1.0) + with run(budget=1.0) as run_ctx: + run_ctx.cost = 0.5 + result = await plugin.before_model_callback( + FakeCallbackContext(), FakeLlmRequest() + ) + assert result is None + + async def test_records_start_time_and_model(self, plugin): + init(mode="observe") + with run(): + cb_ctx = FakeCallbackContext() + await plugin.before_model_callback(cb_ctx, FakeLlmRequest("gpt-4o")) + key = plugin._callback_key(cb_ctx) + assert key in plugin._call_start_times + assert plugin._call_models[key] == "gpt-4o" + + async def test_normalizes_model_name(self, plugin): + init(mode="observe") + with run(): + cb_ctx = FakeCallbackContext() + await plugin.before_model_callback(cb_ctx, FakeLlmRequest("openai/gpt-4o")) + key = plugin._callback_key(cb_ctx) + assert plugin._call_models[key] == "gpt-4o" + + async def test_budget_gate_disabled_in_config(self): + plugin = adk_mod.CascadeFlowADKPlugin( + config=adk_mod.GoogleADKHarnessConfig(enable_budget_gate=False) + ) + init(mode="enforce", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.002 + result = await plugin.before_model_callback( + FakeCallbackContext(), FakeLlmRequest() + ) + assert result is None # gate disabled + + async def test_fail_open_swallows_errors(self, plugin): + init(mode="enforce") + with run(): + with patch( + "cascadeflow.integrations.google_adk.get_current_run", + side_effect=RuntimeError("boom"), + ): + result = await plugin.before_model_callback( + FakeCallbackContext(), FakeLlmRequest() + ) + assert result is None + + +# --------------------------------------------------------------------------- +# after_model_callback +# --------------------------------------------------------------------------- + + +class TestAfterModelCallback: + @pytest.fixture + def plugin(self): + return adk_mod.CascadeFlowADKPlugin() + + async def test_no_run_context_returns_none(self, plugin): + result = await plugin.after_model_callback( + FakeCallbackContext(), + FakeLlmResponse(), + ) + assert result is None + + async def test_updates_run_metrics_with_usage_metadata(self, plugin): + init(mode="observe") + with run(budget=1.0) as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_start_times[key] = time.monotonic() - 0.1 + plugin._call_models[key] = "gemini-2.5-flash" + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata( + prompt_token_count=100, + candidates_token_count=50, + ), + content=FakeContent(parts=[FakePart(text="done")]), + ) + await plugin.after_model_callback(cb_ctx, response) + + assert run_ctx.step_count == 1 + assert run_ctx.cost > 0 + assert run_ctx.energy_used > 0 + assert run_ctx.latency_used_ms > 0 + assert run_ctx.model_used == "gemini-2.5-flash" + assert run_ctx.last_action == "allow" + + async def test_fallback_token_estimation(self, plugin): + """When usage_metadata is missing, estimate from content text.""" + init(mode="observe") + with run() as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gemini-2.5-flash" + + response = FakeLlmResponse( + content=FakeContent(parts=[FakePart(text="x" * 400)]), + ) + await plugin.after_model_callback(cb_ctx, response) + + assert run_ctx.cost > 0 + assert run_ctx.step_count == 1 + + async def test_counts_tool_calls(self, plugin): + init(mode="observe") + with run() as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gemini-2.5-flash" + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(100, 50), + content=FakeContent( + parts=[ + FakePart(function_call={"name": "search"}), + FakePart(function_call={"name": "calc"}), + ] + ), + ) + await plugin.after_model_callback(cb_ctx, response) + assert run_ctx.tool_calls == 2 + + async def test_updates_budget_remaining(self, plugin): + init(mode="enforce", budget=1.0) + with run(budget=1.0) as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gemini-2.5-flash" + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(100, 50), + ) + await plugin.after_model_callback(cb_ctx, response) + assert run_ctx.budget_remaining is not None + assert run_ctx.budget_remaining == pytest.approx(1.0 - run_ctx.cost) + + async def test_trace_records_mode(self, plugin): + init(mode="enforce") + with run() as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gpt-4o" + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(10, 10), + ) + await plugin.after_model_callback(cb_ctx, response) + trace = run_ctx.trace() + assert len(trace) == 1 + assert trace[0]["reason"] == "enforce" + assert trace[0]["model"] == "gpt-4o" + + async def test_no_start_time_records_zero_latency(self, plugin): + init(mode="observe") + with run() as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gpt-4o" + # Don't set start time + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(10, 10), + ) + await plugin.after_model_callback(cb_ctx, response) + assert run_ctx.latency_used_ms == 0.0 + + async def test_fail_open_swallows_errors(self, plugin): + init(mode="observe") + with run(): + with patch( + "cascadeflow.integrations.google_adk.get_current_run", + side_effect=RuntimeError("boom"), + ): + result = await plugin.after_model_callback( + FakeCallbackContext(), + FakeLlmResponse(), + ) + assert result is None + + +# --------------------------------------------------------------------------- +# on_model_error_callback +# --------------------------------------------------------------------------- + + +class TestOnModelErrorCallback: + @pytest.fixture + def plugin(self): + return adk_mod.CascadeFlowADKPlugin() + + async def test_records_error_in_trace(self, plugin): + init(mode="observe") + with run() as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gemini-2.5-flash" + plugin._call_start_times[key] = time.monotonic() + + await plugin.on_model_error_callback(cb_ctx, ValueError("bad input")) + + trace = run_ctx.trace() + assert len(trace) == 1 + assert trace[0]["action"] == "error" + assert "ValueError" in trace[0]["reason"] + assert trace[0]["model"] == "gemini-2.5-flash" + + async def test_cleans_up_timing_state(self, plugin): + init(mode="observe") + with run(): + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gemini-2.5-flash" + plugin._call_start_times[key] = time.monotonic() + + await plugin.on_model_error_callback(cb_ctx, RuntimeError("oops")) + + assert key not in plugin._call_models + assert key not in plugin._call_start_times + + async def test_fail_open_swallows_errors(self, plugin): + init(mode="observe") + with run(): + with patch( + "cascadeflow.integrations.google_adk.get_current_run", + side_effect=RuntimeError("boom"), + ): + result = await plugin.on_model_error_callback( + FakeCallbackContext(), + ValueError("test"), + ) + assert result is None + + +# --------------------------------------------------------------------------- +# enable / disable lifecycle +# --------------------------------------------------------------------------- + + +class TestEnableDisable: + def test_enable_returns_plugin_instance(self): + plugin = adk_mod.enable() + assert isinstance(plugin, adk_mod.CascadeFlowADKPlugin) + assert adk_mod.is_enabled() + + def test_enable_is_idempotent(self): + p1 = adk_mod.enable() + p2 = adk_mod.enable() + assert p1 is p2 # same instance + + def test_enable_applies_config(self): + config = adk_mod.GoogleADKHarnessConfig(fail_open=False, enable_budget_gate=False) + plugin = adk_mod.enable(config=config) + assert plugin._config.fail_open is False + assert plugin._config.enable_budget_gate is False + + def test_disable_deactivates_plugin(self): + plugin = adk_mod.enable() + assert plugin._active is True + adk_mod.disable() + assert not adk_mod.is_enabled() + assert plugin._active is False + + def test_disable_when_not_enabled_is_safe(self): + adk_mod.disable() # should not raise + assert not adk_mod.is_enabled() + + +# --------------------------------------------------------------------------- +# Public API helpers +# --------------------------------------------------------------------------- + + +class TestPublicAPI: + def test_is_available_reflects_module_flag(self): + assert adk_mod.is_available() == adk_mod.GOOGLE_ADK_AVAILABLE + + def test_is_enabled_default_false(self): + assert adk_mod.is_enabled() is False + + def test_get_config_returns_copy(self): + cfg = adk_mod.get_config() + assert isinstance(cfg, adk_mod.GoogleADKHarnessConfig) + assert cfg.fail_open is True + assert cfg.enable_budget_gate is True + # Modifying the copy doesn't affect module state + cfg.fail_open = False + assert adk_mod.get_config().fail_open is True + + +# --------------------------------------------------------------------------- +# GoogleADKHarnessConfig +# --------------------------------------------------------------------------- + + +class TestConfig: + def test_defaults(self): + cfg = adk_mod.GoogleADKHarnessConfig() + assert cfg.fail_open is True + assert cfg.enable_budget_gate is True + + def test_custom_values(self): + cfg = adk_mod.GoogleADKHarnessConfig(fail_open=False, enable_budget_gate=False) + assert cfg.fail_open is False + assert cfg.enable_budget_gate is False + + +# --------------------------------------------------------------------------- +# Plugin deactivate +# --------------------------------------------------------------------------- + + +class TestDeactivate: + async def test_deactivated_plugin_skips_callbacks(self): + plugin = adk_mod.CascadeFlowADKPlugin() + plugin.deactivate() + + init(mode="enforce", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.002 + result = await plugin.before_model_callback( + FakeCallbackContext(), FakeLlmRequest() + ) + assert result is None # no-op, not blocked + + async def test_deactivate_clears_state(self): + plugin = adk_mod.CascadeFlowADKPlugin() + plugin._call_start_times[12345] = 1.0 + plugin._call_models[12345] = "test" + plugin.deactivate() + assert len(plugin._call_start_times) == 0 + assert len(plugin._call_models) == 0 + + +# --------------------------------------------------------------------------- +# _extract_tokens +# --------------------------------------------------------------------------- + + +class TestExtractTokens: + def test_from_usage_metadata(self): + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(100, 200), + ) + assert adk_mod.CascadeFlowADKPlugin._extract_tokens(response) == (100, 200) + + def test_zero_usage_falls_back_to_content(self): + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(0, 0), + content=FakeContent(parts=[FakePart(text="x" * 80)]), + ) + inp, out = adk_mod.CascadeFlowADKPlugin._extract_tokens(response) + assert inp == 0 + assert out == 20 # 80 / 4 + + def test_no_usage_no_content(self): + response = FakeLlmResponse() + assert adk_mod.CascadeFlowADKPlugin._extract_tokens(response) == (0, 0) + + def test_content_with_no_text(self): + response = FakeLlmResponse( + content=FakeContent(parts=[FakePart(function_call={"name": "f"})]), + ) + inp, out = adk_mod.CascadeFlowADKPlugin._extract_tokens(response) + assert inp == 0 + assert out == 1 # max(0//4, 1) + + +class TestCallbackKeyCollision: + """Verify _callback_key uses id() for per-object uniqueness.""" + + def test_distinct_keys_for_different_objects(self): + """Two distinct context objects always produce distinct keys.""" + ctx_a = FakeCallbackContext(invocation_id="inv-1", agent_name="agent-a") + ctx_b = FakeCallbackContext(invocation_id="inv-1", agent_name="agent-a") + key_a = adk_mod.CascadeFlowADKPlugin._callback_key(ctx_a) + key_b = adk_mod.CascadeFlowADKPlugin._callback_key(ctx_b) + assert key_a != key_b, "Same IDs on different objects must produce distinct keys" + + def test_key_stable_for_same_object(self): + """Same context object always produces the same key.""" + ctx = FakeCallbackContext() + key1 = adk_mod.CascadeFlowADKPlugin._callback_key(ctx) + key2 = adk_mod.CascadeFlowADKPlugin._callback_key(ctx) + assert key1 == key2 + + def test_key_is_int(self): + """Key type is int (object id).""" + ctx = FakeCallbackContext() + assert isinstance(adk_mod.CascadeFlowADKPlugin._callback_key(ctx), int) + + @pytest.mark.asyncio + async def test_concurrent_same_ids_track_independently(self): + """Two concurrent calls with same invocation_id+agent_name don't corrupt.""" + init(mode="observe") + with run(budget=1.0) as harness_ctx: + plugin = adk_mod.CascadeFlowADKPlugin() + # Same IDs — previously would collide + ctx_a = FakeCallbackContext(invocation_id="inv-1", agent_name="agent") + ctx_b = FakeCallbackContext(invocation_id="inv-1", agent_name="agent") + + req_a = FakeLlmRequest(model="gpt-4o") + req_b = FakeLlmRequest(model="gpt-4o-mini") + + await plugin.before_model_callback(ctx_a, req_a) + await plugin.before_model_callback(ctx_b, req_b) + + resp_b = FakeLlmResponse(usage_metadata=FakeUsageMetadata(50, 25)) + resp_a = FakeLlmResponse(usage_metadata=FakeUsageMetadata(100, 50)) + await plugin.after_model_callback(ctx_b, resp_b) + await plugin.after_model_callback(ctx_a, resp_a) + + assert harness_ctx.step_count == 2 + assert len(plugin._call_start_times) == 0 + assert len(plugin._call_models) == 0 + + +# --------------------------------------------------------------------------- +# Off-mode behavior +# --------------------------------------------------------------------------- + + +class TestOffMode: + """mode='off' must not track metrics or update run context.""" + + @pytest.mark.asyncio + async def test_off_mode_before_callback_returns_none(self): + init(mode="off") + plugin = adk_mod.CascadeFlowADKPlugin() + with run() as run_ctx: + result = await plugin.before_model_callback( + FakeCallbackContext(), FakeLlmRequest() + ) + assert result is None + assert len(plugin._call_start_times) == 0 + + @pytest.mark.asyncio + async def test_off_mode_after_callback_does_not_track(self): + init(mode="off") + plugin = adk_mod.CascadeFlowADKPlugin() + with run() as run_ctx: + await plugin.after_model_callback( + FakeCallbackContext(), + FakeLlmResponse(usage_metadata=FakeUsageMetadata(1000, 500)), + ) + assert run_ctx.step_count == 0 + assert run_ctx.cost == 0.0 + assert run_ctx.energy_used == 0.0 + assert len(run_ctx.trace()) == 0 + + +# --------------------------------------------------------------------------- +# Versioned model name resolution +# --------------------------------------------------------------------------- + + +class TestVersionedModelPricing: + """Versioned model IDs must resolve to correct pricing, not default.""" + + def test_versioned_gemini_flash(self): + from cascadeflow.harness.pricing import estimate_cost + + # Should resolve to gemini-2.5-flash pricing ($0.15/$0.60) + cost = estimate_cost("gemini-2.5-flash-preview-05-20", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.75, abs=0.01) + + def test_versioned_gemini_pro(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-pro-preview-05-06", 1_000_000, 1_000_000) + assert cost == pytest.approx(11.25, abs=0.01) + + def test_dated_model_suffix(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-flash-20250120", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.75, abs=0.01) + + def test_latest_suffix(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-flash-latest", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.75, abs=0.01) + + def test_unknown_model_still_uses_default(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("totally-unknown-model", 1_000_000, 0) + assert cost == pytest.approx(2.50) + + def test_exact_match_still_works(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-flash", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.75, abs=0.01) + + def test_prefix_match_variant(self): + """A variant like gemini-2.5-flash-8b matches the base model.""" + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-flash-8b", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.75, abs=0.01) diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py new file mode 100644 index 00000000..eb960a39 --- /dev/null +++ b/tests/test_harness_api.py @@ -0,0 +1,312 @@ +import sys + +import pytest + +import cascadeflow +import cascadeflow.harness.api as harness_api +from cascadeflow.harness import agent, get_current_run, get_harness_config, init, reset, run + + +def setup_function() -> None: + reset() + + +def test_init_sets_mode_and_returns_report(): + report = init(mode="observe", budget=1.5, max_tool_calls=7) + + cfg = get_harness_config() + assert cfg.mode == "observe" + assert cfg.budget == 1.5 + assert cfg.max_tool_calls == 7 + assert report.mode == "observe" + assert isinstance(report.instrumented, list) + assert isinstance(report.detected_but_not_instrumented, list) + assert report.config_sources["mode"] == "code" + + +def test_init_rejects_invalid_mode(): + with pytest.raises(ValueError): + init(mode="invalid") # type: ignore[arg-type] + + +def test_init_idempotent_logs(monkeypatch, caplog): + monkeypatch.setattr(harness_api, "find_spec", lambda _: None) + with caplog.at_level("DEBUG", logger="cascadeflow.harness"): + init(mode="observe") + init(mode="observe") + assert any("idempotent" in rec.message for rec in caplog.records) + + +def test_env_aliases_and_false_bool(monkeypatch): + monkeypatch.setenv("CASCADEFLOW_MODE", "observe") + monkeypatch.setenv("CASCADEFLOW_BUDGET", "0.33") + monkeypatch.setenv("CASCADEFLOW_HARNESS_VERBOSE", "off") + monkeypatch.setenv("CASCADEFLOW_HARNESS_MAX_TOOL_CALLS", "4") + monkeypatch.setenv("CASCADEFLOW_HARNESS_MAX_LATENCY_MS", "1200") + monkeypatch.setenv("CASCADEFLOW_HARNESS_MAX_ENERGY", "0.01") + monkeypatch.setenv("CASCADEFLOW_HARNESS_COMPLIANCE", "gdpr") + + report = init() + cfg = get_harness_config() + + assert report.mode == "observe" + assert cfg.mode == "observe" + assert cfg.budget == 0.33 + assert cfg.verbose is False + assert cfg.max_tool_calls == 4 + assert cfg.max_latency_ms == 1200 + assert cfg.max_energy == 0.01 + assert cfg.compliance == "gdpr" + + +def test_init_invalid_json_env_raises(monkeypatch): + monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_WEIGHTS", "[1,2,3]") + with pytest.raises(ValueError): + init() + + +def test_init_non_numeric_env_raises(monkeypatch): + monkeypatch.setenv("CASCADEFLOW_HARNESS_BUDGET", "abc") + with pytest.raises(ValueError): + init() + + +def test_run_uses_global_defaults_and_overrides(): + init(mode="enforce", budget=2.0, max_tool_calls=5) + + default_ctx = run() + assert default_ctx.mode == "enforce" + assert default_ctx.budget_max == 2.0 + assert default_ctx.tool_calls_max == 5 + assert default_ctx.budget_remaining == 2.0 + + override_ctx = run(budget=0.5, max_tool_calls=3) + assert override_ctx.budget_max == 0.5 + assert override_ctx.tool_calls_max == 3 + assert override_ctx.budget_remaining == 0.5 + + +def test_run_without_enter_exit_is_safe(): + ctx = run() + ctx.__exit__(None, None, None) + + +@pytest.mark.asyncio +async def test_nested_run_context_is_isolated(): + init(mode="enforce", budget=1.0) + + async with run(budget=0.7) as outer: + assert get_current_run() is outer + assert outer.budget_max == 0.7 + + async with run(budget=0.2) as inner: + assert get_current_run() is inner + assert inner.budget_max == 0.2 + + assert get_current_run() is outer + + assert get_current_run() is None + + +def test_sync_run_context_isolated(): + init(mode="enforce", budget=1.0) + with run(budget=0.6) as outer: + assert get_current_run() is outer + with run(budget=0.1) as inner: + assert get_current_run() is inner + assert inner.budget_max == 0.1 + assert get_current_run() is outer + assert get_current_run() is None + + +def test_agent_decorator_keeps_sync_behavior_and_attaches_metadata(): + @agent( + budget=0.9, + kpi_targets={"quality_min": 0.9}, + kpi_weights={"cost": 0.5, "quality": 0.5}, + compliance="gdpr", + ) + def fn(x: int) -> int: + return x + 1 + + assert fn(2) == 3 + policy = fn.__cascadeflow_agent_policy__ + assert policy["budget"] == 0.9 + assert policy["kpi_targets"] == {"quality_min": 0.9} + assert policy["compliance"] == "gdpr" + + +@pytest.mark.asyncio +async def test_agent_decorator_keeps_async_behavior_and_attaches_metadata(): + @agent(budget=0.4, kpi_weights={"cost": 1.0}) + async def fn(x: int) -> int: + return x * 2 + + assert await fn(4) == 8 + policy = fn.__cascadeflow_agent_policy__ + assert policy["budget"] == 0.4 + assert policy["kpi_weights"] == {"cost": 1.0} + + +def test_top_level_exports_exist(): + assert callable(cascadeflow.init) + assert callable(cascadeflow.reset) + assert callable(cascadeflow.run) + # harness.agent is intentionally NOT re-exported at top level because it + # would shadow the cascadeflow.agent module. Import from submodule: + assert callable(agent) # imported from cascadeflow.harness + report = cascadeflow.init(mode="off") + assert report.mode == "off" + + +def test_run_record_and_trace_copy(): + ctx = run(budget=1.0) + ctx.record(action="switch_model", reason="cost_pressure", model="gpt-4o-mini") + trace_a = ctx.trace() + trace_b = ctx.trace() + assert trace_a == trace_b + assert trace_a[0]["action"] == "switch_model" + trace_a.append({"action": "mutated"}) + assert len(ctx.trace()) == 1 + + +def test_init_reads_from_env(monkeypatch): + monkeypatch.setenv("CASCADEFLOW_HARNESS_MODE", "observe") + monkeypatch.setenv("CASCADEFLOW_HARNESS_BUDGET", "0.25") + monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_TARGETS", '{"quality_min": 0.9}') + monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_WEIGHTS", '{"cost": 1.0}') + + report = init() + cfg = get_harness_config() + + assert report.mode == "observe" + assert cfg.mode == "observe" + assert cfg.budget == 0.25 + assert cfg.kpi_targets == {"quality_min": 0.9} + assert cfg.kpi_weights == {"cost": 1.0} + assert report.config_sources["mode"] == "env" + assert report.config_sources["budget"] == "env" + + +def test_init_reads_from_config_file(tmp_path, monkeypatch): + config = tmp_path / "cascadeflow.json" + config.write_text( + '{"harness":{"mode":"observe","budget":0.75,"max_tool_calls":11,"kpi_targets":{"quality_min":0.9}}}' + ) + monkeypatch.setenv("CASCADEFLOW_CONFIG", str(config)) + + report = init() + cfg = get_harness_config() + + assert cfg.mode == "observe" + assert cfg.budget == 0.75 + assert cfg.max_tool_calls == 11 + assert cfg.kpi_targets == {"quality_min": 0.9} + assert report.config_sources["mode"] == "file" + assert report.config_sources["budget"] == "file" + + +def test_init_reads_top_level_config_file_keys(tmp_path, monkeypatch): + config = tmp_path / "cascadeflow.json" + config.write_text('{"mode":"observe","budget":0.4,"max_tool_calls":2}') + monkeypatch.setenv("CASCADEFLOW_CONFIG", str(config)) + + report = init() + cfg = get_harness_config() + + assert cfg.mode == "observe" + assert cfg.budget == 0.4 + assert cfg.max_tool_calls == 2 + assert report.config_sources["mode"] == "file" + + +def test_init_non_dict_config_file_ignored(tmp_path, monkeypatch): + config = tmp_path / "cascadeflow.json" + config.write_text('["not-a-dict"]') + monkeypatch.setenv("CASCADEFLOW_CONFIG", str(config)) + + report = init() + cfg = get_harness_config() + + assert cfg.mode == "off" + assert cfg.budget is None + assert report.config_sources["mode"] == "default" + + +def test_init_file_loader_exception_falls_back_defaults(monkeypatch): + import cascadeflow.config_loader as cl + + monkeypatch.setattr(cl, "find_config", lambda: "broken.json") + + def _raise(_path): + raise RuntimeError("boom") + + monkeypatch.setattr(cl, "load_config", _raise) + + report = init() + cfg = get_harness_config() + assert cfg.mode == "off" + assert report.config_sources["mode"] == "default" + + +def test_init_config_loader_import_failure_falls_back(monkeypatch): + monkeypatch.setitem(sys.modules, "cascadeflow.config_loader", object()) + report = init(mode="observe") + assert report.mode == "observe" + assert report.config_sources["mode"] == "code" + + +def test_precedence_code_over_env_over_file(tmp_path, monkeypatch): + config = tmp_path / "cascadeflow.json" + config.write_text('{"harness":{"mode":"off","budget":9.9}}') + monkeypatch.setenv("CASCADEFLOW_CONFIG", str(config)) + monkeypatch.setenv("CASCADEFLOW_HARNESS_MODE", "observe") + monkeypatch.setenv("CASCADEFLOW_HARNESS_BUDGET", "0.5") + + # env overrides file + report_env = init() + cfg_env = get_harness_config() + assert cfg_env.mode == "observe" + assert cfg_env.budget == 0.5 + assert report_env.config_sources["mode"] == "env" + assert report_env.config_sources["budget"] == "env" + + # code overrides env + report_code = init(mode="enforce", budget=0.2) + cfg_code = get_harness_config() + assert cfg_code.mode == "enforce" + assert cfg_code.budget == 0.2 + assert report_code.config_sources["mode"] == "code" + assert report_code.config_sources["budget"] == "code" + + +def test_reset_clears_state(): + init(mode="enforce", budget=0.9) + with run() as ctx: + assert get_current_run() is ctx + reset() + cfg = get_harness_config() + assert cfg.mode == "off" + assert cfg.budget is None + assert get_current_run() is None + + +def test_init_without_detected_sdks(monkeypatch): + monkeypatch.setattr(harness_api, "find_spec", lambda _: None) + report = init(mode="observe") + assert report.instrumented == [] + assert report.detected_but_not_instrumented == [] + + +def test_init_reports_openai_instrumented_when_patch_succeeds(monkeypatch): + monkeypatch.setattr( + harness_api, + "find_spec", + lambda name: object() if name == "openai" else None, + ) + + import cascadeflow.harness.instrument as instrument + + monkeypatch.setattr(instrument, "patch_openai", lambda: True) + report = init(mode="observe") + assert report.instrumented == ["openai"] diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py new file mode 100644 index 00000000..12f0f938 --- /dev/null +++ b/tests/test_harness_instrument.py @@ -0,0 +1,708 @@ +"""Tests for cascadeflow.harness.instrument — OpenAI auto-instrumentation.""" + +from __future__ import annotations + +import time +from typing import Optional +from unittest.mock import AsyncMock, MagicMock + +import pytest + +pytest.importorskip("openai", reason="openai package required for instrumentation tests") + +from cascadeflow.harness import init, reset, run +from cascadeflow.harness.instrument import ( + _InstrumentedAsyncStream, + _InstrumentedStream, + _estimate_cost, + _estimate_energy, + _make_patched_async_create, + _make_patched_create, + is_patched, + patch_openai, + unpatch_openai, +) + + +@pytest.fixture(autouse=True) +def _reset_harness() -> None: + reset() + yield # type: ignore[misc] + reset() + + +# --------------------------------------------------------------------------- +# Mock helpers +# --------------------------------------------------------------------------- + + +def _mock_usage(prompt_tokens: int = 100, completion_tokens: int = 50) -> MagicMock: + u = MagicMock() + u.prompt_tokens = prompt_tokens + u.completion_tokens = completion_tokens + return u + + +def _mock_completion( + prompt_tokens: int = 100, + completion_tokens: int = 50, + tool_calls: Optional[list] = None, +) -> MagicMock: + msg = MagicMock() + msg.tool_calls = tool_calls + choice = MagicMock() + choice.message = msg + resp = MagicMock() + resp.usage = _mock_usage(prompt_tokens, completion_tokens) + resp.choices = [choice] + return resp + + +def _mock_tool_call(tc_id: str) -> MagicMock: + tc = MagicMock() + tc.id = tc_id + return tc + + +def _mock_stream_chunk( + content: str = "hi", + usage: Optional[MagicMock] = None, + tool_calls: Optional[list] = None, +) -> MagicMock: + delta = MagicMock() + delta.content = content + delta.tool_calls = tool_calls + choice = MagicMock() + choice.delta = delta + chunk = MagicMock() + chunk.choices = [choice] + chunk.usage = usage + return chunk + + +# --------------------------------------------------------------------------- +# Patch lifecycle +# --------------------------------------------------------------------------- + + +class TestPatchLifecycle: + def test_patch_and_unpatch(self) -> None: + assert not is_patched() + result = patch_openai() + assert result is True + assert is_patched() + unpatch_openai() + assert not is_patched() + + def test_idempotent_patching(self) -> None: + patch_openai() + patch_openai() + assert is_patched() + unpatch_openai() + assert not is_patched() + + def test_unpatch_without_prior_patch(self) -> None: + unpatch_openai() # should not raise + + def test_init_observe_patches(self) -> None: + report = init(mode="observe") + assert "openai" in report.instrumented + assert is_patched() + + def test_init_enforce_patches(self) -> None: + report = init(mode="enforce") + assert "openai" in report.instrumented + assert is_patched() + + def test_init_off_does_not_patch(self) -> None: + init(mode="off") + assert not is_patched() + + def test_reset_unpatches(self) -> None: + init(mode="observe") + assert is_patched() + reset() + assert not is_patched() + + def test_class_method_actually_replaced(self) -> None: + from openai.resources.chat.completions import Completions + + original = Completions.create + patch_openai() + assert Completions.create is not original + unpatch_openai() + assert Completions.create is original + + +# --------------------------------------------------------------------------- +# Sync wrapper +# --------------------------------------------------------------------------- + + +class TestSyncWrapper: + def test_observe_passes_through_response(self) -> None: + init(mode="observe") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + result = wrapper(MagicMock(), model="gpt-4o-mini") + + assert result is mock_resp + original.assert_called_once() + + def test_observe_tracks_cost(self) -> None: + init(mode="observe") + mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=10.0) as ctx: + wrapper(MagicMock(), model="gpt-4o-mini") + + # gpt-4o-mini: $0.15/1M in + $0.60/1M out = $0.75 + assert ctx.cost == pytest.approx(0.75, abs=0.01) + + def test_observe_tracks_step_count(self) -> None: + init(mode="observe") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="gpt-4o-mini") + wrapper(MagicMock(), model="gpt-4o-mini") + + assert ctx.step_count == 2 + + def test_observe_tracks_tool_calls(self) -> None: + init(mode="observe") + tc1 = _mock_tool_call("tc_1") + tc2 = _mock_tool_call("tc_2") + mock_resp = _mock_completion(tool_calls=[tc1, tc2]) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="gpt-4o") + + assert ctx.tool_calls == 2 + + def test_observe_tracks_energy(self) -> None: + init(mode="observe") + mock_resp = _mock_completion(prompt_tokens=1000, completion_tokens=500) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="gpt-4o-mini") + + # gpt-4o-mini coefficient=0.3, output_weight=1.5 + # energy = 0.3 * (1000 + 500 * 1.5) = 0.3 * 1750 = 525.0 + assert ctx.energy_used == pytest.approx(525.0) + + def test_observe_tracks_latency(self) -> None: + init(mode="observe") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="gpt-4o-mini") + + assert ctx.latency_used_ms > 0 + + def test_budget_remaining_decreases(self) -> None: + init(mode="observe") + mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=10.0) as ctx: + wrapper(MagicMock(), model="gpt-4o-mini") + + assert ctx.budget_remaining is not None + assert ctx.budget_remaining < 10.0 + assert ctx.budget_remaining == pytest.approx(10.0 - 0.75, abs=0.01) + + def test_model_used_and_trace(self) -> None: + init(mode="observe") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="gpt-4o") + + assert ctx.model_used == "gpt-4o" + trace = ctx.trace() + assert len(trace) == 1 + assert trace[0]["action"] == "allow" + assert trace[0]["reason"] == "observe" + assert trace[0]["model"] == "gpt-4o" + + def test_off_mode_passthrough_no_tracking(self) -> None: + init(mode="off") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run() as ctx: + result = wrapper(MagicMock(), model="gpt-4o") + + assert result is mock_resp + assert ctx.cost == 0.0 + assert ctx.step_count == 0 + + def test_no_run_scope_logs_but_does_not_track(self) -> None: + init(mode="observe") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + # Call outside any run() scope + result = wrapper(MagicMock(), model="gpt-4o") + assert result is mock_resp + + def test_multiple_calls_accumulate(self) -> None: + init(mode="observe") + mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=10.0) as ctx: + wrapper(MagicMock(), model="gpt-4o-mini") + wrapper(MagicMock(), model="gpt-4o-mini") + + assert ctx.cost == pytest.approx(1.50, abs=0.01) + assert ctx.step_count == 2 + assert len(ctx.trace()) == 2 + + +# --------------------------------------------------------------------------- +# Async wrapper +# --------------------------------------------------------------------------- + + +class TestAsyncWrapper: + @pytest.mark.asyncio + async def test_observe_passes_through_response(self) -> None: + init(mode="observe") + mock_resp = _mock_completion() + original = AsyncMock(return_value=mock_resp) + wrapper = _make_patched_async_create(original) + + async with run(budget=1.0) as ctx: + result = await wrapper(MagicMock(), model="gpt-4o") + + assert result is mock_resp + + @pytest.mark.asyncio + async def test_observe_tracks_cost(self) -> None: + init(mode="observe") + mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000) + original = AsyncMock(return_value=mock_resp) + wrapper = _make_patched_async_create(original) + + async with run(budget=10.0) as ctx: + await wrapper(MagicMock(), model="gpt-4o-mini") + + assert ctx.cost == pytest.approx(0.75, abs=0.01) + assert ctx.step_count == 1 + + @pytest.mark.asyncio + async def test_off_mode_passthrough(self) -> None: + init(mode="off") + mock_resp = _mock_completion() + original = AsyncMock(return_value=mock_resp) + wrapper = _make_patched_async_create(original) + + async with run() as ctx: + result = await wrapper(MagicMock(), model="gpt-4o") + + assert result is mock_resp + assert ctx.cost == 0.0 + + +# --------------------------------------------------------------------------- +# Sync stream wrapper +# --------------------------------------------------------------------------- + + +class TestSyncStreamWrapper: + def test_stream_yields_all_chunks(self) -> None: + init(mode="observe") + chunk1 = _mock_stream_chunk("Hello") + chunk2 = _mock_stream_chunk(" world", usage=_mock_usage(100, 50)) + mock_stream = iter([chunk1, chunk2]) + + with run(budget=1.0) as ctx: + wrapped = _InstrumentedStream(mock_stream, ctx, "gpt-4o-mini", time.monotonic()) + chunks = list(wrapped) + + assert len(chunks) == 2 + assert chunks[0] is chunk1 + assert chunks[1] is chunk2 + + def test_stream_tracks_cost_after_consumption(self) -> None: + init(mode="observe") + chunk1 = _mock_stream_chunk("Hello") + chunk2 = _mock_stream_chunk(" world", usage=_mock_usage(1_000_000, 1_000_000)) + mock_stream = iter([chunk1, chunk2]) + + with run(budget=10.0) as ctx: + wrapped = _InstrumentedStream(mock_stream, ctx, "gpt-4o-mini", time.monotonic()) + list(wrapped) + + assert ctx.cost == pytest.approx(0.75, abs=0.01) + assert ctx.step_count == 1 + + def test_stream_tracks_tool_calls(self) -> None: + init(mode="observe") + tc = _mock_tool_call("tc_1") + chunk1 = _mock_stream_chunk("", tool_calls=[tc]) + chunk2 = _mock_stream_chunk("", usage=_mock_usage(100, 50)) + mock_stream = iter([chunk1, chunk2]) + + with run(budget=1.0) as ctx: + wrapped = _InstrumentedStream(mock_stream, ctx, "gpt-4o", time.monotonic()) + list(wrapped) + + assert ctx.tool_calls == 1 + + def test_stream_context_manager(self) -> None: + init(mode="observe") + chunk1 = _mock_stream_chunk("data", usage=_mock_usage(100, 50)) + mock_inner = MagicMock() + mock_inner.__iter__ = MagicMock(return_value=iter([chunk1])) + mock_inner.__next__ = MagicMock(side_effect=[chunk1, StopIteration]) + mock_inner.__enter__ = MagicMock(return_value=mock_inner) + mock_inner.__exit__ = MagicMock(return_value=False) + + with run(budget=1.0) as ctx: + with _InstrumentedStream(mock_inner, ctx, "gpt-4o-mini", time.monotonic()) as stream: + for _ in stream: + pass + + assert ctx.step_count == 1 + + def test_stream_finalize_is_idempotent(self) -> None: + init(mode="observe") + chunk1 = _mock_stream_chunk("data", usage=_mock_usage(100, 50)) + mock_stream = iter([chunk1]) + + with run(budget=1.0) as ctx: + wrapped = _InstrumentedStream(mock_stream, ctx, "gpt-4o-mini", time.monotonic()) + list(wrapped) + # Force finalize again + wrapped._finalize() + + assert ctx.step_count == 1 # Should not double-count + + def test_stream_wrapper_via_patched_create(self) -> None: + """Verify that stream=True in the wrapper returns an _InstrumentedStream.""" + init(mode="observe") + chunk = _mock_stream_chunk("hi", usage=_mock_usage(50, 25)) + mock_stream = iter([chunk]) + original = MagicMock(return_value=mock_stream) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + result = wrapper(MagicMock(), model="gpt-4o-mini", stream=True) + assert isinstance(result, _InstrumentedStream) + list(result) + + assert ctx.step_count == 1 + + +# --------------------------------------------------------------------------- +# Async stream wrapper +# --------------------------------------------------------------------------- + + +class TestAsyncStreamWrapper: + @pytest.mark.asyncio + async def test_async_stream_yields_all_chunks(self) -> None: + init(mode="observe") + chunk1 = _mock_stream_chunk("Hello") + chunk2 = _mock_stream_chunk(" world", usage=_mock_usage(100, 50)) + + async def _async_iter(): + yield chunk1 + yield chunk2 + + mock_stream = _async_iter() + + async with run(budget=1.0) as ctx: + wrapped = _InstrumentedAsyncStream(mock_stream, ctx, "gpt-4o-mini", time.monotonic()) + chunks = [c async for c in wrapped] + + assert len(chunks) == 2 + assert ctx.cost > 0 + assert ctx.step_count == 1 + + @pytest.mark.asyncio + async def test_async_stream_via_patched_create(self) -> None: + """Verify that stream=True in async wrapper returns an _InstrumentedAsyncStream.""" + init(mode="observe") + chunk = _mock_stream_chunk("hi", usage=_mock_usage(50, 25)) + + async def _async_iter(): + yield chunk + + mock_stream = _async_iter() + original = AsyncMock(return_value=mock_stream) + wrapper = _make_patched_async_create(original) + + async with run(budget=1.0) as ctx: + result = await wrapper(MagicMock(), model="gpt-4o-mini", stream=True) + assert isinstance(result, _InstrumentedAsyncStream) + _ = [c async for c in result] + + assert ctx.step_count == 1 + + +# --------------------------------------------------------------------------- +# Cost and energy estimation +# --------------------------------------------------------------------------- + + +class TestEstimation: + def test_cost_known_model(self) -> None: + cost = _estimate_cost("gpt-4o-mini", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.15 + 0.60) + + def test_cost_unknown_model_uses_default(self) -> None: + cost = _estimate_cost("my-custom-model", 1_000_000, 1_000_000) + # default pricing: $2.50/$10.00 + assert cost == pytest.approx(2.50 + 10.00) + + def test_cost_zero_tokens(self) -> None: + cost = _estimate_cost("gpt-4o", 0, 0) + assert cost == 0.0 + + def test_energy_known_model(self) -> None: + energy = _estimate_energy("gpt-4o-mini", 1000, 500) + # coeff=0.3, output_weight=1.5 + # energy = 0.3 * (1000 + 500 * 1.5) = 0.3 * 1750 = 525.0 + assert energy == pytest.approx(525.0) + + def test_energy_unknown_model_uses_default(self) -> None: + energy = _estimate_energy("custom-model", 1000, 500) + # default coeff=1.0 + # energy = 1.0 * (1000 + 500 * 1.5) = 1750.0 + assert energy == pytest.approx(1750.0) + + +# --------------------------------------------------------------------------- +# Nested run isolation +# --------------------------------------------------------------------------- + + +class TestNestedRuns: + def test_inner_run_does_not_affect_outer(self) -> None: + init(mode="observe") + mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=10.0) as outer: + wrapper(MagicMock(), model="gpt-4o-mini") # $0.75 to outer + outer_cost_before_inner = outer.cost + + with run(budget=5.0) as inner: + wrapper(MagicMock(), model="gpt-4o-mini") # $0.75 to inner + + # Outer cost should be unchanged after inner scope exits + assert outer.cost == pytest.approx(outer_cost_before_inner) + assert inner.cost == pytest.approx(0.75, abs=0.01) + + +# --------------------------------------------------------------------------- +# Edge cases +# --------------------------------------------------------------------------- + + +class TestEdgeCases: + def test_response_without_usage(self) -> None: + init(mode="observe") + mock_resp = MagicMock() + mock_resp.usage = None + mock_resp.choices = [] + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="gpt-4o") + + assert ctx.cost == 0.0 + assert ctx.step_count == 1 + + def test_response_without_choices(self) -> None: + init(mode="observe") + mock_resp = MagicMock() + mock_resp.usage = _mock_usage(100, 50) + mock_resp.choices = [] + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="gpt-4o") + + assert ctx.tool_calls == 0 + assert ctx.cost > 0 + + def test_stream_without_usage_in_any_chunk(self) -> None: + init(mode="observe") + chunk1 = _mock_stream_chunk("Hello") + chunk2 = _mock_stream_chunk(" world") + mock_stream = iter([chunk1, chunk2]) + + with run(budget=1.0) as ctx: + wrapped = _InstrumentedStream(mock_stream, ctx, "gpt-4o-mini", time.monotonic()) + list(wrapped) + + assert ctx.cost == 0.0 # No usage data available + assert ctx.step_count == 1 # Step still counted + + +# --------------------------------------------------------------------------- +# Fix: init(mode="off") unpatches previously patched client +# --------------------------------------------------------------------------- + + +class TestInitOffUnpatches: + def test_init_off_after_observe_unpatches(self) -> None: + init(mode="observe") + assert is_patched() + init(mode="off") + assert not is_patched() + + def test_init_off_when_not_patched_is_safe(self) -> None: + init(mode="off") + assert not is_patched() + + +# --------------------------------------------------------------------------- +# Fix: enforce mode — budget gate and correct trace reason +# --------------------------------------------------------------------------- + + +class TestEnforceMode: + def test_enforce_trace_records_enforce_reason(self) -> None: + init(mode="enforce") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=10.0) as ctx: + wrapper(MagicMock(), model="gpt-4o") + + trace = ctx.trace() + assert trace[0]["reason"] == "enforce" + + def test_observe_trace_records_observe_reason(self) -> None: + init(mode="observe") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=10.0) as ctx: + wrapper(MagicMock(), model="gpt-4o") + + trace = ctx.trace() + assert trace[0]["reason"] == "observe" + + def test_enforce_raises_on_budget_exhausted(self) -> None: + from cascadeflow.schema.exceptions import BudgetExceededError + + init(mode="enforce") + mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=0.001) as ctx: + # First call uses the tiny budget + wrapper(MagicMock(), model="gpt-4o") + # Second call should raise — budget exhausted + with pytest.raises(BudgetExceededError): + wrapper(MagicMock(), model="gpt-4o") + + def test_observe_does_not_raise_on_budget_exhausted(self) -> None: + init(mode="observe") + mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=0.001) as ctx: + wrapper(MagicMock(), model="gpt-4o") + # Second call should NOT raise — observe mode is permissive + wrapper(MagicMock(), model="gpt-4o") + + assert ctx.cost > ctx.budget_max # type: ignore[operator] + + @pytest.mark.asyncio + async def test_enforce_raises_on_budget_exhausted_async(self) -> None: + from cascadeflow.schema.exceptions import BudgetExceededError + + init(mode="enforce") + mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000) + original = AsyncMock(return_value=mock_resp) + wrapper = _make_patched_async_create(original) + + async with run(budget=0.001) as ctx: + await wrapper(MagicMock(), model="gpt-4o") + with pytest.raises(BudgetExceededError): + await wrapper(MagicMock(), model="gpt-4o") + + +# --------------------------------------------------------------------------- +# Fix: stream_options.include_usage auto-injection +# --------------------------------------------------------------------------- + + +class TestStreamUsageInjection: + def test_stream_injects_include_usage(self) -> None: + init(mode="observe") + mock_stream = iter([_mock_stream_chunk("hi", usage=_mock_usage(50, 25))]) + original = MagicMock(return_value=mock_stream) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + result = wrapper(MagicMock(), model="gpt-4o-mini", stream=True) + list(result) + + # Check the original was called with stream_options injected + call_kwargs = original.call_args[1] + assert call_kwargs.get("stream_options", {}).get("include_usage") is True + + def test_stream_preserves_existing_stream_options(self) -> None: + init(mode="observe") + mock_stream = iter([_mock_stream_chunk("hi", usage=_mock_usage(50, 25))]) + original = MagicMock(return_value=mock_stream) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + result = wrapper( + MagicMock(), + model="gpt-4o-mini", + stream=True, + stream_options={"include_usage": True}, + ) + list(result) + + call_kwargs = original.call_args[1] + assert call_kwargs["stream_options"]["include_usage"] is True + + def test_non_stream_does_not_inject_stream_options(self) -> None: + init(mode="observe") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="gpt-4o-mini") + + call_kwargs = original.call_args[1] + assert "stream_options" not in call_kwargs diff --git a/tests/test_openai_agents_integration.py b/tests/test_openai_agents_integration.py new file mode 100644 index 00000000..b2644036 --- /dev/null +++ b/tests/test_openai_agents_integration.py @@ -0,0 +1,207 @@ +import pytest + +from cascadeflow.harness import init, reset, run +import cascadeflow.integrations.openai_agents as openai_agents_integration +from cascadeflow.integrations.openai_agents import ( + CascadeFlowModelProvider, + OpenAIAgentsIntegrationConfig, +) +from cascadeflow.schema.exceptions import BudgetExceededError + + +def setup_function() -> None: + reset() + + +def test_requires_sdk_for_default_provider(monkeypatch): + monkeypatch.setattr(openai_agents_integration, "OPENAI_AGENTS_SDK_AVAILABLE", False) + with pytest.raises(ImportError): + CascadeFlowModelProvider() + + +class _FakeUsage: + def __init__(self, input_tokens: int, output_tokens: int) -> None: + self.input_tokens = input_tokens + self.output_tokens = output_tokens + + +class _FakeResponse: + def __init__(self, input_tokens: int = 0, output_tokens: int = 0, output=None) -> None: + self.usage = _FakeUsage(input_tokens=input_tokens, output_tokens=output_tokens) + self.output = output or [] + + +class _FakeEvent: + def __init__(self, response=None) -> None: + self.response = response + + +class _FakeAsyncStream: + def __init__(self, events) -> None: + self._events = list(events) + self._index = 0 + + def __aiter__(self): + return self + + async def __anext__(self): + if self._index >= len(self._events): + raise StopAsyncIteration + event = self._events[self._index] + self._index += 1 + return event + + +class _FakeModel: + def __init__(self, response: _FakeResponse, stream_events=None) -> None: + self._response = response + self._stream_events = stream_events or [] + self.last_kwargs = None + + async def get_response(self, **kwargs): + self.last_kwargs = kwargs + return self._response + + def stream_response(self, **kwargs): + self.last_kwargs = kwargs + return _FakeAsyncStream(self._stream_events) + + +class _FakeBaseProvider: + def __init__(self, model: _FakeModel) -> None: + self._model = model + self.requested_models = [] + + def get_model(self, model_name): + self.requested_models.append(model_name) + return self._model + + +def _response_call_kwargs(): + return { + "system_instructions": None, + "input": "hello", + "model_settings": None, + "tools": [], + "output_schema": None, + "handoffs": [], + "tracing": None, + "previous_response_id": None, + "conversation_id": None, + "prompt": None, + } + + +@pytest.mark.asyncio +async def test_metrics_updated_from_get_response(): + init(mode="observe", budget=2.0) + + output = [{"type": "function_call", "name": "lookup"}] + response = _FakeResponse(input_tokens=200, output_tokens=100, output=output) + model = _FakeModel(response=response) + provider = CascadeFlowModelProvider(base_provider=_FakeBaseProvider(model)) + + wrapped = provider.get_model("gpt-4o") + + with run(budget=2.0) as ctx: + await wrapped.get_response(**_response_call_kwargs()) + assert model.last_kwargs is not None + assert model.last_kwargs["input"] == "hello" + assert ctx.step_count == 1 + assert ctx.tool_calls == 1 + assert ctx.cost > 0 + assert ctx.energy_used > 0 + assert ctx.budget_remaining is not None + assert ctx.budget_remaining < 2.0 + assert ctx.model_used == "gpt-4o" + + +@pytest.mark.asyncio +async def test_tool_gating_enforced_when_limit_reached(): + init(mode="enforce", max_tool_calls=0, budget=1.0) + + response = _FakeResponse(input_tokens=10, output_tokens=5) + model = _FakeModel(response=response) + provider = CascadeFlowModelProvider(base_provider=_FakeBaseProvider(model)) + wrapped = provider.get_model("gpt-4o-mini") + + kwargs = _response_call_kwargs() + kwargs["tools"] = [{"name": "lookup"}] + + with run(max_tool_calls=0, budget=1.0) as ctx: + await wrapped.get_response(**kwargs) + assert model.last_kwargs is not None + assert model.last_kwargs["tools"] == [] + assert ctx.last_action == "deny_tool" + + +def test_switches_to_cheapest_candidate_under_budget_pressure(): + init(mode="enforce", budget=1.0) + + response = _FakeResponse() + model = _FakeModel(response=response) + base_provider = _FakeBaseProvider(model) + config = OpenAIAgentsIntegrationConfig(model_candidates=["gpt-4o", "gpt-4o-mini"]) + provider = CascadeFlowModelProvider(base_provider=base_provider, config=config) + + with run(budget=1.0) as ctx: + ctx.cost = 0.9 + ctx.budget_remaining = 0.1 + provider.get_model("gpt-4o") + assert base_provider.requested_models[-1] == "gpt-4o-mini" + assert ctx.last_action == "switch_model" + + +def test_budget_exceeded_raises_cascadeflow_budget_error(): + init(mode="enforce", budget=1.0) + + response = _FakeResponse() + model = _FakeModel(response=response) + provider = CascadeFlowModelProvider(base_provider=_FakeBaseProvider(model)) + + with run(budget=1.0) as ctx: + ctx.budget_remaining = 0.0 + with pytest.raises(BudgetExceededError): + provider.get_model("gpt-4o-mini") + + +def test_fail_open_falls_back_when_model_resolution_errors(monkeypatch): + response = _FakeResponse() + model = _FakeModel(response=response) + base_provider = _FakeBaseProvider(model) + provider = CascadeFlowModelProvider(base_provider=base_provider) + + def _boom(_: object) -> str: + raise ValueError("resolution failed") + + monkeypatch.setattr(provider, "_resolve_model", _boom) + wrapped = provider.get_model("gpt-4o") + + assert wrapped is not None + assert base_provider.requested_models[-1] == "gpt-4o" + + +@pytest.mark.asyncio +async def test_stream_response_updates_metrics(): + init(mode="observe", budget=3.0) + + final_response = _FakeResponse( + input_tokens=120, + output_tokens=60, + output=[{"type": "function_call", "name": "tool_a"}], + ) + stream_events = [_FakeEvent(response=final_response)] + model = _FakeModel(response=final_response, stream_events=stream_events) + provider = CascadeFlowModelProvider(base_provider=_FakeBaseProvider(model)) + wrapped = provider.get_model("gpt-4o-mini") + + with run(budget=3.0) as ctx: + async for _ in wrapped.stream_response(**_response_call_kwargs()): + pass + + assert model.last_kwargs is not None + assert model.last_kwargs["input"] == "hello" + assert ctx.step_count == 1 + assert ctx.tool_calls == 1 + assert ctx.cost > 0 + assert ctx.model_used == "gpt-4o-mini"