From eb3df8952d7c494a2b4f30b12c286f57a0532d94 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 25 Feb 2026 22:30:36 +0100 Subject: [PATCH 01/49] Add core harness API scaffold with context-scoped runtime --- cascadeflow/__init__.py | 4 + cascadeflow/harness/api.py | 78 +++------- docs/strategy/agent-intelligence-v2-plan.md | 10 +- tests/test_harness_api.py | 161 +------------------- 4 files changed, 31 insertions(+), 222 deletions(-) diff --git a/cascadeflow/__init__.py b/cascadeflow/__init__.py index 1b61a9f3..d49eb644 100644 --- a/cascadeflow/__init__.py +++ b/cascadeflow/__init__.py @@ -401,7 +401,11 @@ "init", "reset", "run", +<<<<<<< HEAD "harness_agent", +======= + "agent", +>>>>>>> 1aba349 (Add core harness API scaffold with context-scoped runtime) "get_harness_config", "get_current_run", # ===== PROVIDERS ===== diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py index a71d5f5a..b2bb1033 100644 --- a/cascadeflow/harness/api.py +++ b/cascadeflow/harness/api.py @@ -45,24 +45,16 @@ class HarnessRunContext: tool_calls_max: Optional[int] = None latency_max_ms: Optional[float] = None energy_max: Optional[float] = None - kpi_targets: Optional[dict[str, float]] = None - kpi_weights: Optional[dict[str, float]] = None - compliance: Optional[str] = None cost: float = 0.0 savings: float = 0.0 tool_calls: int = 0 - step_count: int = 0 - latency_used_ms: float = 0.0 - energy_used: float = 0.0 budget_remaining: Optional[float] = None model_used: Optional[str] = None last_action: str = "allow" draft_accepted: Optional[bool] = None _trace: list[dict[str, Any]] = field(default_factory=list) - _token: Optional[Token[Optional[HarnessRunContext]]] = field( - default=None, init=False, repr=False - ) + _token: Optional[Token[Optional[HarnessRunContext]]] = field(default=None, init=False, repr=False) def __post_init__(self) -> None: if self.budget_max is not None and self.budget_remaining is None: @@ -86,34 +78,21 @@ async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None: def trace(self) -> list[dict[str, Any]]: return list(self._trace) - def record( - self, - action: str, - reason: str, - model: Optional[str] = None, - *, - applied: Optional[bool] = None, - decision_mode: Optional[str] = None, - ) -> None: + def record(self, action: str, reason: str, model: Optional[str] = None) -> None: self.last_action = action self.model_used = model - entry: dict[str, Any] = { - "action": action, - "reason": reason, - "model": model, - "run_id": self.run_id, - } - if applied is not None: - entry["applied"] = applied - if decision_mode is not None: - entry["decision_mode"] = decision_mode - self._trace.append(entry) + self._trace.append( + { + "action": action, + "reason": reason, + "model": model, + "run_id": self.run_id, + } + ) _harness_config: HarnessConfig = HarnessConfig() -_current_run: ContextVar[Optional[HarnessRunContext]] = ContextVar( - "cascadeflow_harness_run", default=None -) +_current_run: ContextVar[Optional[HarnessRunContext]] = ContextVar("cascadeflow_harness_run", default=None) _is_instrumented: bool = False _UNSET = object() @@ -141,17 +120,13 @@ def get_current_run() -> Optional[HarnessRunContext]: def reset() -> None: """ - Reset harness global state and unpatch instrumented clients. + Reset harness global state. Intended for tests and controlled shutdown paths. """ global _harness_config global _is_instrumented - - from cascadeflow.harness.instrument import unpatch_openai - - unpatch_openai() _harness_config = HarnessConfig() _is_instrumented = False _current_run.set(None) @@ -307,7 +282,9 @@ def init( compliance: Optional[str] | object = _UNSET, ) -> HarnessInitReport: """ - Initialize global harness settings and instrument detected SDK clients. + Initialize global harness settings. + + This is a scaffold API for V2 work and intentionally performs no request patching yet. """ global _harness_config @@ -326,9 +303,7 @@ def init( resolved_max_latency_ms = _resolve_value( "max_latency_ms", max_latency_ms, env_config, file_config, None, sources ) - resolved_max_energy = _resolve_value( - "max_energy", max_energy, env_config, file_config, None, sources - ) + resolved_max_energy = _resolve_value("max_energy", max_energy, env_config, file_config, None, sources) resolved_kpi_targets = _resolve_value( "kpi_targets", kpi_targets, env_config, file_config, None, sources ) @@ -356,16 +331,8 @@ def init( instrumented: list[str] = [] detected_but_not_instrumented: list[str] = [] - if validated_mode != "off" and sdk_presence["openai"]: - from cascadeflow.harness.instrument import patch_openai - - if patch_openai(): - instrumented.append("openai") - elif validated_mode == "off": - from cascadeflow.harness.instrument import is_patched, unpatch_openai - - if is_patched(): - unpatch_openai() + if sdk_presence["openai"]: + instrumented.append("openai") if sdk_presence["anthropic"]: detected_but_not_instrumented.append("anthropic") @@ -396,9 +363,6 @@ def run( max_tool_calls: Optional[int] = None, max_latency_ms: Optional[float] = None, max_energy: Optional[float] = None, - kpi_targets: Optional[dict[str, float]] = None, - kpi_weights: Optional[dict[str, float]] = None, - compliance: Optional[str] = None, ) -> HarnessRunContext: """ Create a scoped run context. @@ -411,9 +375,6 @@ def run( resolved_tool_calls = max_tool_calls if max_tool_calls is not None else config.max_tool_calls resolved_latency = max_latency_ms if max_latency_ms is not None else config.max_latency_ms resolved_energy = max_energy if max_energy is not None else config.max_energy - resolved_kpi_targets = kpi_targets if kpi_targets is not None else config.kpi_targets - resolved_kpi_weights = kpi_weights if kpi_weights is not None else config.kpi_weights - resolved_compliance = compliance if compliance is not None else config.compliance return HarnessRunContext( mode=config.mode, @@ -421,9 +382,6 @@ def run( tool_calls_max=resolved_tool_calls, latency_max_ms=resolved_latency, energy_max=resolved_energy, - kpi_targets=resolved_kpi_targets, - kpi_weights=resolved_kpi_weights, - compliance=resolved_compliance, ) diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md index 267ddc69..787bab32 100644 --- a/docs/strategy/agent-intelligence-v2-plan.md +++ b/docs/strategy/agent-intelligence-v2-plan.md @@ -843,13 +843,13 @@ Branching model: Claim checklist (one owner per branch at a time): - [x] `feat/v2-core-harness-api` — Owner: `@codex` — PR: `TBD` — Status: `completed` -- [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — PR: `TBD` — Status: `in-progress` -- [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `completed (ready for PR)` -- [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` +- [ ] `feat/v2-openai-auto-instrumentation` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` +- [ ] `feat/v2-enforce-actions` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` +- [ ] `feat/v2-openai-agents-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` - [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` -- [ ] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` +- [ ] `feat/v2-langchain-harness-extension` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` - [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` -- [x] `feat/v2-bench-repro-pipeline` — Owner: `@codex` — PR: `#163` — Status: `review` +- [ ] `feat/v2-bench-repro-pipeline` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` - [ ] `feat/v2-security-privacy-telemetry` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` Merge gates per feature branch: diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py index 5669e845..2d8ffcfc 100644 --- a/tests/test_harness_api.py +++ b/tests/test_harness_api.py @@ -1,9 +1,6 @@ -import sys - import pytest import cascadeflow -import cascadeflow.harness.api as harness_api from cascadeflow.harness import agent, get_current_run, get_harness_config, init, reset, run @@ -29,84 +26,19 @@ def test_init_rejects_invalid_mode(): init(mode="invalid") # type: ignore[arg-type] -def test_init_idempotent_logs(monkeypatch, caplog): - monkeypatch.setattr(harness_api, "find_spec", lambda _: None) - with caplog.at_level("DEBUG", logger="cascadeflow.harness"): - init(mode="observe") - init(mode="observe") - assert any("idempotent" in rec.message for rec in caplog.records) - - -def test_env_aliases_and_false_bool(monkeypatch): - monkeypatch.setenv("CASCADEFLOW_MODE", "observe") - monkeypatch.setenv("CASCADEFLOW_BUDGET", "0.33") - monkeypatch.setenv("CASCADEFLOW_HARNESS_VERBOSE", "off") - monkeypatch.setenv("CASCADEFLOW_HARNESS_MAX_TOOL_CALLS", "4") - monkeypatch.setenv("CASCADEFLOW_HARNESS_MAX_LATENCY_MS", "1200") - monkeypatch.setenv("CASCADEFLOW_HARNESS_MAX_ENERGY", "0.01") - monkeypatch.setenv("CASCADEFLOW_HARNESS_COMPLIANCE", "gdpr") - - report = init() - cfg = get_harness_config() - - assert report.mode == "observe" - assert cfg.mode == "observe" - assert cfg.budget == 0.33 - assert cfg.verbose is False - assert cfg.max_tool_calls == 4 - assert cfg.max_latency_ms == 1200 - assert cfg.max_energy == 0.01 - assert cfg.compliance == "gdpr" - - -def test_init_invalid_json_env_raises(monkeypatch): - monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_WEIGHTS", "[1,2,3]") - with pytest.raises(ValueError): - init() - - -def test_init_non_numeric_env_raises(monkeypatch): - monkeypatch.setenv("CASCADEFLOW_HARNESS_BUDGET", "abc") - with pytest.raises(ValueError): - init() - - def test_run_uses_global_defaults_and_overrides(): - init( - mode="enforce", - budget=2.0, - max_tool_calls=5, - kpi_targets={"quality_min": 0.9}, - kpi_weights={"cost": 0.7, "quality": 0.3}, - compliance="gdpr", - ) + init(mode="enforce", budget=2.0, max_tool_calls=5) default_ctx = run() assert default_ctx.mode == "enforce" assert default_ctx.budget_max == 2.0 assert default_ctx.tool_calls_max == 5 assert default_ctx.budget_remaining == 2.0 - assert default_ctx.kpi_targets == {"quality_min": 0.9} - assert default_ctx.kpi_weights == {"cost": 0.7, "quality": 0.3} - assert default_ctx.compliance == "gdpr" - - override_ctx = run( - budget=0.5, - max_tool_calls=3, - kpi_weights={"quality": 1.0}, - compliance="strict", - ) + + override_ctx = run(budget=0.5, max_tool_calls=3) assert override_ctx.budget_max == 0.5 assert override_ctx.tool_calls_max == 3 assert override_ctx.budget_remaining == 0.5 - assert override_ctx.kpi_targets == {"quality_min": 0.9} - assert override_ctx.kpi_weights == {"quality": 1.0} - assert override_ctx.compliance == "strict" - - -def test_run_without_enter_exit_is_safe(): - ctx = run() - ctx.__exit__(None, None, None) @pytest.mark.asyncio @@ -126,17 +58,6 @@ async def test_nested_run_context_is_isolated(): assert get_current_run() is None -def test_sync_run_context_isolated(): - init(mode="enforce", budget=1.0) - with run(budget=0.6) as outer: - assert get_current_run() is outer - with run(budget=0.1) as inner: - assert get_current_run() is inner - assert inner.budget_max == 0.1 - assert get_current_run() is outer - assert get_current_run() is None - - def test_agent_decorator_keeps_sync_behavior_and_attaches_metadata(): @agent( budget=0.9, @@ -170,8 +91,7 @@ def test_top_level_exports_exist(): assert callable(cascadeflow.init) assert callable(cascadeflow.reset) assert callable(cascadeflow.run) - assert callable(cascadeflow.harness_agent) - assert hasattr(cascadeflow.agent, "PROVIDER_REGISTRY") + assert callable(cascadeflow.agent) report = cascadeflow.init(mode="off") assert report.mode == "off" @@ -190,7 +110,6 @@ def test_run_record_and_trace_copy(): def test_init_reads_from_env(monkeypatch): monkeypatch.setenv("CASCADEFLOW_HARNESS_MODE", "observe") monkeypatch.setenv("CASCADEFLOW_HARNESS_BUDGET", "0.25") - monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_TARGETS", '{"quality_min": 0.9}') monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_WEIGHTS", '{"cost": 1.0}') report = init() @@ -199,7 +118,6 @@ def test_init_reads_from_env(monkeypatch): assert report.mode == "observe" assert cfg.mode == "observe" assert cfg.budget == 0.25 - assert cfg.kpi_targets == {"quality_min": 0.9} assert cfg.kpi_weights == {"cost": 1.0} assert report.config_sources["mode"] == "env" assert report.config_sources["budget"] == "env" @@ -223,56 +141,6 @@ def test_init_reads_from_config_file(tmp_path, monkeypatch): assert report.config_sources["budget"] == "file" -def test_init_reads_top_level_config_file_keys(tmp_path, monkeypatch): - config = tmp_path / "cascadeflow.json" - config.write_text('{"mode":"observe","budget":0.4,"max_tool_calls":2}') - monkeypatch.setenv("CASCADEFLOW_CONFIG", str(config)) - - report = init() - cfg = get_harness_config() - - assert cfg.mode == "observe" - assert cfg.budget == 0.4 - assert cfg.max_tool_calls == 2 - assert report.config_sources["mode"] == "file" - - -def test_init_non_dict_config_file_ignored(tmp_path, monkeypatch): - config = tmp_path / "cascadeflow.json" - config.write_text('["not-a-dict"]') - monkeypatch.setenv("CASCADEFLOW_CONFIG", str(config)) - - report = init() - cfg = get_harness_config() - - assert cfg.mode == "off" - assert cfg.budget is None - assert report.config_sources["mode"] == "default" - - -def test_init_file_loader_exception_falls_back_defaults(monkeypatch): - import cascadeflow.config_loader as cl - - monkeypatch.setattr(cl, "find_config", lambda: "broken.json") - - def _raise(_path): - raise RuntimeError("boom") - - monkeypatch.setattr(cl, "load_config", _raise) - - report = init() - cfg = get_harness_config() - assert cfg.mode == "off" - assert report.config_sources["mode"] == "default" - - -def test_init_config_loader_import_failure_falls_back(monkeypatch): - monkeypatch.setitem(sys.modules, "cascadeflow.config_loader", object()) - report = init(mode="observe") - assert report.mode == "observe" - assert report.config_sources["mode"] == "code" - - def test_precedence_code_over_env_over_file(tmp_path, monkeypatch): config = tmp_path / "cascadeflow.json" config.write_text('{"harness":{"mode":"off","budget":9.9}}') @@ -306,24 +174,3 @@ def test_reset_clears_state(): assert cfg.mode == "off" assert cfg.budget is None assert get_current_run() is None - - -def test_init_without_detected_sdks(monkeypatch): - monkeypatch.setattr(harness_api, "find_spec", lambda _: None) - report = init(mode="observe") - assert report.instrumented == [] - assert report.detected_but_not_instrumented == [] - - -def test_init_reports_openai_instrumented_when_patch_succeeds(monkeypatch): - monkeypatch.setattr( - harness_api, - "find_spec", - lambda name: object() if name == "openai" else None, - ) - - import cascadeflow.harness.instrument as instrument - - monkeypatch.setattr(instrument, "patch_openai", lambda: True) - report = init(mode="observe") - assert report.instrumented == ["openai"] From 8b0d2e01740bafa94674d88f775337b3b5234924 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 25 Feb 2026 22:38:16 +0100 Subject: [PATCH 02/49] Harden harness core scaffolding and complete API test coverage --- cascadeflow/harness/api.py | 16 +- cascadeflow/harness/instrument.py | 873 +----------------------------- tests/test_harness_api.py | 134 +++++ 3 files changed, 157 insertions(+), 866 deletions(-) diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py index b2bb1033..10d0e29a 100644 --- a/cascadeflow/harness/api.py +++ b/cascadeflow/harness/api.py @@ -49,6 +49,9 @@ class HarnessRunContext: cost: float = 0.0 savings: float = 0.0 tool_calls: int = 0 + step_count: int = 0 + latency_used_ms: float = 0.0 + energy_used: float = 0.0 budget_remaining: Optional[float] = None model_used: Optional[str] = None last_action: str = "allow" @@ -120,13 +123,17 @@ def get_current_run() -> Optional[HarnessRunContext]: def reset() -> None: """ - Reset harness global state. + Reset harness global state and unpatch instrumented clients. Intended for tests and controlled shutdown paths. """ global _harness_config global _is_instrumented + + from cascadeflow.harness.instrument import unpatch_openai + + unpatch_openai() _harness_config = HarnessConfig() _is_instrumented = False _current_run.set(None) @@ -331,8 +338,11 @@ def init( instrumented: list[str] = [] detected_but_not_instrumented: list[str] = [] - if sdk_presence["openai"]: - instrumented.append("openai") + if validated_mode != "off" and sdk_presence["openai"]: + from cascadeflow.harness.instrument import patch_openai + + if patch_openai(): + instrumented.append("openai") if sdk_presence["anthropic"]: detected_but_not_instrumented.append("anthropic") diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py index c2fbd7ab..ad12bbdf 100644 --- a/cascadeflow/harness/instrument.py +++ b/cascadeflow/harness/instrument.py @@ -1,878 +1,25 @@ -"""OpenAI Python client auto-instrumentation for cascadeflow harness. - -Patches ``openai.resources.chat.completions.Completions.create`` (sync) and -``AsyncCompletions.create`` (async) to intercept LLM calls for observe/enforce -modes. - -This module is called internally by ``cascadeflow.harness.init()``. Users -should not call ``patch_openai`` / ``unpatch_openai`` directly. - -Implementation notes: - - Patching is class-level (all current and future client instances). - - Patching is idempotent (safe to call multiple times). - - ``unpatch_openai()`` restores the original methods exactly. - - Streaming responses are wrapped to capture usage after completion. - - ``with_raw_response`` is NOT patched in V2 (known limitation). -""" - from __future__ import annotations -import functools import logging -import time -from dataclasses import dataclass -from typing import Any - -from cascadeflow.harness.pricing import ( - DEFAULT_ENERGY_COEFFICIENT as _DEFAULT_ENERGY_COEFFICIENT, -) -from cascadeflow.harness.pricing import ( - ENERGY_COEFFICIENTS as _ENERGY_COEFFICIENTS, -) -from cascadeflow.harness.pricing import ( - OPENAI_MODEL_POOL as _PRICING_MODELS, -) -from cascadeflow.harness.pricing import ( - estimate_cost as _estimate_cost_shared, -) -from cascadeflow.harness.pricing import ( - estimate_energy as _estimate_energy_shared, -) -from cascadeflow.harness.pricing import ( - model_total_price as _model_total_price_shared, -) - -logger = logging.getLogger("cascadeflow.harness.instrument") - -# --------------------------------------------------------------------------- -# Module-level state for idempotent patch/unpatch -# --------------------------------------------------------------------------- - -_openai_patched: bool = False -_original_sync_create: Any = None -_original_async_create: Any = None - -_MODEL_TOTAL_COSTS: dict[str, float] = { - name: _model_total_price_shared(name) for name in _PRICING_MODELS -} -_CHEAPEST_MODEL: str = min(_MODEL_TOTAL_COSTS, key=_MODEL_TOTAL_COSTS.get) -_MIN_TOTAL_COST: float = min(_MODEL_TOTAL_COSTS.values()) -_MAX_TOTAL_COST: float = max(_MODEL_TOTAL_COSTS.values()) - -_OPENAI_ENERGY_COEFFS: dict[str, float] = { - name: _ENERGY_COEFFICIENTS.get(name, _DEFAULT_ENERGY_COEFFICIENT) for name in _PRICING_MODELS -} -_LOWEST_ENERGY_MODEL: str = min(_OPENAI_ENERGY_COEFFS, key=_OPENAI_ENERGY_COEFFS.get) -_MIN_ENERGY_COEFF: float = min(_OPENAI_ENERGY_COEFFS.values()) -_MAX_ENERGY_COEFF: float = max(_OPENAI_ENERGY_COEFFS.values()) - -# Relative priors used by KPI-weighted soft-control scoring. -# These are deterministic heuristics based on internal benchmark runs and -# intended as defaults until provider-specific online scoring is wired in. -_QUALITY_PRIORS: dict[str, float] = { - "gpt-4o": 0.90, - "gpt-4o-mini": 0.75, - "gpt-5-mini": 0.86, - "gpt-4-turbo": 0.88, - "gpt-4": 0.87, - "gpt-3.5-turbo": 0.65, - "o1": 0.95, - "o1-mini": 0.82, - "o3-mini": 0.80, -} -_LATENCY_PRIORS: dict[str, float] = { - "gpt-4o": 0.72, - "gpt-4o-mini": 0.93, - "gpt-5-mini": 0.84, - "gpt-4-turbo": 0.66, - "gpt-4": 0.52, - "gpt-3.5-turbo": 1.00, - "o1": 0.40, - "o1-mini": 0.60, - "o3-mini": 0.78, -} -_LATENCY_CANDIDATES: tuple[str, ...] = tuple( - name for name in _PRICING_MODELS if name in _LATENCY_PRIORS -) -_FASTEST_MODEL: str | None = ( - max(_LATENCY_CANDIDATES, key=lambda name: _LATENCY_PRIORS[name]) - if _LATENCY_CANDIDATES - else None -) - -# OpenAI-model allowlists used by the current OpenAI harness instrumentation. -# Future provider instrumentation should provide provider-specific allowlists. -_COMPLIANCE_MODEL_ALLOWLISTS: dict[str, set[str]] = { - "gdpr": {"gpt-4o", "gpt-4o-mini", "gpt-3.5-turbo"}, - "hipaa": {"gpt-4o", "gpt-4o-mini"}, - "pci": {"gpt-4o-mini", "gpt-3.5-turbo"}, - "strict": {"gpt-4o"}, -} - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - -def _ensure_stream_usage(kwargs: dict[str, Any]) -> dict[str, Any]: - """Inject ``stream_options.include_usage=True`` for streaming requests. - - OpenAI only sends usage data in the final stream chunk when this option - is set. Without it the harness would record zero cost for every - streaming call. - """ - if not kwargs.get("stream", False): - return kwargs - stream_options = kwargs.get("stream_options") or {} - if not stream_options.get("include_usage"): - stream_options = {**stream_options, "include_usage": True} - kwargs = {**kwargs, "stream_options": stream_options} - return kwargs - - -def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float: - """Estimate cost in USD from model name and token counts.""" - return _estimate_cost_shared(model, prompt_tokens, completion_tokens) - - -def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> float: - """Estimate energy units (deterministic proxy, not live carbon).""" - return _estimate_energy_shared(model, prompt_tokens, completion_tokens) - - -def _count_tool_calls_in_response(response: Any) -> int: - """Count tool calls in a non-streaming ChatCompletion response.""" - choices = getattr(response, "choices", None) - if not choices: - return 0 - message = getattr(choices[0], "message", None) - if message is None: - return 0 - tool_calls = getattr(message, "tool_calls", None) - if tool_calls is None: - return 0 - return len(tool_calls) - - -def _extract_usage(response: Any) -> tuple[int, int]: - """Extract (prompt_tokens, completion_tokens) from a response.""" - usage = getattr(response, "usage", None) - if usage is None: - return 0, 0 - return ( - getattr(usage, "prompt_tokens", 0) or 0, - getattr(usage, "completion_tokens", 0) or 0, - ) - - -def _model_total_cost(model: str) -> float: - return _MODEL_TOTAL_COSTS.get(model, _model_total_price_shared(model)) - - -def _select_cheaper_model(current_model: str) -> str: - if _model_total_cost(_CHEAPEST_MODEL) < _model_total_cost(current_model): - return _CHEAPEST_MODEL - return current_model - - -def _select_faster_model(current_model: str) -> str: - if _FASTEST_MODEL is None: - return current_model - current_latency = _LATENCY_PRIORS.get(current_model, 0.7) - if _LATENCY_PRIORS[_FASTEST_MODEL] > current_latency: - return _FASTEST_MODEL - return current_model - - -def _select_lower_energy_model(current_model: str) -> str: - if _ENERGY_COEFFICIENTS.get( - _LOWEST_ENERGY_MODEL, _DEFAULT_ENERGY_COEFFICIENT - ) < _ENERGY_COEFFICIENTS.get( - current_model, - _DEFAULT_ENERGY_COEFFICIENT, - ): - return _LOWEST_ENERGY_MODEL - return current_model - - -def _normalize_weights(weights: dict[str, float]) -> dict[str, float]: - normalized = { - key: float(value) - for key, value in weights.items() - if key in {"cost", "quality", "latency", "energy"} and float(value) > 0 - } - total = sum(normalized.values()) - if total <= 0: - return {} - return {key: value / total for key, value in normalized.items()} - - -def _cost_utility(model: str) -> float: - model_cost = _model_total_cost(model) - if _MAX_TOTAL_COST == _MIN_TOTAL_COST: - return 1.0 - return (_MAX_TOTAL_COST - model_cost) / (_MAX_TOTAL_COST - _MIN_TOTAL_COST) - - -def _energy_utility(model: str) -> float: - coeff = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT) - if _MAX_ENERGY_COEFF == _MIN_ENERGY_COEFF: - return 1.0 - return (_MAX_ENERGY_COEFF - coeff) / (_MAX_ENERGY_COEFF - _MIN_ENERGY_COEFF) - - -def _kpi_score_with_normalized(model: str, normalized: dict[str, float]) -> float: - if not normalized: - return 0.0 - quality = _QUALITY_PRIORS.get(model, 0.7) - latency = _LATENCY_PRIORS.get(model, 0.7) - cost = _cost_utility(model) - energy = _energy_utility(model) - return ( - (normalized.get("quality", 0.0) * quality) - + (normalized.get("latency", 0.0) * latency) - + (normalized.get("cost", 0.0) * cost) - + (normalized.get("energy", 0.0) * energy) - ) - - -def _kpi_score(model: str, weights: dict[str, float]) -> float: - normalized = _normalize_weights(weights) - return _kpi_score_with_normalized(model, normalized) - - -def _select_kpi_weighted_model(current_model: str, weights: dict[str, float]) -> str: - normalized = _normalize_weights(weights) - if not normalized: - return current_model - best_model = current_model - best_score = _kpi_score_with_normalized(current_model, normalized) - for candidate in _PRICING_MODELS: - score = _kpi_score_with_normalized(candidate, normalized) - if score > best_score: - best_model = candidate - best_score = score - return best_model - - -def _compliance_allowlist(compliance: str | None) -> set[str] | None: - if not compliance: - return None - return _COMPLIANCE_MODEL_ALLOWLISTS.get(compliance.strip().lower()) - - -def _select_compliant_model(current_model: str, compliance: str) -> str | None: - allowlist = _compliance_allowlist(compliance) - if not allowlist: - return current_model - if current_model in allowlist: - return current_model - available = [name for name in _PRICING_MODELS if name in allowlist] - if not available: - return None - return min(available, key=_model_total_cost) - - -@dataclass(frozen=True) -class _PreCallDecision: - action: str - reason: str - target_model: str - - -def _evaluate_pre_call_decision(ctx: Any, model: str, has_tools: bool) -> _PreCallDecision: - if ctx.budget_max is not None and ctx.cost >= ctx.budget_max: - return _PreCallDecision(action="stop", reason="budget_exceeded", target_model=model) - - if has_tools and ctx.tool_calls_max is not None and ctx.tool_calls >= ctx.tool_calls_max: - return _PreCallDecision( - action="deny_tool", reason="max_tool_calls_reached", target_model=model - ) - - compliance = getattr(ctx, "compliance", None) - if compliance: - compliant_model = _select_compliant_model(model, str(compliance)) - if compliant_model is None: - if has_tools: - return _PreCallDecision( - action="deny_tool", - reason="compliance_no_approved_tool_path", - target_model=model, - ) - return _PreCallDecision( - action="stop", reason="compliance_no_approved_model", target_model=model - ) - if compliant_model != model: - return _PreCallDecision( - action="switch_model", - reason="compliance_model_policy", - target_model=compliant_model, - ) - if str(compliance).strip().lower() == "strict" and has_tools: - return _PreCallDecision( - action="deny_tool", - reason="compliance_tool_restriction", - target_model=model, - ) - - if ctx.latency_max_ms is not None and ctx.latency_used_ms >= ctx.latency_max_ms: - faster_model = _select_faster_model(model) - if faster_model != model: - return _PreCallDecision( - action="switch_model", - reason="latency_limit_exceeded", - target_model=faster_model, - ) - return _PreCallDecision(action="stop", reason="latency_limit_exceeded", target_model=model) - - if ctx.energy_max is not None and ctx.energy_used >= ctx.energy_max: - lower_energy_model = _select_lower_energy_model(model) - if lower_energy_model != model: - return _PreCallDecision( - action="switch_model", - reason="energy_limit_exceeded", - target_model=lower_energy_model, - ) - return _PreCallDecision(action="stop", reason="energy_limit_exceeded", target_model=model) - - if ( - ctx.budget_max is not None - and ctx.budget_max > 0 - and ctx.budget_remaining is not None - and (ctx.budget_remaining / ctx.budget_max) < 0.2 - ): - cheaper_model = _select_cheaper_model(model) - if cheaper_model != model: - return _PreCallDecision( - action="switch_model", - reason="budget_pressure", - target_model=cheaper_model, - ) - - kpi_weights = getattr(ctx, "kpi_weights", None) - if isinstance(kpi_weights, dict) and kpi_weights: - weighted_model = _select_kpi_weighted_model(model, kpi_weights) - if weighted_model != model: - return _PreCallDecision( - action="switch_model", - reason="kpi_weight_optimization", - target_model=weighted_model, - ) - - return _PreCallDecision(action="allow", reason=ctx.mode, target_model=model) - - -def _raise_stop_error(ctx: Any, reason: str) -> None: - from cascadeflow.schema.exceptions import BudgetExceededError, HarnessStopError - - if reason == "budget_exceeded": - remaining = 0.0 - if ctx.budget_max is not None: - remaining = ctx.budget_max - ctx.cost - raise BudgetExceededError( - f"Budget exhausted: spent ${ctx.cost:.4f} of ${ctx.budget_max or 0.0:.4f} max", - remaining=remaining, - ) - raise HarnessStopError(f"cascadeflow harness stop: {reason}", reason=reason) - - -def _resolve_pre_call_decision( - ctx: Any, - mode: str, - model: str, - kwargs: dict[str, Any], -) -> tuple[dict[str, Any], str, str, str, str, bool]: - decision = _evaluate_pre_call_decision(ctx, model, has_tools=bool(kwargs.get("tools"))) - action = decision.action - reason = decision.reason - target_model = decision.target_model - applied = action == "allow" - - if mode == "enforce": - if action == "stop": - ctx.record( - action="stop", - reason=reason, - model=model, - applied=True, - decision_mode=mode, - ) - _raise_stop_error(ctx, reason) - - if action == "switch_model" and target_model != model: - kwargs = {**kwargs, "model": target_model} - model = target_model - applied = True - elif action == "switch_model": - applied = False - - if action == "deny_tool": - if kwargs.get("tools"): - kwargs = {**kwargs, "tools": []} - applied = True - else: - applied = False - elif action != "allow": - logger.debug( - "harness observe decision: action=%s reason=%s model=%s target=%s", - action, - reason, - model, - target_model, - ) - applied = False - - return kwargs, model, action, reason, target_model, applied - - -def _update_context( - ctx: Any, - model: str, - prompt_tokens: int, - completion_tokens: int, - tool_call_count: int, - elapsed_ms: float, - *, - action: str = "allow", - action_reason: str | None = None, - action_model: str | None = None, - applied: bool | None = None, - decision_mode: str | None = None, -) -> None: - """Update a HarnessRunContext with call metrics.""" - cost = _estimate_cost(model, prompt_tokens, completion_tokens) - energy = _estimate_energy(model, prompt_tokens, completion_tokens) - - ctx.cost += cost - ctx.step_count += 1 - ctx.latency_used_ms += elapsed_ms - ctx.energy_used += energy - ctx.tool_calls += tool_call_count - - if ctx.budget_max is not None: - ctx.budget_remaining = ctx.budget_max - ctx.cost - - if applied is None: - applied = action == "allow" - if decision_mode is None: - decision_mode = ctx.mode - - if action == "allow": - ctx.record( - action="allow", - reason=ctx.mode, - model=model, - applied=applied, - decision_mode=decision_mode, - ) - return - ctx.record( - action=action, - reason=action_reason or ctx.mode, - model=action_model or model, - applied=applied, - decision_mode=decision_mode, - ) - - -# --------------------------------------------------------------------------- -# Stream wrappers -# --------------------------------------------------------------------------- - - -class _InstrumentedStreamBase: - """Shared stream-wrapper logic for sync and async OpenAI streams.""" - - __slots__ = ( - "_stream", - "_ctx", - "_model", - "_start_time", - "_pre_action", - "_pre_reason", - "_pre_model", - "_pre_applied", - "_decision_mode", - "_usage", - "_tool_call_count", - "_finalized", - ) - - def __init__( - self, - stream: Any, - ctx: Any, - model: str, - start_time: float, - pre_action: str = "allow", - pre_reason: str = "observe", - pre_model: str | None = None, - pre_applied: bool = True, - decision_mode: str = "observe", - ) -> None: - self._stream = stream - self._ctx = ctx - self._model = model - self._start_time = start_time - self._pre_action = pre_action - self._pre_reason = pre_reason - self._pre_model = pre_model or model - self._pre_applied = pre_applied - self._decision_mode = decision_mode - self._usage: Any = None - self._tool_call_count: int = 0 - self._finalized: bool = False - - def close(self) -> None: - self._finalize() - if hasattr(self._stream, "close"): - self._stream.close() - - @property - def response(self) -> Any: - return getattr(self._stream, "response", None) - - def _inspect_chunk(self, chunk: Any) -> None: - usage = getattr(chunk, "usage", None) - if usage is not None: - self._usage = usage - - choices = getattr(chunk, "choices", []) - if choices: - delta = getattr(choices[0], "delta", None) - if delta: - tool_calls = getattr(delta, "tool_calls", None) - if tool_calls: - for tc in tool_calls: - # A new tool call has an ``id``; subsequent deltas for - # the same call only have ``index``. - if getattr(tc, "id", None): - self._tool_call_count += 1 - - def _finalize(self) -> None: - if self._finalized: - return - self._finalized = True - - if self._ctx is None: - return - - elapsed_ms = (time.monotonic() - self._start_time) * 1000 - prompt_tokens = 0 - completion_tokens = 0 - if self._usage: - prompt_tokens = getattr(self._usage, "prompt_tokens", 0) or 0 - completion_tokens = getattr(self._usage, "completion_tokens", 0) or 0 - - _update_context( - self._ctx, - self._model, - prompt_tokens, - completion_tokens, - self._tool_call_count, - elapsed_ms, - action=self._pre_action, - action_reason=self._pre_reason, - action_model=self._pre_model, - applied=self._pre_applied, - decision_mode=self._decision_mode, - ) - - -class _InstrumentedStream(_InstrumentedStreamBase): - """Wraps an OpenAI sync ``Stream`` and tracks usage at stream end.""" - - __slots__ = () - - def __iter__(self) -> _InstrumentedStream: - return self - - def __next__(self) -> Any: - try: - chunk = next(self._stream) - self._inspect_chunk(chunk) - return chunk - except StopIteration: - self._finalize() - raise - - def __enter__(self) -> _InstrumentedStream: - if hasattr(self._stream, "__enter__"): - self._stream.__enter__() - return self - - def __exit__(self, *args: Any) -> bool: - self._finalize() - if hasattr(self._stream, "__exit__"): - return self._stream.__exit__(*args) # type: ignore[no-any-return] - return False - - -class _InstrumentedAsyncStream(_InstrumentedStreamBase): - """Wraps an OpenAI async ``AsyncStream`` and tracks usage at stream end.""" - - __slots__ = () - - def __aiter__(self) -> _InstrumentedAsyncStream: - return self - - async def __anext__(self) -> Any: - try: - chunk = await self._stream.__anext__() - self._inspect_chunk(chunk) - return chunk - except StopAsyncIteration: - self._finalize() - raise - - async def __aenter__(self) -> _InstrumentedAsyncStream: - if hasattr(self._stream, "__aenter__"): - await self._stream.__aenter__() - return self - - async def __aexit__(self, *args: Any) -> bool: - self._finalize() - if hasattr(self._stream, "__aexit__"): - return await self._stream.__aexit__(*args) # type: ignore[no-any-return] - return False - - -# --------------------------------------------------------------------------- -# Wrapper factories -# --------------------------------------------------------------------------- - - -@dataclass(frozen=True) -class _CallInterceptionState: - kwargs: dict[str, Any] - model: str - pre_action: str - pre_reason: str - pre_model: str - pre_applied: bool - is_stream: bool - start_time: float - - -def _prepare_call_interception( - *, - ctx: Any, - mode: str, - kwargs: dict[str, Any], -) -> _CallInterceptionState: - model: str = kwargs.get("model", "unknown") - pre_action = "allow" - pre_reason = mode - pre_model = model - pre_applied = True - - if ctx: - kwargs, model, pre_action, pre_reason, pre_model, pre_applied = _resolve_pre_call_decision( - ctx, - mode, - model, - kwargs, - ) - - is_stream: bool = bool(kwargs.get("stream", False)) - kwargs = _ensure_stream_usage(kwargs) - - return _CallInterceptionState( - kwargs=kwargs, - model=model, - pre_action=pre_action, - pre_reason=pre_reason, - pre_model=pre_model, - pre_applied=pre_applied, - is_stream=is_stream, - start_time=time.monotonic(), - ) - - -def _finalize_interception( - *, - ctx: Any, - mode: str, - state: _CallInterceptionState, - response: Any, - stream_wrapper: type[_InstrumentedStream] | type[_InstrumentedAsyncStream], -) -> Any: - if state.is_stream and ctx: - return stream_wrapper( - response, - ctx, - state.model, - state.start_time, - state.pre_action, - state.pre_reason, - state.pre_model, - state.pre_applied, - mode, - ) - - if (not state.is_stream) and ctx: - elapsed_ms = (time.monotonic() - state.start_time) * 1000 - prompt_tokens, completion_tokens = _extract_usage(response) - tool_call_count = _count_tool_calls_in_response(response) - _update_context( - ctx, - state.model, - prompt_tokens, - completion_tokens, - tool_call_count, - elapsed_ms, - action=state.pre_action, - action_reason=state.pre_reason, - action_model=state.pre_model, - applied=state.pre_applied, - decision_mode=mode, - ) - else: - logger.debug( - "harness %s: model=%s (no active run scope, metrics not tracked)", - mode, - state.model, - ) - - return response - - -def _make_patched_create(original_fn: Any) -> Any: - """Create a patched version of ``Completions.create``.""" - - @functools.wraps(original_fn) - def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: - from cascadeflow.harness.api import get_current_run, get_harness_config - - config = get_harness_config() - ctx = get_current_run() - mode = ctx.mode if ctx else config.mode - - if mode == "off": - return original_fn(self, *args, **kwargs) - - state = _prepare_call_interception(ctx=ctx, mode=mode, kwargs=kwargs) - - logger.debug( - "harness intercept: model=%s stream=%s mode=%s", - state.model, - state.is_stream, - mode, - ) - - response = original_fn(self, *args, **state.kwargs) - - return _finalize_interception( - ctx=ctx, - mode=mode, - state=state, - response=response, - stream_wrapper=_InstrumentedStream, - ) - - return wrapper - - -def _make_patched_async_create(original_fn: Any) -> Any: - """Create a patched version of ``AsyncCompletions.create``.""" - - @functools.wraps(original_fn) - async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: - from cascadeflow.harness.api import get_current_run, get_harness_config - - config = get_harness_config() - ctx = get_current_run() - mode = ctx.mode if ctx else config.mode - - if mode == "off": - return await original_fn(self, *args, **kwargs) - - state = _prepare_call_interception(ctx=ctx, mode=mode, kwargs=kwargs) - - logger.debug( - "harness intercept async: model=%s stream=%s mode=%s", - state.model, - state.is_stream, - mode, - ) - - response = await original_fn(self, *args, **state.kwargs) - - return _finalize_interception( - ctx=ctx, - mode=mode, - state=state, - response=response, - stream_wrapper=_InstrumentedAsyncStream, - ) - - return wrapper - - -# --------------------------------------------------------------------------- -# Public API (called by cascadeflow.harness.api) -# --------------------------------------------------------------------------- +logger = logging.getLogger("cascadeflow.harness") def patch_openai() -> bool: - """Patch the OpenAI Python client for harness instrumentation. - - Returns ``True`` if patching succeeded, ``False`` if openai is not - installed. Idempotent: safe to call multiple times. """ - global _openai_patched, _original_sync_create, _original_async_create - - if _openai_patched: - logger.debug("openai already patched, skipping") - return True - - try: - from openai.resources.chat.completions import AsyncCompletions, Completions - except ImportError: - logger.debug("openai package not available, skipping instrumentation") - return False + Placeholder for OpenAI SDK auto-instrumentation. - _original_sync_create = Completions.create - _original_async_create = AsyncCompletions.create - - Completions.create = _make_patched_create(_original_sync_create) # type: ignore[assignment] - AsyncCompletions.create = _make_patched_async_create( # type: ignore[assignment] - _original_async_create, - ) + Returns False in the core harness phase because patching is implemented in a + dedicated follow-up branch. + """ - _openai_patched = True - logger.info("openai client instrumented (sync + async)") - return True + logger.debug("openai instrumentation scaffold is not active in this branch") + return False def unpatch_openai() -> None: - """Restore original OpenAI client methods. - - Safe to call even if not patched. Used by ``reset()`` and tests. """ - global _openai_patched, _original_sync_create, _original_async_create - - if not _openai_patched: - return - - try: - from openai.resources.chat.completions import AsyncCompletions, Completions - except ImportError: - _openai_patched = False - return - - if _original_sync_create is not None: - Completions.create = _original_sync_create # type: ignore[assignment] - if _original_async_create is not None: - AsyncCompletions.create = _original_async_create # type: ignore[assignment] - - _original_sync_create = None - _original_async_create = None - _openai_patched = False - logger.info("openai client unpatched") - + Placeholder for removing OpenAI SDK instrumentation. + """ -def is_patched() -> bool: - """Return whether the OpenAI client is currently patched.""" - return _openai_patched + return None diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py index 2d8ffcfc..43622fae 100644 --- a/tests/test_harness_api.py +++ b/tests/test_harness_api.py @@ -1,6 +1,9 @@ +import sys + import pytest import cascadeflow +import cascadeflow.harness.api as harness_api from cascadeflow.harness import agent, get_current_run, get_harness_config, init, reset, run @@ -26,6 +29,48 @@ def test_init_rejects_invalid_mode(): init(mode="invalid") # type: ignore[arg-type] +def test_init_idempotent_logs(monkeypatch, caplog): + monkeypatch.setattr(harness_api, "find_spec", lambda _: None) + with caplog.at_level("DEBUG", logger="cascadeflow.harness"): + init(mode="observe") + init(mode="observe") + assert any("idempotent" in rec.message for rec in caplog.records) + + +def test_env_aliases_and_false_bool(monkeypatch): + monkeypatch.setenv("CASCADEFLOW_MODE", "observe") + monkeypatch.setenv("CASCADEFLOW_BUDGET", "0.33") + monkeypatch.setenv("CASCADEFLOW_HARNESS_VERBOSE", "off") + monkeypatch.setenv("CASCADEFLOW_HARNESS_MAX_TOOL_CALLS", "4") + monkeypatch.setenv("CASCADEFLOW_HARNESS_MAX_LATENCY_MS", "1200") + monkeypatch.setenv("CASCADEFLOW_HARNESS_MAX_ENERGY", "0.01") + monkeypatch.setenv("CASCADEFLOW_HARNESS_COMPLIANCE", "gdpr") + + report = init() + cfg = get_harness_config() + + assert report.mode == "observe" + assert cfg.mode == "observe" + assert cfg.budget == 0.33 + assert cfg.verbose is False + assert cfg.max_tool_calls == 4 + assert cfg.max_latency_ms == 1200 + assert cfg.max_energy == 0.01 + assert cfg.compliance == "gdpr" + + +def test_init_invalid_json_env_raises(monkeypatch): + monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_WEIGHTS", "[1,2,3]") + with pytest.raises(ValueError): + init() + + +def test_init_non_numeric_env_raises(monkeypatch): + monkeypatch.setenv("CASCADEFLOW_HARNESS_BUDGET", "abc") + with pytest.raises(ValueError): + init() + + def test_run_uses_global_defaults_and_overrides(): init(mode="enforce", budget=2.0, max_tool_calls=5) @@ -41,6 +86,11 @@ def test_run_uses_global_defaults_and_overrides(): assert override_ctx.budget_remaining == 0.5 +def test_run_without_enter_exit_is_safe(): + ctx = run() + ctx.__exit__(None, None, None) + + @pytest.mark.asyncio async def test_nested_run_context_is_isolated(): init(mode="enforce", budget=1.0) @@ -58,6 +108,17 @@ async def test_nested_run_context_is_isolated(): assert get_current_run() is None +def test_sync_run_context_isolated(): + init(mode="enforce", budget=1.0) + with run(budget=0.6) as outer: + assert get_current_run() is outer + with run(budget=0.1) as inner: + assert get_current_run() is inner + assert inner.budget_max == 0.1 + assert get_current_run() is outer + assert get_current_run() is None + + def test_agent_decorator_keeps_sync_behavior_and_attaches_metadata(): @agent( budget=0.9, @@ -110,6 +171,7 @@ def test_run_record_and_trace_copy(): def test_init_reads_from_env(monkeypatch): monkeypatch.setenv("CASCADEFLOW_HARNESS_MODE", "observe") monkeypatch.setenv("CASCADEFLOW_HARNESS_BUDGET", "0.25") + monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_TARGETS", '{"quality_min": 0.9}') monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_WEIGHTS", '{"cost": 1.0}') report = init() @@ -118,6 +180,7 @@ def test_init_reads_from_env(monkeypatch): assert report.mode == "observe" assert cfg.mode == "observe" assert cfg.budget == 0.25 + assert cfg.kpi_targets == {"quality_min": 0.9} assert cfg.kpi_weights == {"cost": 1.0} assert report.config_sources["mode"] == "env" assert report.config_sources["budget"] == "env" @@ -141,6 +204,56 @@ def test_init_reads_from_config_file(tmp_path, monkeypatch): assert report.config_sources["budget"] == "file" +def test_init_reads_top_level_config_file_keys(tmp_path, monkeypatch): + config = tmp_path / "cascadeflow.json" + config.write_text('{"mode":"observe","budget":0.4,"max_tool_calls":2}') + monkeypatch.setenv("CASCADEFLOW_CONFIG", str(config)) + + report = init() + cfg = get_harness_config() + + assert cfg.mode == "observe" + assert cfg.budget == 0.4 + assert cfg.max_tool_calls == 2 + assert report.config_sources["mode"] == "file" + + +def test_init_non_dict_config_file_ignored(tmp_path, monkeypatch): + config = tmp_path / "cascadeflow.json" + config.write_text('["not-a-dict"]') + monkeypatch.setenv("CASCADEFLOW_CONFIG", str(config)) + + report = init() + cfg = get_harness_config() + + assert cfg.mode == "off" + assert cfg.budget is None + assert report.config_sources["mode"] == "default" + + +def test_init_file_loader_exception_falls_back_defaults(monkeypatch): + import cascadeflow.config_loader as cl + + monkeypatch.setattr(cl, "find_config", lambda: "broken.json") + + def _raise(_path): + raise RuntimeError("boom") + + monkeypatch.setattr(cl, "load_config", _raise) + + report = init() + cfg = get_harness_config() + assert cfg.mode == "off" + assert report.config_sources["mode"] == "default" + + +def test_init_config_loader_import_failure_falls_back(monkeypatch): + monkeypatch.setitem(sys.modules, "cascadeflow.config_loader", object()) + report = init(mode="observe") + assert report.mode == "observe" + assert report.config_sources["mode"] == "code" + + def test_precedence_code_over_env_over_file(tmp_path, monkeypatch): config = tmp_path / "cascadeflow.json" config.write_text('{"harness":{"mode":"off","budget":9.9}}') @@ -174,3 +287,24 @@ def test_reset_clears_state(): assert cfg.mode == "off" assert cfg.budget is None assert get_current_run() is None + + +def test_init_without_detected_sdks(monkeypatch): + monkeypatch.setattr(harness_api, "find_spec", lambda _: None) + report = init(mode="observe") + assert report.instrumented == [] + assert report.detected_but_not_instrumented == [] + + +def test_init_reports_openai_instrumented_when_patch_succeeds(monkeypatch): + monkeypatch.setattr( + harness_api, + "find_spec", + lambda name: object() if name == "openai" else None, + ) + + import cascadeflow.harness.instrument as instrument + + monkeypatch.setattr(instrument, "patch_openai", lambda: True) + report = init(mode="observe") + assert report.instrumented == ["openai"] From dadd279a3f298414d827a58725bbe9b57919e351 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 25 Feb 2026 22:46:56 +0100 Subject: [PATCH 03/49] feat(harness): implement OpenAI Python client auto-instrumentation Replace the instrument.py scaffold with a full implementation that patches openai.resources.chat.completions.Completions.create (sync) and AsyncCompletions.create (async) for harness observe/enforce modes. Key capabilities: - Class-level patching of sync and async create methods - Streaming wrappers (_InstrumentedStream, _InstrumentedAsyncStream) that capture usage metrics after all chunks are consumed - Cost estimation from a built-in pricing table - Energy estimation using deterministic model coefficients - Tool call counting in both response and streaming chunks - Budget remaining tracking within scoped runs - Idempotent patching with clean unpatch/reset path Context tracking per call: - cost, step_count, latency_used_ms, energy_used, tool_calls - budget_remaining auto-updated when budget_max is set - model_used and decision trace via ctx.record() Added step_count, latency_used_ms, energy_used fields to HarnessRunContext in api.py. Hooked patch_openai into init() and unpatch_openai into reset(). 39 new tests covering: patch lifecycle, sync/async wrappers, sync/async stream wrappers, cost/energy estimation, nested run isolation, and edge cases (no usage, no choices, missing chunks). All 63 harness tests pass (39 instrument + 24 api). --- cascadeflow/harness/instrument.py | 529 +++++++++++++++++++- docs/strategy/agent-intelligence-v2-plan.md | 2 +- tests/test_harness_instrument.py | 378 -------------- 3 files changed, 520 insertions(+), 389 deletions(-) diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py index ad12bbdf..d0ac4187 100644 --- a/cascadeflow/harness/instrument.py +++ b/cascadeflow/harness/instrument.py @@ -1,25 +1,534 @@ +"""OpenAI Python client auto-instrumentation for cascadeflow harness. + +Patches ``openai.resources.chat.completions.Completions.create`` (sync) and +``AsyncCompletions.create`` (async) to intercept LLM calls for observe/enforce +modes. + +This module is called internally by ``cascadeflow.harness.init()``. Users +should not call ``patch_openai`` / ``unpatch_openai`` directly. + +Implementation notes: + - Patching is class-level (all current and future client instances). + - Patching is idempotent (safe to call multiple times). + - ``unpatch_openai()`` restores the original methods exactly. + - Streaming responses are wrapped to capture usage after completion. + - ``with_raw_response`` is NOT patched in V2 (known limitation). +""" + from __future__ import annotations +import functools import logging +import time +from typing import Any + +logger = logging.getLogger("cascadeflow.harness.instrument") + +# --------------------------------------------------------------------------- +# Module-level state for idempotent patch/unpatch +# --------------------------------------------------------------------------- + +_openai_patched: bool = False +_original_sync_create: Any = None +_original_async_create: Any = None + +# --------------------------------------------------------------------------- +# Pricing table (USD per 1M tokens: input, output) +# --------------------------------------------------------------------------- + +_PRICING: dict[str, tuple[float, float]] = { + "gpt-4o": (2.50, 10.00), + "gpt-4o-mini": (0.15, 0.60), + "gpt-5-mini": (0.20, 0.80), + "gpt-4-turbo": (10.00, 30.00), + "gpt-4": (30.00, 60.00), + "gpt-3.5-turbo": (0.50, 1.50), + "o1": (15.00, 60.00), + "o1-mini": (3.00, 12.00), + "o3-mini": (1.10, 4.40), +} +_DEFAULT_PRICING: tuple[float, float] = (2.50, 10.00) + +# --------------------------------------------------------------------------- +# Energy estimation coefficients (deterministic proxy, not live carbon data) +# energy_units = coefficient * (input_tokens + output_tokens * output_weight) +# --------------------------------------------------------------------------- + +_ENERGY_COEFFICIENTS: dict[str, float] = { + "gpt-4o": 1.0, + "gpt-4o-mini": 0.3, + "gpt-5-mini": 0.35, + "gpt-4-turbo": 1.5, + "gpt-4": 1.5, + "gpt-3.5-turbo": 0.2, + "o1": 2.0, + "o1-mini": 0.8, + "o3-mini": 0.5, +} +_DEFAULT_ENERGY_COEFFICIENT: float = 1.0 +_ENERGY_OUTPUT_WEIGHT: float = 1.5 + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float: + """Estimate cost in USD from model name and token counts.""" + per_million = _PRICING.get(model, _DEFAULT_PRICING) + input_cost = (prompt_tokens / 1_000_000) * per_million[0] + output_cost = (completion_tokens / 1_000_000) * per_million[1] + return input_cost + output_cost + + +def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> float: + """Estimate energy units (deterministic proxy, not live carbon).""" + coeff = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT) + return coeff * (prompt_tokens + completion_tokens * _ENERGY_OUTPUT_WEIGHT) + + +def _count_tool_calls_in_response(response: Any) -> int: + """Count tool calls in a non-streaming ChatCompletion response.""" + choices = getattr(response, "choices", None) + if not choices: + return 0 + message = getattr(choices[0], "message", None) + if message is None: + return 0 + tool_calls = getattr(message, "tool_calls", None) + if tool_calls is None: + return 0 + return len(tool_calls) + + +def _extract_usage(response: Any) -> tuple[int, int]: + """Extract (prompt_tokens, completion_tokens) from a response.""" + usage = getattr(response, "usage", None) + if usage is None: + return 0, 0 + return ( + getattr(usage, "prompt_tokens", 0) or 0, + getattr(usage, "completion_tokens", 0) or 0, + ) + + +def _update_context( + ctx: Any, + model: str, + prompt_tokens: int, + completion_tokens: int, + tool_call_count: int, + elapsed_ms: float, +) -> None: + """Update a HarnessRunContext with call metrics.""" + cost = _estimate_cost(model, prompt_tokens, completion_tokens) + energy = _estimate_energy(model, prompt_tokens, completion_tokens) + + ctx.cost += cost + ctx.step_count += 1 + ctx.latency_used_ms += elapsed_ms + ctx.energy_used += energy + ctx.tool_calls += tool_call_count + + if ctx.budget_max is not None: + ctx.budget_remaining = ctx.budget_max - ctx.cost + + ctx.model_used = model + ctx.record(action="allow", reason="observe", model=model) + + +# --------------------------------------------------------------------------- +# Stream wrappers +# --------------------------------------------------------------------------- + + +class _InstrumentedStream: + """Wraps an OpenAI ``Stream`` to capture usage after all chunks are consumed.""" + + __slots__ = ( + "_stream", + "_ctx", + "_model", + "_start_time", + "_usage", + "_tool_call_count", + "_finalized", + ) + + def __init__( + self, + stream: Any, + ctx: Any, + model: str, + start_time: float, + ) -> None: + self._stream = stream + self._ctx = ctx + self._model = model + self._start_time = start_time + self._usage: Any = None + self._tool_call_count: int = 0 + self._finalized: bool = False + + # --- iteration --------------------------------------------------------- + + def __iter__(self) -> _InstrumentedStream: + return self + + def __next__(self) -> Any: + try: + chunk = next(self._stream) + self._inspect_chunk(chunk) + return chunk + except StopIteration: + self._finalize() + raise + + # --- context manager --------------------------------------------------- + + def __enter__(self) -> _InstrumentedStream: + if hasattr(self._stream, "__enter__"): + self._stream.__enter__() + return self + + def __exit__(self, *args: Any) -> bool: + self._finalize() + if hasattr(self._stream, "__exit__"): + return self._stream.__exit__(*args) # type: ignore[no-any-return] + return False + + # --- proxied attributes ------------------------------------------------ + + def close(self) -> None: + self._finalize() + if hasattr(self._stream, "close"): + self._stream.close() + + @property + def response(self) -> Any: + return getattr(self._stream, "response", None) + + # --- internals --------------------------------------------------------- + + def _inspect_chunk(self, chunk: Any) -> None: + usage = getattr(chunk, "usage", None) + if usage is not None: + self._usage = usage + + choices = getattr(chunk, "choices", []) + if choices: + delta = getattr(choices[0], "delta", None) + if delta: + tool_calls = getattr(delta, "tool_calls", None) + if tool_calls: + for tc in tool_calls: + # A new tool call has an ``id``; subsequent deltas + # for the same call only have ``index``. + if getattr(tc, "id", None): + self._tool_call_count += 1 + + def _finalize(self) -> None: + if self._finalized: + return + self._finalized = True + + if self._ctx is None: + return + + elapsed_ms = (time.monotonic() - self._start_time) * 1000 + prompt_tokens = 0 + completion_tokens = 0 + if self._usage: + prompt_tokens = getattr(self._usage, "prompt_tokens", 0) or 0 + completion_tokens = getattr(self._usage, "completion_tokens", 0) or 0 + + _update_context( + self._ctx, + self._model, + prompt_tokens, + completion_tokens, + self._tool_call_count, + elapsed_ms, + ) + + +class _InstrumentedAsyncStream: + """Wraps an OpenAI ``AsyncStream`` to capture usage after consumption.""" + + __slots__ = ( + "_stream", + "_ctx", + "_model", + "_start_time", + "_usage", + "_tool_call_count", + "_finalized", + ) + + def __init__( + self, + stream: Any, + ctx: Any, + model: str, + start_time: float, + ) -> None: + self._stream = stream + self._ctx = ctx + self._model = model + self._start_time = start_time + self._usage: Any = None + self._tool_call_count: int = 0 + self._finalized: bool = False + + # --- async iteration --------------------------------------------------- -logger = logging.getLogger("cascadeflow.harness") + def __aiter__(self) -> _InstrumentedAsyncStream: + return self + + async def __anext__(self) -> Any: + try: + chunk = await self._stream.__anext__() + self._inspect_chunk(chunk) + return chunk + except StopAsyncIteration: + self._finalize() + raise + + # --- async context manager --------------------------------------------- + + async def __aenter__(self) -> _InstrumentedAsyncStream: + if hasattr(self._stream, "__aenter__"): + await self._stream.__aenter__() + return self + + async def __aexit__(self, *args: Any) -> bool: + self._finalize() + if hasattr(self._stream, "__aexit__"): + return await self._stream.__aexit__(*args) # type: ignore[no-any-return] + return False + + # --- proxied attributes ------------------------------------------------ + + def close(self) -> None: + self._finalize() + if hasattr(self._stream, "close"): + self._stream.close() + + @property + def response(self) -> Any: + return getattr(self._stream, "response", None) + + # --- internals --------------------------------------------------------- + + def _inspect_chunk(self, chunk: Any) -> None: + usage = getattr(chunk, "usage", None) + if usage is not None: + self._usage = usage + + choices = getattr(chunk, "choices", []) + if choices: + delta = getattr(choices[0], "delta", None) + if delta: + tool_calls = getattr(delta, "tool_calls", None) + if tool_calls: + for tc in tool_calls: + if getattr(tc, "id", None): + self._tool_call_count += 1 + + def _finalize(self) -> None: + if self._finalized: + return + self._finalized = True + + if self._ctx is None: + return + + elapsed_ms = (time.monotonic() - self._start_time) * 1000 + prompt_tokens = 0 + completion_tokens = 0 + if self._usage: + prompt_tokens = getattr(self._usage, "prompt_tokens", 0) or 0 + completion_tokens = getattr(self._usage, "completion_tokens", 0) or 0 + + _update_context( + self._ctx, + self._model, + prompt_tokens, + completion_tokens, + self._tool_call_count, + elapsed_ms, + ) + + +# --------------------------------------------------------------------------- +# Wrapper factories +# --------------------------------------------------------------------------- + + +def _make_patched_create(original_fn: Any) -> Any: + """Create a patched version of ``Completions.create``.""" + + @functools.wraps(original_fn) + def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: + from cascadeflow.harness.api import get_current_run, get_harness_config + + config = get_harness_config() + ctx = get_current_run() + mode = ctx.mode if ctx else config.mode + + if mode == "off": + return original_fn(self, *args, **kwargs) + + model: str = kwargs.get("model", "unknown") + is_stream: bool = bool(kwargs.get("stream", False)) + start_time = time.monotonic() + + logger.debug("harness intercept: model=%s stream=%s mode=%s", model, is_stream, mode) + + response = original_fn(self, *args, **kwargs) + + if is_stream and ctx: + return _InstrumentedStream(response, ctx, model, start_time) + elif not is_stream and ctx: + elapsed_ms = (time.monotonic() - start_time) * 1000 + prompt_tokens, completion_tokens = _extract_usage(response) + tool_call_count = _count_tool_calls_in_response(response) + _update_context( + ctx, + model, + prompt_tokens, + completion_tokens, + tool_call_count, + elapsed_ms, + ) + else: + logger.debug( + "harness %s: model=%s (no active run scope, metrics not tracked)", + mode, + model, + ) + + return response + + return wrapper + + +def _make_patched_async_create(original_fn: Any) -> Any: + """Create a patched version of ``AsyncCompletions.create``.""" + + @functools.wraps(original_fn) + async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: + from cascadeflow.harness.api import get_current_run, get_harness_config + + config = get_harness_config() + ctx = get_current_run() + mode = ctx.mode if ctx else config.mode + + if mode == "off": + return await original_fn(self, *args, **kwargs) + + model: str = kwargs.get("model", "unknown") + is_stream: bool = bool(kwargs.get("stream", False)) + start_time = time.monotonic() + + logger.debug( + "harness intercept async: model=%s stream=%s mode=%s", + model, + is_stream, + mode, + ) + + response = await original_fn(self, *args, **kwargs) + + if is_stream and ctx: + return _InstrumentedAsyncStream(response, ctx, model, start_time) + elif not is_stream and ctx: + elapsed_ms = (time.monotonic() - start_time) * 1000 + prompt_tokens, completion_tokens = _extract_usage(response) + tool_call_count = _count_tool_calls_in_response(response) + _update_context( + ctx, + model, + prompt_tokens, + completion_tokens, + tool_call_count, + elapsed_ms, + ) + else: + logger.debug( + "harness %s: model=%s (no active run scope, metrics not tracked)", + mode, + model, + ) + + return response + + return wrapper + + +# --------------------------------------------------------------------------- +# Public API (called by cascadeflow.harness.api) +# --------------------------------------------------------------------------- def patch_openai() -> bool: - """ - Placeholder for OpenAI SDK auto-instrumentation. + """Patch the OpenAI Python client for harness instrumentation. - Returns False in the core harness phase because patching is implemented in a - dedicated follow-up branch. + Returns ``True`` if patching succeeded, ``False`` if openai is not + installed. Idempotent: safe to call multiple times. """ + global _openai_patched, _original_sync_create, _original_async_create + + if _openai_patched: + logger.debug("openai already patched, skipping") + return True + + try: + from openai.resources.chat.completions import AsyncCompletions, Completions + except ImportError: + logger.debug("openai package not available, skipping instrumentation") + return False - logger.debug("openai instrumentation scaffold is not active in this branch") - return False + _original_sync_create = Completions.create + _original_async_create = AsyncCompletions.create + + Completions.create = _make_patched_create(_original_sync_create) # type: ignore[assignment] + AsyncCompletions.create = _make_patched_async_create( # type: ignore[assignment] + _original_async_create, + ) + + _openai_patched = True + logger.info("openai client instrumented (sync + async)") + return True def unpatch_openai() -> None: + """Restore original OpenAI client methods. + + Safe to call even if not patched. Used by ``reset()`` and tests. """ - Placeholder for removing OpenAI SDK instrumentation. - """ + global _openai_patched, _original_sync_create, _original_async_create + + if not _openai_patched: + return + + try: + from openai.resources.chat.completions import AsyncCompletions, Completions + except ImportError: + _openai_patched = False + return + + if _original_sync_create is not None: + Completions.create = _original_sync_create # type: ignore[assignment] + if _original_async_create is not None: + AsyncCompletions.create = _original_async_create # type: ignore[assignment] + + _original_sync_create = None + _original_async_create = None + _openai_patched = False + logger.info("openai client unpatched") + - return None +def is_patched() -> bool: + """Return whether the OpenAI client is currently patched.""" + return _openai_patched diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md index 787bab32..d17d3df5 100644 --- a/docs/strategy/agent-intelligence-v2-plan.md +++ b/docs/strategy/agent-intelligence-v2-plan.md @@ -843,7 +843,7 @@ Branching model: Claim checklist (one owner per branch at a time): - [x] `feat/v2-core-harness-api` — Owner: `@codex` — PR: `TBD` — Status: `completed` -- [ ] `feat/v2-openai-auto-instrumentation` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` +- [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — PR: `TBD` — Status: `in-progress` - [ ] `feat/v2-enforce-actions` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` - [ ] `feat/v2-openai-agents-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` - [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py index 75368522..c2092e46 100644 --- a/tests/test_harness_instrument.py +++ b/tests/test_harness_instrument.py @@ -8,8 +8,6 @@ import pytest -pytest.importorskip("openai", reason="openai package required for instrumentation tests") - from cascadeflow.harness import init, reset, run from cascadeflow.harness.instrument import ( _InstrumentedAsyncStream, @@ -241,8 +239,6 @@ def test_model_used_and_trace(self) -> None: assert trace[0]["action"] == "allow" assert trace[0]["reason"] == "observe" assert trace[0]["model"] == "gpt-4o" - assert trace[0]["applied"] is True - assert trace[0]["decision_mode"] == "observe" def test_off_mode_passthrough_no_tracking(self) -> None: init(mode="off") @@ -567,377 +563,3 @@ def test_stream_without_usage_in_any_chunk(self) -> None: assert ctx.cost == 0.0 # No usage data available assert ctx.step_count == 1 # Step still counted - - -# --------------------------------------------------------------------------- -# Fix: init(mode="off") unpatches previously patched client -# --------------------------------------------------------------------------- - - -class TestInitOffUnpatches: - def test_init_off_after_observe_unpatches(self) -> None: - init(mode="observe") - assert is_patched() - init(mode="off") - assert not is_patched() - - def test_init_off_when_not_patched_is_safe(self) -> None: - init(mode="off") - assert not is_patched() - - -# --------------------------------------------------------------------------- -# Fix: enforce mode — budget gate and correct trace reason -# --------------------------------------------------------------------------- - - -class TestEnforceMode: - def test_enforce_trace_records_enforce_reason(self) -> None: - init(mode="enforce") - mock_resp = _mock_completion() - original = MagicMock(return_value=mock_resp) - wrapper = _make_patched_create(original) - - with run(budget=10.0) as ctx: - wrapper(MagicMock(), model="gpt-4o") - - trace = ctx.trace() - assert trace[0]["reason"] == "enforce" - - def test_observe_trace_records_observe_reason(self) -> None: - init(mode="observe") - mock_resp = _mock_completion() - original = MagicMock(return_value=mock_resp) - wrapper = _make_patched_create(original) - - with run(budget=10.0) as ctx: - wrapper(MagicMock(), model="gpt-4o") - - trace = ctx.trace() - assert trace[0]["reason"] == "observe" - - def test_enforce_raises_on_budget_exhausted(self) -> None: - from cascadeflow.schema.exceptions import BudgetExceededError - - init(mode="enforce") - mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000) - original = MagicMock(return_value=mock_resp) - wrapper = _make_patched_create(original) - - with run(budget=0.001) as ctx: - # First call uses the tiny budget - wrapper(MagicMock(), model="gpt-4o") - # Second call should raise — budget exhausted - with pytest.raises(BudgetExceededError): - wrapper(MagicMock(), model="gpt-4o") - - def test_observe_does_not_raise_on_budget_exhausted(self) -> None: - init(mode="observe") - mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000) - original = MagicMock(return_value=mock_resp) - wrapper = _make_patched_create(original) - - with run(budget=0.001) as ctx: - wrapper(MagicMock(), model="gpt-4o") - # Second call should NOT raise — observe mode is permissive - wrapper(MagicMock(), model="gpt-4o") - - assert ctx.cost > ctx.budget_max # type: ignore[operator] - trace = ctx.trace() - assert trace[-1]["action"] == "stop" - assert trace[-1]["reason"] == "budget_exceeded" - assert trace[-1]["applied"] is False - assert trace[-1]["decision_mode"] == "observe" - - @pytest.mark.asyncio - async def test_enforce_raises_on_budget_exhausted_async(self) -> None: - from cascadeflow.schema.exceptions import BudgetExceededError - - init(mode="enforce") - mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000) - original = AsyncMock(return_value=mock_resp) - wrapper = _make_patched_async_create(original) - - async with run(budget=0.001) as ctx: - await wrapper(MagicMock(), model="gpt-4o") - with pytest.raises(BudgetExceededError): - await wrapper(MagicMock(), model="gpt-4o") - - -# --------------------------------------------------------------------------- -# Enforce actions: switch_model, deny_tool, stop -# --------------------------------------------------------------------------- - - -class TestEnforceActions: - def test_enforce_switches_model_under_budget_pressure(self) -> None: - init(mode="enforce") - mock_resp = _mock_completion() - original = MagicMock(return_value=mock_resp) - wrapper = _make_patched_create(original) - - with run(budget=1.0) as ctx: - ctx.cost = 0.85 - ctx.budget_remaining = 0.15 - wrapper(MagicMock(), model="gpt-4o") - - assert original.call_args[1]["model"] == "gpt-4o-mini" - trace = ctx.trace() - assert trace[0]["action"] == "switch_model" - assert trace[0]["reason"] == "budget_pressure" - assert trace[0]["applied"] is True - assert trace[0]["decision_mode"] == "enforce" - - def test_observe_computes_switch_model_but_does_not_apply(self) -> None: - init(mode="observe") - mock_resp = _mock_completion() - original = MagicMock(return_value=mock_resp) - wrapper = _make_patched_create(original) - - with run(budget=1.0) as ctx: - ctx.cost = 0.85 - ctx.budget_remaining = 0.15 - wrapper(MagicMock(), model="gpt-4o") - - assert original.call_args[1]["model"] == "gpt-4o" - trace = ctx.trace() - assert trace[0]["action"] == "switch_model" - assert trace[0]["reason"] == "budget_pressure" - assert trace[0]["model"] == "gpt-4o-mini" - assert trace[0]["applied"] is False - assert trace[0]["decision_mode"] == "observe" - - def test_enforce_denies_tools_when_cap_reached(self) -> None: - init(mode="enforce", max_tool_calls=0) - mock_resp = _mock_completion() - original = MagicMock(return_value=mock_resp) - wrapper = _make_patched_create(original) - - with run(max_tool_calls=0) as ctx: - wrapper( - MagicMock(), - model="gpt-4o", - tools=[{"type": "function", "function": {"name": "t1"}}], - ) - - assert original.call_args[1]["tools"] == [] - trace = ctx.trace() - assert trace[0]["action"] == "deny_tool" - assert trace[0]["reason"] == "max_tool_calls_reached" - assert trace[0]["applied"] is True - assert trace[0]["decision_mode"] == "enforce" - - def test_observe_logs_deny_tool_but_keeps_tools(self) -> None: - init(mode="observe", max_tool_calls=0) - mock_resp = _mock_completion() - original = MagicMock(return_value=mock_resp) - wrapper = _make_patched_create(original) - - tools = [{"type": "function", "function": {"name": "t1"}}] - with run(max_tool_calls=0) as ctx: - wrapper(MagicMock(), model="gpt-4o", tools=tools) - - assert original.call_args[1]["tools"] == tools - trace = ctx.trace() - assert trace[0]["action"] == "deny_tool" - assert trace[0]["reason"] == "max_tool_calls_reached" - assert trace[0]["applied"] is False - assert trace[0]["decision_mode"] == "observe" - - def test_enforce_stops_when_latency_limit_exceeded_at_fastest_model(self) -> None: - from cascadeflow.schema.exceptions import HarnessStopError - - init(mode="enforce") - mock_resp = _mock_completion() - original = MagicMock(return_value=mock_resp) - wrapper = _make_patched_create(original) - - with run(max_latency_ms=1.0) as ctx: - ctx.latency_used_ms = 5.0 - with pytest.raises(HarnessStopError, match="latency_limit_exceeded"): - wrapper(MagicMock(), model="gpt-3.5-turbo") - - original.assert_not_called() - trace = ctx.trace() - assert trace[0]["action"] == "stop" - assert trace[0]["reason"] == "latency_limit_exceeded" - assert trace[0]["applied"] is True - assert trace[0]["decision_mode"] == "enforce" - - def test_enforce_stops_when_energy_limit_exceeded_at_lowest_energy_model(self) -> None: - from cascadeflow.schema.exceptions import HarnessStopError - - init(mode="enforce") - mock_resp = _mock_completion() - original = MagicMock(return_value=mock_resp) - wrapper = _make_patched_create(original) - - with run(max_energy=1.0) as ctx: - ctx.energy_used = 5.0 - with pytest.raises(HarnessStopError, match="energy_limit_exceeded"): - wrapper(MagicMock(), model="gpt-3.5-turbo") - - original.assert_not_called() - trace = ctx.trace() - assert trace[0]["action"] == "stop" - assert trace[0]["reason"] == "energy_limit_exceeded" - assert trace[0]["applied"] is True - assert trace[0]["decision_mode"] == "enforce" - - @pytest.mark.asyncio - async def test_async_enforce_denies_tools_when_cap_reached(self) -> None: - init(mode="enforce", max_tool_calls=0) - mock_resp = _mock_completion() - original = AsyncMock(return_value=mock_resp) - wrapper = _make_patched_async_create(original) - - async with run(max_tool_calls=0) as ctx: - await wrapper( - MagicMock(), - model="gpt-4o", - tools=[{"type": "function", "function": {"name": "t1"}}], - ) - - assert original.call_args[1]["tools"] == [] - trace = ctx.trace() - assert trace[0]["action"] == "deny_tool" - assert trace[0]["reason"] == "max_tool_calls_reached" - assert trace[0]["applied"] is True - assert trace[0]["decision_mode"] == "enforce" - - def test_enforce_switches_model_for_compliance_policy(self) -> None: - init(mode="enforce", compliance="strict") - mock_resp = _mock_completion() - original = MagicMock(return_value=mock_resp) - wrapper = _make_patched_create(original) - - with run() as ctx: - wrapper(MagicMock(), model="gpt-4o-mini") - - assert original.call_args[1]["model"] == "gpt-4o" - trace = ctx.trace() - assert trace[0]["action"] == "switch_model" - assert trace[0]["reason"] == "compliance_model_policy" - assert trace[0]["applied"] is True - assert trace[0]["decision_mode"] == "enforce" - - def test_enforce_denies_tool_for_strict_compliance(self) -> None: - init(mode="enforce", compliance="strict") - mock_resp = _mock_completion() - original = MagicMock(return_value=mock_resp) - wrapper = _make_patched_create(original) - - with run() as ctx: - wrapper( - MagicMock(), - model="gpt-4o", - tools=[{"type": "function", "function": {"name": "t1"}}], - ) - - assert original.call_args[1]["tools"] == [] - trace = ctx.trace() - assert trace[0]["action"] == "deny_tool" - assert trace[0]["reason"] == "compliance_tool_restriction" - assert trace[0]["applied"] is True - assert trace[0]["decision_mode"] == "enforce" - - def test_observe_logs_compliance_switch_without_applying(self) -> None: - init(mode="observe", compliance="strict") - mock_resp = _mock_completion() - original = MagicMock(return_value=mock_resp) - wrapper = _make_patched_create(original) - - with run() as ctx: - wrapper(MagicMock(), model="gpt-4o-mini") - - assert original.call_args[1]["model"] == "gpt-4o-mini" - trace = ctx.trace() - assert trace[0]["action"] == "switch_model" - assert trace[0]["reason"] == "compliance_model_policy" - assert trace[0]["model"] == "gpt-4o" - assert trace[0]["applied"] is False - assert trace[0]["decision_mode"] == "observe" - - def test_enforce_switches_model_using_kpi_weights(self) -> None: - init(mode="enforce", kpi_weights={"quality": 1.0}) - mock_resp = _mock_completion() - original = MagicMock(return_value=mock_resp) - wrapper = _make_patched_create(original) - - with run() as ctx: - wrapper(MagicMock(), model="gpt-3.5-turbo") - - assert original.call_args[1]["model"] == "o1" - trace = ctx.trace() - assert trace[0]["action"] == "switch_model" - assert trace[0]["reason"] == "kpi_weight_optimization" - assert trace[0]["applied"] is True - assert trace[0]["decision_mode"] == "enforce" - - def test_observe_logs_kpi_switch_without_applying(self) -> None: - init(mode="observe", kpi_weights={"quality": 1.0}) - mock_resp = _mock_completion() - original = MagicMock(return_value=mock_resp) - wrapper = _make_patched_create(original) - - with run() as ctx: - wrapper(MagicMock(), model="gpt-3.5-turbo") - - assert original.call_args[1]["model"] == "gpt-3.5-turbo" - trace = ctx.trace() - assert trace[0]["action"] == "switch_model" - assert trace[0]["reason"] == "kpi_weight_optimization" - assert trace[0]["model"] == "o1" - assert trace[0]["applied"] is False - assert trace[0]["decision_mode"] == "observe" - - -# --------------------------------------------------------------------------- -# Fix: stream_options.include_usage auto-injection -# --------------------------------------------------------------------------- - - -class TestStreamUsageInjection: - def test_stream_injects_include_usage(self) -> None: - init(mode="observe") - mock_stream = iter([_mock_stream_chunk("hi", usage=_mock_usage(50, 25))]) - original = MagicMock(return_value=mock_stream) - wrapper = _make_patched_create(original) - - with run(budget=1.0) as ctx: - result = wrapper(MagicMock(), model="gpt-4o-mini", stream=True) - list(result) - - # Check the original was called with stream_options injected - call_kwargs = original.call_args[1] - assert call_kwargs.get("stream_options", {}).get("include_usage") is True - - def test_stream_preserves_existing_stream_options(self) -> None: - init(mode="observe") - mock_stream = iter([_mock_stream_chunk("hi", usage=_mock_usage(50, 25))]) - original = MagicMock(return_value=mock_stream) - wrapper = _make_patched_create(original) - - with run(budget=1.0) as ctx: - result = wrapper( - MagicMock(), - model="gpt-4o-mini", - stream=True, - stream_options={"include_usage": True}, - ) - list(result) - - call_kwargs = original.call_args[1] - assert call_kwargs["stream_options"]["include_usage"] is True - - def test_non_stream_does_not_inject_stream_options(self) -> None: - init(mode="observe") - mock_resp = _mock_completion() - original = MagicMock(return_value=mock_resp) - wrapper = _make_patched_create(original) - - with run(budget=1.0) as ctx: - wrapper(MagicMock(), model="gpt-4o-mini") - - call_kwargs = original.call_args[1] - assert "stream_options" not in call_kwargs From 75ff333ba6bb8afedcc879045e4290a66537d2db Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 07:21:47 +0100 Subject: [PATCH 04/49] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94?= =?UTF-8?q?=20off-mode=20unpatch,=20enforce=20budget=20gate,=20stream=20us?= =?UTF-8?q?age=20injection?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - init(mode="off") now calls unpatch_openai() if previously patched - Trace records actual mode (observe/enforce) instead of always "observe" - Enforce mode raises BudgetExceededError pre-call when budget exhausted - Auto-inject stream_options.include_usage=True for streaming requests - Add pytest.importorskip("openai") for graceful skip when not installed - 10 new tests covering all four fixes (73 total pass) --- cascadeflow/harness/api.py | 5 ++ cascadeflow/harness/instrument.py | 44 ++++++++- tests/test_harness_instrument.py | 143 ++++++++++++++++++++++++++++++ 3 files changed, 191 insertions(+), 1 deletion(-) diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py index 10d0e29a..88c9c579 100644 --- a/cascadeflow/harness/api.py +++ b/cascadeflow/harness/api.py @@ -343,6 +343,11 @@ def init( if patch_openai(): instrumented.append("openai") + elif validated_mode == "off": + from cascadeflow.harness.instrument import is_patched, unpatch_openai + + if is_patched(): + unpatch_openai() if sdk_presence["anthropic"]: detected_but_not_instrumented.append("anthropic") diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py index d0ac4187..c02200f7 100644 --- a/cascadeflow/harness/instrument.py +++ b/cascadeflow/harness/instrument.py @@ -73,6 +73,22 @@ # --------------------------------------------------------------------------- +def _ensure_stream_usage(kwargs: dict[str, Any]) -> dict[str, Any]: + """Inject ``stream_options.include_usage=True`` for streaming requests. + + OpenAI only sends usage data in the final stream chunk when this option + is set. Without it the harness would record zero cost for every + streaming call. + """ + if not kwargs.get("stream", False): + return kwargs + stream_options = kwargs.get("stream_options") or {} + if not stream_options.get("include_usage"): + stream_options = {**stream_options, "include_usage": True} + kwargs = {**kwargs, "stream_options": stream_options} + return kwargs + + def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float: """Estimate cost in USD from model name and token counts.""" per_million = _PRICING.get(model, _DEFAULT_PRICING) @@ -112,6 +128,20 @@ def _extract_usage(response: Any) -> tuple[int, int]: ) +def _check_budget_pre_call(ctx: Any) -> None: + """Raise BudgetExceededError in enforce mode if budget is already exhausted.""" + if ctx.mode != "enforce": + return + if ctx.budget_max is not None and ctx.cost >= ctx.budget_max: + from cascadeflow.schema.exceptions import BudgetExceededError + + remaining = ctx.budget_max - ctx.cost + raise BudgetExceededError( + f"Budget exhausted: spent ${ctx.cost:.4f} of ${ctx.budget_max:.4f} max", + remaining=remaining, + ) + + def _update_context( ctx: Any, model: str, @@ -134,7 +164,7 @@ def _update_context( ctx.budget_remaining = ctx.budget_max - ctx.cost ctx.model_used = model - ctx.record(action="allow", reason="observe", model=model) + ctx.record(action="allow", reason=ctx.mode, model=model) # --------------------------------------------------------------------------- @@ -381,8 +411,14 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: model: str = kwargs.get("model", "unknown") is_stream: bool = bool(kwargs.get("stream", False)) + + if ctx: + _check_budget_pre_call(ctx) + start_time = time.monotonic() + kwargs = _ensure_stream_usage(kwargs) + logger.debug("harness intercept: model=%s stream=%s mode=%s", model, is_stream, mode) response = original_fn(self, *args, **kwargs) @@ -429,8 +465,14 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: model: str = kwargs.get("model", "unknown") is_stream: bool = bool(kwargs.get("stream", False)) + + if ctx: + _check_budget_pre_call(ctx) + start_time = time.monotonic() + kwargs = _ensure_stream_usage(kwargs) + logger.debug( "harness intercept async: model=%s stream=%s mode=%s", model, diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py index c2092e46..12f0f938 100644 --- a/tests/test_harness_instrument.py +++ b/tests/test_harness_instrument.py @@ -8,6 +8,8 @@ import pytest +pytest.importorskip("openai", reason="openai package required for instrumentation tests") + from cascadeflow.harness import init, reset, run from cascadeflow.harness.instrument import ( _InstrumentedAsyncStream, @@ -563,3 +565,144 @@ def test_stream_without_usage_in_any_chunk(self) -> None: assert ctx.cost == 0.0 # No usage data available assert ctx.step_count == 1 # Step still counted + + +# --------------------------------------------------------------------------- +# Fix: init(mode="off") unpatches previously patched client +# --------------------------------------------------------------------------- + + +class TestInitOffUnpatches: + def test_init_off_after_observe_unpatches(self) -> None: + init(mode="observe") + assert is_patched() + init(mode="off") + assert not is_patched() + + def test_init_off_when_not_patched_is_safe(self) -> None: + init(mode="off") + assert not is_patched() + + +# --------------------------------------------------------------------------- +# Fix: enforce mode — budget gate and correct trace reason +# --------------------------------------------------------------------------- + + +class TestEnforceMode: + def test_enforce_trace_records_enforce_reason(self) -> None: + init(mode="enforce") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=10.0) as ctx: + wrapper(MagicMock(), model="gpt-4o") + + trace = ctx.trace() + assert trace[0]["reason"] == "enforce" + + def test_observe_trace_records_observe_reason(self) -> None: + init(mode="observe") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=10.0) as ctx: + wrapper(MagicMock(), model="gpt-4o") + + trace = ctx.trace() + assert trace[0]["reason"] == "observe" + + def test_enforce_raises_on_budget_exhausted(self) -> None: + from cascadeflow.schema.exceptions import BudgetExceededError + + init(mode="enforce") + mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=0.001) as ctx: + # First call uses the tiny budget + wrapper(MagicMock(), model="gpt-4o") + # Second call should raise — budget exhausted + with pytest.raises(BudgetExceededError): + wrapper(MagicMock(), model="gpt-4o") + + def test_observe_does_not_raise_on_budget_exhausted(self) -> None: + init(mode="observe") + mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=0.001) as ctx: + wrapper(MagicMock(), model="gpt-4o") + # Second call should NOT raise — observe mode is permissive + wrapper(MagicMock(), model="gpt-4o") + + assert ctx.cost > ctx.budget_max # type: ignore[operator] + + @pytest.mark.asyncio + async def test_enforce_raises_on_budget_exhausted_async(self) -> None: + from cascadeflow.schema.exceptions import BudgetExceededError + + init(mode="enforce") + mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000) + original = AsyncMock(return_value=mock_resp) + wrapper = _make_patched_async_create(original) + + async with run(budget=0.001) as ctx: + await wrapper(MagicMock(), model="gpt-4o") + with pytest.raises(BudgetExceededError): + await wrapper(MagicMock(), model="gpt-4o") + + +# --------------------------------------------------------------------------- +# Fix: stream_options.include_usage auto-injection +# --------------------------------------------------------------------------- + + +class TestStreamUsageInjection: + def test_stream_injects_include_usage(self) -> None: + init(mode="observe") + mock_stream = iter([_mock_stream_chunk("hi", usage=_mock_usage(50, 25))]) + original = MagicMock(return_value=mock_stream) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + result = wrapper(MagicMock(), model="gpt-4o-mini", stream=True) + list(result) + + # Check the original was called with stream_options injected + call_kwargs = original.call_args[1] + assert call_kwargs.get("stream_options", {}).get("include_usage") is True + + def test_stream_preserves_existing_stream_options(self) -> None: + init(mode="observe") + mock_stream = iter([_mock_stream_chunk("hi", usage=_mock_usage(50, 25))]) + original = MagicMock(return_value=mock_stream) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + result = wrapper( + MagicMock(), + model="gpt-4o-mini", + stream=True, + stream_options={"include_usage": True}, + ) + list(result) + + call_kwargs = original.call_args[1] + assert call_kwargs["stream_options"]["include_usage"] is True + + def test_non_stream_does_not_inject_stream_options(self) -> None: + init(mode="observe") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="gpt-4o-mini") + + call_kwargs = original.call_args[1] + assert "stream_options" not in call_kwargs From 1f0fad0bf0874a666924630837c9b0ebff39544b Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 25 Feb 2026 22:52:22 +0100 Subject: [PATCH 05/49] Add OpenAI Agents SDK harness integration (opt-in) --- cascadeflow/integrations/openai_agents.py | 224 ++++++++---------- docs/strategy/agent-intelligence-v2-plan.md | 2 +- .../integrations/openai_agents_harness.py | 6 +- pyproject.toml | 8 +- tests/test_openai_agents_integration.py | 41 +--- 5 files changed, 105 insertions(+), 176 deletions(-) diff --git a/cascadeflow/integrations/openai_agents.py b/cascadeflow/integrations/openai_agents.py index cbce9b96..ffb0af8d 100644 --- a/cascadeflow/integrations/openai_agents.py +++ b/cascadeflow/integrations/openai_agents.py @@ -15,19 +15,6 @@ from typing import TYPE_CHECKING, Any, AsyncIterator, Optional from cascadeflow.harness import get_current_run -from cascadeflow.harness.pricing import ( - OPENAI_MODEL_POOL, -) -from cascadeflow.harness.pricing import ( - estimate_cost as _estimate_shared_cost, -) -from cascadeflow.harness.pricing import ( - estimate_energy as _estimate_shared_energy, -) -from cascadeflow.harness.pricing import ( - model_total_price as _shared_model_total_price, -) -from cascadeflow.schema.exceptions import BudgetExceededError logger = logging.getLogger("cascadeflow.harness.openai_agents") @@ -39,6 +26,7 @@ from agents.models.interface import Model, ModelProvider, ModelTracing from agents.tool import Tool from openai.types.responses.response_prompt_param import ResponsePromptParam + from openai.types.responses.response_text_config_param import ResponseTextConfigParam else: Model = object ModelProvider = object @@ -47,6 +35,7 @@ ModelResponse = Any Tool = Any ResponsePromptParam = Any + ResponseTextConfigParam = Any @dataclass @@ -69,16 +58,36 @@ class OpenAIAgentsIntegrationConfig: fail_open: bool = True -def _estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float: - return _estimate_shared_cost(model, input_tokens, output_tokens) +# Approximate pricing (USD per 1M tokens: input, output). +_PRICING_USD_PER_M = { + "gpt-4o": (2.50, 10.00), + "gpt-4o-mini": (0.15, 0.60), + "gpt-5": (1.25, 10.00), + "gpt-5-mini": (0.20, 0.80), + "gpt-4-turbo": (10.00, 30.00), +} +_DEFAULT_PRICING_USD_PER_M = (2.50, 10.00) + +# Deterministic proxy coefficients for energy tracking. +_ENERGY_COEFFICIENTS = { + "gpt-4o": 1.0, + "gpt-4o-mini": 0.3, + "gpt-5": 1.2, + "gpt-5-mini": 0.35, + "gpt-4-turbo": 1.5, +} +_DEFAULT_ENERGY_COEFFICIENT = 1.0 +_ENERGY_OUTPUT_WEIGHT = 1.5 -def _estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float: - return _estimate_shared_energy(model, input_tokens, output_tokens) +def _estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float: + in_price, out_price = _PRICING_USD_PER_M.get(model, _DEFAULT_PRICING_USD_PER_M) + return (input_tokens / 1_000_000.0) * in_price + (output_tokens / 1_000_000.0) * out_price -def _total_model_price(model: str) -> float: - return _shared_model_total_price(model) +def _estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float: + coefficient = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT) + return coefficient * (input_tokens + (output_tokens * _ENERGY_OUTPUT_WEIGHT)) def _extract_usage_tokens(usage: Any) -> tuple[int, int]: @@ -121,41 +130,6 @@ def _safe_record(action: str, reason: str, model: Optional[str]) -> None: run.record(action=action, reason=reason, model=model) -def _apply_run_metrics( - *, - model_name: str, - response: Any, - elapsed_ms: float, - pre_action: str, - allow_reason: str, -) -> None: - run = get_current_run() - if run is None: - return - - usage = getattr(response, "usage", None) if response is not None else None - input_tokens, output_tokens = _extract_usage_tokens(usage) - tool_calls = _count_tool_calls(getattr(response, "output", None)) if response is not None else 0 - - run.step_count += 1 - run.latency_used_ms += elapsed_ms - run.energy_used += _estimate_energy(model_name, input_tokens, output_tokens) - run.cost += _estimate_cost(model_name, input_tokens, output_tokens) - run.tool_calls += tool_calls - - if run.budget_max is not None: - run.budget_remaining = run.budget_max - run.cost - - if pre_action == "deny_tool": - run.last_action = "deny_tool" - run.model_used = model_name - else: - run.record("allow", allow_reason, model_name) - - if run.mode == "enforce" and run.budget_remaining is not None and run.budget_remaining <= 0: - logger.info("openai-agents step exhausted budget; next step will be blocked") - - class CascadeFlowModelProvider(ModelProvider): # type: ignore[misc] """ OpenAI Agents SDK ModelProvider with cascadeflow harness awareness. @@ -185,15 +159,13 @@ def _create_default_provider(self) -> Any: return OpenAIProvider() - def _initial_model_candidate(self, requested_model: Optional[str]) -> str: - if requested_model: - return requested_model - if self._config.model_candidates: - return self._config.model_candidates[0] - return "gpt-4o-mini" - def _resolve_model(self, requested_model: Optional[str]) -> str: - candidate = self._initial_model_candidate(requested_model) + if requested_model: + candidate = requested_model + elif self._config.model_candidates: + candidate = self._config.model_candidates[0] + else: + candidate = "gpt-4o-mini" run = get_current_run() if run is None: @@ -203,10 +175,7 @@ def _resolve_model(self, requested_model: Optional[str]) -> str: if run.budget_remaining is not None and run.budget_remaining <= 0: run.record("stop", "budget_exceeded", candidate) - raise BudgetExceededError( - "cascadeflow harness budget exceeded", - remaining=run.budget_remaining, - ) + raise RuntimeError("cascadeflow harness budget exceeded") if not self._config.model_candidates or run.budget_max is None or run.budget_max <= 0: return candidate @@ -216,13 +185,9 @@ def _resolve_model(self, requested_model: Optional[str]) -> str: # Under budget pressure, switch to the cheapest configured candidate. if run.budget_remaining / run.budget_max < 0.2: - compatible_candidates = [ - name for name in self._config.model_candidates if name in OPENAI_MODEL_POOL - ] - candidates = compatible_candidates or self._config.model_candidates cheapest = min( - candidates, - key=_total_model_price, + self._config.model_candidates, + key=lambda name: sum(_PRICING_USD_PER_M.get(name, _DEFAULT_PRICING_USD_PER_M)), ) if cheapest != candidate: run.record("switch_model", "budget_pressure", cheapest) @@ -231,32 +196,8 @@ def _resolve_model(self, requested_model: Optional[str]) -> str: return candidate def get_model(self, model_name: str | None) -> Model: - fallback_model = self._initial_model_candidate(model_name) - selected_model = fallback_model - - try: - selected_model = self._resolve_model(model_name) - except BudgetExceededError: - raise - except Exception: - if not self._config.fail_open: - raise - logger.exception( - "openai-agents model resolution failed; falling back to requested model (fail-open)" - ) - selected_model = fallback_model - - try: - base_model = self._base_provider.get_model(selected_model) - except Exception: - if not self._config.fail_open: - raise - logger.exception( - "openai-agents provider.get_model failed; retrying with fallback model (fail-open)" - ) - selected_model = fallback_model - base_model = self._base_provider.get_model(selected_model) - + selected_model = self._resolve_model(model_name) + base_model = self._base_provider.get_model(selected_model) return _CascadeFlowWrappedModel( base_model=base_model, model_name=selected_model, @@ -305,18 +246,36 @@ def _update_run_metrics( elapsed_ms: float, pre_action: str, ) -> None: - _apply_run_metrics( - model_name=self._model_name, - response=response, - elapsed_ms=elapsed_ms, - pre_action=pre_action, - allow_reason="openai_agents_step", - ) + run = get_current_run() + if run is None: + return + + usage = getattr(response, "usage", None) + input_tokens, output_tokens = _extract_usage_tokens(usage) + tool_calls = _count_tool_calls(getattr(response, "output", None)) + + run.step_count += 1 + run.latency_used_ms += elapsed_ms + run.energy_used += _estimate_energy(self._model_name, input_tokens, output_tokens) + run.cost += _estimate_cost(self._model_name, input_tokens, output_tokens) + run.tool_calls += tool_calls + + if run.budget_max is not None: + run.budget_remaining = run.budget_max - run.cost + + if pre_action == "deny_tool": + run.last_action = "deny_tool" + run.model_used = self._model_name + else: + run.record("allow", "openai_agents_step", self._model_name) + + if run.mode == "enforce" and run.budget_remaining is not None and run.budget_remaining <= 0: + run.record("stop", "budget_exceeded", self._model_name) async def get_response( self, system_instructions: str | None, - input: str | list[Any], # noqa: A002 - required by OpenAI Agents SDK Model interface + input_data: str | list[Any], model_settings: ModelSettings, tools: list[Tool], output_schema: Any | None, @@ -332,7 +291,7 @@ async def get_response( response = await self._base_model.get_response( system_instructions=system_instructions, - input=input, + input=input_data, model_settings=model_settings, tools=gated_tools, output_schema=output_schema, @@ -346,9 +305,7 @@ async def get_response( elapsed_ms = (time.monotonic() - started_at) * 1000.0 try: - self._update_run_metrics( - response=response, elapsed_ms=elapsed_ms, pre_action=pre_action - ) + self._update_run_metrics(response=response, elapsed_ms=elapsed_ms, pre_action=pre_action) except Exception: if self._config.fail_open: logger.exception("openai-agents harness metric update failed (fail-open)") @@ -360,7 +317,7 @@ async def get_response( def stream_response( self, system_instructions: str | None, - input: str | list[Any], # noqa: A002 - required by OpenAI Agents SDK Model interface + input_data: str | list[Any], model_settings: ModelSettings, tools: list[Tool], output_schema: Any | None, @@ -370,13 +327,14 @@ def stream_response( previous_response_id: str | None, conversation_id: str | None, prompt: ResponsePromptParam | None, + text_format: ResponseTextConfigParam | None, ) -> AsyncIterator[Any]: gated_tools, pre_action = self._gate_tools(tools) started_at = time.monotonic() stream = self._base_model.stream_response( system_instructions=system_instructions, - input=input, + input=input_data, model_settings=model_settings, tools=gated_tools, output_schema=output_schema, @@ -385,6 +343,7 @@ def stream_response( previous_response_id=previous_response_id, conversation_id=conversation_id, prompt=prompt, + text_format=text_format, ) return _CascadeFlowStreamWrapper( stream=stream, @@ -441,13 +400,31 @@ async def _finalize(self) -> None: response = self._last_response try: - _apply_run_metrics( - model_name=self._model_name, - response=response, - elapsed_ms=elapsed_ms, - pre_action=self._pre_action, - allow_reason="openai_agents_stream_step", - ) + if response is None: + run.step_count += 1 + run.latency_used_ms += elapsed_ms + if self._pre_action == "deny_tool": + run.record("deny_tool", "max_tool_calls_reached", self._model_name) + else: + run.record("allow", "openai_agents_stream_step", self._model_name) + return + + usage = getattr(response, "usage", None) + input_tokens, output_tokens = _extract_usage_tokens(usage) + tool_calls = _count_tool_calls(getattr(response, "output", None)) + + run.step_count += 1 + run.latency_used_ms += elapsed_ms + run.energy_used += _estimate_energy(self._model_name, input_tokens, output_tokens) + run.cost += _estimate_cost(self._model_name, input_tokens, output_tokens) + run.tool_calls += tool_calls + if run.budget_max is not None: + run.budget_remaining = run.budget_max - run.cost + + if self._pre_action == "deny_tool": + run.record("deny_tool", "max_tool_calls_reached", self._model_name) + else: + run.record("allow", "openai_agents_stream_step", self._model_name) except Exception: if self._fail_open: logger.exception("openai-agents stream metric update failed (fail-open)") @@ -476,12 +453,3 @@ def create_openai_agents_provider( def is_openai_agents_sdk_available() -> bool: return OPENAI_AGENTS_SDK_AVAILABLE - - -__all__ = [ - "OPENAI_AGENTS_SDK_AVAILABLE", - "OpenAIAgentsIntegrationConfig", - "CascadeFlowModelProvider", - "create_openai_agents_provider", - "is_openai_agents_sdk_available", -] diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md index d17d3df5..0d815af6 100644 --- a/docs/strategy/agent-intelligence-v2-plan.md +++ b/docs/strategy/agent-intelligence-v2-plan.md @@ -845,7 +845,7 @@ Claim checklist (one owner per branch at a time): - [x] `feat/v2-core-harness-api` — Owner: `@codex` — PR: `TBD` — Status: `completed` - [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — PR: `TBD` — Status: `in-progress` - [ ] `feat/v2-enforce-actions` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` -- [ ] `feat/v2-openai-agents-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` +- [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` - [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` - [ ] `feat/v2-langchain-harness-extension` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` - [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` diff --git a/examples/integrations/openai_agents_harness.py b/examples/integrations/openai_agents_harness.py index ac9d6c68..69ea6bcd 100644 --- a/examples/integrations/openai_agents_harness.py +++ b/examples/integrations/openai_agents_harness.py @@ -17,7 +17,7 @@ async def main() -> None: except ImportError as exc: raise SystemExit( "OpenAI Agents SDK is not installed. " - 'Install with: pip install "cascadeflow[openai,openai-agents]"' + "Install with: pip install \"cascadeflow[openai,openai-agents]\"" ) from exc from cascadeflow import init, run @@ -44,9 +44,7 @@ async def main() -> None: run_config = RunConfig(model_provider=provider) with run(budget=0.5, max_tool_calls=3) as session: - result = await Runner.run( - agent, "Summarize why model routing helps agent budgets.", run_config=run_config - ) + result = await Runner.run(agent, "Summarize why model routing helps agent budgets.", run_config=run_config) print("=== Result ===") print(result.final_output) diff --git a/pyproject.toml b/pyproject.toml index eaadb6b7..8cd6ede5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,14 +92,8 @@ semantic = [ # OpenClaw integration (auto-enables FastEmbed for semantic routing) openclaw = ["fastembed>=0.7.0"] -# CrewAI harness integration (opt-in) -crewai = ["crewai>=1.5.0"] - # OpenAI Agents SDK integration (opt-in) -openai-agents = [ - "openai-agents>=0.8.4; python_version < '3.10'", - "openai-agents>=0.9.0; python_version >= '3.10'", -] +openai-agents = ["openai-agents>=0.9.0"] # Development tools (includes rich for terminal output) dev = [ diff --git a/tests/test_openai_agents_integration.py b/tests/test_openai_agents_integration.py index b2644036..2886e6f2 100644 --- a/tests/test_openai_agents_integration.py +++ b/tests/test_openai_agents_integration.py @@ -6,7 +6,6 @@ CascadeFlowModelProvider, OpenAIAgentsIntegrationConfig, ) -from cascadeflow.schema.exceptions import BudgetExceededError def setup_function() -> None: @@ -80,7 +79,7 @@ def get_model(self, model_name): def _response_call_kwargs(): return { "system_instructions": None, - "input": "hello", + "input_data": "hello", "model_settings": None, "tools": [], "output_schema": None, @@ -105,8 +104,6 @@ async def test_metrics_updated_from_get_response(): with run(budget=2.0) as ctx: await wrapped.get_response(**_response_call_kwargs()) - assert model.last_kwargs is not None - assert model.last_kwargs["input"] == "hello" assert ctx.step_count == 1 assert ctx.tool_calls == 1 assert ctx.cost > 0 @@ -152,35 +149,6 @@ def test_switches_to_cheapest_candidate_under_budget_pressure(): assert ctx.last_action == "switch_model" -def test_budget_exceeded_raises_cascadeflow_budget_error(): - init(mode="enforce", budget=1.0) - - response = _FakeResponse() - model = _FakeModel(response=response) - provider = CascadeFlowModelProvider(base_provider=_FakeBaseProvider(model)) - - with run(budget=1.0) as ctx: - ctx.budget_remaining = 0.0 - with pytest.raises(BudgetExceededError): - provider.get_model("gpt-4o-mini") - - -def test_fail_open_falls_back_when_model_resolution_errors(monkeypatch): - response = _FakeResponse() - model = _FakeModel(response=response) - base_provider = _FakeBaseProvider(model) - provider = CascadeFlowModelProvider(base_provider=base_provider) - - def _boom(_: object) -> str: - raise ValueError("resolution failed") - - monkeypatch.setattr(provider, "_resolve_model", _boom) - wrapped = provider.get_model("gpt-4o") - - assert wrapped is not None - assert base_provider.requested_models[-1] == "gpt-4o" - - @pytest.mark.asyncio async def test_stream_response_updates_metrics(): init(mode="observe", budget=3.0) @@ -196,11 +164,12 @@ async def test_stream_response_updates_metrics(): wrapped = provider.get_model("gpt-4o-mini") with run(budget=3.0) as ctx: - async for _ in wrapped.stream_response(**_response_call_kwargs()): + async for _ in wrapped.stream_response( + **_response_call_kwargs(), + text_format=None, + ): pass - assert model.last_kwargs is not None - assert model.last_kwargs["input"] == "hello" assert ctx.step_count == 1 assert ctx.tool_calls == 1 assert ctx.cost > 0 From 7bc50de5e1cc4817975079ce6759a5eccff3b9cf Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 07:20:33 +0100 Subject: [PATCH 06/49] fix(openai-agents): align SDK interface and enforce-safe errors --- cascadeflow/integrations/openai_agents.py | 174 +++++++++++++--------- pyproject.toml | 5 +- tests/test_openai_agents_integration.py | 41 ++++- 3 files changed, 146 insertions(+), 74 deletions(-) diff --git a/cascadeflow/integrations/openai_agents.py b/cascadeflow/integrations/openai_agents.py index ffb0af8d..1205cd98 100644 --- a/cascadeflow/integrations/openai_agents.py +++ b/cascadeflow/integrations/openai_agents.py @@ -15,6 +15,7 @@ from typing import TYPE_CHECKING, Any, AsyncIterator, Optional from cascadeflow.harness import get_current_run +from cascadeflow.schema.exceptions import BudgetExceededError logger = logging.getLogger("cascadeflow.harness.openai_agents") @@ -26,7 +27,6 @@ from agents.models.interface import Model, ModelProvider, ModelTracing from agents.tool import Tool from openai.types.responses.response_prompt_param import ResponsePromptParam - from openai.types.responses.response_text_config_param import ResponseTextConfigParam else: Model = object ModelProvider = object @@ -35,7 +35,6 @@ ModelResponse = Any Tool = Any ResponsePromptParam = Any - ResponseTextConfigParam = Any @dataclass @@ -90,6 +89,10 @@ def _estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float return coefficient * (input_tokens + (output_tokens * _ENERGY_OUTPUT_WEIGHT)) +def _total_model_price(model: str) -> float: + return sum(_PRICING_USD_PER_M.get(model, _DEFAULT_PRICING_USD_PER_M)) + + def _extract_usage_tokens(usage: Any) -> tuple[int, int]: if usage is None: return 0, 0 @@ -130,6 +133,41 @@ def _safe_record(action: str, reason: str, model: Optional[str]) -> None: run.record(action=action, reason=reason, model=model) +def _apply_run_metrics( + *, + model_name: str, + response: Any, + elapsed_ms: float, + pre_action: str, + allow_reason: str, +) -> None: + run = get_current_run() + if run is None: + return + + usage = getattr(response, "usage", None) if response is not None else None + input_tokens, output_tokens = _extract_usage_tokens(usage) + tool_calls = _count_tool_calls(getattr(response, "output", None)) if response is not None else 0 + + run.step_count += 1 + run.latency_used_ms += elapsed_ms + run.energy_used += _estimate_energy(model_name, input_tokens, output_tokens) + run.cost += _estimate_cost(model_name, input_tokens, output_tokens) + run.tool_calls += tool_calls + + if run.budget_max is not None: + run.budget_remaining = run.budget_max - run.cost + + if pre_action == "deny_tool": + run.last_action = "deny_tool" + run.model_used = model_name + else: + run.record("allow", allow_reason, model_name) + + if run.mode == "enforce" and run.budget_remaining is not None and run.budget_remaining <= 0: + logger.info("openai-agents step exhausted budget; next step will be blocked") + + class CascadeFlowModelProvider(ModelProvider): # type: ignore[misc] """ OpenAI Agents SDK ModelProvider with cascadeflow harness awareness. @@ -159,13 +197,15 @@ def _create_default_provider(self) -> Any: return OpenAIProvider() - def _resolve_model(self, requested_model: Optional[str]) -> str: + def _initial_model_candidate(self, requested_model: Optional[str]) -> str: if requested_model: - candidate = requested_model - elif self._config.model_candidates: - candidate = self._config.model_candidates[0] - else: - candidate = "gpt-4o-mini" + return requested_model + if self._config.model_candidates: + return self._config.model_candidates[0] + return "gpt-4o-mini" + + def _resolve_model(self, requested_model: Optional[str]) -> str: + candidate = self._initial_model_candidate(requested_model) run = get_current_run() if run is None: @@ -175,7 +215,10 @@ def _resolve_model(self, requested_model: Optional[str]) -> str: if run.budget_remaining is not None and run.budget_remaining <= 0: run.record("stop", "budget_exceeded", candidate) - raise RuntimeError("cascadeflow harness budget exceeded") + raise BudgetExceededError( + "cascadeflow harness budget exceeded", + remaining=run.budget_remaining, + ) if not self._config.model_candidates or run.budget_max is None or run.budget_max <= 0: return candidate @@ -187,7 +230,7 @@ def _resolve_model(self, requested_model: Optional[str]) -> str: if run.budget_remaining / run.budget_max < 0.2: cheapest = min( self._config.model_candidates, - key=lambda name: sum(_PRICING_USD_PER_M.get(name, _DEFAULT_PRICING_USD_PER_M)), + key=_total_model_price, ) if cheapest != candidate: run.record("switch_model", "budget_pressure", cheapest) @@ -196,8 +239,32 @@ def _resolve_model(self, requested_model: Optional[str]) -> str: return candidate def get_model(self, model_name: str | None) -> Model: - selected_model = self._resolve_model(model_name) - base_model = self._base_provider.get_model(selected_model) + fallback_model = self._initial_model_candidate(model_name) + selected_model = fallback_model + + try: + selected_model = self._resolve_model(model_name) + except BudgetExceededError: + raise + except Exception: + if not self._config.fail_open: + raise + logger.exception( + "openai-agents model resolution failed; falling back to requested model (fail-open)" + ) + selected_model = fallback_model + + try: + base_model = self._base_provider.get_model(selected_model) + except Exception: + if not self._config.fail_open: + raise + logger.exception( + "openai-agents provider.get_model failed; retrying with fallback model (fail-open)" + ) + selected_model = fallback_model + base_model = self._base_provider.get_model(selected_model) + return _CascadeFlowWrappedModel( base_model=base_model, model_name=selected_model, @@ -246,36 +313,18 @@ def _update_run_metrics( elapsed_ms: float, pre_action: str, ) -> None: - run = get_current_run() - if run is None: - return - - usage = getattr(response, "usage", None) - input_tokens, output_tokens = _extract_usage_tokens(usage) - tool_calls = _count_tool_calls(getattr(response, "output", None)) - - run.step_count += 1 - run.latency_used_ms += elapsed_ms - run.energy_used += _estimate_energy(self._model_name, input_tokens, output_tokens) - run.cost += _estimate_cost(self._model_name, input_tokens, output_tokens) - run.tool_calls += tool_calls - - if run.budget_max is not None: - run.budget_remaining = run.budget_max - run.cost - - if pre_action == "deny_tool": - run.last_action = "deny_tool" - run.model_used = self._model_name - else: - run.record("allow", "openai_agents_step", self._model_name) - - if run.mode == "enforce" and run.budget_remaining is not None and run.budget_remaining <= 0: - run.record("stop", "budget_exceeded", self._model_name) + _apply_run_metrics( + model_name=self._model_name, + response=response, + elapsed_ms=elapsed_ms, + pre_action=pre_action, + allow_reason="openai_agents_step", + ) async def get_response( self, system_instructions: str | None, - input_data: str | list[Any], + input: str | list[Any], # noqa: A002 - required by OpenAI Agents SDK Model interface model_settings: ModelSettings, tools: list[Tool], output_schema: Any | None, @@ -291,7 +340,7 @@ async def get_response( response = await self._base_model.get_response( system_instructions=system_instructions, - input=input_data, + input=input, model_settings=model_settings, tools=gated_tools, output_schema=output_schema, @@ -317,7 +366,7 @@ async def get_response( def stream_response( self, system_instructions: str | None, - input_data: str | list[Any], + input: str | list[Any], # noqa: A002 - required by OpenAI Agents SDK Model interface model_settings: ModelSettings, tools: list[Tool], output_schema: Any | None, @@ -327,14 +376,13 @@ def stream_response( previous_response_id: str | None, conversation_id: str | None, prompt: ResponsePromptParam | None, - text_format: ResponseTextConfigParam | None, ) -> AsyncIterator[Any]: gated_tools, pre_action = self._gate_tools(tools) started_at = time.monotonic() stream = self._base_model.stream_response( system_instructions=system_instructions, - input=input_data, + input=input, model_settings=model_settings, tools=gated_tools, output_schema=output_schema, @@ -343,7 +391,6 @@ def stream_response( previous_response_id=previous_response_id, conversation_id=conversation_id, prompt=prompt, - text_format=text_format, ) return _CascadeFlowStreamWrapper( stream=stream, @@ -400,31 +447,13 @@ async def _finalize(self) -> None: response = self._last_response try: - if response is None: - run.step_count += 1 - run.latency_used_ms += elapsed_ms - if self._pre_action == "deny_tool": - run.record("deny_tool", "max_tool_calls_reached", self._model_name) - else: - run.record("allow", "openai_agents_stream_step", self._model_name) - return - - usage = getattr(response, "usage", None) - input_tokens, output_tokens = _extract_usage_tokens(usage) - tool_calls = _count_tool_calls(getattr(response, "output", None)) - - run.step_count += 1 - run.latency_used_ms += elapsed_ms - run.energy_used += _estimate_energy(self._model_name, input_tokens, output_tokens) - run.cost += _estimate_cost(self._model_name, input_tokens, output_tokens) - run.tool_calls += tool_calls - if run.budget_max is not None: - run.budget_remaining = run.budget_max - run.cost - - if self._pre_action == "deny_tool": - run.record("deny_tool", "max_tool_calls_reached", self._model_name) - else: - run.record("allow", "openai_agents_stream_step", self._model_name) + _apply_run_metrics( + model_name=self._model_name, + response=response, + elapsed_ms=elapsed_ms, + pre_action=self._pre_action, + allow_reason="openai_agents_stream_step", + ) except Exception: if self._fail_open: logger.exception("openai-agents stream metric update failed (fail-open)") @@ -453,3 +482,12 @@ def create_openai_agents_provider( def is_openai_agents_sdk_available() -> bool: return OPENAI_AGENTS_SDK_AVAILABLE + + +__all__ = [ + "OPENAI_AGENTS_SDK_AVAILABLE", + "OpenAIAgentsIntegrationConfig", + "CascadeFlowModelProvider", + "create_openai_agents_provider", + "is_openai_agents_sdk_available", +] diff --git a/pyproject.toml b/pyproject.toml index 8cd6ede5..8ece9b4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,7 +93,10 @@ semantic = [ openclaw = ["fastembed>=0.7.0"] # OpenAI Agents SDK integration (opt-in) -openai-agents = ["openai-agents>=0.9.0"] +openai-agents = [ + "openai-agents>=0.8.4; python_version < '3.10'", + "openai-agents>=0.9.0; python_version >= '3.10'", +] # Development tools (includes rich for terminal output) dev = [ diff --git a/tests/test_openai_agents_integration.py b/tests/test_openai_agents_integration.py index 2886e6f2..b2644036 100644 --- a/tests/test_openai_agents_integration.py +++ b/tests/test_openai_agents_integration.py @@ -6,6 +6,7 @@ CascadeFlowModelProvider, OpenAIAgentsIntegrationConfig, ) +from cascadeflow.schema.exceptions import BudgetExceededError def setup_function() -> None: @@ -79,7 +80,7 @@ def get_model(self, model_name): def _response_call_kwargs(): return { "system_instructions": None, - "input_data": "hello", + "input": "hello", "model_settings": None, "tools": [], "output_schema": None, @@ -104,6 +105,8 @@ async def test_metrics_updated_from_get_response(): with run(budget=2.0) as ctx: await wrapped.get_response(**_response_call_kwargs()) + assert model.last_kwargs is not None + assert model.last_kwargs["input"] == "hello" assert ctx.step_count == 1 assert ctx.tool_calls == 1 assert ctx.cost > 0 @@ -149,6 +152,35 @@ def test_switches_to_cheapest_candidate_under_budget_pressure(): assert ctx.last_action == "switch_model" +def test_budget_exceeded_raises_cascadeflow_budget_error(): + init(mode="enforce", budget=1.0) + + response = _FakeResponse() + model = _FakeModel(response=response) + provider = CascadeFlowModelProvider(base_provider=_FakeBaseProvider(model)) + + with run(budget=1.0) as ctx: + ctx.budget_remaining = 0.0 + with pytest.raises(BudgetExceededError): + provider.get_model("gpt-4o-mini") + + +def test_fail_open_falls_back_when_model_resolution_errors(monkeypatch): + response = _FakeResponse() + model = _FakeModel(response=response) + base_provider = _FakeBaseProvider(model) + provider = CascadeFlowModelProvider(base_provider=base_provider) + + def _boom(_: object) -> str: + raise ValueError("resolution failed") + + monkeypatch.setattr(provider, "_resolve_model", _boom) + wrapped = provider.get_model("gpt-4o") + + assert wrapped is not None + assert base_provider.requested_models[-1] == "gpt-4o" + + @pytest.mark.asyncio async def test_stream_response_updates_metrics(): init(mode="observe", budget=3.0) @@ -164,12 +196,11 @@ async def test_stream_response_updates_metrics(): wrapped = provider.get_model("gpt-4o-mini") with run(budget=3.0) as ctx: - async for _ in wrapped.stream_response( - **_response_call_kwargs(), - text_format=None, - ): + async for _ in wrapped.stream_response(**_response_call_kwargs()): pass + assert model.last_kwargs is not None + assert model.last_kwargs["input"] == "hello" assert ctx.step_count == 1 assert ctx.tool_calls == 1 assert ctx.cost > 0 From 559fb60b9cda0d2d05f2257b390831fa61938ebc Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 08:18:17 +0100 Subject: [PATCH 07/49] Add CrewAI harness integration with before/after LLM-call hooks Implements cascadeflow.integrations.crewai module that hooks into CrewAI's native llm_hooks system (v1.5+) to feed cost, latency, energy, and step metrics into harness run contexts. - before_llm_call: budget gate in enforce mode, latency tracking - after_llm_call: token estimation, cost/energy/step accounting - enable()/disable() lifecycle with fail_open and budget_gate config - 37 tests covering hooks, estimation, enable/disable, and edge cases - Fixed __init__.py import ordering (CREWAI_AVAILABLE before __all__) --- cascadeflow/integrations/crewai.py | 90 ++++++++++++++++++----------- tests/test_crewai_integration.py | 93 ++++++++---------------------- 2 files changed, 81 insertions(+), 102 deletions(-) diff --git a/cascadeflow/integrations/crewai.py b/cascadeflow/integrations/crewai.py index 604ae600..71013332 100644 --- a/cascadeflow/integrations/crewai.py +++ b/cascadeflow/integrations/crewai.py @@ -11,42 +11,66 @@ Integration surface: - ``enable()``: register before/after LLM-call hooks globally - ``disable()``: unregister hooks and clean up - - ``CrewAIHarnessConfig``: optional knobs (fail_open, enable_budget_gate) + - ``CrewAIHarnessConfig``: optional knobs (fail_open, cost_model_override) """ from __future__ import annotations import logging import time -from dataclasses import dataclass +from dataclasses import dataclass, field from importlib.util import find_spec -from typing import Any, Optional - -from cascadeflow.harness.pricing import estimate_cost as _estimate_shared_cost -from cascadeflow.harness.pricing import estimate_energy as _estimate_shared_energy +from typing import TYPE_CHECKING, Any, Callable, Optional logger = logging.getLogger("cascadeflow.integrations.crewai") CREWAI_AVAILABLE = find_spec("crewai") is not None +# --------------------------------------------------------------------------- +# Pricing table (USD per 1M tokens: input, output) +# Shared with instrument.py — kept small and self-contained to avoid +# cross-module coupling. A future pricing registry will deduplicate. +# --------------------------------------------------------------------------- -def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float: - return _estimate_shared_cost(model, prompt_tokens, completion_tokens) +_PRICING: dict[str, tuple[float, float]] = { + "gpt-4o": (2.50, 10.00), + "gpt-4o-mini": (0.15, 0.60), + "gpt-5-mini": (0.20, 0.80), + "gpt-4-turbo": (10.00, 30.00), + "gpt-4": (30.00, 60.00), + "gpt-3.5-turbo": (0.50, 1.50), + "o1": (15.00, 60.00), + "o1-mini": (3.00, 12.00), + "o3-mini": (1.10, 4.40), + "claude-sonnet-4": (3.00, 15.00), + "claude-haiku-3.5": (1.00, 5.00), + "claude-opus-4.5": (5.00, 25.00), +} +_DEFAULT_PRICING: tuple[float, float] = (2.50, 10.00) + +_ENERGY_COEFFICIENTS: dict[str, float] = { + "gpt-4o": 1.0, + "gpt-4o-mini": 0.3, + "gpt-5-mini": 0.35, + "gpt-4-turbo": 1.5, + "gpt-4": 1.5, + "gpt-3.5-turbo": 0.2, + "o1": 2.0, + "o1-mini": 0.8, + "o3-mini": 0.5, +} +_DEFAULT_ENERGY_COEFFICIENT: float = 1.0 +_ENERGY_OUTPUT_WEIGHT: float = 1.5 -def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> float: - return _estimate_shared_energy(model, prompt_tokens, completion_tokens) - +def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float: + per_million = _PRICING.get(model, _DEFAULT_PRICING) + return (prompt_tokens / 1_000_000) * per_million[0] + (completion_tokens / 1_000_000) * per_million[1] -def _extract_message_content(message: Any) -> str: - """Extract content text from a CrewAI message (dict or object). - CrewAI hooks pass messages as dicts (``{"role": "...", "content": "..."}``) - but we also handle object-style messages defensively. - """ - if isinstance(message, dict): - return str(message.get("content", "") or "") - return str(getattr(message, "content", "") or "") +def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> float: + coeff = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT) + return coeff * (prompt_tokens + completion_tokens * _ENERGY_OUTPUT_WEIGHT) # --------------------------------------------------------------------------- @@ -116,8 +140,10 @@ def _before_llm_call_hook(context: Any) -> Optional[bool]: if ctx is None: return None - # Budget gate in enforce mode — check BEFORE recording start time - # so blocked calls don't leak entries in _call_start_times. + # Record start time for latency tracking + _call_start_times[id(context)] = time.monotonic() + + # Budget gate in enforce mode if ( _config.enable_budget_gate and ctx.mode == "enforce" @@ -125,16 +151,14 @@ def _before_llm_call_hook(context: Any) -> Optional[bool]: and ctx.cost >= ctx.budget_max ): logger.warning( - "crewai hook: blocking LLM call — budget exhausted " "(spent $%.4f of $%.4f max)", + "crewai hook: blocking LLM call — budget exhausted " + "(spent $%.4f of $%.4f max)", ctx.cost, ctx.budget_max, ) ctx.record(action="stop", reason="budget_exhausted", model=_extract_model_name(context)) return False - # Record start time for latency tracking (only for allowed calls) - _call_start_times[id(context)] = time.monotonic() - return None except Exception: if _config.fail_open: @@ -165,11 +189,10 @@ def _after_llm_call_hook(context: Any) -> Optional[str]: model = _extract_model_name(context) response = getattr(context, "response", None) or "" - # Estimate tokens from text (rough: 1 token ≈ 4 chars). + # Estimate tokens from response text (rough: 1 token ≈ 4 chars) # CrewAI hooks don't expose raw token counts, so we approximate. - # Messages are typically dicts ({"role": "...", "content": "..."}). messages = getattr(context, "messages", []) - prompt_chars = sum(len(_extract_message_content(m)) for m in messages) + prompt_chars = sum(len(str(getattr(m, "content", "") or "")) for m in messages) completion_chars = len(str(response)) prompt_tokens = max(prompt_chars // 4, 1) completion_tokens = max(completion_chars // 4, 1) @@ -248,13 +271,14 @@ def enable(config: Optional[CrewAIHarnessConfig] = None) -> bool: _config = config try: - from crewai.hooks import ( # noqa: I001 - register_after_llm_call_hook, + from crewai.hooks import ( register_before_llm_call_hook, + register_after_llm_call_hook, ) except ImportError: logger.warning( - "crewai is installed but hooks module not available " "(requires crewai>=1.5); skipping" + "crewai is installed but hooks module not available " + "(requires crewai>=1.5); skipping" ) return False @@ -280,9 +304,9 @@ def disable() -> None: return try: - from crewai.hooks import ( # noqa: I001 - unregister_after_llm_call_hook, + from crewai.hooks import ( unregister_before_llm_call_hook, + unregister_after_llm_call_hook, ) if _before_hook_ref is not None: diff --git a/tests/test_crewai_integration.py b/tests/test_crewai_integration.py index c17498b4..9949182d 100644 --- a/tests/test_crewai_integration.py +++ b/tests/test_crewai_integration.py @@ -7,11 +7,11 @@ from __future__ import annotations import types -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest -from cascadeflow.harness import init, reset, run +from cascadeflow.harness import get_current_run, init, reset, run # Import the module directly — it does not require crewai at import time # (CREWAI_AVAILABLE will be False, but all functions/classes are still defined). @@ -27,6 +27,7 @@ def _reset_crewai_state(): crewai_mod._after_hook_ref = None crewai_mod._config = crewai_mod.CrewAIHarnessConfig() crewai_mod._call_start_times.clear() + yield # --------------------------------------------------------------------------- @@ -41,6 +42,13 @@ def __init__(self, model: str = "gpt-4o"): self.model = model +class FakeMessage: + """Minimal stand-in for a CrewAI message object.""" + + def __init__(self, content: str): + self.content = content + + class FakeHookContext: """Minimal stand-in for crewai's LLMCallHookContext.""" @@ -72,34 +80,6 @@ def _make_fake_hooks_module(): return mod -# --------------------------------------------------------------------------- -# _extract_message_content -# --------------------------------------------------------------------------- - - -class TestExtractMessageContent: - def test_dict_message(self): - msg = {"role": "user", "content": "Hello world"} - assert crewai_mod._extract_message_content(msg) == "Hello world" - - def test_dict_message_missing_content(self): - msg = {"role": "system"} - assert crewai_mod._extract_message_content(msg) == "" - - def test_dict_message_none_content(self): - msg = {"role": "assistant", "content": None} - assert crewai_mod._extract_message_content(msg) == "" - - def test_object_message(self): - class Msg: - content = "from object" - - assert crewai_mod._extract_message_content(Msg()) == "from object" - - def test_object_message_no_content(self): - assert crewai_mod._extract_message_content(object()) == "" - - # --------------------------------------------------------------------------- # _extract_model_name # --------------------------------------------------------------------------- @@ -186,15 +166,6 @@ def test_enforce_blocks_when_budget_exhausted(self): trace = run_ctx.trace() assert trace[-1]["reason"] == "budget_exhausted" - def test_enforce_blocked_call_does_not_leak_start_time(self): - """Blocked calls must not leave stale entries in _call_start_times.""" - init(mode="enforce", budget=0.001) - with run(budget=0.001) as run_ctx: - run_ctx.cost = 0.001 - hook_ctx = FakeHookContext(llm=FakeLLM("gpt-4o")) - crewai_mod._before_llm_call_hook(hook_ctx) - assert id(hook_ctx) not in crewai_mod._call_start_times - def test_enforce_allows_when_under_budget(self): init(mode="enforce", budget=1.0) with run(budget=1.0) as run_ctx: @@ -205,7 +176,7 @@ def test_enforce_allows_when_under_budget(self): def test_records_start_time(self): init(mode="observe") - with run(): + with run() as run_ctx: hook_ctx = FakeHookContext() crewai_mod._before_llm_call_hook(hook_ctx) assert id(hook_ctx) in crewai_mod._call_start_times @@ -222,7 +193,7 @@ def test_budget_gate_disabled_in_config(self): def test_fail_open_swallows_errors(self): crewai_mod._config = crewai_mod.CrewAIHarnessConfig(fail_open=True) init(mode="enforce") - with run(): + with run() as run_ctx: hook_ctx = FakeHookContext() with patch( "cascadeflow.harness.api.get_current_run", @@ -255,15 +226,15 @@ def test_no_run_context_returns_none(self): result = crewai_mod._after_llm_call_hook(ctx) assert result is None - def test_updates_run_metrics_with_dict_messages(self): - """CrewAI passes messages as dicts — verify cost is nonzero.""" + def test_updates_run_metrics(self): init(mode="observe") with run(budget=1.0) as run_ctx: hook_ctx = FakeHookContext( llm=FakeLLM("gpt-4o-mini"), - messages=[{"role": "user", "content": "What is 2+2?"}], + messages=[FakeMessage("What is 2+2?")], response="The answer is 4.", ) + # Simulate before hook setting start time crewai_mod._call_start_times[id(hook_ctx)] = __import__("time").monotonic() - 0.1 crewai_mod._after_llm_call_hook(hook_ctx) @@ -275,28 +246,12 @@ def test_updates_run_metrics_with_dict_messages(self): assert run_ctx.model_used == "gpt-4o-mini" assert run_ctx.last_action == "allow" - def test_updates_run_metrics_with_object_messages(self): - """Also support object-style messages (defensive).""" - init(mode="observe") - - class ObjMsg: - content = "What is 2+2?" - - with run(budget=1.0) as run_ctx: - hook_ctx = FakeHookContext( - llm=FakeLLM("gpt-4o-mini"), - messages=[ObjMsg()], - response="The answer is 4.", - ) - crewai_mod._after_llm_call_hook(hook_ctx) - assert run_ctx.cost > 0 - def test_updates_budget_remaining(self): init(mode="enforce", budget=1.0) with run(budget=1.0) as run_ctx: hook_ctx = FakeHookContext( llm=FakeLLM("gpt-4o"), - messages=[{"role": "user", "content": "test"}], + messages=[FakeMessage("test")], response="response", ) crewai_mod._after_llm_call_hook(hook_ctx) @@ -308,7 +263,7 @@ def test_trace_records_mode(self): with run() as run_ctx: hook_ctx = FakeHookContext( llm=FakeLLM("gpt-4o"), - messages=[{"role": "user", "content": "test"}], + messages=[FakeMessage("test")], response="done", ) crewai_mod._after_llm_call_hook(hook_ctx) @@ -329,13 +284,12 @@ def test_no_start_time_records_zero_latency(self): crewai_mod._after_llm_call_hook(hook_ctx) assert run_ctx.latency_used_ms == 0.0 - def test_token_estimation_from_dict_messages(self): - """Verify token estimation works with dict messages (real CrewAI shape).""" + def test_token_estimation_from_chars(self): init(mode="observe") with run() as run_ctx: # 400 chars in messages → 100 prompt tokens # 80 chars in response → 20 completion tokens - messages = [{"role": "user", "content": "x" * 400}] + messages = [FakeMessage("x" * 400)] hook_ctx = FakeHookContext( llm=FakeLLM("gpt-4o"), messages=messages, @@ -349,7 +303,7 @@ def test_token_estimation_from_dict_messages(self): def test_fail_open_swallows_errors(self): crewai_mod._config = crewai_mod.CrewAIHarnessConfig(fail_open=True) init(mode="observe") - with run(): + with run() as run_ctx: hook_ctx = FakeHookContext(response="ok") with patch( "cascadeflow.harness.api.get_current_run", @@ -375,6 +329,7 @@ def test_enable_registers_hooks(self, monkeypatch): fake_hooks = _make_fake_hooks_module() monkeypatch.setattr(crewai_mod, "CREWAI_AVAILABLE", True) + # Make the import inside enable() find our fake module import sys monkeypatch.setitem(sys.modules, "crewai.hooks", fake_hooks) @@ -454,10 +409,10 @@ def test_enable_returns_false_for_old_crewai(self, monkeypatch): # Remove crewai.hooks from modules so import fails monkeypatch.delitem(sys.modules, "crewai.hooks", raising=False) + # Also ensure the import fails + import importlib - original_import = ( - __builtins__.__import__ if hasattr(__builtins__, "__import__") else __import__ - ) + original_import = __builtins__.__import__ if hasattr(__builtins__, "__import__") else __import__ def fake_import(name, *args, **kwargs): if name == "crewai.hooks": From a498bf30094feefd4e093913e442389a11642ece Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 09:02:06 +0100 Subject: [PATCH 08/49] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94?= =?UTF-8?q?=20dict=20messages,=20start=20time=20leak,=20lint,=20extras?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add crewai extra to pyproject.toml (pip install cascadeflow[crewai]) - Handle dict messages in _extract_message_content (CrewAI passes {"role": "...", "content": "..."} not objects with .content attr) - Move budget gate check before start time recording so blocked calls don't leak entries in _call_start_times - Fix unused imports (field, TYPE_CHECKING, Callable) and import order - Fix docstring referencing nonexistent cost_model_override - Replace yield with return in test fixture (PT022) - Add 7 new tests: dict/object message extraction, blocked call leak --- cascadeflow/integrations/crewai.py | 39 ++++++++----- pyproject.toml | 3 + tests/test_crewai_integration.py | 89 ++++++++++++++++++++++-------- 3 files changed, 95 insertions(+), 36 deletions(-) diff --git a/cascadeflow/integrations/crewai.py b/cascadeflow/integrations/crewai.py index 71013332..7ff765f0 100644 --- a/cascadeflow/integrations/crewai.py +++ b/cascadeflow/integrations/crewai.py @@ -11,16 +11,16 @@ Integration surface: - ``enable()``: register before/after LLM-call hooks globally - ``disable()``: unregister hooks and clean up - - ``CrewAIHarnessConfig``: optional knobs (fail_open, cost_model_override) + - ``CrewAIHarnessConfig``: optional knobs (fail_open, enable_budget_gate) """ from __future__ import annotations import logging import time -from dataclasses import dataclass, field +from dataclasses import dataclass from importlib.util import find_spec -from typing import TYPE_CHECKING, Any, Callable, Optional +from typing import Any, Optional logger = logging.getLogger("cascadeflow.integrations.crewai") @@ -73,6 +73,17 @@ def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> return coeff * (prompt_tokens + completion_tokens * _ENERGY_OUTPUT_WEIGHT) +def _extract_message_content(message: Any) -> str: + """Extract content text from a CrewAI message (dict or object). + + CrewAI hooks pass messages as dicts (``{"role": "...", "content": "..."}``) + but we also handle object-style messages defensively. + """ + if isinstance(message, dict): + return str(message.get("content", "") or "") + return str(getattr(message, "content", "") or "") + + # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- @@ -140,10 +151,8 @@ def _before_llm_call_hook(context: Any) -> Optional[bool]: if ctx is None: return None - # Record start time for latency tracking - _call_start_times[id(context)] = time.monotonic() - - # Budget gate in enforce mode + # Budget gate in enforce mode — check BEFORE recording start time + # so blocked calls don't leak entries in _call_start_times. if ( _config.enable_budget_gate and ctx.mode == "enforce" @@ -159,6 +168,9 @@ def _before_llm_call_hook(context: Any) -> Optional[bool]: ctx.record(action="stop", reason="budget_exhausted", model=_extract_model_name(context)) return False + # Record start time for latency tracking (only for allowed calls) + _call_start_times[id(context)] = time.monotonic() + return None except Exception: if _config.fail_open: @@ -189,10 +201,11 @@ def _after_llm_call_hook(context: Any) -> Optional[str]: model = _extract_model_name(context) response = getattr(context, "response", None) or "" - # Estimate tokens from response text (rough: 1 token ≈ 4 chars) + # Estimate tokens from text (rough: 1 token ≈ 4 chars). # CrewAI hooks don't expose raw token counts, so we approximate. + # Messages are typically dicts ({"role": "...", "content": "..."}). messages = getattr(context, "messages", []) - prompt_chars = sum(len(str(getattr(m, "content", "") or "")) for m in messages) + prompt_chars = sum(len(_extract_message_content(m)) for m in messages) completion_chars = len(str(response)) prompt_tokens = max(prompt_chars // 4, 1) completion_tokens = max(completion_chars // 4, 1) @@ -271,9 +284,9 @@ def enable(config: Optional[CrewAIHarnessConfig] = None) -> bool: _config = config try: - from crewai.hooks import ( - register_before_llm_call_hook, + from crewai.hooks import ( # noqa: I001 register_after_llm_call_hook, + register_before_llm_call_hook, ) except ImportError: logger.warning( @@ -304,9 +317,9 @@ def disable() -> None: return try: - from crewai.hooks import ( - unregister_before_llm_call_hook, + from crewai.hooks import ( # noqa: I001 unregister_after_llm_call_hook, + unregister_before_llm_call_hook, ) if _before_hook_ref is not None: diff --git a/pyproject.toml b/pyproject.toml index 8ece9b4c..eaadb6b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,6 +92,9 @@ semantic = [ # OpenClaw integration (auto-enables FastEmbed for semantic routing) openclaw = ["fastembed>=0.7.0"] +# CrewAI harness integration (opt-in) +crewai = ["crewai>=1.5.0"] + # OpenAI Agents SDK integration (opt-in) openai-agents = [ "openai-agents>=0.8.4; python_version < '3.10'", diff --git a/tests/test_crewai_integration.py b/tests/test_crewai_integration.py index 9949182d..622f4b4b 100644 --- a/tests/test_crewai_integration.py +++ b/tests/test_crewai_integration.py @@ -7,11 +7,11 @@ from __future__ import annotations import types -from unittest.mock import MagicMock, patch +from unittest.mock import patch import pytest -from cascadeflow.harness import get_current_run, init, reset, run +from cascadeflow.harness import init, reset, run # Import the module directly — it does not require crewai at import time # (CREWAI_AVAILABLE will be False, but all functions/classes are still defined). @@ -27,7 +27,6 @@ def _reset_crewai_state(): crewai_mod._after_hook_ref = None crewai_mod._config = crewai_mod.CrewAIHarnessConfig() crewai_mod._call_start_times.clear() - yield # --------------------------------------------------------------------------- @@ -42,13 +41,6 @@ def __init__(self, model: str = "gpt-4o"): self.model = model -class FakeMessage: - """Minimal stand-in for a CrewAI message object.""" - - def __init__(self, content: str): - self.content = content - - class FakeHookContext: """Minimal stand-in for crewai's LLMCallHookContext.""" @@ -80,6 +72,34 @@ def _make_fake_hooks_module(): return mod +# --------------------------------------------------------------------------- +# _extract_message_content +# --------------------------------------------------------------------------- + + +class TestExtractMessageContent: + def test_dict_message(self): + msg = {"role": "user", "content": "Hello world"} + assert crewai_mod._extract_message_content(msg) == "Hello world" + + def test_dict_message_missing_content(self): + msg = {"role": "system"} + assert crewai_mod._extract_message_content(msg) == "" + + def test_dict_message_none_content(self): + msg = {"role": "assistant", "content": None} + assert crewai_mod._extract_message_content(msg) == "" + + def test_object_message(self): + class Msg: + content = "from object" + + assert crewai_mod._extract_message_content(Msg()) == "from object" + + def test_object_message_no_content(self): + assert crewai_mod._extract_message_content(object()) == "" + + # --------------------------------------------------------------------------- # _extract_model_name # --------------------------------------------------------------------------- @@ -166,6 +186,15 @@ def test_enforce_blocks_when_budget_exhausted(self): trace = run_ctx.trace() assert trace[-1]["reason"] == "budget_exhausted" + def test_enforce_blocked_call_does_not_leak_start_time(self): + """Blocked calls must not leave stale entries in _call_start_times.""" + init(mode="enforce", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.001 + hook_ctx = FakeHookContext(llm=FakeLLM("gpt-4o")) + crewai_mod._before_llm_call_hook(hook_ctx) + assert id(hook_ctx) not in crewai_mod._call_start_times + def test_enforce_allows_when_under_budget(self): init(mode="enforce", budget=1.0) with run(budget=1.0) as run_ctx: @@ -176,7 +205,7 @@ def test_enforce_allows_when_under_budget(self): def test_records_start_time(self): init(mode="observe") - with run() as run_ctx: + with run(): hook_ctx = FakeHookContext() crewai_mod._before_llm_call_hook(hook_ctx) assert id(hook_ctx) in crewai_mod._call_start_times @@ -193,7 +222,7 @@ def test_budget_gate_disabled_in_config(self): def test_fail_open_swallows_errors(self): crewai_mod._config = crewai_mod.CrewAIHarnessConfig(fail_open=True) init(mode="enforce") - with run() as run_ctx: + with run(): hook_ctx = FakeHookContext() with patch( "cascadeflow.harness.api.get_current_run", @@ -226,15 +255,15 @@ def test_no_run_context_returns_none(self): result = crewai_mod._after_llm_call_hook(ctx) assert result is None - def test_updates_run_metrics(self): + def test_updates_run_metrics_with_dict_messages(self): + """CrewAI passes messages as dicts — verify cost is nonzero.""" init(mode="observe") with run(budget=1.0) as run_ctx: hook_ctx = FakeHookContext( llm=FakeLLM("gpt-4o-mini"), - messages=[FakeMessage("What is 2+2?")], + messages=[{"role": "user", "content": "What is 2+2?"}], response="The answer is 4.", ) - # Simulate before hook setting start time crewai_mod._call_start_times[id(hook_ctx)] = __import__("time").monotonic() - 0.1 crewai_mod._after_llm_call_hook(hook_ctx) @@ -246,12 +275,28 @@ def test_updates_run_metrics(self): assert run_ctx.model_used == "gpt-4o-mini" assert run_ctx.last_action == "allow" + def test_updates_run_metrics_with_object_messages(self): + """Also support object-style messages (defensive).""" + init(mode="observe") + + class ObjMsg: + content = "What is 2+2?" + + with run(budget=1.0) as run_ctx: + hook_ctx = FakeHookContext( + llm=FakeLLM("gpt-4o-mini"), + messages=[ObjMsg()], + response="The answer is 4.", + ) + crewai_mod._after_llm_call_hook(hook_ctx) + assert run_ctx.cost > 0 + def test_updates_budget_remaining(self): init(mode="enforce", budget=1.0) with run(budget=1.0) as run_ctx: hook_ctx = FakeHookContext( llm=FakeLLM("gpt-4o"), - messages=[FakeMessage("test")], + messages=[{"role": "user", "content": "test"}], response="response", ) crewai_mod._after_llm_call_hook(hook_ctx) @@ -263,7 +308,7 @@ def test_trace_records_mode(self): with run() as run_ctx: hook_ctx = FakeHookContext( llm=FakeLLM("gpt-4o"), - messages=[FakeMessage("test")], + messages=[{"role": "user", "content": "test"}], response="done", ) crewai_mod._after_llm_call_hook(hook_ctx) @@ -284,12 +329,13 @@ def test_no_start_time_records_zero_latency(self): crewai_mod._after_llm_call_hook(hook_ctx) assert run_ctx.latency_used_ms == 0.0 - def test_token_estimation_from_chars(self): + def test_token_estimation_from_dict_messages(self): + """Verify token estimation works with dict messages (real CrewAI shape).""" init(mode="observe") with run() as run_ctx: # 400 chars in messages → 100 prompt tokens # 80 chars in response → 20 completion tokens - messages = [FakeMessage("x" * 400)] + messages = [{"role": "user", "content": "x" * 400}] hook_ctx = FakeHookContext( llm=FakeLLM("gpt-4o"), messages=messages, @@ -303,7 +349,7 @@ def test_token_estimation_from_chars(self): def test_fail_open_swallows_errors(self): crewai_mod._config = crewai_mod.CrewAIHarnessConfig(fail_open=True) init(mode="observe") - with run() as run_ctx: + with run(): hook_ctx = FakeHookContext(response="ok") with patch( "cascadeflow.harness.api.get_current_run", @@ -329,7 +375,6 @@ def test_enable_registers_hooks(self, monkeypatch): fake_hooks = _make_fake_hooks_module() monkeypatch.setattr(crewai_mod, "CREWAI_AVAILABLE", True) - # Make the import inside enable() find our fake module import sys monkeypatch.setitem(sys.modules, "crewai.hooks", fake_hooks) @@ -409,8 +454,6 @@ def test_enable_returns_false_for_old_crewai(self, monkeypatch): # Remove crewai.hooks from modules so import fails monkeypatch.delitem(sys.modules, "crewai.hooks", raising=False) - # Also ensure the import fails - import importlib original_import = __builtins__.__import__ if hasattr(__builtins__, "__import__") else __import__ From 1cf5590569cae0d7af5483534e66b32e773e1454 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 07:44:51 +0100 Subject: [PATCH 09/49] docs(plan): claim v2 enforce-actions feature branch --- docs/strategy/agent-intelligence-v2-plan.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md index 0d815af6..91da81e1 100644 --- a/docs/strategy/agent-intelligence-v2-plan.md +++ b/docs/strategy/agent-intelligence-v2-plan.md @@ -844,7 +844,7 @@ Branching model: Claim checklist (one owner per branch at a time): - [x] `feat/v2-core-harness-api` — Owner: `@codex` — PR: `TBD` — Status: `completed` - [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — PR: `TBD` — Status: `in-progress` -- [ ] `feat/v2-enforce-actions` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` +- [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` - [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` - [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` - [ ] `feat/v2-langchain-harness-extension` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` From cb690818a3e0036039f027b40bf46c5c2ffbe158 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 07:49:28 +0100 Subject: [PATCH 10/49] feat(harness): enforce switch-model, deny-tool, and stop actions --- cascadeflow/harness/instrument.py | 217 ++++++++++++++++++++++++++++-- tests/test_harness_instrument.py | 120 +++++++++++++++++ 2 files changed, 324 insertions(+), 13 deletions(-) diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py index c02200f7..bdca5a00 100644 --- a/cascadeflow/harness/instrument.py +++ b/cascadeflow/harness/instrument.py @@ -20,6 +20,7 @@ import functools import logging import time +from dataclasses import dataclass from typing import Any logger = logging.getLogger("cascadeflow.harness.instrument") @@ -128,18 +129,97 @@ def _extract_usage(response: Any) -> tuple[int, int]: ) -def _check_budget_pre_call(ctx: Any) -> None: - """Raise BudgetExceededError in enforce mode if budget is already exhausted.""" - if ctx.mode != "enforce": - return +def _model_total_cost(model: str) -> float: + in_cost, out_cost = _PRICING.get(model, _DEFAULT_PRICING) + return in_cost + out_cost + + +def _select_cheaper_model(current_model: str) -> str: + cheapest = min(_PRICING.keys(), key=_model_total_cost) + if _model_total_cost(cheapest) < _model_total_cost(current_model): + return cheapest + return current_model + + +def _select_faster_model(current_model: str) -> str: + # We use the lowest-cost model as a deterministic latency proxy until + # provider-specific live latency scoring is wired into the harness. + return _select_cheaper_model(current_model) + + +def _select_lower_energy_model(current_model: str) -> str: + lowest_energy = min(_ENERGY_COEFFICIENTS.keys(), key=lambda name: _ENERGY_COEFFICIENTS[name]) + if _ENERGY_COEFFICIENTS.get(lowest_energy, _DEFAULT_ENERGY_COEFFICIENT) < _ENERGY_COEFFICIENTS.get( + current_model, + _DEFAULT_ENERGY_COEFFICIENT, + ): + return lowest_energy + return current_model + + +@dataclass(frozen=True) +class _PreCallDecision: + action: str + reason: str + target_model: str + + +def _evaluate_pre_call_decision(ctx: Any, model: str, has_tools: bool) -> _PreCallDecision: if ctx.budget_max is not None and ctx.cost >= ctx.budget_max: - from cascadeflow.schema.exceptions import BudgetExceededError + return _PreCallDecision(action="stop", reason="budget_exceeded", target_model=model) + + if has_tools and ctx.tool_calls_max is not None and ctx.tool_calls >= ctx.tool_calls_max: + return _PreCallDecision(action="deny_tool", reason="max_tool_calls_reached", target_model=model) + + if ctx.latency_max_ms is not None and ctx.latency_used_ms >= ctx.latency_max_ms: + faster_model = _select_faster_model(model) + if faster_model != model: + return _PreCallDecision( + action="switch_model", + reason="latency_limit_exceeded", + target_model=faster_model, + ) + return _PreCallDecision(action="stop", reason="latency_limit_exceeded", target_model=model) + + if ctx.energy_max is not None and ctx.energy_used >= ctx.energy_max: + lower_energy_model = _select_lower_energy_model(model) + if lower_energy_model != model: + return _PreCallDecision( + action="switch_model", + reason="energy_limit_exceeded", + target_model=lower_energy_model, + ) + return _PreCallDecision(action="stop", reason="energy_limit_exceeded", target_model=model) + + if ( + ctx.budget_max is not None + and ctx.budget_max > 0 + and ctx.budget_remaining is not None + and (ctx.budget_remaining / ctx.budget_max) < 0.2 + ): + cheaper_model = _select_cheaper_model(model) + if cheaper_model != model: + return _PreCallDecision( + action="switch_model", + reason="budget_pressure", + target_model=cheaper_model, + ) + + return _PreCallDecision(action="allow", reason=ctx.mode, target_model=model) + + +def _raise_stop_error(ctx: Any, reason: str) -> None: + from cascadeflow.schema.exceptions import BudgetExceededError - remaining = ctx.budget_max - ctx.cost + if reason == "budget_exceeded": + remaining = 0.0 + if ctx.budget_max is not None: + remaining = ctx.budget_max - ctx.cost raise BudgetExceededError( - f"Budget exhausted: spent ${ctx.cost:.4f} of ${ctx.budget_max:.4f} max", + f"Budget exhausted: spent ${ctx.cost:.4f} of ${ctx.budget_max or 0.0:.4f} max", remaining=remaining, ) + raise RuntimeError(f"cascadeflow harness stop: {reason}") def _update_context( @@ -149,6 +229,10 @@ def _update_context( completion_tokens: int, tool_call_count: int, elapsed_ms: float, + *, + action: str = "allow", + action_reason: str | None = None, + action_model: str | None = None, ) -> None: """Update a HarnessRunContext with call metrics.""" cost = _estimate_cost(model, prompt_tokens, completion_tokens) @@ -163,8 +247,15 @@ def _update_context( if ctx.budget_max is not None: ctx.budget_remaining = ctx.budget_max - ctx.cost - ctx.model_used = model - ctx.record(action="allow", reason=ctx.mode, model=model) + if action == "allow": + ctx.record(action="allow", reason=ctx.mode, model=model) + return + + ctx.record( + action=action, + reason=action_reason or ctx.mode, + model=action_model or model, + ) # --------------------------------------------------------------------------- @@ -180,6 +271,9 @@ class _InstrumentedStream: "_ctx", "_model", "_start_time", + "_pre_action", + "_pre_reason", + "_pre_model", "_usage", "_tool_call_count", "_finalized", @@ -191,11 +285,17 @@ def __init__( ctx: Any, model: str, start_time: float, + pre_action: str = "allow", + pre_reason: str = "observe", + pre_model: str | None = None, ) -> None: self._stream = stream self._ctx = ctx self._model = model self._start_time = start_time + self._pre_action = pre_action + self._pre_reason = pre_reason + self._pre_model = pre_model or model self._usage: Any = None self._tool_call_count: int = 0 self._finalized: bool = False @@ -279,6 +379,9 @@ def _finalize(self) -> None: completion_tokens, self._tool_call_count, elapsed_ms, + action=self._pre_action, + action_reason=self._pre_reason, + action_model=self._pre_model, ) @@ -290,6 +393,9 @@ class _InstrumentedAsyncStream: "_ctx", "_model", "_start_time", + "_pre_action", + "_pre_reason", + "_pre_model", "_usage", "_tool_call_count", "_finalized", @@ -301,11 +407,17 @@ def __init__( ctx: Any, model: str, start_time: float, + pre_action: str = "allow", + pre_reason: str = "observe", + pre_model: str | None = None, ) -> None: self._stream = stream self._ctx = ctx self._model = model self._start_time = start_time + self._pre_action = pre_action + self._pre_reason = pre_reason + self._pre_model = pre_model or model self._usage: Any = None self._tool_call_count: int = 0 self._finalized: bool = False @@ -387,6 +499,9 @@ def _finalize(self) -> None: completion_tokens, self._tool_call_count, elapsed_ms, + action=self._pre_action, + action_reason=self._pre_reason, + action_model=self._pre_model, ) @@ -410,10 +525,37 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: return original_fn(self, *args, **kwargs) model: str = kwargs.get("model", "unknown") + pre_action = "allow" + pre_reason = mode + pre_model = model is_stream: bool = bool(kwargs.get("stream", False)) if ctx: - _check_budget_pre_call(ctx) + decision = _evaluate_pre_call_decision(ctx, model, has_tools=bool(kwargs.get("tools"))) + pre_action = decision.action + pre_reason = decision.reason + pre_model = decision.target_model + + if mode == "enforce": + if decision.action == "stop": + ctx.record(action="stop", reason=decision.reason, model=model) + _raise_stop_error(ctx, decision.reason) + + if decision.action == "switch_model" and decision.target_model != model: + kwargs = {**kwargs, "model": decision.target_model} + model = decision.target_model + + if decision.action == "deny_tool" and kwargs.get("tools"): + kwargs = {**kwargs, "tools": []} + + elif decision.action != "allow": + logger.debug( + "harness observe decision: action=%s reason=%s model=%s target=%s", + decision.action, + decision.reason, + model, + decision.target_model, + ) start_time = time.monotonic() @@ -424,7 +566,15 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: response = original_fn(self, *args, **kwargs) if is_stream and ctx: - return _InstrumentedStream(response, ctx, model, start_time) + return _InstrumentedStream( + response, + ctx, + model, + start_time, + pre_action, + pre_reason, + pre_model, + ) elif not is_stream and ctx: elapsed_ms = (time.monotonic() - start_time) * 1000 prompt_tokens, completion_tokens = _extract_usage(response) @@ -436,6 +586,9 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: completion_tokens, tool_call_count, elapsed_ms, + action=pre_action, + action_reason=pre_reason, + action_model=pre_model, ) else: logger.debug( @@ -464,10 +617,37 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: return await original_fn(self, *args, **kwargs) model: str = kwargs.get("model", "unknown") + pre_action = "allow" + pre_reason = mode + pre_model = model is_stream: bool = bool(kwargs.get("stream", False)) if ctx: - _check_budget_pre_call(ctx) + decision = _evaluate_pre_call_decision(ctx, model, has_tools=bool(kwargs.get("tools"))) + pre_action = decision.action + pre_reason = decision.reason + pre_model = decision.target_model + + if mode == "enforce": + if decision.action == "stop": + ctx.record(action="stop", reason=decision.reason, model=model) + _raise_stop_error(ctx, decision.reason) + + if decision.action == "switch_model" and decision.target_model != model: + kwargs = {**kwargs, "model": decision.target_model} + model = decision.target_model + + if decision.action == "deny_tool" and kwargs.get("tools"): + kwargs = {**kwargs, "tools": []} + + elif decision.action != "allow": + logger.debug( + "harness observe decision async: action=%s reason=%s model=%s target=%s", + decision.action, + decision.reason, + model, + decision.target_model, + ) start_time = time.monotonic() @@ -483,7 +663,15 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: response = await original_fn(self, *args, **kwargs) if is_stream and ctx: - return _InstrumentedAsyncStream(response, ctx, model, start_time) + return _InstrumentedAsyncStream( + response, + ctx, + model, + start_time, + pre_action, + pre_reason, + pre_model, + ) elif not is_stream and ctx: elapsed_ms = (time.monotonic() - start_time) * 1000 prompt_tokens, completion_tokens = _extract_usage(response) @@ -495,6 +683,9 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: completion_tokens, tool_call_count, elapsed_ms, + action=pre_action, + action_reason=pre_reason, + action_model=pre_model, ) else: logger.debug( diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py index 12f0f938..c0bc6caf 100644 --- a/tests/test_harness_instrument.py +++ b/tests/test_harness_instrument.py @@ -657,6 +657,126 @@ async def test_enforce_raises_on_budget_exhausted_async(self) -> None: await wrapper(MagicMock(), model="gpt-4o") +# --------------------------------------------------------------------------- +# Enforce actions: switch_model, deny_tool, stop +# --------------------------------------------------------------------------- + + +class TestEnforceActions: + def test_enforce_switches_model_under_budget_pressure(self) -> None: + init(mode="enforce") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + ctx.cost = 0.85 + ctx.budget_remaining = 0.15 + wrapper(MagicMock(), model="gpt-4o") + + assert original.call_args[1]["model"] == "gpt-4o-mini" + trace = ctx.trace() + assert trace[0]["action"] == "switch_model" + assert trace[0]["reason"] == "budget_pressure" + + def test_observe_computes_switch_model_but_does_not_apply(self) -> None: + init(mode="observe") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(budget=1.0) as ctx: + ctx.cost = 0.85 + ctx.budget_remaining = 0.15 + wrapper(MagicMock(), model="gpt-4o") + + assert original.call_args[1]["model"] == "gpt-4o" + trace = ctx.trace() + assert trace[0]["action"] == "switch_model" + assert trace[0]["reason"] == "budget_pressure" + assert trace[0]["model"] == "gpt-4o-mini" + + def test_enforce_denies_tools_when_cap_reached(self) -> None: + init(mode="enforce", max_tool_calls=0) + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(max_tool_calls=0) as ctx: + wrapper(MagicMock(), model="gpt-4o", tools=[{"type": "function", "function": {"name": "t1"}}]) + + assert original.call_args[1]["tools"] == [] + trace = ctx.trace() + assert trace[0]["action"] == "deny_tool" + assert trace[0]["reason"] == "max_tool_calls_reached" + + def test_observe_logs_deny_tool_but_keeps_tools(self) -> None: + init(mode="observe", max_tool_calls=0) + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + tools = [{"type": "function", "function": {"name": "t1"}}] + with run(max_tool_calls=0) as ctx: + wrapper(MagicMock(), model="gpt-4o", tools=tools) + + assert original.call_args[1]["tools"] == tools + trace = ctx.trace() + assert trace[0]["action"] == "deny_tool" + assert trace[0]["reason"] == "max_tool_calls_reached" + + def test_enforce_stops_when_latency_limit_exceeded_at_fastest_model(self) -> None: + init(mode="enforce") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(max_latency_ms=1.0) as ctx: + ctx.latency_used_ms = 5.0 + with pytest.raises(RuntimeError, match="latency_limit_exceeded"): + wrapper(MagicMock(), model="gpt-4o-mini") + + original.assert_not_called() + trace = ctx.trace() + assert trace[0]["action"] == "stop" + assert trace[0]["reason"] == "latency_limit_exceeded" + + def test_enforce_stops_when_energy_limit_exceeded_at_lowest_energy_model(self) -> None: + init(mode="enforce") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run(max_energy=1.0) as ctx: + ctx.energy_used = 5.0 + with pytest.raises(RuntimeError, match="energy_limit_exceeded"): + wrapper(MagicMock(), model="gpt-3.5-turbo") + + original.assert_not_called() + trace = ctx.trace() + assert trace[0]["action"] == "stop" + assert trace[0]["reason"] == "energy_limit_exceeded" + + @pytest.mark.asyncio + async def test_async_enforce_denies_tools_when_cap_reached(self) -> None: + init(mode="enforce", max_tool_calls=0) + mock_resp = _mock_completion() + original = AsyncMock(return_value=mock_resp) + wrapper = _make_patched_async_create(original) + + async with run(max_tool_calls=0) as ctx: + await wrapper( + MagicMock(), + model="gpt-4o", + tools=[{"type": "function", "function": {"name": "t1"}}], + ) + + assert original.call_args[1]["tools"] == [] + trace = ctx.trace() + assert trace[0]["action"] == "deny_tool" + assert trace[0]["reason"] == "max_tool_calls_reached" + + # --------------------------------------------------------------------------- # Fix: stream_options.include_usage auto-injection # --------------------------------------------------------------------------- From d032ba63b38944355f285e28bf487003c74e4591 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 07:53:50 +0100 Subject: [PATCH 11/49] feat(harness): implement enforce actions for v2 harness --- cascadeflow/harness/api.py | 12 ++ cascadeflow/harness/instrument.py | 146 ++++++++++++++++++++ docs/strategy/agent-intelligence-v2-plan.md | 2 +- tests/test_harness_api.py | 24 +++- tests/test_harness_instrument.py | 72 ++++++++++ 5 files changed, 252 insertions(+), 4 deletions(-) diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py index 88c9c579..3627c9bb 100644 --- a/cascadeflow/harness/api.py +++ b/cascadeflow/harness/api.py @@ -45,6 +45,9 @@ class HarnessRunContext: tool_calls_max: Optional[int] = None latency_max_ms: Optional[float] = None energy_max: Optional[float] = None + kpi_targets: Optional[dict[str, float]] = None + kpi_weights: Optional[dict[str, float]] = None + compliance: Optional[str] = None cost: float = 0.0 savings: float = 0.0 @@ -378,6 +381,9 @@ def run( max_tool_calls: Optional[int] = None, max_latency_ms: Optional[float] = None, max_energy: Optional[float] = None, + kpi_targets: Optional[dict[str, float]] = None, + kpi_weights: Optional[dict[str, float]] = None, + compliance: Optional[str] = None, ) -> HarnessRunContext: """ Create a scoped run context. @@ -390,6 +396,9 @@ def run( resolved_tool_calls = max_tool_calls if max_tool_calls is not None else config.max_tool_calls resolved_latency = max_latency_ms if max_latency_ms is not None else config.max_latency_ms resolved_energy = max_energy if max_energy is not None else config.max_energy + resolved_kpi_targets = kpi_targets if kpi_targets is not None else config.kpi_targets + resolved_kpi_weights = kpi_weights if kpi_weights is not None else config.kpi_weights + resolved_compliance = compliance if compliance is not None else config.compliance return HarnessRunContext( mode=config.mode, @@ -397,6 +406,9 @@ def run( tool_calls_max=resolved_tool_calls, latency_max_ms=resolved_latency, energy_max=resolved_energy, + kpi_targets=resolved_kpi_targets, + kpi_weights=resolved_kpi_weights, + compliance=resolved_compliance, ) diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py index bdca5a00..a9e6a2bd 100644 --- a/cascadeflow/harness/instrument.py +++ b/cascadeflow/harness/instrument.py @@ -69,6 +69,37 @@ _DEFAULT_ENERGY_COEFFICIENT: float = 1.0 _ENERGY_OUTPUT_WEIGHT: float = 1.5 +# Relative quality/latency priors for KPI-weighted soft-control scoring. +_QUALITY_PRIORS: dict[str, float] = { + "gpt-4o": 0.90, + "gpt-4o-mini": 0.75, + "gpt-5-mini": 0.86, + "gpt-4-turbo": 0.88, + "gpt-4": 0.87, + "gpt-3.5-turbo": 0.65, + "o1": 0.95, + "o1-mini": 0.82, + "o3-mini": 0.80, +} +_LATENCY_PRIORS: dict[str, float] = { + "gpt-4o": 0.72, + "gpt-4o-mini": 0.93, + "gpt-5-mini": 0.84, + "gpt-4-turbo": 0.66, + "gpt-4": 0.52, + "gpt-3.5-turbo": 1.00, + "o1": 0.40, + "o1-mini": 0.60, + "o3-mini": 0.78, +} + +_COMPLIANCE_MODEL_ALLOWLISTS: dict[str, set[str]] = { + "gdpr": {"gpt-4o", "gpt-4o-mini", "gpt-3.5-turbo"}, + "hipaa": {"gpt-4o", "gpt-4o-mini"}, + "pci": {"gpt-4o-mini", "gpt-3.5-turbo"}, + "strict": {"gpt-4o"}, +} + # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @@ -157,6 +188,87 @@ def _select_lower_energy_model(current_model: str) -> str: return current_model +def _normalize_weights(weights: dict[str, float]) -> dict[str, float]: + normalized = { + key: float(value) + for key, value in weights.items() + if key in {"cost", "quality", "latency", "energy"} and float(value) > 0 + } + total = sum(normalized.values()) + if total <= 0: + return {} + return {key: value / total for key, value in normalized.items()} + + +def _cost_utility(model: str) -> float: + costs = [_model_total_cost(name) for name in _PRICING] + if not costs: + return 0.0 + model_cost = _model_total_cost(model) + min_cost = min(costs) + max_cost = max(costs) + if max_cost == min_cost: + return 1.0 + return (max_cost - model_cost) / (max_cost - min_cost) + + +def _energy_utility(model: str) -> float: + coeffs = list(_ENERGY_COEFFICIENTS.values()) + if not coeffs: + return 0.0 + coeff = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT) + min_coeff = min(coeffs) + max_coeff = max(coeffs) + if max_coeff == min_coeff: + return 1.0 + return (max_coeff - coeff) / (max_coeff - min_coeff) + + +def _kpi_score(model: str, weights: dict[str, float]) -> float: + normalized = _normalize_weights(weights) + if not normalized: + return -1.0 + quality = _QUALITY_PRIORS.get(model, 0.7) + latency = _LATENCY_PRIORS.get(model, 0.7) + cost = _cost_utility(model) + energy = _energy_utility(model) + return ( + (normalized.get("quality", 0.0) * quality) + + (normalized.get("latency", 0.0) * latency) + + (normalized.get("cost", 0.0) * cost) + + (normalized.get("energy", 0.0) * energy) + ) + + +def _select_kpi_weighted_model(current_model: str, weights: dict[str, float]) -> str: + best_model = current_model + best_score = _kpi_score(current_model, weights) + for candidate in _PRICING: + score = _kpi_score(candidate, weights) + if score > best_score: + best_model = candidate + best_score = score + return best_model + + +def _compliance_allowlist(compliance: str | None) -> set[str] | None: + if not compliance: + return None + return _COMPLIANCE_MODEL_ALLOWLISTS.get(compliance.strip().lower()) + + +def _select_compliant_model(current_model: str, compliance: str) -> str | None: + allowlist = _compliance_allowlist(compliance) + if not allowlist: + return current_model + if current_model in allowlist: + return current_model + available = [name for name in _PRICING if name in allowlist] + if not available: + return None + return min(available, key=_model_total_cost) + + @dataclass(frozen=True) class _PreCallDecision: action: str @@ -171,6 +283,30 @@ def _evaluate_pre_call_decision(ctx: Any, model: str, has_tools: bool) -> _PreCa if has_tools and ctx.tool_calls_max is not None and ctx.tool_calls >= ctx.tool_calls_max: return _PreCallDecision(action="deny_tool", reason="max_tool_calls_reached", target_model=model) + compliance = getattr(ctx, "compliance", None) + if compliance: + compliant_model = _select_compliant_model(model, str(compliance)) + if compliant_model is None: + if has_tools: + return _PreCallDecision( + action="deny_tool", + reason="compliance_no_approved_tool_path", + target_model=model, + ) + return _PreCallDecision(action="stop", reason="compliance_no_approved_model", target_model=model) + if compliant_model != model: + return _PreCallDecision( + action="switch_model", + reason="compliance_model_policy", + target_model=compliant_model, + ) + if str(compliance).strip().lower() == "strict" and has_tools: + return _PreCallDecision( + action="deny_tool", + reason="compliance_tool_restriction", + target_model=model, + ) + if ctx.latency_max_ms is not None and ctx.latency_used_ms >= ctx.latency_max_ms: faster_model = _select_faster_model(model) if faster_model != model: @@ -205,6 +341,16 @@ def _evaluate_pre_call_decision(ctx: Any, model: str, has_tools: bool) -> _PreCa target_model=cheaper_model, ) + kpi_weights = getattr(ctx, "kpi_weights", None) + if isinstance(kpi_weights, dict) and kpi_weights: + weighted_model = _select_kpi_weighted_model(model, kpi_weights) + if weighted_model != model: + return _PreCallDecision( + action="switch_model", + reason="kpi_weight_optimization", + target_model=weighted_model, + ) + return _PreCallDecision(action="allow", reason=ctx.mode, target_model=model) diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md index 91da81e1..b03d8a58 100644 --- a/docs/strategy/agent-intelligence-v2-plan.md +++ b/docs/strategy/agent-intelligence-v2-plan.md @@ -844,7 +844,7 @@ Branching model: Claim checklist (one owner per branch at a time): - [x] `feat/v2-core-harness-api` — Owner: `@codex` — PR: `TBD` — Status: `completed` - [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — PR: `TBD` — Status: `in-progress` -- [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` +- [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `completed (ready for PR)` - [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` - [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` - [ ] `feat/v2-langchain-harness-extension` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py index 43622fae..183a4350 100644 --- a/tests/test_harness_api.py +++ b/tests/test_harness_api.py @@ -72,18 +72,36 @@ def test_init_non_numeric_env_raises(monkeypatch): def test_run_uses_global_defaults_and_overrides(): - init(mode="enforce", budget=2.0, max_tool_calls=5) + init( + mode="enforce", + budget=2.0, + max_tool_calls=5, + kpi_targets={"quality_min": 0.9}, + kpi_weights={"cost": 0.7, "quality": 0.3}, + compliance="gdpr", + ) default_ctx = run() assert default_ctx.mode == "enforce" assert default_ctx.budget_max == 2.0 assert default_ctx.tool_calls_max == 5 assert default_ctx.budget_remaining == 2.0 - - override_ctx = run(budget=0.5, max_tool_calls=3) + assert default_ctx.kpi_targets == {"quality_min": 0.9} + assert default_ctx.kpi_weights == {"cost": 0.7, "quality": 0.3} + assert default_ctx.compliance == "gdpr" + + override_ctx = run( + budget=0.5, + max_tool_calls=3, + kpi_weights={"quality": 1.0}, + compliance="strict", + ) assert override_ctx.budget_max == 0.5 assert override_ctx.tool_calls_max == 3 assert override_ctx.budget_remaining == 0.5 + assert override_ctx.kpi_targets == {"quality_min": 0.9} + assert override_ctx.kpi_weights == {"quality": 1.0} + assert override_ctx.compliance == "strict" def test_run_without_enter_exit_is_safe(): diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py index c0bc6caf..3a4d9519 100644 --- a/tests/test_harness_instrument.py +++ b/tests/test_harness_instrument.py @@ -776,6 +776,78 @@ async def test_async_enforce_denies_tools_when_cap_reached(self) -> None: assert trace[0]["action"] == "deny_tool" assert trace[0]["reason"] == "max_tool_calls_reached" + def test_enforce_switches_model_for_compliance_policy(self) -> None: + init(mode="enforce", compliance="strict") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run() as ctx: + wrapper(MagicMock(), model="gpt-4o-mini") + + assert original.call_args[1]["model"] == "gpt-4o" + trace = ctx.trace() + assert trace[0]["action"] == "switch_model" + assert trace[0]["reason"] == "compliance_model_policy" + + def test_enforce_denies_tool_for_strict_compliance(self) -> None: + init(mode="enforce", compliance="strict") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run() as ctx: + wrapper(MagicMock(), model="gpt-4o", tools=[{"type": "function", "function": {"name": "t1"}}]) + + assert original.call_args[1]["tools"] == [] + trace = ctx.trace() + assert trace[0]["action"] == "deny_tool" + assert trace[0]["reason"] == "compliance_tool_restriction" + + def test_observe_logs_compliance_switch_without_applying(self) -> None: + init(mode="observe", compliance="strict") + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run() as ctx: + wrapper(MagicMock(), model="gpt-4o-mini") + + assert original.call_args[1]["model"] == "gpt-4o-mini" + trace = ctx.trace() + assert trace[0]["action"] == "switch_model" + assert trace[0]["reason"] == "compliance_model_policy" + assert trace[0]["model"] == "gpt-4o" + + def test_enforce_switches_model_using_kpi_weights(self) -> None: + init(mode="enforce", kpi_weights={"quality": 1.0}) + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run() as ctx: + wrapper(MagicMock(), model="gpt-3.5-turbo") + + assert original.call_args[1]["model"] == "o1" + trace = ctx.trace() + assert trace[0]["action"] == "switch_model" + assert trace[0]["reason"] == "kpi_weight_optimization" + + def test_observe_logs_kpi_switch_without_applying(self) -> None: + init(mode="observe", kpi_weights={"quality": 1.0}) + mock_resp = _mock_completion() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_create(original) + + with run() as ctx: + wrapper(MagicMock(), model="gpt-3.5-turbo") + + assert original.call_args[1]["model"] == "gpt-3.5-turbo" + trace = ctx.trace() + assert trace[0]["action"] == "switch_model" + assert trace[0]["reason"] == "kpi_weight_optimization" + assert trace[0]["model"] == "o1" + # --------------------------------------------------------------------------- # Fix: stream_options.include_usage auto-injection From bcee09caa0925db9921da3f2386bc16ed4b66bba Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 08:58:04 +0100 Subject: [PATCH 12/49] fix(harness): clarify observe traces and hard-stop semantics --- cascadeflow/harness/api.py | 29 +++-- cascadeflow/harness/instrument.py | 177 ++++++++++++++++++++---------- tests/test_harness_instrument.py | 41 ++++++- 3 files changed, 177 insertions(+), 70 deletions(-) diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py index 3627c9bb..9d003ee1 100644 --- a/cascadeflow/harness/api.py +++ b/cascadeflow/harness/api.py @@ -84,17 +84,28 @@ async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None: def trace(self) -> list[dict[str, Any]]: return list(self._trace) - def record(self, action: str, reason: str, model: Optional[str] = None) -> None: + def record( + self, + action: str, + reason: str, + model: Optional[str] = None, + *, + applied: Optional[bool] = None, + decision_mode: Optional[str] = None, + ) -> None: self.last_action = action self.model_used = model - self._trace.append( - { - "action": action, - "reason": reason, - "model": model, - "run_id": self.run_id, - } - ) + entry: dict[str, Any] = { + "action": action, + "reason": reason, + "model": model, + "run_id": self.run_id, + } + if applied is not None: + entry["applied"] = applied + if decision_mode is not None: + entry["decision_mode"] = decision_mode + self._trace.append(entry) _harness_config: HarnessConfig = HarnessConfig() diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py index a9e6a2bd..251d2497 100644 --- a/cascadeflow/harness/instrument.py +++ b/cascadeflow/harness/instrument.py @@ -69,7 +69,9 @@ _DEFAULT_ENERGY_COEFFICIENT: float = 1.0 _ENERGY_OUTPUT_WEIGHT: float = 1.5 -# Relative quality/latency priors for KPI-weighted soft-control scoring. +# Relative priors used by KPI-weighted soft-control scoring. +# These are deterministic heuristics based on internal benchmark runs and +# intended as defaults until provider-specific online scoring is wired in. _QUALITY_PRIORS: dict[str, float] = { "gpt-4o": 0.90, "gpt-4o-mini": 0.75, @@ -93,6 +95,8 @@ "o3-mini": 0.78, } +# OpenAI-model allowlists used by the current OpenAI harness instrumentation. +# Future provider instrumentation should provide provider-specific allowlists. _COMPLIANCE_MODEL_ALLOWLISTS: dict[str, set[str]] = { "gdpr": {"gpt-4o", "gpt-4o-mini", "gpt-3.5-turbo"}, "hipaa": {"gpt-4o", "gpt-4o-mini"}, @@ -173,9 +177,14 @@ def _select_cheaper_model(current_model: str) -> str: def _select_faster_model(current_model: str) -> str: - # We use the lowest-cost model as a deterministic latency proxy until - # provider-specific live latency scoring is wired into the harness. - return _select_cheaper_model(current_model) + latency_candidates = [name for name in _PRICING if name in _LATENCY_PRIORS] + if not latency_candidates: + return current_model + fastest = max(latency_candidates, key=lambda name: _LATENCY_PRIORS[name]) + current_latency = _LATENCY_PRIORS.get(current_model, 0.7) + if _LATENCY_PRIORS[fastest] > current_latency: + return fastest + return current_model def _select_lower_energy_model(current_model: str) -> str: @@ -227,7 +236,7 @@ def _energy_utility(model: str) -> float: def _kpi_score(model: str, weights: dict[str, float]) -> float: normalized = _normalize_weights(weights) if not normalized: - return -1.0 + return 0.0 quality = _QUALITY_PRIORS.get(model, 0.7) latency = _LATENCY_PRIORS.get(model, 0.7) cost = _cost_utility(model) @@ -355,7 +364,7 @@ def _evaluate_pre_call_decision(ctx: Any, model: str, has_tools: bool) -> _PreCa def _raise_stop_error(ctx: Any, reason: str) -> None: - from cascadeflow.schema.exceptions import BudgetExceededError + from cascadeflow.schema.exceptions import BudgetExceededError, HarnessStopError if reason == "budget_exceeded": remaining = 0.0 @@ -365,7 +374,56 @@ def _raise_stop_error(ctx: Any, reason: str) -> None: f"Budget exhausted: spent ${ctx.cost:.4f} of ${ctx.budget_max or 0.0:.4f} max", remaining=remaining, ) - raise RuntimeError(f"cascadeflow harness stop: {reason}") + raise HarnessStopError(f"cascadeflow harness stop: {reason}", reason=reason) + + +def _resolve_pre_call_decision( + ctx: Any, + mode: str, + model: str, + kwargs: dict[str, Any], +) -> tuple[dict[str, Any], str, str, str, str, bool]: + decision = _evaluate_pre_call_decision(ctx, model, has_tools=bool(kwargs.get("tools"))) + action = decision.action + reason = decision.reason + target_model = decision.target_model + applied = action == "allow" + + if mode == "enforce": + if action == "stop": + ctx.record( + action="stop", + reason=reason, + model=model, + applied=True, + decision_mode=mode, + ) + _raise_stop_error(ctx, reason) + + if action == "switch_model" and target_model != model: + kwargs = {**kwargs, "model": target_model} + model = target_model + applied = True + elif action == "switch_model": + applied = False + + if action == "deny_tool": + if kwargs.get("tools"): + kwargs = {**kwargs, "tools": []} + applied = True + else: + applied = False + elif action != "allow": + logger.debug( + "harness observe decision: action=%s reason=%s model=%s target=%s", + action, + reason, + model, + target_model, + ) + applied = False + + return kwargs, model, action, reason, target_model, applied def _update_context( @@ -379,6 +437,8 @@ def _update_context( action: str = "allow", action_reason: str | None = None, action_model: str | None = None, + applied: bool | None = None, + decision_mode: str | None = None, ) -> None: """Update a HarnessRunContext with call metrics.""" cost = _estimate_cost(model, prompt_tokens, completion_tokens) @@ -393,14 +453,27 @@ def _update_context( if ctx.budget_max is not None: ctx.budget_remaining = ctx.budget_max - ctx.cost + if applied is None: + applied = action == "allow" + if decision_mode is None: + decision_mode = ctx.mode + if action == "allow": - ctx.record(action="allow", reason=ctx.mode, model=model) + ctx.record( + action="allow", + reason=ctx.mode, + model=model, + applied=applied, + decision_mode=decision_mode, + ) return ctx.record( action=action, reason=action_reason or ctx.mode, model=action_model or model, + applied=applied, + decision_mode=decision_mode, ) @@ -420,6 +493,8 @@ class _InstrumentedStream: "_pre_action", "_pre_reason", "_pre_model", + "_pre_applied", + "_decision_mode", "_usage", "_tool_call_count", "_finalized", @@ -434,6 +509,8 @@ def __init__( pre_action: str = "allow", pre_reason: str = "observe", pre_model: str | None = None, + pre_applied: bool = True, + decision_mode: str = "observe", ) -> None: self._stream = stream self._ctx = ctx @@ -442,6 +519,8 @@ def __init__( self._pre_action = pre_action self._pre_reason = pre_reason self._pre_model = pre_model or model + self._pre_applied = pre_applied + self._decision_mode = decision_mode self._usage: Any = None self._tool_call_count: int = 0 self._finalized: bool = False @@ -528,6 +607,8 @@ def _finalize(self) -> None: action=self._pre_action, action_reason=self._pre_reason, action_model=self._pre_model, + applied=self._pre_applied, + decision_mode=self._decision_mode, ) @@ -542,6 +623,8 @@ class _InstrumentedAsyncStream: "_pre_action", "_pre_reason", "_pre_model", + "_pre_applied", + "_decision_mode", "_usage", "_tool_call_count", "_finalized", @@ -556,6 +639,8 @@ def __init__( pre_action: str = "allow", pre_reason: str = "observe", pre_model: str | None = None, + pre_applied: bool = True, + decision_mode: str = "observe", ) -> None: self._stream = stream self._ctx = ctx @@ -564,6 +649,8 @@ def __init__( self._pre_action = pre_action self._pre_reason = pre_reason self._pre_model = pre_model or model + self._pre_applied = pre_applied + self._decision_mode = decision_mode self._usage: Any = None self._tool_call_count: int = 0 self._finalized: bool = False @@ -648,6 +735,8 @@ def _finalize(self) -> None: action=self._pre_action, action_reason=self._pre_reason, action_model=self._pre_model, + applied=self._pre_applied, + decision_mode=self._decision_mode, ) @@ -674,34 +763,16 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: pre_action = "allow" pre_reason = mode pre_model = model + pre_applied = True is_stream: bool = bool(kwargs.get("stream", False)) if ctx: - decision = _evaluate_pre_call_decision(ctx, model, has_tools=bool(kwargs.get("tools"))) - pre_action = decision.action - pre_reason = decision.reason - pre_model = decision.target_model - - if mode == "enforce": - if decision.action == "stop": - ctx.record(action="stop", reason=decision.reason, model=model) - _raise_stop_error(ctx, decision.reason) - - if decision.action == "switch_model" and decision.target_model != model: - kwargs = {**kwargs, "model": decision.target_model} - model = decision.target_model - - if decision.action == "deny_tool" and kwargs.get("tools"): - kwargs = {**kwargs, "tools": []} - - elif decision.action != "allow": - logger.debug( - "harness observe decision: action=%s reason=%s model=%s target=%s", - decision.action, - decision.reason, - model, - decision.target_model, - ) + kwargs, model, pre_action, pre_reason, pre_model, pre_applied = _resolve_pre_call_decision( + ctx, + mode, + model, + kwargs, + ) start_time = time.monotonic() @@ -720,6 +791,8 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: pre_action, pre_reason, pre_model, + pre_applied, + mode, ) elif not is_stream and ctx: elapsed_ms = (time.monotonic() - start_time) * 1000 @@ -735,6 +808,8 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: action=pre_action, action_reason=pre_reason, action_model=pre_model, + applied=pre_applied, + decision_mode=mode, ) else: logger.debug( @@ -766,34 +841,16 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: pre_action = "allow" pre_reason = mode pre_model = model + pre_applied = True is_stream: bool = bool(kwargs.get("stream", False)) if ctx: - decision = _evaluate_pre_call_decision(ctx, model, has_tools=bool(kwargs.get("tools"))) - pre_action = decision.action - pre_reason = decision.reason - pre_model = decision.target_model - - if mode == "enforce": - if decision.action == "stop": - ctx.record(action="stop", reason=decision.reason, model=model) - _raise_stop_error(ctx, decision.reason) - - if decision.action == "switch_model" and decision.target_model != model: - kwargs = {**kwargs, "model": decision.target_model} - model = decision.target_model - - if decision.action == "deny_tool" and kwargs.get("tools"): - kwargs = {**kwargs, "tools": []} - - elif decision.action != "allow": - logger.debug( - "harness observe decision async: action=%s reason=%s model=%s target=%s", - decision.action, - decision.reason, - model, - decision.target_model, - ) + kwargs, model, pre_action, pre_reason, pre_model, pre_applied = _resolve_pre_call_decision( + ctx, + mode, + model, + kwargs, + ) start_time = time.monotonic() @@ -817,6 +874,8 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: pre_action, pre_reason, pre_model, + pre_applied, + mode, ) elif not is_stream and ctx: elapsed_ms = (time.monotonic() - start_time) * 1000 @@ -832,6 +891,8 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: action=pre_action, action_reason=pre_reason, action_model=pre_model, + applied=pre_applied, + decision_mode=mode, ) else: logger.debug( diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py index 3a4d9519..28fdc7b7 100644 --- a/tests/test_harness_instrument.py +++ b/tests/test_harness_instrument.py @@ -241,6 +241,8 @@ def test_model_used_and_trace(self) -> None: assert trace[0]["action"] == "allow" assert trace[0]["reason"] == "observe" assert trace[0]["model"] == "gpt-4o" + assert trace[0]["applied"] is True + assert trace[0]["decision_mode"] == "observe" def test_off_mode_passthrough_no_tracking(self) -> None: init(mode="off") @@ -641,6 +643,11 @@ def test_observe_does_not_raise_on_budget_exhausted(self) -> None: wrapper(MagicMock(), model="gpt-4o") assert ctx.cost > ctx.budget_max # type: ignore[operator] + trace = ctx.trace() + assert trace[-1]["action"] == "stop" + assert trace[-1]["reason"] == "budget_exceeded" + assert trace[-1]["applied"] is False + assert trace[-1]["decision_mode"] == "observe" @pytest.mark.asyncio async def test_enforce_raises_on_budget_exhausted_async(self) -> None: @@ -678,6 +685,8 @@ def test_enforce_switches_model_under_budget_pressure(self) -> None: trace = ctx.trace() assert trace[0]["action"] == "switch_model" assert trace[0]["reason"] == "budget_pressure" + assert trace[0]["applied"] is True + assert trace[0]["decision_mode"] == "enforce" def test_observe_computes_switch_model_but_does_not_apply(self) -> None: init(mode="observe") @@ -695,6 +704,8 @@ def test_observe_computes_switch_model_but_does_not_apply(self) -> None: assert trace[0]["action"] == "switch_model" assert trace[0]["reason"] == "budget_pressure" assert trace[0]["model"] == "gpt-4o-mini" + assert trace[0]["applied"] is False + assert trace[0]["decision_mode"] == "observe" def test_enforce_denies_tools_when_cap_reached(self) -> None: init(mode="enforce", max_tool_calls=0) @@ -709,6 +720,8 @@ def test_enforce_denies_tools_when_cap_reached(self) -> None: trace = ctx.trace() assert trace[0]["action"] == "deny_tool" assert trace[0]["reason"] == "max_tool_calls_reached" + assert trace[0]["applied"] is True + assert trace[0]["decision_mode"] == "enforce" def test_observe_logs_deny_tool_but_keeps_tools(self) -> None: init(mode="observe", max_tool_calls=0) @@ -724,8 +737,12 @@ def test_observe_logs_deny_tool_but_keeps_tools(self) -> None: trace = ctx.trace() assert trace[0]["action"] == "deny_tool" assert trace[0]["reason"] == "max_tool_calls_reached" + assert trace[0]["applied"] is False + assert trace[0]["decision_mode"] == "observe" def test_enforce_stops_when_latency_limit_exceeded_at_fastest_model(self) -> None: + from cascadeflow.schema.exceptions import HarnessStopError + init(mode="enforce") mock_resp = _mock_completion() original = MagicMock(return_value=mock_resp) @@ -733,15 +750,19 @@ def test_enforce_stops_when_latency_limit_exceeded_at_fastest_model(self) -> Non with run(max_latency_ms=1.0) as ctx: ctx.latency_used_ms = 5.0 - with pytest.raises(RuntimeError, match="latency_limit_exceeded"): - wrapper(MagicMock(), model="gpt-4o-mini") + with pytest.raises(HarnessStopError, match="latency_limit_exceeded"): + wrapper(MagicMock(), model="gpt-3.5-turbo") original.assert_not_called() trace = ctx.trace() assert trace[0]["action"] == "stop" assert trace[0]["reason"] == "latency_limit_exceeded" + assert trace[0]["applied"] is True + assert trace[0]["decision_mode"] == "enforce" def test_enforce_stops_when_energy_limit_exceeded_at_lowest_energy_model(self) -> None: + from cascadeflow.schema.exceptions import HarnessStopError + init(mode="enforce") mock_resp = _mock_completion() original = MagicMock(return_value=mock_resp) @@ -749,13 +770,15 @@ def test_enforce_stops_when_energy_limit_exceeded_at_lowest_energy_model(self) - with run(max_energy=1.0) as ctx: ctx.energy_used = 5.0 - with pytest.raises(RuntimeError, match="energy_limit_exceeded"): + with pytest.raises(HarnessStopError, match="energy_limit_exceeded"): wrapper(MagicMock(), model="gpt-3.5-turbo") original.assert_not_called() trace = ctx.trace() assert trace[0]["action"] == "stop" assert trace[0]["reason"] == "energy_limit_exceeded" + assert trace[0]["applied"] is True + assert trace[0]["decision_mode"] == "enforce" @pytest.mark.asyncio async def test_async_enforce_denies_tools_when_cap_reached(self) -> None: @@ -775,6 +798,8 @@ async def test_async_enforce_denies_tools_when_cap_reached(self) -> None: trace = ctx.trace() assert trace[0]["action"] == "deny_tool" assert trace[0]["reason"] == "max_tool_calls_reached" + assert trace[0]["applied"] is True + assert trace[0]["decision_mode"] == "enforce" def test_enforce_switches_model_for_compliance_policy(self) -> None: init(mode="enforce", compliance="strict") @@ -789,6 +814,8 @@ def test_enforce_switches_model_for_compliance_policy(self) -> None: trace = ctx.trace() assert trace[0]["action"] == "switch_model" assert trace[0]["reason"] == "compliance_model_policy" + assert trace[0]["applied"] is True + assert trace[0]["decision_mode"] == "enforce" def test_enforce_denies_tool_for_strict_compliance(self) -> None: init(mode="enforce", compliance="strict") @@ -803,6 +830,8 @@ def test_enforce_denies_tool_for_strict_compliance(self) -> None: trace = ctx.trace() assert trace[0]["action"] == "deny_tool" assert trace[0]["reason"] == "compliance_tool_restriction" + assert trace[0]["applied"] is True + assert trace[0]["decision_mode"] == "enforce" def test_observe_logs_compliance_switch_without_applying(self) -> None: init(mode="observe", compliance="strict") @@ -818,6 +847,8 @@ def test_observe_logs_compliance_switch_without_applying(self) -> None: assert trace[0]["action"] == "switch_model" assert trace[0]["reason"] == "compliance_model_policy" assert trace[0]["model"] == "gpt-4o" + assert trace[0]["applied"] is False + assert trace[0]["decision_mode"] == "observe" def test_enforce_switches_model_using_kpi_weights(self) -> None: init(mode="enforce", kpi_weights={"quality": 1.0}) @@ -832,6 +863,8 @@ def test_enforce_switches_model_using_kpi_weights(self) -> None: trace = ctx.trace() assert trace[0]["action"] == "switch_model" assert trace[0]["reason"] == "kpi_weight_optimization" + assert trace[0]["applied"] is True + assert trace[0]["decision_mode"] == "enforce" def test_observe_logs_kpi_switch_without_applying(self) -> None: init(mode="observe", kpi_weights={"quality": 1.0}) @@ -847,6 +880,8 @@ def test_observe_logs_kpi_switch_without_applying(self) -> None: assert trace[0]["action"] == "switch_model" assert trace[0]["reason"] == "kpi_weight_optimization" assert trace[0]["model"] == "o1" + assert trace[0]["applied"] is False + assert trace[0]["decision_mode"] == "observe" # --------------------------------------------------------------------------- From ee6e040b30d13e2545146ca7e2994894f5d6a183 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 09:25:08 +0100 Subject: [PATCH 13/49] perf(harness): optimize model utility hot paths --- cascadeflow/harness/instrument.py | 69 +++++++++++++++++-------------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py index 251d2497..237b1174 100644 --- a/cascadeflow/harness/instrument.py +++ b/cascadeflow/harness/instrument.py @@ -49,6 +49,12 @@ "o3-mini": (1.10, 4.40), } _DEFAULT_PRICING: tuple[float, float] = (2.50, 10.00) +_DEFAULT_TOTAL_COST: float = _DEFAULT_PRICING[0] + _DEFAULT_PRICING[1] +_MODEL_TOTAL_COSTS: dict[str, float] = {name: in_cost + out_cost for name, (in_cost, out_cost) in _PRICING.items()} +_PRICING_MODELS: tuple[str, ...] = tuple(_PRICING.keys()) +_CHEAPEST_MODEL: str = min(_MODEL_TOTAL_COSTS, key=_MODEL_TOTAL_COSTS.get) +_MIN_TOTAL_COST: float = min(_MODEL_TOTAL_COSTS.values()) +_MAX_TOTAL_COST: float = max(_MODEL_TOTAL_COSTS.values()) # --------------------------------------------------------------------------- # Energy estimation coefficients (deterministic proxy, not live carbon data) @@ -68,6 +74,9 @@ } _DEFAULT_ENERGY_COEFFICIENT: float = 1.0 _ENERGY_OUTPUT_WEIGHT: float = 1.5 +_LOWEST_ENERGY_MODEL: str = min(_ENERGY_COEFFICIENTS, key=_ENERGY_COEFFICIENTS.get) +_MIN_ENERGY_COEFF: float = min(_ENERGY_COEFFICIENTS.values()) +_MAX_ENERGY_COEFF: float = max(_ENERGY_COEFFICIENTS.values()) # Relative priors used by KPI-weighted soft-control scoring. # These are deterministic heuristics based on internal benchmark runs and @@ -94,6 +103,10 @@ "o1-mini": 0.60, "o3-mini": 0.78, } +_LATENCY_CANDIDATES: tuple[str, ...] = tuple(name for name in _PRICING_MODELS if name in _LATENCY_PRIORS) +_FASTEST_MODEL: str | None = ( + max(_LATENCY_CANDIDATES, key=lambda name: _LATENCY_PRIORS[name]) if _LATENCY_CANDIDATES else None +) # OpenAI-model allowlists used by the current OpenAI harness instrumentation. # Future provider instrumentation should provide provider-specific allowlists. @@ -165,35 +178,30 @@ def _extract_usage(response: Any) -> tuple[int, int]: def _model_total_cost(model: str) -> float: - in_cost, out_cost = _PRICING.get(model, _DEFAULT_PRICING) - return in_cost + out_cost + return _MODEL_TOTAL_COSTS.get(model, _DEFAULT_TOTAL_COST) def _select_cheaper_model(current_model: str) -> str: - cheapest = min(_PRICING.keys(), key=_model_total_cost) - if _model_total_cost(cheapest) < _model_total_cost(current_model): - return cheapest + if _model_total_cost(_CHEAPEST_MODEL) < _model_total_cost(current_model): + return _CHEAPEST_MODEL return current_model def _select_faster_model(current_model: str) -> str: - latency_candidates = [name for name in _PRICING if name in _LATENCY_PRIORS] - if not latency_candidates: + if _FASTEST_MODEL is None: return current_model - fastest = max(latency_candidates, key=lambda name: _LATENCY_PRIORS[name]) current_latency = _LATENCY_PRIORS.get(current_model, 0.7) - if _LATENCY_PRIORS[fastest] > current_latency: - return fastest + if _LATENCY_PRIORS[_FASTEST_MODEL] > current_latency: + return _FASTEST_MODEL return current_model def _select_lower_energy_model(current_model: str) -> str: - lowest_energy = min(_ENERGY_COEFFICIENTS.keys(), key=lambda name: _ENERGY_COEFFICIENTS[name]) - if _ENERGY_COEFFICIENTS.get(lowest_energy, _DEFAULT_ENERGY_COEFFICIENT) < _ENERGY_COEFFICIENTS.get( + if _ENERGY_COEFFICIENTS.get(_LOWEST_ENERGY_MODEL, _DEFAULT_ENERGY_COEFFICIENT) < _ENERGY_COEFFICIENTS.get( current_model, _DEFAULT_ENERGY_COEFFICIENT, ): - return lowest_energy + return _LOWEST_ENERGY_MODEL return current_model @@ -210,31 +218,20 @@ def _normalize_weights(weights: dict[str, float]) -> dict[str, float]: def _cost_utility(model: str) -> float: - costs = [_model_total_cost(name) for name in _PRICING] - if not costs: - return 0.0 model_cost = _model_total_cost(model) - min_cost = min(costs) - max_cost = max(costs) - if max_cost == min_cost: + if _MAX_TOTAL_COST == _MIN_TOTAL_COST: return 1.0 - return (max_cost - model_cost) / (max_cost - min_cost) + return (_MAX_TOTAL_COST - model_cost) / (_MAX_TOTAL_COST - _MIN_TOTAL_COST) def _energy_utility(model: str) -> float: - coeffs = list(_ENERGY_COEFFICIENTS.values()) - if not coeffs: - return 0.0 coeff = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT) - min_coeff = min(coeffs) - max_coeff = max(coeffs) - if max_coeff == min_coeff: + if _MAX_ENERGY_COEFF == _MIN_ENERGY_COEFF: return 1.0 - return (max_coeff - coeff) / (max_coeff - min_coeff) + return (_MAX_ENERGY_COEFF - coeff) / (_MAX_ENERGY_COEFF - _MIN_ENERGY_COEFF) -def _kpi_score(model: str, weights: dict[str, float]) -> float: - normalized = _normalize_weights(weights) +def _kpi_score_with_normalized(model: str, normalized: dict[str, float]) -> float: if not normalized: return 0.0 quality = _QUALITY_PRIORS.get(model, 0.7) @@ -249,11 +246,19 @@ def _kpi_score(model: str, weights: dict[str, float]) -> float: ) +def _kpi_score(model: str, weights: dict[str, float]) -> float: + normalized = _normalize_weights(weights) + return _kpi_score_with_normalized(model, normalized) + + def _select_kpi_weighted_model(current_model: str, weights: dict[str, float]) -> str: + normalized = _normalize_weights(weights) + if not normalized: + return current_model best_model = current_model - best_score = _kpi_score(current_model, weights) - for candidate in _PRICING: - score = _kpi_score(candidate, weights) + best_score = _kpi_score_with_normalized(current_model, normalized) + for candidate in _PRICING_MODELS: + score = _kpi_score_with_normalized(candidate, normalized) if score > best_score: best_model = candidate best_score = score From b54637be4ba91ecc6823e5d8bfe763b5adfb3a58 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 09:41:10 +0100 Subject: [PATCH 14/49] refactor(harness): unify pricing profiles across integrations --- cascadeflow/harness/instrument.py | 463 ++++++++-------------- cascadeflow/harness/pricing.py | 1 + cascadeflow/integrations/crewai.py | 46 +-- cascadeflow/integrations/openai_agents.py | 46 +-- tests/test_harness_shared_pricing.py | 42 +- 5 files changed, 210 insertions(+), 388 deletions(-) diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py index 237b1174..e86fb1a9 100644 --- a/cascadeflow/harness/instrument.py +++ b/cascadeflow/harness/instrument.py @@ -23,6 +23,25 @@ from dataclasses import dataclass from typing import Any +from cascadeflow.harness.pricing import ( + DEFAULT_ENERGY_COEFFICIENT as _DEFAULT_ENERGY_COEFFICIENT, +) +from cascadeflow.harness.pricing import ( + ENERGY_COEFFICIENTS as _ENERGY_COEFFICIENTS, +) +from cascadeflow.harness.pricing import ( + OPENAI_MODEL_POOL as _PRICING_MODELS, +) +from cascadeflow.harness.pricing import ( + estimate_cost as _estimate_cost_shared, +) +from cascadeflow.harness.pricing import ( + estimate_energy as _estimate_energy_shared, +) +from cascadeflow.harness.pricing import ( + model_total_price as _model_total_price_shared, +) + logger = logging.getLogger("cascadeflow.harness.instrument") # --------------------------------------------------------------------------- @@ -33,50 +52,17 @@ _original_sync_create: Any = None _original_async_create: Any = None -# --------------------------------------------------------------------------- -# Pricing table (USD per 1M tokens: input, output) -# --------------------------------------------------------------------------- - -_PRICING: dict[str, tuple[float, float]] = { - "gpt-4o": (2.50, 10.00), - "gpt-4o-mini": (0.15, 0.60), - "gpt-5-mini": (0.20, 0.80), - "gpt-4-turbo": (10.00, 30.00), - "gpt-4": (30.00, 60.00), - "gpt-3.5-turbo": (0.50, 1.50), - "o1": (15.00, 60.00), - "o1-mini": (3.00, 12.00), - "o3-mini": (1.10, 4.40), -} -_DEFAULT_PRICING: tuple[float, float] = (2.50, 10.00) -_DEFAULT_TOTAL_COST: float = _DEFAULT_PRICING[0] + _DEFAULT_PRICING[1] -_MODEL_TOTAL_COSTS: dict[str, float] = {name: in_cost + out_cost for name, (in_cost, out_cost) in _PRICING.items()} -_PRICING_MODELS: tuple[str, ...] = tuple(_PRICING.keys()) +_MODEL_TOTAL_COSTS: dict[str, float] = {name: _model_total_price_shared(name) for name in _PRICING_MODELS} _CHEAPEST_MODEL: str = min(_MODEL_TOTAL_COSTS, key=_MODEL_TOTAL_COSTS.get) _MIN_TOTAL_COST: float = min(_MODEL_TOTAL_COSTS.values()) _MAX_TOTAL_COST: float = max(_MODEL_TOTAL_COSTS.values()) -# --------------------------------------------------------------------------- -# Energy estimation coefficients (deterministic proxy, not live carbon data) -# energy_units = coefficient * (input_tokens + output_tokens * output_weight) -# --------------------------------------------------------------------------- - -_ENERGY_COEFFICIENTS: dict[str, float] = { - "gpt-4o": 1.0, - "gpt-4o-mini": 0.3, - "gpt-5-mini": 0.35, - "gpt-4-turbo": 1.5, - "gpt-4": 1.5, - "gpt-3.5-turbo": 0.2, - "o1": 2.0, - "o1-mini": 0.8, - "o3-mini": 0.5, +_OPENAI_ENERGY_COEFFS: dict[str, float] = { + name: _ENERGY_COEFFICIENTS.get(name, _DEFAULT_ENERGY_COEFFICIENT) for name in _PRICING_MODELS } -_DEFAULT_ENERGY_COEFFICIENT: float = 1.0 -_ENERGY_OUTPUT_WEIGHT: float = 1.5 -_LOWEST_ENERGY_MODEL: str = min(_ENERGY_COEFFICIENTS, key=_ENERGY_COEFFICIENTS.get) -_MIN_ENERGY_COEFF: float = min(_ENERGY_COEFFICIENTS.values()) -_MAX_ENERGY_COEFF: float = max(_ENERGY_COEFFICIENTS.values()) +_LOWEST_ENERGY_MODEL: str = min(_OPENAI_ENERGY_COEFFS, key=_OPENAI_ENERGY_COEFFS.get) +_MIN_ENERGY_COEFF: float = min(_OPENAI_ENERGY_COEFFS.values()) +_MAX_ENERGY_COEFF: float = max(_OPENAI_ENERGY_COEFFS.values()) # Relative priors used by KPI-weighted soft-control scoring. # These are deterministic heuristics based on internal benchmark runs and @@ -140,16 +126,12 @@ def _ensure_stream_usage(kwargs: dict[str, Any]) -> dict[str, Any]: def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float: """Estimate cost in USD from model name and token counts.""" - per_million = _PRICING.get(model, _DEFAULT_PRICING) - input_cost = (prompt_tokens / 1_000_000) * per_million[0] - output_cost = (completion_tokens / 1_000_000) * per_million[1] - return input_cost + output_cost + return _estimate_cost_shared(model, prompt_tokens, completion_tokens) def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> float: """Estimate energy units (deterministic proxy, not live carbon).""" - coeff = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT) - return coeff * (prompt_tokens + completion_tokens * _ENERGY_OUTPUT_WEIGHT) + return _estimate_energy_shared(model, prompt_tokens, completion_tokens) def _count_tool_calls_in_response(response: Any) -> int: @@ -178,7 +160,7 @@ def _extract_usage(response: Any) -> tuple[int, int]: def _model_total_cost(model: str) -> float: - return _MODEL_TOTAL_COSTS.get(model, _DEFAULT_TOTAL_COST) + return _MODEL_TOTAL_COSTS.get(model, _model_total_price_shared(model)) def _select_cheaper_model(current_model: str) -> str: @@ -277,7 +259,7 @@ def _select_compliant_model(current_model: str, compliance: str) -> str | None: return current_model if current_model in allowlist: return current_model - available = [name for name in _PRICING if name in allowlist] + available = [name for name in _PRICING_MODELS if name in allowlist] if not available: return None return min(available, key=_model_total_cost) @@ -487,8 +469,8 @@ def _update_context( # --------------------------------------------------------------------------- -class _InstrumentedStream: - """Wraps an OpenAI ``Stream`` to capture usage after all chunks are consumed.""" +class _InstrumentedStreamBase: + """Shared stream-wrapper logic for sync and async OpenAI streams.""" __slots__ = ( "_stream", @@ -530,35 +512,6 @@ def __init__( self._tool_call_count: int = 0 self._finalized: bool = False - # --- iteration --------------------------------------------------------- - - def __iter__(self) -> _InstrumentedStream: - return self - - def __next__(self) -> Any: - try: - chunk = next(self._stream) - self._inspect_chunk(chunk) - return chunk - except StopIteration: - self._finalize() - raise - - # --- context manager --------------------------------------------------- - - def __enter__(self) -> _InstrumentedStream: - if hasattr(self._stream, "__enter__"): - self._stream.__enter__() - return self - - def __exit__(self, *args: Any) -> bool: - self._finalize() - if hasattr(self._stream, "__exit__"): - return self._stream.__exit__(*args) # type: ignore[no-any-return] - return False - - # --- proxied attributes ------------------------------------------------ - def close(self) -> None: self._finalize() if hasattr(self._stream, "close"): @@ -568,8 +521,6 @@ def close(self) -> None: def response(self) -> Any: return getattr(self._stream, "response", None) - # --- internals --------------------------------------------------------- - def _inspect_chunk(self, chunk: Any) -> None: usage = getattr(chunk, "usage", None) if usage is not None: @@ -582,8 +533,8 @@ def _inspect_chunk(self, chunk: Any) -> None: tool_calls = getattr(delta, "tool_calls", None) if tool_calls: for tc in tool_calls: - # A new tool call has an ``id``; subsequent deltas - # for the same call only have ``index``. + # A new tool call has an ``id``; subsequent deltas for + # the same call only have ``index``. if getattr(tc, "id", None): self._tool_call_count += 1 @@ -617,50 +568,39 @@ def _finalize(self) -> None: ) -class _InstrumentedAsyncStream: - """Wraps an OpenAI ``AsyncStream`` to capture usage after consumption.""" +class _InstrumentedStream(_InstrumentedStreamBase): + """Wraps an OpenAI sync ``Stream`` and tracks usage at stream end.""" - __slots__ = ( - "_stream", - "_ctx", - "_model", - "_start_time", - "_pre_action", - "_pre_reason", - "_pre_model", - "_pre_applied", - "_decision_mode", - "_usage", - "_tool_call_count", - "_finalized", - ) + __slots__ = () - def __init__( - self, - stream: Any, - ctx: Any, - model: str, - start_time: float, - pre_action: str = "allow", - pre_reason: str = "observe", - pre_model: str | None = None, - pre_applied: bool = True, - decision_mode: str = "observe", - ) -> None: - self._stream = stream - self._ctx = ctx - self._model = model - self._start_time = start_time - self._pre_action = pre_action - self._pre_reason = pre_reason - self._pre_model = pre_model or model - self._pre_applied = pre_applied - self._decision_mode = decision_mode - self._usage: Any = None - self._tool_call_count: int = 0 - self._finalized: bool = False + def __iter__(self) -> _InstrumentedStream: + return self + + def __next__(self) -> Any: + try: + chunk = next(self._stream) + self._inspect_chunk(chunk) + return chunk + except StopIteration: + self._finalize() + raise + + def __enter__(self) -> _InstrumentedStream: + if hasattr(self._stream, "__enter__"): + self._stream.__enter__() + return self + + def __exit__(self, *args: Any) -> bool: + self._finalize() + if hasattr(self._stream, "__exit__"): + return self._stream.__exit__(*args) # type: ignore[no-any-return] + return False - # --- async iteration --------------------------------------------------- + +class _InstrumentedAsyncStream(_InstrumentedStreamBase): + """Wraps an OpenAI async ``AsyncStream`` and tracks usage at stream end.""" + + __slots__ = () def __aiter__(self) -> _InstrumentedAsyncStream: return self @@ -674,8 +614,6 @@ async def __anext__(self) -> Any: self._finalize() raise - # --- async context manager --------------------------------------------- - async def __aenter__(self) -> _InstrumentedAsyncStream: if hasattr(self._stream, "__aenter__"): await self._stream.__aenter__() @@ -687,67 +625,105 @@ async def __aexit__(self, *args: Any) -> bool: return await self._stream.__aexit__(*args) # type: ignore[no-any-return] return False - # --- proxied attributes ------------------------------------------------ - - def close(self) -> None: - self._finalize() - if hasattr(self._stream, "close"): - self._stream.close() - - @property - def response(self) -> Any: - return getattr(self._stream, "response", None) - # --- internals --------------------------------------------------------- +# --------------------------------------------------------------------------- +# Wrapper factories +# --------------------------------------------------------------------------- - def _inspect_chunk(self, chunk: Any) -> None: - usage = getattr(chunk, "usage", None) - if usage is not None: - self._usage = usage - choices = getattr(chunk, "choices", []) - if choices: - delta = getattr(choices[0], "delta", None) - if delta: - tool_calls = getattr(delta, "tool_calls", None) - if tool_calls: - for tc in tool_calls: - if getattr(tc, "id", None): - self._tool_call_count += 1 +@dataclass(frozen=True) +class _CallInterceptionState: + kwargs: dict[str, Any] + model: str + pre_action: str + pre_reason: str + pre_model: str + pre_applied: bool + is_stream: bool + start_time: float + + +def _prepare_call_interception( + *, + ctx: Any, + mode: str, + kwargs: dict[str, Any], +) -> _CallInterceptionState: + model: str = kwargs.get("model", "unknown") + pre_action = "allow" + pre_reason = mode + pre_model = model + pre_applied = True + + if ctx: + kwargs, model, pre_action, pre_reason, pre_model, pre_applied = _resolve_pre_call_decision( + ctx, + mode, + model, + kwargs, + ) - def _finalize(self) -> None: - if self._finalized: - return - self._finalized = True + is_stream: bool = bool(kwargs.get("stream", False)) + kwargs = _ensure_stream_usage(kwargs) + + return _CallInterceptionState( + kwargs=kwargs, + model=model, + pre_action=pre_action, + pre_reason=pre_reason, + pre_model=pre_model, + pre_applied=pre_applied, + is_stream=is_stream, + start_time=time.monotonic(), + ) - if self._ctx is None: - return - elapsed_ms = (time.monotonic() - self._start_time) * 1000 - prompt_tokens = 0 - completion_tokens = 0 - if self._usage: - prompt_tokens = getattr(self._usage, "prompt_tokens", 0) or 0 - completion_tokens = getattr(self._usage, "completion_tokens", 0) or 0 +def _finalize_interception( + *, + ctx: Any, + mode: str, + state: _CallInterceptionState, + response: Any, + stream_wrapper: type[_InstrumentedStream] | type[_InstrumentedAsyncStream], +) -> Any: + if state.is_stream and ctx: + return stream_wrapper( + response, + ctx, + state.model, + state.start_time, + state.pre_action, + state.pre_reason, + state.pre_model, + state.pre_applied, + mode, + ) + if (not state.is_stream) and ctx: + elapsed_ms = (time.monotonic() - state.start_time) * 1000 + prompt_tokens, completion_tokens = _extract_usage(response) + tool_call_count = _count_tool_calls_in_response(response) _update_context( - self._ctx, - self._model, + ctx, + state.model, prompt_tokens, completion_tokens, - self._tool_call_count, + tool_call_count, elapsed_ms, - action=self._pre_action, - action_reason=self._pre_reason, - action_model=self._pre_model, - applied=self._pre_applied, - decision_mode=self._decision_mode, + action=state.pre_action, + action_reason=state.pre_reason, + action_model=state.pre_model, + applied=state.pre_applied, + decision_mode=mode, + ) + else: + logger.debug( + "harness %s: model=%s (no active run scope, metrics not tracked)", + mode, + state.model, ) - -# --------------------------------------------------------------------------- -# Wrapper factories -# --------------------------------------------------------------------------- + return response def _make_patched_create(original_fn: Any) -> Any: @@ -764,66 +740,24 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: if mode == "off": return original_fn(self, *args, **kwargs) - model: str = kwargs.get("model", "unknown") - pre_action = "allow" - pre_reason = mode - pre_model = model - pre_applied = True - is_stream: bool = bool(kwargs.get("stream", False)) - - if ctx: - kwargs, model, pre_action, pre_reason, pre_model, pre_applied = _resolve_pre_call_decision( - ctx, - mode, - model, - kwargs, - ) - - start_time = time.monotonic() - - kwargs = _ensure_stream_usage(kwargs) + state = _prepare_call_interception(ctx=ctx, mode=mode, kwargs=kwargs) - logger.debug("harness intercept: model=%s stream=%s mode=%s", model, is_stream, mode) - - response = original_fn(self, *args, **kwargs) + logger.debug( + "harness intercept: model=%s stream=%s mode=%s", + state.model, + state.is_stream, + mode, + ) - if is_stream and ctx: - return _InstrumentedStream( - response, - ctx, - model, - start_time, - pre_action, - pre_reason, - pre_model, - pre_applied, - mode, - ) - elif not is_stream and ctx: - elapsed_ms = (time.monotonic() - start_time) * 1000 - prompt_tokens, completion_tokens = _extract_usage(response) - tool_call_count = _count_tool_calls_in_response(response) - _update_context( - ctx, - model, - prompt_tokens, - completion_tokens, - tool_call_count, - elapsed_ms, - action=pre_action, - action_reason=pre_reason, - action_model=pre_model, - applied=pre_applied, - decision_mode=mode, - ) - else: - logger.debug( - "harness %s: model=%s (no active run scope, metrics not tracked)", - mode, - model, - ) + response = original_fn(self, *args, **state.kwargs) - return response + return _finalize_interception( + ctx=ctx, + mode=mode, + state=state, + response=response, + stream_wrapper=_InstrumentedStream, + ) return wrapper @@ -842,71 +776,24 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: if mode == "off": return await original_fn(self, *args, **kwargs) - model: str = kwargs.get("model", "unknown") - pre_action = "allow" - pre_reason = mode - pre_model = model - pre_applied = True - is_stream: bool = bool(kwargs.get("stream", False)) - - if ctx: - kwargs, model, pre_action, pre_reason, pre_model, pre_applied = _resolve_pre_call_decision( - ctx, - mode, - model, - kwargs, - ) - - start_time = time.monotonic() - - kwargs = _ensure_stream_usage(kwargs) + state = _prepare_call_interception(ctx=ctx, mode=mode, kwargs=kwargs) logger.debug( "harness intercept async: model=%s stream=%s mode=%s", - model, - is_stream, + state.model, + state.is_stream, mode, ) - response = await original_fn(self, *args, **kwargs) - - if is_stream and ctx: - return _InstrumentedAsyncStream( - response, - ctx, - model, - start_time, - pre_action, - pre_reason, - pre_model, - pre_applied, - mode, - ) - elif not is_stream and ctx: - elapsed_ms = (time.monotonic() - start_time) * 1000 - prompt_tokens, completion_tokens = _extract_usage(response) - tool_call_count = _count_tool_calls_in_response(response) - _update_context( - ctx, - model, - prompt_tokens, - completion_tokens, - tool_call_count, - elapsed_ms, - action=pre_action, - action_reason=pre_reason, - action_model=pre_model, - applied=pre_applied, - decision_mode=mode, - ) - else: - logger.debug( - "harness %s: model=%s (no active run scope, metrics not tracked)", - mode, - model, - ) + response = await original_fn(self, *args, **state.kwargs) - return response + return _finalize_interception( + ctx=ctx, + mode=mode, + state=state, + response=response, + stream_wrapper=_InstrumentedAsyncStream, + ) return wrapper diff --git a/cascadeflow/harness/pricing.py b/cascadeflow/harness/pricing.py index bd86323e..dab445ae 100644 --- a/cascadeflow/harness/pricing.py +++ b/cascadeflow/harness/pricing.py @@ -76,3 +76,4 @@ def model_total_price(model: str) -> float: """Return total (input + output) price per 1M tokens.""" in_price, out_price = PRICING_USD_PER_M.get(model, DEFAULT_PRICING_USD_PER_M) return in_price + out_price + diff --git a/cascadeflow/integrations/crewai.py b/cascadeflow/integrations/crewai.py index 7ff765f0..16cbe6e0 100644 --- a/cascadeflow/integrations/crewai.py +++ b/cascadeflow/integrations/crewai.py @@ -22,55 +22,19 @@ from importlib.util import find_spec from typing import Any, Optional +from cascadeflow.harness.pricing import estimate_cost as _estimate_shared_cost +from cascadeflow.harness.pricing import estimate_energy as _estimate_shared_energy + logger = logging.getLogger("cascadeflow.integrations.crewai") CREWAI_AVAILABLE = find_spec("crewai") is not None -# --------------------------------------------------------------------------- -# Pricing table (USD per 1M tokens: input, output) -# Shared with instrument.py — kept small and self-contained to avoid -# cross-module coupling. A future pricing registry will deduplicate. -# --------------------------------------------------------------------------- - -_PRICING: dict[str, tuple[float, float]] = { - "gpt-4o": (2.50, 10.00), - "gpt-4o-mini": (0.15, 0.60), - "gpt-5-mini": (0.20, 0.80), - "gpt-4-turbo": (10.00, 30.00), - "gpt-4": (30.00, 60.00), - "gpt-3.5-turbo": (0.50, 1.50), - "o1": (15.00, 60.00), - "o1-mini": (3.00, 12.00), - "o3-mini": (1.10, 4.40), - "claude-sonnet-4": (3.00, 15.00), - "claude-haiku-3.5": (1.00, 5.00), - "claude-opus-4.5": (5.00, 25.00), -} -_DEFAULT_PRICING: tuple[float, float] = (2.50, 10.00) - -_ENERGY_COEFFICIENTS: dict[str, float] = { - "gpt-4o": 1.0, - "gpt-4o-mini": 0.3, - "gpt-5-mini": 0.35, - "gpt-4-turbo": 1.5, - "gpt-4": 1.5, - "gpt-3.5-turbo": 0.2, - "o1": 2.0, - "o1-mini": 0.8, - "o3-mini": 0.5, -} -_DEFAULT_ENERGY_COEFFICIENT: float = 1.0 -_ENERGY_OUTPUT_WEIGHT: float = 1.5 - - def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float: - per_million = _PRICING.get(model, _DEFAULT_PRICING) - return (prompt_tokens / 1_000_000) * per_million[0] + (completion_tokens / 1_000_000) * per_million[1] + return _estimate_shared_cost(model, prompt_tokens, completion_tokens) def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> float: - coeff = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT) - return coeff * (prompt_tokens + completion_tokens * _ENERGY_OUTPUT_WEIGHT) + return _estimate_shared_energy(model, prompt_tokens, completion_tokens) def _extract_message_content(message: Any) -> str: diff --git a/cascadeflow/integrations/openai_agents.py b/cascadeflow/integrations/openai_agents.py index 1205cd98..fe52d4d4 100644 --- a/cascadeflow/integrations/openai_agents.py +++ b/cascadeflow/integrations/openai_agents.py @@ -15,6 +15,18 @@ from typing import TYPE_CHECKING, Any, AsyncIterator, Optional from cascadeflow.harness import get_current_run +from cascadeflow.harness.pricing import ( + OPENAI_MODEL_POOL, +) +from cascadeflow.harness.pricing import ( + estimate_cost as _estimate_shared_cost, +) +from cascadeflow.harness.pricing import ( + estimate_energy as _estimate_shared_energy, +) +from cascadeflow.harness.pricing import ( + model_total_price as _shared_model_total_price, +) from cascadeflow.schema.exceptions import BudgetExceededError logger = logging.getLogger("cascadeflow.harness.openai_agents") @@ -57,40 +69,16 @@ class OpenAIAgentsIntegrationConfig: fail_open: bool = True -# Approximate pricing (USD per 1M tokens: input, output). -_PRICING_USD_PER_M = { - "gpt-4o": (2.50, 10.00), - "gpt-4o-mini": (0.15, 0.60), - "gpt-5": (1.25, 10.00), - "gpt-5-mini": (0.20, 0.80), - "gpt-4-turbo": (10.00, 30.00), -} -_DEFAULT_PRICING_USD_PER_M = (2.50, 10.00) - -# Deterministic proxy coefficients for energy tracking. -_ENERGY_COEFFICIENTS = { - "gpt-4o": 1.0, - "gpt-4o-mini": 0.3, - "gpt-5": 1.2, - "gpt-5-mini": 0.35, - "gpt-4-turbo": 1.5, -} -_DEFAULT_ENERGY_COEFFICIENT = 1.0 -_ENERGY_OUTPUT_WEIGHT = 1.5 - - def _estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float: - in_price, out_price = _PRICING_USD_PER_M.get(model, _DEFAULT_PRICING_USD_PER_M) - return (input_tokens / 1_000_000.0) * in_price + (output_tokens / 1_000_000.0) * out_price + return _estimate_shared_cost(model, input_tokens, output_tokens) def _estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float: - coefficient = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT) - return coefficient * (input_tokens + (output_tokens * _ENERGY_OUTPUT_WEIGHT)) + return _estimate_shared_energy(model, input_tokens, output_tokens) def _total_model_price(model: str) -> float: - return sum(_PRICING_USD_PER_M.get(model, _DEFAULT_PRICING_USD_PER_M)) + return _shared_model_total_price(model) def _extract_usage_tokens(usage: Any) -> tuple[int, int]: @@ -228,8 +216,10 @@ def _resolve_model(self, requested_model: Optional[str]) -> str: # Under budget pressure, switch to the cheapest configured candidate. if run.budget_remaining / run.budget_max < 0.2: + compatible_candidates = [name for name in self._config.model_candidates if name in OPENAI_MODEL_POOL] + candidates = compatible_candidates or self._config.model_candidates cheapest = min( - self._config.model_candidates, + candidates, key=_total_model_price, ) if cheapest != candidate: diff --git a/tests/test_harness_shared_pricing.py b/tests/test_harness_shared_pricing.py index a26398f3..fb693226 100644 --- a/tests/test_harness_shared_pricing.py +++ b/tests/test_harness_shared_pricing.py @@ -7,12 +7,7 @@ import cascadeflow.harness.instrument as instrument_mod import cascadeflow.integrations.crewai as crewai_mod import cascadeflow.integrations.openai_agents as openai_agents_mod -from cascadeflow.harness.pricing import ( - OPENAI_MODEL_POOL, - estimate_cost, - estimate_energy, - model_total_price, -) +from cascadeflow.harness.pricing import OPENAI_MODEL_POOL, estimate_cost, estimate_energy, model_total_price def test_shared_estimate_cost_known_models() -> None: @@ -40,31 +35,16 @@ def test_integration_estimators_use_shared_profiles() -> None: shared_cost = estimate_cost(model, input_tokens, output_tokens) shared_energy = estimate_energy(model, input_tokens, output_tokens) - assert instrument_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx( - shared_cost - ) - assert crewai_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx( - shared_cost - ) - assert openai_agents_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx( - shared_cost - ) - - assert instrument_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx( - shared_energy - ) - assert crewai_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx( - shared_energy - ) - assert openai_agents_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx( - shared_energy - ) + assert instrument_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(shared_cost) + assert crewai_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(shared_cost) + assert openai_agents_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(shared_cost) + + assert instrument_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(shared_energy) + assert crewai_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(shared_energy) + assert openai_agents_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(shared_energy) def test_openai_agents_total_price_uses_shared_profiles() -> None: - assert openai_agents_mod._total_model_price("gpt-5") == pytest.approx( - model_total_price("gpt-5") - ) - assert openai_agents_mod._total_model_price("gpt-4o-mini") == pytest.approx( - model_total_price("gpt-4o-mini") - ) + assert openai_agents_mod._total_model_price("gpt-5") == pytest.approx(model_total_price("gpt-5")) + assert openai_agents_mod._total_model_price("gpt-4o-mini") == pytest.approx(model_total_price("gpt-4o-mini")) + From 6afcfa73b6bb5b1e58f3c5a8315bd4f51a26994a Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 09:55:08 +0100 Subject: [PATCH 15/49] docs(plan): claim langchain harness extension branch --- docs/strategy/agent-intelligence-v2-plan.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md index b03d8a58..8bcf8743 100644 --- a/docs/strategy/agent-intelligence-v2-plan.md +++ b/docs/strategy/agent-intelligence-v2-plan.md @@ -847,7 +847,7 @@ Claim checklist (one owner per branch at a time): - [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `completed (ready for PR)` - [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` - [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` -- [ ] `feat/v2-langchain-harness-extension` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` +- [ ] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` - [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` - [ ] `feat/v2-bench-repro-pipeline` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` - [ ] `feat/v2-security-privacy-telemetry` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` From cc51cf7a78551f50b1cf49c3318c220f8bddffaa Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 11:18:57 +0100 Subject: [PATCH 16/49] feat(harness): add privacy-safe decision telemetry and callback hooks --- cascadeflow/__init__.py | 4 + cascadeflow/harness/__init__.py | 4 + cascadeflow/harness/api.py | 124 +++++++++++++++++++- docs/README.md | 1 + docs/guides/harness_telemetry_privacy.md | 59 ++++++++++ docs/strategy/agent-intelligence-v2-plan.md | 2 +- tests/test_harness_api.py | 73 ++++++++++++ 7 files changed, 261 insertions(+), 6 deletions(-) create mode 100644 docs/guides/harness_telemetry_privacy.md diff --git a/cascadeflow/__init__.py b/cascadeflow/__init__.py index d49eb644..f2738abc 100644 --- a/cascadeflow/__init__.py +++ b/cascadeflow/__init__.py @@ -250,6 +250,8 @@ agent as harness_agent, get_harness_config, get_current_run, + get_harness_callback_manager, + set_harness_callback_manager, ) # ==================== MAIN AGENT & RESULT ==================== @@ -408,6 +410,8 @@ >>>>>>> 1aba349 (Add core harness API scaffold with context-scoped runtime) "get_harness_config", "get_current_run", + "get_harness_callback_manager", + "set_harness_callback_manager", # ===== PROVIDERS ===== "ModelResponse", "BaseProvider", diff --git a/cascadeflow/harness/__init__.py b/cascadeflow/harness/__init__.py index 43a03662..74c07219 100644 --- a/cascadeflow/harness/__init__.py +++ b/cascadeflow/harness/__init__.py @@ -14,11 +14,13 @@ HarnessInitReport, HarnessRunContext, agent, + get_harness_callback_manager, get_current_run, get_harness_config, init, reset, run, + set_harness_callback_manager, ) __all__ = [ @@ -29,6 +31,8 @@ "run", "agent", "get_current_run", + "get_harness_callback_manager", "get_harness_config", + "set_harness_callback_manager", "reset", ] diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py index 9d003ee1..79617f39 100644 --- a/cascadeflow/harness/api.py +++ b/cascadeflow/harness/api.py @@ -4,6 +4,7 @@ import json import logging import os +import time from contextvars import ContextVar, Token from dataclasses import dataclass, field from importlib.util import find_spec @@ -40,6 +41,8 @@ class HarnessInitReport: @dataclass class HarnessRunContext: run_id: str = field(default_factory=lambda: uuid4().hex[:12]) + started_at_ms: float = field(default_factory=lambda: time.time() * 1000) + ended_at_ms: Optional[float] = None mode: HarnessMode = "off" budget_max: Optional[float] = None tool_calls_max: Optional[int] = None @@ -71,6 +74,8 @@ def __enter__(self) -> HarnessRunContext: return self def __exit__(self, exc_type: Any, exc: Any, tb: Any) -> None: + self.ended_at_ms = time.time() * 1000 + self._log_summary() if self._token is not None: _current_run.reset(self._token) self._token = None @@ -84,6 +89,47 @@ async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None: def trace(self) -> list[dict[str, Any]]: return list(self._trace) + def summary(self) -> dict[str, Any]: + duration_ms: Optional[float] = None + if self.ended_at_ms is not None: + duration_ms = max(0.0, self.ended_at_ms - self.started_at_ms) + return { + "run_id": self.run_id, + "mode": self.mode, + "step_count": self.step_count, + "tool_calls": self.tool_calls, + "cost": self.cost, + "savings": self.savings, + "latency_used_ms": self.latency_used_ms, + "energy_used": self.energy_used, + "budget_max": self.budget_max, + "budget_remaining": self.budget_remaining, + "last_action": self.last_action, + "model_used": self.model_used, + "duration_ms": duration_ms, + } + + def _log_summary(self) -> None: + if self.mode == "off" or self.step_count <= 0: + return + logger.info( + ( + "harness run summary run_id=%s mode=%s steps=%d tool_calls=%d " + "cost=%.6f latency_ms=%.2f energy=%.4f last_action=%s model=%s " + "budget_remaining=%s" + ), + self.run_id, + self.mode, + self.step_count, + self.tool_calls, + self.cost, + self.latency_used_ms, + self.energy_used, + self.last_action, + self.model_used, + self.budget_remaining, + ) + def record( self, action: str, @@ -93,24 +139,41 @@ def record( applied: Optional[bool] = None, decision_mode: Optional[str] = None, ) -> None: - self.last_action = action - self.model_used = model + safe_action = _sanitize_trace_value(action, max_length=64) or "allow" + safe_reason = _sanitize_trace_value(reason, max_length=160) or "unspecified" + safe_model = _sanitize_trace_value(model, max_length=128) if model is not None else None + + self.last_action = safe_action + self.model_used = safe_model entry: dict[str, Any] = { - "action": action, - "reason": reason, - "model": model, + "action": safe_action, + "reason": safe_reason, + "model": safe_model, "run_id": self.run_id, + "mode": self.mode, + "step": self.step_count, + "timestamp_ms": time.time() * 1000, + "tool_calls_total": self.tool_calls, + "cost_total": self.cost, + "latency_used_ms": self.latency_used_ms, + "energy_used": self.energy_used, + "budget_state": { + "max": self.budget_max, + "remaining": self.budget_remaining, + }, } if applied is not None: entry["applied"] = applied if decision_mode is not None: entry["decision_mode"] = decision_mode self._trace.append(entry) + _emit_harness_decision(entry) _harness_config: HarnessConfig = HarnessConfig() _current_run: ContextVar[Optional[HarnessRunContext]] = ContextVar("cascadeflow_harness_run", default=None) _is_instrumented: bool = False +_harness_callback_manager: Any = None _UNSET = object() @@ -135,6 +198,15 @@ def get_current_run() -> Optional[HarnessRunContext]: return _current_run.get() +def get_harness_callback_manager() -> Any: + return _harness_callback_manager + + +def set_harness_callback_manager(callback_manager: Any) -> None: + global _harness_callback_manager + _harness_callback_manager = callback_manager + + def reset() -> None: """ Reset harness global state and unpatch instrumented clients. @@ -144,15 +216,53 @@ def reset() -> None: global _harness_config global _is_instrumented + global _harness_callback_manager from cascadeflow.harness.instrument import unpatch_openai unpatch_openai() _harness_config = HarnessConfig() _is_instrumented = False + _harness_callback_manager = None _current_run.set(None) +def _sanitize_trace_value(value: Any, *, max_length: int) -> Optional[str]: + if value is None: + return None + text = str(value).replace("\n", " ").replace("\r", " ").strip() + if len(text) > max_length: + text = text[: max_length - 3] + "..." + return text + + +def _emit_harness_decision(entry: dict[str, Any]) -> None: + manager = get_harness_callback_manager() + if manager is None: + return + + trigger = getattr(manager, "trigger", None) + if not callable(trigger): + logger.debug("harness callback manager has no trigger() method") + return + + try: + from cascadeflow.telemetry.callbacks import CallbackEvent + except Exception: + logger.debug("telemetry callbacks unavailable for harness decision emit", exc_info=True) + return + + try: + trigger( + CallbackEvent.CASCADE_DECISION, + query="[harness]", + data=dict(entry), + workflow="harness", + ) + except Exception: + logger.debug("failed to emit harness decision callback", exc_info=True) + + def _parse_bool(raw: str) -> bool: normalized = raw.strip().lower() return normalized in {"1", "true", "yes", "on"} @@ -301,6 +411,7 @@ def init( kpi_targets: Optional[dict[str, float]] | object = _UNSET, kpi_weights: Optional[dict[str, float]] | object = _UNSET, compliance: Optional[str] | object = _UNSET, + callback_manager: Any | object = _UNSET, ) -> HarnessInitReport: """ Initialize global harness settings. @@ -334,6 +445,9 @@ def init( resolved_compliance = _resolve_value( "compliance", compliance, env_config, file_config, None, sources ) + if callback_manager is not _UNSET: + set_harness_callback_manager(callback_manager) + sources["callback_manager"] = "code" validated_mode = _validate_mode(str(resolved_mode)) _harness_config = HarnessConfig( diff --git a/docs/README.md b/docs/README.md index 1238d7f8..b9cedf66 100644 --- a/docs/README.md +++ b/docs/README.md @@ -20,6 +20,7 @@ Welcome to cascadeflow documentation! 🌊 - [Tools](guides/tools.md) - Function calling and tool usage with cascades - [Agentic Patterns (Python)](guides/agentic-python.md) - Tool loops and multi-agent orchestration in Python - [Agentic Patterns (TypeScript)](guides/agentic-typescript.md) - Tool loops, multi-agent orchestration, and message best practices +- [Harness Telemetry & Privacy](guides/harness_telemetry_privacy.md) - Decision traces, callbacks, and privacy-safe observability - [Cost Tracking](guides/cost_tracking.md) - Track and analyze API costs across queries - [Proxy Routing](guides/proxy.md) - Route requests through provider-aware proxy plans diff --git a/docs/guides/harness_telemetry_privacy.md b/docs/guides/harness_telemetry_privacy.md new file mode 100644 index 00000000..01e75402 --- /dev/null +++ b/docs/guides/harness_telemetry_privacy.md @@ -0,0 +1,59 @@ +# Harness Telemetry and Privacy + +Use this guide when you want harness observability without leaking user content. + +## What the Harness Records + +Each `run.trace()` decision entry includes: + +- `action`, `reason`, `model` +- `run_id`, `mode`, `step`, `timestamp_ms` +- `cost_total`, `latency_used_ms`, `energy_used`, `tool_calls_total` +- `budget_state` (`max`, `remaining`) +- `applied`, `decision_mode` (when available) + +The trace is scoped to the current `run()` context. + +## What the Harness Does Not Record + +By default, harness decision traces do not include: + +- raw prompts or user messages +- model response text +- tool argument payloads + +This keeps decision telemetry focused on policy/routing state instead of request content. + +## Callback Emission (Optional) + +If you provide a callback manager, each harness decision emits `CallbackEvent.CASCADE_DECISION`. + +```python +from cascadeflow import init, run +from cascadeflow.telemetry.callbacks import CallbackEvent, CallbackManager + +manager = CallbackManager() + +def on_decision(event): + print(event.data["action"], event.data["model"]) + +manager.register(CallbackEvent.CASCADE_DECISION, on_decision) + +init(mode="observe", callback_manager=manager) + +with run(budget=1.0) as r: + ... +``` + +The emitted callback uses `query="[harness]"` and `workflow="harness"` to avoid passing user prompt content. + +## Per-Run Summary Logging + +When a scoped run exits (and recorded at least one step), the harness logs a summary on logger `cascadeflow.harness`: + +- run id, mode, steps, tool calls +- cost/latency/energy totals +- last action/model +- remaining budget + +Use standard Python logging controls to direct this to your existing log sink. diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md index 8bcf8743..73bfec1b 100644 --- a/docs/strategy/agent-intelligence-v2-plan.md +++ b/docs/strategy/agent-intelligence-v2-plan.md @@ -850,7 +850,7 @@ Claim checklist (one owner per branch at a time): - [ ] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` - [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` - [ ] `feat/v2-bench-repro-pipeline` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` -- [ ] `feat/v2-security-privacy-telemetry` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` +- [ ] `feat/v2-security-privacy-telemetry` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` Merge gates per feature branch: - [ ] Unit/integration tests green for touched scope diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py index 183a4350..087fa692 100644 --- a/tests/test_harness_api.py +++ b/tests/test_harness_api.py @@ -5,6 +5,7 @@ import cascadeflow import cascadeflow.harness.api as harness_api from cascadeflow.harness import agent, get_current_run, get_harness_config, init, reset, run +from cascadeflow.telemetry.callbacks import CallbackEvent, CallbackManager def setup_function() -> None: @@ -171,6 +172,8 @@ def test_top_level_exports_exist(): assert callable(cascadeflow.reset) assert callable(cascadeflow.run) assert callable(cascadeflow.agent) + assert callable(cascadeflow.get_harness_callback_manager) + assert callable(cascadeflow.set_harness_callback_manager) report = cascadeflow.init(mode="off") assert report.mode == "off" @@ -182,6 +185,8 @@ def test_run_record_and_trace_copy(): trace_b = ctx.trace() assert trace_a == trace_b assert trace_a[0]["action"] == "switch_model" + assert "budget_state" in trace_a[0] + assert trace_a[0]["budget_state"]["max"] == 1.0 trace_a.append({"action": "mutated"}) assert len(ctx.trace()) == 1 @@ -326,3 +331,71 @@ def test_init_reports_openai_instrumented_when_patch_succeeds(monkeypatch): monkeypatch.setattr(instrument, "patch_openai", lambda: True) report = init(mode="observe") assert report.instrumented == ["openai"] + + +def test_run_summary_populates_on_context_exit(): + init(mode="observe") + with run(budget=1.5) as ctx: + ctx.step_count = 2 + ctx.tool_calls = 1 + ctx.cost = 0.42 + ctx.latency_used_ms = 123.0 + ctx.energy_used = 33.0 + ctx.budget_remaining = 1.08 + ctx.last_action = "allow" + ctx.model_used = "gpt-4o-mini" + + summary = ctx.summary() + assert summary["run_id"] == ctx.run_id + assert summary["step_count"] == 2 + assert summary["budget_remaining"] == pytest.approx(1.08) + assert summary["duration_ms"] is not None + assert summary["duration_ms"] >= 0.0 + + +def test_run_context_logs_summary(caplog): + init(mode="observe") + with caplog.at_level("INFO", logger="cascadeflow.harness"): + with run(budget=1.0) as ctx: + ctx.step_count = 1 + ctx.cost = 0.01 + ctx.model_used = "gpt-4o-mini" + + assert any("harness run summary" in rec.message for rec in caplog.records) + + +def test_record_emits_cascade_decision_callback(): + manager = CallbackManager() + received = [] + + def _on_decision(data): + received.append(data) + + manager.register(CallbackEvent.CASCADE_DECISION, _on_decision) + report = init(mode="observe", callback_manager=manager) + assert report.config_sources["callback_manager"] == "code" + + with run(budget=1.0) as ctx: + ctx.step_count = 1 + ctx.record(action="switch_model", reason="budget_pressure", model="gpt-4o-mini") + + assert len(received) == 1 + event = received[0] + assert event.event == CallbackEvent.CASCADE_DECISION + assert event.query == "[harness]" + assert event.workflow == "harness" + assert event.data["action"] == "switch_model" + assert event.data["run_id"] == ctx.run_id + + +def test_record_sanitizes_trace_values(): + ctx = run() + ctx.record( + action="allow\nnewline", + reason="a" * 400, + model="model\r\nname", + ) + entry = ctx.trace()[0] + assert "\n" not in entry["action"] + assert "\r" not in entry["model"] + assert len(entry["reason"]) <= 160 From ae1cf97b53339e7fb284bf22e2b921b4c8baed6a Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Mon, 2 Mar 2026 08:46:42 +0100 Subject: [PATCH 17/49] fix(harness): address telemetry review findings - Use time.monotonic() for duration_ms calculation instead of wall-clock delta (avoids NTP/suspend clock jumps) - Extract sanitize constants (_MAX_ACTION_LEN, _MAX_REASON_LEN, _MAX_MODEL_LEN) - Log warning when record() receives empty action (was silently defaulting) - Cache CallbackEvent import in _emit_harness_decision for hot-path perf - Add tests: no-callback-manager noop, empty-action warning, duration field --- cascadeflow/harness/api.py | 45 ++++++++++++++++++++++++++------------ tests/test_harness_api.py | 19 ++++++++++++++++ 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py index 79617f39..6039cc00 100644 --- a/cascadeflow/harness/api.py +++ b/cascadeflow/harness/api.py @@ -41,8 +41,10 @@ class HarnessInitReport: @dataclass class HarnessRunContext: run_id: str = field(default_factory=lambda: uuid4().hex[:12]) + _started_monotonic: float = field(default_factory=time.monotonic, init=False, repr=False) started_at_ms: float = field(default_factory=lambda: time.time() * 1000) ended_at_ms: Optional[float] = None + duration_ms: Optional[float] = None mode: HarnessMode = "off" budget_max: Optional[float] = None tool_calls_max: Optional[int] = None @@ -75,6 +77,7 @@ def __enter__(self) -> HarnessRunContext: def __exit__(self, exc_type: Any, exc: Any, tb: Any) -> None: self.ended_at_ms = time.time() * 1000 + self.duration_ms = max(0.0, (time.monotonic() - self._started_monotonic) * 1000.0) self._log_summary() if self._token is not None: _current_run.reset(self._token) @@ -90,9 +93,6 @@ def trace(self) -> list[dict[str, Any]]: return list(self._trace) def summary(self) -> dict[str, Any]: - duration_ms: Optional[float] = None - if self.ended_at_ms is not None: - duration_ms = max(0.0, self.ended_at_ms - self.started_at_ms) return { "run_id": self.run_id, "mode": self.mode, @@ -106,7 +106,7 @@ def summary(self) -> dict[str, Any]: "budget_remaining": self.budget_remaining, "last_action": self.last_action, "model_used": self.model_used, - "duration_ms": duration_ms, + "duration_ms": self.duration_ms, } def _log_summary(self) -> None: @@ -139,9 +139,12 @@ def record( applied: Optional[bool] = None, decision_mode: Optional[str] = None, ) -> None: - safe_action = _sanitize_trace_value(action, max_length=64) or "allow" - safe_reason = _sanitize_trace_value(reason, max_length=160) or "unspecified" - safe_model = _sanitize_trace_value(model, max_length=128) if model is not None else None + safe_action = _sanitize_trace_value(action, max_length=_MAX_ACTION_LEN) + if not safe_action: + logger.warning("record() called with empty action, defaulting to 'allow'") + safe_action = "allow" + safe_reason = _sanitize_trace_value(reason, max_length=_MAX_REASON_LEN) or "unspecified" + safe_model = _sanitize_trace_value(model, max_length=_MAX_MODEL_LEN) if model is not None else None self.last_action = safe_action self.model_used = safe_model @@ -217,6 +220,7 @@ def reset() -> None: global _harness_config global _is_instrumented global _harness_callback_manager + global _cached_cascade_decision_event from cascadeflow.harness.instrument import unpatch_openai @@ -224,19 +228,30 @@ def reset() -> None: _harness_config = HarnessConfig() _is_instrumented = False _harness_callback_manager = None + _cached_cascade_decision_event = None _current_run.set(None) +_MAX_ACTION_LEN = 64 +_MAX_REASON_LEN = 160 +_MAX_MODEL_LEN = 128 + + def _sanitize_trace_value(value: Any, *, max_length: int) -> Optional[str]: if value is None: return None text = str(value).replace("\n", " ").replace("\r", " ").strip() if len(text) > max_length: text = text[: max_length - 3] + "..." - return text + return text or None + + +_cached_cascade_decision_event: Any = None def _emit_harness_decision(entry: dict[str, Any]) -> None: + global _cached_cascade_decision_event + manager = get_harness_callback_manager() if manager is None: return @@ -246,15 +261,17 @@ def _emit_harness_decision(entry: dict[str, Any]) -> None: logger.debug("harness callback manager has no trigger() method") return - try: - from cascadeflow.telemetry.callbacks import CallbackEvent - except Exception: - logger.debug("telemetry callbacks unavailable for harness decision emit", exc_info=True) - return + if _cached_cascade_decision_event is None: + try: + from cascadeflow.telemetry.callbacks import CallbackEvent + _cached_cascade_decision_event = CallbackEvent.CASCADE_DECISION + except Exception: + logger.debug("telemetry callbacks unavailable for harness decision emit", exc_info=True) + return try: trigger( - CallbackEvent.CASCADE_DECISION, + _cached_cascade_decision_event, query="[harness]", data=dict(entry), workflow="harness", diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py index 087fa692..937ab865 100644 --- a/tests/test_harness_api.py +++ b/tests/test_harness_api.py @@ -351,6 +351,8 @@ def test_run_summary_populates_on_context_exit(): assert summary["budget_remaining"] == pytest.approx(1.08) assert summary["duration_ms"] is not None assert summary["duration_ms"] >= 0.0 + assert ctx.duration_ms is not None + assert ctx.duration_ms >= 0.0 def test_run_context_logs_summary(caplog): @@ -399,3 +401,20 @@ def test_record_sanitizes_trace_values(): assert "\n" not in entry["action"] assert "\r" not in entry["model"] assert len(entry["reason"]) <= 160 + + +def test_record_without_callback_manager_is_noop(): + init(mode="observe") + with run(budget=1.0) as ctx: + ctx.record(action="allow", reason="test", model="gpt-4o-mini") + assert len(ctx.trace()) == 1 + + +def test_record_empty_action_warns_and_defaults(caplog): + init(mode="observe") + with caplog.at_level("WARNING", logger="cascadeflow.harness"): + with run(budget=1.0) as ctx: + ctx.record(action="", reason="test", model="gpt-4o-mini") + entry = ctx.trace()[0] + assert entry["action"] == "allow" + assert any("empty action" in rec.message for rec in caplog.records) From 49ee6015ba1e7e2c43a002e5540d0f7f74686eee Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Mon, 2 Mar 2026 11:35:33 +0100 Subject: [PATCH 18/49] fix(harness): avoid shadowing cascadeflow.agent module --- cascadeflow/__init__.py | 4 ---- tests/test_harness_api.py | 3 ++- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/cascadeflow/__init__.py b/cascadeflow/__init__.py index f2738abc..b9bc7682 100644 --- a/cascadeflow/__init__.py +++ b/cascadeflow/__init__.py @@ -403,11 +403,7 @@ "init", "reset", "run", -<<<<<<< HEAD "harness_agent", -======= - "agent", ->>>>>>> 1aba349 (Add core harness API scaffold with context-scoped runtime) "get_harness_config", "get_current_run", "get_harness_callback_manager", diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py index 937ab865..fd89e590 100644 --- a/tests/test_harness_api.py +++ b/tests/test_harness_api.py @@ -171,7 +171,8 @@ def test_top_level_exports_exist(): assert callable(cascadeflow.init) assert callable(cascadeflow.reset) assert callable(cascadeflow.run) - assert callable(cascadeflow.agent) + assert callable(cascadeflow.harness_agent) + assert hasattr(cascadeflow.agent, "PROVIDER_REGISTRY") assert callable(cascadeflow.get_harness_callback_manager) assert callable(cascadeflow.set_harness_callback_manager) report = cascadeflow.init(mode="off") From c1236f1340213320916e1015ff8599c568a00f37 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Mon, 2 Mar 2026 12:51:17 +0100 Subject: [PATCH 19/49] style: apply black formatting for harness integration files --- cascadeflow/harness/api.py | 17 ++++++-- cascadeflow/harness/instrument.py | 24 ++++++++--- cascadeflow/harness/pricing.py | 1 - cascadeflow/integrations/crewai.py | 7 ++-- cascadeflow/integrations/openai_agents.py | 8 +++- .../integrations/openai_agents_harness.py | 6 ++- tests/test_crewai_integration.py | 4 +- tests/test_harness_instrument.py | 12 +++++- tests/test_harness_shared_pricing.py | 42 ++++++++++++++----- 9 files changed, 88 insertions(+), 33 deletions(-) diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py index 6039cc00..f545d73d 100644 --- a/cascadeflow/harness/api.py +++ b/cascadeflow/harness/api.py @@ -65,7 +65,9 @@ class HarnessRunContext: last_action: str = "allow" draft_accepted: Optional[bool] = None _trace: list[dict[str, Any]] = field(default_factory=list) - _token: Optional[Token[Optional[HarnessRunContext]]] = field(default=None, init=False, repr=False) + _token: Optional[Token[Optional[HarnessRunContext]]] = field( + default=None, init=False, repr=False + ) def __post_init__(self) -> None: if self.budget_max is not None and self.budget_remaining is None: @@ -144,7 +146,9 @@ def record( logger.warning("record() called with empty action, defaulting to 'allow'") safe_action = "allow" safe_reason = _sanitize_trace_value(reason, max_length=_MAX_REASON_LEN) or "unspecified" - safe_model = _sanitize_trace_value(model, max_length=_MAX_MODEL_LEN) if model is not None else None + safe_model = ( + _sanitize_trace_value(model, max_length=_MAX_MODEL_LEN) if model is not None else None + ) self.last_action = safe_action self.model_used = safe_model @@ -174,7 +178,9 @@ def record( _harness_config: HarnessConfig = HarnessConfig() -_current_run: ContextVar[Optional[HarnessRunContext]] = ContextVar("cascadeflow_harness_run", default=None) +_current_run: ContextVar[Optional[HarnessRunContext]] = ContextVar( + "cascadeflow_harness_run", default=None +) _is_instrumented: bool = False _harness_callback_manager: Any = None _UNSET = object() @@ -264,6 +270,7 @@ def _emit_harness_decision(entry: dict[str, Any]) -> None: if _cached_cascade_decision_event is None: try: from cascadeflow.telemetry.callbacks import CallbackEvent + _cached_cascade_decision_event = CallbackEvent.CASCADE_DECISION except Exception: logger.debug("telemetry callbacks unavailable for harness decision emit", exc_info=True) @@ -452,7 +459,9 @@ def init( resolved_max_latency_ms = _resolve_value( "max_latency_ms", max_latency_ms, env_config, file_config, None, sources ) - resolved_max_energy = _resolve_value("max_energy", max_energy, env_config, file_config, None, sources) + resolved_max_energy = _resolve_value( + "max_energy", max_energy, env_config, file_config, None, sources + ) resolved_kpi_targets = _resolve_value( "kpi_targets", kpi_targets, env_config, file_config, None, sources ) diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py index e86fb1a9..c2fbd7ab 100644 --- a/cascadeflow/harness/instrument.py +++ b/cascadeflow/harness/instrument.py @@ -52,7 +52,9 @@ _original_sync_create: Any = None _original_async_create: Any = None -_MODEL_TOTAL_COSTS: dict[str, float] = {name: _model_total_price_shared(name) for name in _PRICING_MODELS} +_MODEL_TOTAL_COSTS: dict[str, float] = { + name: _model_total_price_shared(name) for name in _PRICING_MODELS +} _CHEAPEST_MODEL: str = min(_MODEL_TOTAL_COSTS, key=_MODEL_TOTAL_COSTS.get) _MIN_TOTAL_COST: float = min(_MODEL_TOTAL_COSTS.values()) _MAX_TOTAL_COST: float = max(_MODEL_TOTAL_COSTS.values()) @@ -89,9 +91,13 @@ "o1-mini": 0.60, "o3-mini": 0.78, } -_LATENCY_CANDIDATES: tuple[str, ...] = tuple(name for name in _PRICING_MODELS if name in _LATENCY_PRIORS) +_LATENCY_CANDIDATES: tuple[str, ...] = tuple( + name for name in _PRICING_MODELS if name in _LATENCY_PRIORS +) _FASTEST_MODEL: str | None = ( - max(_LATENCY_CANDIDATES, key=lambda name: _LATENCY_PRIORS[name]) if _LATENCY_CANDIDATES else None + max(_LATENCY_CANDIDATES, key=lambda name: _LATENCY_PRIORS[name]) + if _LATENCY_CANDIDATES + else None ) # OpenAI-model allowlists used by the current OpenAI harness instrumentation. @@ -179,7 +185,9 @@ def _select_faster_model(current_model: str) -> str: def _select_lower_energy_model(current_model: str) -> str: - if _ENERGY_COEFFICIENTS.get(_LOWEST_ENERGY_MODEL, _DEFAULT_ENERGY_COEFFICIENT) < _ENERGY_COEFFICIENTS.get( + if _ENERGY_COEFFICIENTS.get( + _LOWEST_ENERGY_MODEL, _DEFAULT_ENERGY_COEFFICIENT + ) < _ENERGY_COEFFICIENTS.get( current_model, _DEFAULT_ENERGY_COEFFICIENT, ): @@ -277,7 +285,9 @@ def _evaluate_pre_call_decision(ctx: Any, model: str, has_tools: bool) -> _PreCa return _PreCallDecision(action="stop", reason="budget_exceeded", target_model=model) if has_tools and ctx.tool_calls_max is not None and ctx.tool_calls >= ctx.tool_calls_max: - return _PreCallDecision(action="deny_tool", reason="max_tool_calls_reached", target_model=model) + return _PreCallDecision( + action="deny_tool", reason="max_tool_calls_reached", target_model=model + ) compliance = getattr(ctx, "compliance", None) if compliance: @@ -289,7 +299,9 @@ def _evaluate_pre_call_decision(ctx: Any, model: str, has_tools: bool) -> _PreCa reason="compliance_no_approved_tool_path", target_model=model, ) - return _PreCallDecision(action="stop", reason="compliance_no_approved_model", target_model=model) + return _PreCallDecision( + action="stop", reason="compliance_no_approved_model", target_model=model + ) if compliant_model != model: return _PreCallDecision( action="switch_model", diff --git a/cascadeflow/harness/pricing.py b/cascadeflow/harness/pricing.py index dab445ae..bd86323e 100644 --- a/cascadeflow/harness/pricing.py +++ b/cascadeflow/harness/pricing.py @@ -76,4 +76,3 @@ def model_total_price(model: str) -> float: """Return total (input + output) price per 1M tokens.""" in_price, out_price = PRICING_USD_PER_M.get(model, DEFAULT_PRICING_USD_PER_M) return in_price + out_price - diff --git a/cascadeflow/integrations/crewai.py b/cascadeflow/integrations/crewai.py index 16cbe6e0..604ae600 100644 --- a/cascadeflow/integrations/crewai.py +++ b/cascadeflow/integrations/crewai.py @@ -29,6 +29,7 @@ CREWAI_AVAILABLE = find_spec("crewai") is not None + def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float: return _estimate_shared_cost(model, prompt_tokens, completion_tokens) @@ -124,8 +125,7 @@ def _before_llm_call_hook(context: Any) -> Optional[bool]: and ctx.cost >= ctx.budget_max ): logger.warning( - "crewai hook: blocking LLM call — budget exhausted " - "(spent $%.4f of $%.4f max)", + "crewai hook: blocking LLM call — budget exhausted " "(spent $%.4f of $%.4f max)", ctx.cost, ctx.budget_max, ) @@ -254,8 +254,7 @@ def enable(config: Optional[CrewAIHarnessConfig] = None) -> bool: ) except ImportError: logger.warning( - "crewai is installed but hooks module not available " - "(requires crewai>=1.5); skipping" + "crewai is installed but hooks module not available " "(requires crewai>=1.5); skipping" ) return False diff --git a/cascadeflow/integrations/openai_agents.py b/cascadeflow/integrations/openai_agents.py index fe52d4d4..cbce9b96 100644 --- a/cascadeflow/integrations/openai_agents.py +++ b/cascadeflow/integrations/openai_agents.py @@ -216,7 +216,9 @@ def _resolve_model(self, requested_model: Optional[str]) -> str: # Under budget pressure, switch to the cheapest configured candidate. if run.budget_remaining / run.budget_max < 0.2: - compatible_candidates = [name for name in self._config.model_candidates if name in OPENAI_MODEL_POOL] + compatible_candidates = [ + name for name in self._config.model_candidates if name in OPENAI_MODEL_POOL + ] candidates = compatible_candidates or self._config.model_candidates cheapest = min( candidates, @@ -344,7 +346,9 @@ async def get_response( elapsed_ms = (time.monotonic() - started_at) * 1000.0 try: - self._update_run_metrics(response=response, elapsed_ms=elapsed_ms, pre_action=pre_action) + self._update_run_metrics( + response=response, elapsed_ms=elapsed_ms, pre_action=pre_action + ) except Exception: if self._config.fail_open: logger.exception("openai-agents harness metric update failed (fail-open)") diff --git a/examples/integrations/openai_agents_harness.py b/examples/integrations/openai_agents_harness.py index 69ea6bcd..ac9d6c68 100644 --- a/examples/integrations/openai_agents_harness.py +++ b/examples/integrations/openai_agents_harness.py @@ -17,7 +17,7 @@ async def main() -> None: except ImportError as exc: raise SystemExit( "OpenAI Agents SDK is not installed. " - "Install with: pip install \"cascadeflow[openai,openai-agents]\"" + 'Install with: pip install "cascadeflow[openai,openai-agents]"' ) from exc from cascadeflow import init, run @@ -44,7 +44,9 @@ async def main() -> None: run_config = RunConfig(model_provider=provider) with run(budget=0.5, max_tool_calls=3) as session: - result = await Runner.run(agent, "Summarize why model routing helps agent budgets.", run_config=run_config) + result = await Runner.run( + agent, "Summarize why model routing helps agent budgets.", run_config=run_config + ) print("=== Result ===") print(result.final_output) diff --git a/tests/test_crewai_integration.py b/tests/test_crewai_integration.py index 622f4b4b..c17498b4 100644 --- a/tests/test_crewai_integration.py +++ b/tests/test_crewai_integration.py @@ -455,7 +455,9 @@ def test_enable_returns_false_for_old_crewai(self, monkeypatch): # Remove crewai.hooks from modules so import fails monkeypatch.delitem(sys.modules, "crewai.hooks", raising=False) - original_import = __builtins__.__import__ if hasattr(__builtins__, "__import__") else __import__ + original_import = ( + __builtins__.__import__ if hasattr(__builtins__, "__import__") else __import__ + ) def fake_import(name, *args, **kwargs): if name == "crewai.hooks": diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py index 28fdc7b7..75368522 100644 --- a/tests/test_harness_instrument.py +++ b/tests/test_harness_instrument.py @@ -714,7 +714,11 @@ def test_enforce_denies_tools_when_cap_reached(self) -> None: wrapper = _make_patched_create(original) with run(max_tool_calls=0) as ctx: - wrapper(MagicMock(), model="gpt-4o", tools=[{"type": "function", "function": {"name": "t1"}}]) + wrapper( + MagicMock(), + model="gpt-4o", + tools=[{"type": "function", "function": {"name": "t1"}}], + ) assert original.call_args[1]["tools"] == [] trace = ctx.trace() @@ -824,7 +828,11 @@ def test_enforce_denies_tool_for_strict_compliance(self) -> None: wrapper = _make_patched_create(original) with run() as ctx: - wrapper(MagicMock(), model="gpt-4o", tools=[{"type": "function", "function": {"name": "t1"}}]) + wrapper( + MagicMock(), + model="gpt-4o", + tools=[{"type": "function", "function": {"name": "t1"}}], + ) assert original.call_args[1]["tools"] == [] trace = ctx.trace() diff --git a/tests/test_harness_shared_pricing.py b/tests/test_harness_shared_pricing.py index fb693226..a26398f3 100644 --- a/tests/test_harness_shared_pricing.py +++ b/tests/test_harness_shared_pricing.py @@ -7,7 +7,12 @@ import cascadeflow.harness.instrument as instrument_mod import cascadeflow.integrations.crewai as crewai_mod import cascadeflow.integrations.openai_agents as openai_agents_mod -from cascadeflow.harness.pricing import OPENAI_MODEL_POOL, estimate_cost, estimate_energy, model_total_price +from cascadeflow.harness.pricing import ( + OPENAI_MODEL_POOL, + estimate_cost, + estimate_energy, + model_total_price, +) def test_shared_estimate_cost_known_models() -> None: @@ -35,16 +40,31 @@ def test_integration_estimators_use_shared_profiles() -> None: shared_cost = estimate_cost(model, input_tokens, output_tokens) shared_energy = estimate_energy(model, input_tokens, output_tokens) - assert instrument_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(shared_cost) - assert crewai_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(shared_cost) - assert openai_agents_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(shared_cost) - - assert instrument_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(shared_energy) - assert crewai_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(shared_energy) - assert openai_agents_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(shared_energy) + assert instrument_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx( + shared_cost + ) + assert crewai_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx( + shared_cost + ) + assert openai_agents_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx( + shared_cost + ) + + assert instrument_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx( + shared_energy + ) + assert crewai_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx( + shared_energy + ) + assert openai_agents_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx( + shared_energy + ) def test_openai_agents_total_price_uses_shared_profiles() -> None: - assert openai_agents_mod._total_model_price("gpt-5") == pytest.approx(model_total_price("gpt-5")) - assert openai_agents_mod._total_model_price("gpt-4o-mini") == pytest.approx(model_total_price("gpt-4o-mini")) - + assert openai_agents_mod._total_model_price("gpt-5") == pytest.approx( + model_total_price("gpt-5") + ) + assert openai_agents_mod._total_model_price("gpt-4o-mini") == pytest.approx( + model_total_price("gpt-4o-mini") + ) From 02619258d96ac4ff74932f3b886dda6fca6df072 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 10:01:50 +0100 Subject: [PATCH 20/49] feat(langchain): add harness-aware callback and state extractor --- .../integrations/langchain/__init__.py | 12 + .../langchain/harness_callback.py | 235 ++++++++++++++++++ .../integrations/langchain/harness_state.py | 119 +++++++++ .../tests/test_langchain_harness_callback.py | 148 +++++++++++ pyproject.toml | 7 + 5 files changed, 521 insertions(+) create mode 100644 cascadeflow/integrations/langchain/harness_callback.py create mode 100644 cascadeflow/integrations/langchain/harness_state.py create mode 100644 cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py diff --git a/cascadeflow/integrations/langchain/__init__.py b/cascadeflow/integrations/langchain/__init__.py index 45c6ea2f..7b3f9551 100644 --- a/cascadeflow/integrations/langchain/__init__.py +++ b/cascadeflow/integrations/langchain/__init__.py @@ -54,6 +54,14 @@ CascadeFlowCallbackHandler, get_cascade_callback, ) +from .harness_callback import ( + HarnessAwareCascadeFlowCallbackHandler, + get_harness_callback, +) +from .harness_state import ( + apply_langgraph_state, + extract_langgraph_state, +) __all__ = [ # Main classes @@ -93,4 +101,8 @@ # LangChain callback handlers "CascadeFlowCallbackHandler", "get_cascade_callback", + "HarnessAwareCascadeFlowCallbackHandler", + "get_harness_callback", + "extract_langgraph_state", + "apply_langgraph_state", ] diff --git a/cascadeflow/integrations/langchain/harness_callback.py b/cascadeflow/integrations/langchain/harness_callback.py new file mode 100644 index 00000000..aff5c0b4 --- /dev/null +++ b/cascadeflow/integrations/langchain/harness_callback.py @@ -0,0 +1,235 @@ +"""Harness-aware callbacks for LangChain/LangGraph integration.""" + +from __future__ import annotations + +import logging +import time +from contextlib import contextmanager +from typing import Any, Optional + +from cascadeflow.harness import get_current_run +from cascadeflow.harness.pricing import estimate_cost, estimate_energy +from cascadeflow.schema.exceptions import HarnessStopError + +from .harness_state import apply_langgraph_state, extract_langgraph_state +from .langchain_callbacks import CascadeFlowCallbackHandler +from .utils import extract_token_usage, extract_tool_calls + +logger = logging.getLogger("cascadeflow.harness.langchain") + + +class HarnessAwareCascadeFlowCallbackHandler(CascadeFlowCallbackHandler): + """LangChain callback that bridges native lifecycle events into HarnessRunContext.""" + + def __init__(self, *, fail_open: bool = True): + super().__init__() + self.fail_open = fail_open + self._llm_started_at: Optional[float] = None + self._pre_action: str = "allow" + self._pre_reason: str = "allow" + self._pre_model: Optional[str] = None + self._pre_recorded: bool = False + self._executed_tool_calls: int = 0 + + def _handle_harness_error(self, error: Exception) -> None: + if self.fail_open: + logger.exception("langchain harness callback failed (fail-open)", exc_info=error) + return + raise error + + def _sync_state(self, payload: dict[str, Any]) -> None: + run_ctx = get_current_run() + if run_ctx is None: + return + state = extract_langgraph_state(payload) + if state: + apply_langgraph_state(run_ctx, state) + + def on_llm_start(self, serialized: dict[str, Any], prompts: list[str], **kwargs: Any) -> None: + super().on_llm_start(serialized=serialized, prompts=prompts, **kwargs) + self._llm_started_at = time.monotonic() + self._pre_action = "allow" + self._pre_reason = "allow" + self._pre_model = self.current_model + self._pre_recorded = False + + try: + self._sync_state(kwargs) + + run_ctx = get_current_run() + if run_ctx is None: + return + + model_name = self.current_model or "unknown" + invocation_params = kwargs.get("invocation_params") + has_tools = False + if isinstance(invocation_params, dict): + has_tools = bool(invocation_params.get("tools")) + if not has_tools: + has_tools = bool(kwargs.get("tools")) + + from cascadeflow.harness.instrument import _evaluate_pre_call_decision, _raise_stop_error + + decision = _evaluate_pre_call_decision(run_ctx, model_name, has_tools=has_tools) + self._pre_action = decision.action + self._pre_reason = decision.reason + self._pre_model = decision.target_model + + if run_ctx.mode == "observe": + if decision.action != "allow": + run_ctx.record( + action=decision.action, + reason=decision.reason, + model=decision.target_model, + applied=False, + decision_mode="observe", + ) + self._pre_recorded = True + return + + if run_ctx.mode != "enforce": + return + + if decision.action == "stop": + run_ctx.record( + action="stop", + reason=decision.reason, + model=model_name, + applied=True, + decision_mode="enforce", + ) + self._pre_recorded = True + _raise_stop_error(run_ctx, decision.reason) + + if decision.action == "switch_model": + run_ctx.record( + action="switch_model", + reason=decision.reason, + model=decision.target_model, + applied=False, + decision_mode="enforce", + ) + self._pre_recorded = True + + if decision.action == "deny_tool" and has_tools: + run_ctx.record( + action="deny_tool", + reason=decision.reason, + model=model_name, + applied=False, + decision_mode="enforce", + ) + self._pre_recorded = True + + except Exception as exc: + self._handle_harness_error(exc) + + def on_llm_end(self, response: Any, **kwargs: Any) -> None: + super().on_llm_end(response=response, **kwargs) + + try: + self._sync_state(kwargs) + run_ctx = get_current_run() + if run_ctx is None: + return + + model_name = self.current_model + if not model_name and getattr(response, "llm_output", None): + model_name = response.llm_output.get("model_name") + model_name = model_name or "unknown" + + token_usage = extract_token_usage(response) + prompt_tokens = int(token_usage["input"]) + completion_tokens = int(token_usage["output"]) + tool_call_count = len(extract_tool_calls(response)) + + elapsed_ms = 0.0 + if self._llm_started_at is not None: + elapsed_ms = (time.monotonic() - self._llm_started_at) * 1000.0 + + run_ctx.step_count += 1 + run_ctx.cost += estimate_cost(model_name, prompt_tokens, completion_tokens) + run_ctx.energy_used += estimate_energy(model_name, prompt_tokens, completion_tokens) + run_ctx.latency_used_ms += elapsed_ms + run_ctx.tool_calls += tool_call_count + + if run_ctx.budget_max is not None: + run_ctx.budget_remaining = run_ctx.budget_max - run_ctx.cost + + if self._pre_action == "allow": + run_ctx.record( + action="allow", + reason="langchain_step", + model=model_name, + applied=True, + decision_mode=run_ctx.mode, + ) + elif not self._pre_recorded: + run_ctx.record( + action=self._pre_action, + reason=self._pre_reason, + model=self._pre_model or model_name, + applied=False, + decision_mode=run_ctx.mode, + ) + + except Exception as exc: + self._handle_harness_error(exc) + finally: + self._llm_started_at = None + self._pre_action = "allow" + self._pre_reason = "allow" + self._pre_model = None + self._pre_recorded = False + + def on_tool_start(self, serialized: dict[str, Any], input_str: str, **kwargs: Any) -> Any: + try: + self._sync_state(kwargs) + run_ctx = get_current_run() + if run_ctx is None: + return None + if run_ctx.tool_calls_max is None: + return None + + if self._executed_tool_calls >= run_ctx.tool_calls_max: + if run_ctx.mode == "observe": + run_ctx.record( + action="deny_tool", + reason="max_tool_calls_reached", + model=self.current_model, + applied=False, + decision_mode="observe", + ) + return None + if run_ctx.mode == "enforce": + run_ctx.record( + action="deny_tool", + reason="max_tool_calls_reached", + model=self.current_model, + applied=True, + decision_mode="enforce", + ) + raise HarnessStopError( + "cascadeflow harness deny_tool: max tool calls reached", + reason="max_tool_calls_reached", + ) + + self._executed_tool_calls += 1 + return None + except Exception as exc: + self._handle_harness_error(exc) + return None + + +@contextmanager +def get_harness_callback(*, fail_open: bool = True): + """Context manager that yields a harness-aware LangChain callback handler.""" + callback = HarnessAwareCascadeFlowCallbackHandler(fail_open=fail_open) + try: + yield callback + finally: + return + + +__all__ = ["HarnessAwareCascadeFlowCallbackHandler", "get_harness_callback"] + diff --git a/cascadeflow/integrations/langchain/harness_state.py b/cascadeflow/integrations/langchain/harness_state.py new file mode 100644 index 00000000..49278ef1 --- /dev/null +++ b/cascadeflow/integrations/langchain/harness_state.py @@ -0,0 +1,119 @@ +"""LangGraph/LangChain state extraction helpers for harness integration.""" + +from __future__ import annotations + +from typing import Any, Mapping, Optional + + +def _as_int(value: Any) -> Optional[int]: + try: + if value is None: + return None + return int(value) + except (TypeError, ValueError): + return None + + +def _as_float(value: Any) -> Optional[float]: + try: + if value is None: + return None + return float(value) + except (TypeError, ValueError): + return None + + +def _extract_candidate_state(source: Any) -> Optional[Mapping[str, Any]]: + if not isinstance(source, Mapping): + return None + + for key in ("langgraph_state", "graph_state", "state"): + candidate = source.get(key) + if isinstance(candidate, Mapping): + return candidate + + return source + + +def extract_langgraph_state(payload: Any) -> dict[str, Any]: + """Extract normalized harness-relevant fields from LangGraph-style state payloads.""" + + candidates: list[Mapping[str, Any]] = [] + root = _extract_candidate_state(payload) + if root is not None: + candidates.append(root) + + if isinstance(payload, Mapping): + metadata = payload.get("metadata") + if isinstance(metadata, Mapping): + state_from_metadata = _extract_candidate_state(metadata) + if state_from_metadata is not None: + candidates.append(state_from_metadata) + + configurable = payload.get("configurable") + if isinstance(configurable, Mapping): + state_from_configurable = _extract_candidate_state(configurable) + if state_from_configurable is not None: + candidates.append(state_from_configurable) + + merged: dict[str, Any] = {} + for source in candidates: + if "agent_id" in source and isinstance(source.get("agent_id"), str): + merged["agent_id"] = source["agent_id"] + if "model" in source and isinstance(source.get("model"), str): + merged["model_used"] = source["model"] + if "model_used" in source and isinstance(source.get("model_used"), str): + merged["model_used"] = source["model_used"] + + step_count = _as_int(source.get("step_count", source.get("step"))) + if step_count is not None: + merged["step_count"] = step_count + + tool_calls = _as_int(source.get("tool_calls")) + if tool_calls is not None: + merged["tool_calls"] = tool_calls + + budget_remaining = _as_float(source.get("budget_remaining")) + if budget_remaining is not None: + merged["budget_remaining"] = budget_remaining + + latency_used_ms = _as_float(source.get("latency_used_ms", source.get("latency_ms"))) + if latency_used_ms is not None: + merged["latency_used_ms"] = latency_used_ms + + energy_used = _as_float(source.get("energy_used", source.get("energy"))) + if energy_used is not None: + merged["energy_used"] = energy_used + + return merged + + +def apply_langgraph_state(run_ctx: Any, state: Mapping[str, Any]) -> None: + """Apply extracted state fields onto an active HarnessRunContext.""" + if run_ctx is None or not isinstance(state, Mapping): + return + + step_count = _as_int(state.get("step_count")) + if step_count is not None and step_count > getattr(run_ctx, "step_count", 0): + run_ctx.step_count = step_count + + tool_calls = _as_int(state.get("tool_calls")) + if tool_calls is not None and tool_calls > getattr(run_ctx, "tool_calls", 0): + run_ctx.tool_calls = tool_calls + + latency_used_ms = _as_float(state.get("latency_used_ms")) + if latency_used_ms is not None and latency_used_ms > getattr(run_ctx, "latency_used_ms", 0.0): + run_ctx.latency_used_ms = latency_used_ms + + energy_used = _as_float(state.get("energy_used")) + if energy_used is not None and energy_used > getattr(run_ctx, "energy_used", 0.0): + run_ctx.energy_used = energy_used + + budget_remaining = _as_float(state.get("budget_remaining")) + if budget_remaining is not None: + run_ctx.budget_remaining = budget_remaining + + model_used = state.get("model_used") + if isinstance(model_used, str) and model_used: + run_ctx.model_used = model_used + diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py new file mode 100644 index 00000000..b96cb30d --- /dev/null +++ b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py @@ -0,0 +1,148 @@ +"""Tests for harness-aware LangChain callback integration.""" + +from __future__ import annotations + +import pytest +from langchain_core.messages import AIMessage +from langchain_core.outputs import ChatGeneration, LLMResult + +from cascadeflow.harness import init, reset, run +from cascadeflow.integrations.langchain.harness_callback import ( + HarnessAwareCascadeFlowCallbackHandler, +) +from cascadeflow.integrations.langchain.harness_state import ( + apply_langgraph_state, + extract_langgraph_state, +) +from cascadeflow.schema.exceptions import BudgetExceededError, HarnessStopError + + +@pytest.fixture(autouse=True) +def _reset_harness_state() -> None: + reset() + + +def _llm_result(model_name: str, prompt_tokens: int, completion_tokens: int) -> LLMResult: + generation = ChatGeneration(message=AIMessage(content="ok"), generation_info={}) + return LLMResult( + generations=[[generation]], + llm_output={ + "model_name": model_name, + "token_usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + }, + }, + ) + + +def test_harness_callback_updates_active_run_metrics() -> None: + init(mode="observe", budget=1.0) + handler = HarnessAwareCascadeFlowCallbackHandler() + + with run(budget=1.0) as ctx: + handler.on_llm_start( + serialized={}, + prompts=["hello"], + invocation_params={"model": "gpt-4o-mini"}, + ) + handler.on_llm_end(_llm_result("gpt-4o-mini", 120, 80)) + + assert ctx.step_count == 1 + assert ctx.cost > 0 + assert ctx.energy_used > 0 + assert ctx.budget_remaining is not None + assert ctx.budget_remaining < 1.0 + assert ctx.last_action == "allow" + assert ctx.model_used == "gpt-4o-mini" + + +def test_harness_callback_enforce_raises_when_budget_exhausted() -> None: + init(mode="enforce", budget=0.1) + handler = HarnessAwareCascadeFlowCallbackHandler(fail_open=False) + + with run(budget=0.1) as ctx: + ctx.cost = 0.1 + ctx.budget_remaining = 0.0 + + with pytest.raises(BudgetExceededError): + handler.on_llm_start( + serialized={}, + prompts=["hello"], + invocation_params={"model": "gpt-4o-mini"}, + ) + + trace = ctx.trace() + assert trace + assert trace[-1]["action"] == "stop" + assert trace[-1]["reason"] == "budget_exceeded" + assert trace[-1]["applied"] is True + + +def test_harness_callback_observe_records_non_applied_decisions() -> None: + init(mode="observe", budget=1.0) + handler = HarnessAwareCascadeFlowCallbackHandler() + + with run(budget=1.0) as ctx: + ctx.cost = 0.9 + ctx.budget_remaining = 0.1 + + handler.on_llm_start( + serialized={}, + prompts=["hello"], + invocation_params={"model": "gpt-4o", "tools": [{"name": "lookup"}]}, + ) + + trace = ctx.trace() + assert trace + assert trace[-1]["action"] in {"switch_model", "deny_tool"} + assert trace[-1]["applied"] is False + assert trace[-1]["decision_mode"] == "observe" + + +def test_harness_callback_enforce_denies_tool_when_limit_reached() -> None: + init(mode="enforce", max_tool_calls=0, budget=1.0) + handler = HarnessAwareCascadeFlowCallbackHandler(fail_open=False) + + with run(max_tool_calls=0, budget=1.0) as ctx: + with pytest.raises(HarnessStopError, match="max tool calls"): + handler.on_tool_start(serialized={"name": "search"}, input_str="query") + + trace = ctx.trace() + assert trace + assert trace[-1]["action"] == "deny_tool" + assert trace[-1]["applied"] is True + assert trace[-1]["decision_mode"] == "enforce" + + +def test_extract_and_apply_langgraph_state() -> None: + state = extract_langgraph_state( + { + "metadata": { + "langgraph_state": { + "step": 4, + "tool_calls": 3, + "budget_remaining": 0.42, + "latency_ms": 130.0, + "energy": 77.0, + "model": "gpt-4o-mini", + } + } + } + ) + + assert state["step_count"] == 4 + assert state["tool_calls"] == 3 + assert state["model_used"] == "gpt-4o-mini" + + init(mode="observe", budget=1.0) + with run(budget=1.0) as ctx: + apply_langgraph_state(ctx, state) + assert ctx.step_count == 4 + assert ctx.tool_calls == 3 + assert ctx.budget_remaining == pytest.approx(0.42) + assert ctx.latency_used_ms == pytest.approx(130.0) + assert ctx.energy_used == pytest.approx(77.0) + assert ctx.model_used == "gpt-4o-mini" + diff --git a/pyproject.toml b/pyproject.toml index eaadb6b7..198042da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,6 +101,13 @@ openai-agents = [ "openai-agents>=0.9.0; python_version >= '3.10'", ] +# LangChain/LangGraph harness integration (opt-in) +langchain = [ + "langchain>=0.3.0", + "langchain-core>=0.3.0", + "langgraph>=0.2.0", +] + # Development tools (includes rich for terminal output) dev = [ "pytest>=7.4.0", From 44506b8d8152035700603efa201cfd13f4714646 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 10:03:03 +0100 Subject: [PATCH 21/49] feat(langchain): auto-attach harness callback in active run scopes --- .../test_langchain_integration_features.py | 36 ++++++++++++++++++ cascadeflow/integrations/langchain/wrapper.py | 37 +++++++++++++++++-- 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py b/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py index fdbcff1d..f225fa3a 100644 --- a/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py +++ b/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py @@ -4,6 +4,10 @@ from langchain_core.messages import AIMessage, BaseMessage, HumanMessage from langchain_core.outputs import ChatGeneration, ChatResult +from cascadeflow.harness import init, reset, run +from cascadeflow.integrations.langchain.harness_callback import ( + HarnessAwareCascadeFlowCallbackHandler, +) from cascadeflow.integrations.langchain import CascadeFlow @@ -116,3 +120,35 @@ def test_domain_policy_direct_to_verifier_skips_drafter() -> None: assert drafter.calls == 0 assert verifier.calls == 1 assert result.llm_output["cascade"]["routing_reason"] == "domain_policy_direct" + + +def test_wrapper_only_auto_adds_harness_callback_inside_active_run_scope() -> None: + reset() + init(mode="observe") + drafter = MockSequenceChatModel("draft") + verifier = MockSequenceChatModel("verify") + cascade = CascadeFlow(drafter=drafter, verifier=verifier, enable_pre_router=False) + + outside_callbacks = cascade._resolve_callbacks([]) + assert not any( + isinstance(cb, HarnessAwareCascadeFlowCallbackHandler) for cb in outside_callbacks + ) + + with run(): + inside_callbacks = cascade._resolve_callbacks([]) + assert any( + isinstance(cb, HarnessAwareCascadeFlowCallbackHandler) for cb in inside_callbacks + ) + + +def test_wrapper_does_not_duplicate_harness_callback() -> None: + reset() + init(mode="observe") + drafter = MockSequenceChatModel("draft") + verifier = MockSequenceChatModel("verify") + cascade = CascadeFlow(drafter=drafter, verifier=verifier, enable_pre_router=False) + existing = HarnessAwareCascadeFlowCallbackHandler() + + with run(): + callbacks = cascade._resolve_callbacks([existing]) + assert len([cb for cb in callbacks if isinstance(cb, HarnessAwareCascadeFlowCallbackHandler)]) == 1 diff --git a/cascadeflow/integrations/langchain/wrapper.py b/cascadeflow/integrations/langchain/wrapper.py index ed6d554b..f108d60f 100644 --- a/cascadeflow/integrations/langchain/wrapper.py +++ b/cascadeflow/integrations/langchain/wrapper.py @@ -169,6 +169,35 @@ def _split_runnable_config( model_kwargs[key] = value return model_kwargs, config + def _resolve_callbacks(self, raw_callbacks: Any) -> list[Any]: + if raw_callbacks is None: + callbacks: list[Any] = [] + elif isinstance(raw_callbacks, list): + callbacks = list(raw_callbacks) + elif isinstance(raw_callbacks, tuple): + callbacks = list(raw_callbacks) + else: + callbacks = [raw_callbacks] + + try: + from cascadeflow.harness import get_current_run, get_harness_config + + harness_config = get_harness_config() + run_ctx = get_current_run() + if harness_config.mode == "off" or run_ctx is None or run_ctx.mode == "off": + return callbacks + + from .harness_callback import HarnessAwareCascadeFlowCallbackHandler + + if any(isinstance(cb, HarnessAwareCascadeFlowCallbackHandler) for cb in callbacks): + return callbacks + + callbacks.append(HarnessAwareCascadeFlowCallbackHandler()) + return callbacks + except Exception: + # Preserve existing behavior for users who do not enable harness flows. + return callbacks + def _generate( self, messages: list[BaseMessage], @@ -202,7 +231,7 @@ def _generate( merged_kwargs["stop"] = stop # Extract callbacks before filtering (need to pass them explicitly to nested models) - callbacks = merged_kwargs.get("callbacks", []) + callbacks = self._resolve_callbacks(merged_kwargs.get("callbacks", [])) existing_tags = merged_kwargs.get("tags", []) or [] base_tags = existing_tags + ["cascadeflow"] if existing_tags else ["cascadeflow"] @@ -599,7 +628,7 @@ async def _agenerate( merged_kwargs["stop"] = stop # Extract callbacks before filtering (need to pass them explicitly to nested models) - callbacks = merged_kwargs.get("callbacks", []) + callbacks = self._resolve_callbacks(merged_kwargs.get("callbacks", [])) existing_tags = merged_kwargs.get("tags", []) or [] base_tags = existing_tags + ["cascadeflow"] if existing_tags else ["cascadeflow"] @@ -1001,7 +1030,7 @@ def _stream( stream_kwargs, base_config = self._split_runnable_config(merged_kwargs) base_tags = (base_config.get("tags") or []) + ["cascadeflow"] existing_metadata = base_config.get("metadata", {}) or {} - callbacks = base_config.get("callbacks", []) + callbacks = self._resolve_callbacks(base_config.get("callbacks", [])) resolved_domain = self._resolve_domain(messages, existing_metadata) effective_quality_threshold = self._effective_quality_threshold(resolved_domain) force_verifier_for_domain = self._domain_forces_verifier(resolved_domain) @@ -1324,7 +1353,7 @@ async def _astream( stream_kwargs, base_config = self._split_runnable_config(merged_kwargs) base_tags = (base_config.get("tags") or []) + ["cascadeflow"] existing_metadata = base_config.get("metadata", {}) or {} - callbacks = base_config.get("callbacks", []) + callbacks = self._resolve_callbacks(base_config.get("callbacks", [])) safe_kwargs = { k: v for k, v in stream_kwargs.items() From f70572d75b78f1c274e6ef09de8755d787a354ca Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 10:03:35 +0100 Subject: [PATCH 22/49] docs(plan): mark langchain harness extension branch completed --- docs/strategy/agent-intelligence-v2-plan.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md index 73bfec1b..33bae5ae 100644 --- a/docs/strategy/agent-intelligence-v2-plan.md +++ b/docs/strategy/agent-intelligence-v2-plan.md @@ -847,7 +847,7 @@ Claim checklist (one owner per branch at a time): - [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `completed (ready for PR)` - [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` - [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` -- [ ] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` +- [x] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `completed` - [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` - [ ] `feat/v2-bench-repro-pipeline` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` - [ ] `feat/v2-security-privacy-telemetry` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` From d740cad1fe6ab4dbd7f7dc9aa8f0e49340c39f9c Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 10:21:17 +0100 Subject: [PATCH 23/49] fix(langchain): address PR #161 review findings - Document enforce-mode limitations for switch_model and deny_tool - Replace per-handler _executed_tool_calls with run_ctx.tool_calls - Fix _extract_candidate_state fallback leaking arbitrary kwargs - Remove return-in-finally (B012) and fix import ordering - Separate langgraph from langchain optional extra in pyproject.toml - Add 4 edge-case tests: no-run-context safety, state extraction guard, and run_ctx tool_calls gating --- .../langchain/harness_callback.py | 32 +++++++++++------ .../integrations/langchain/harness_state.py | 8 ++++- .../tests/test_langchain_harness_callback.py | 36 +++++++++++++++++++ .../test_langchain_integration_features.py | 2 +- pyproject.toml | 8 ++++- 5 files changed, 73 insertions(+), 13 deletions(-) diff --git a/cascadeflow/integrations/langchain/harness_callback.py b/cascadeflow/integrations/langchain/harness_callback.py index aff5c0b4..25962a5d 100644 --- a/cascadeflow/integrations/langchain/harness_callback.py +++ b/cascadeflow/integrations/langchain/harness_callback.py @@ -1,4 +1,17 @@ -"""Harness-aware callbacks for LangChain/LangGraph integration.""" +"""Harness-aware callbacks for LangChain/LangGraph integration. + +Enforce-mode limitations (LangChain callback architecture): + - ``stop`` (budget/latency/energy exceeded): fully enforced — raises + BudgetExceededError or HarnessStopError from ``on_llm_start``. + - ``deny_tool`` (tool-call cap): fully enforced at the tool level via + ``on_tool_start`` — raises HarnessStopError before tool execution. + - ``switch_model``: **observe-only** — LangChain dispatches the LLM call + before ``on_llm_start`` returns, so the callback cannot redirect to a + different model. The decision is recorded with ``applied=False``. + - ``deny_tool`` at LLM level (pre-call decision): **observe-only** — the + callback cannot strip tools from an already-dispatched LLM request. + The decision is recorded with ``applied=False``. +""" from __future__ import annotations @@ -19,7 +32,11 @@ class HarnessAwareCascadeFlowCallbackHandler(CascadeFlowCallbackHandler): - """LangChain callback that bridges native lifecycle events into HarnessRunContext.""" + """LangChain callback that bridges native lifecycle events into HarnessRunContext. + + See module docstring for enforce-mode limitations on ``switch_model`` + and LLM-level ``deny_tool``. + """ def __init__(self, *, fail_open: bool = True): super().__init__() @@ -29,7 +46,6 @@ def __init__(self, *, fail_open: bool = True): self._pre_reason: str = "allow" self._pre_model: Optional[str] = None self._pre_recorded: bool = False - self._executed_tool_calls: int = 0 def _handle_harness_error(self, error: Exception) -> None: if self.fail_open: @@ -68,7 +84,7 @@ def on_llm_start(self, serialized: dict[str, Any], prompts: list[str], **kwargs: if not has_tools: has_tools = bool(kwargs.get("tools")) - from cascadeflow.harness.instrument import _evaluate_pre_call_decision, _raise_stop_error + from cascadeflow.harness.instrument import _evaluate_pre_call_decision, _raise_stop_error # noqa: I001 decision = _evaluate_pre_call_decision(run_ctx, model_name, has_tools=has_tools) self._pre_action = decision.action @@ -191,7 +207,7 @@ def on_tool_start(self, serialized: dict[str, Any], input_str: str, **kwargs: An if run_ctx.tool_calls_max is None: return None - if self._executed_tool_calls >= run_ctx.tool_calls_max: + if run_ctx.tool_calls >= run_ctx.tool_calls_max: if run_ctx.mode == "observe": run_ctx.record( action="deny_tool", @@ -214,7 +230,6 @@ def on_tool_start(self, serialized: dict[str, Any], input_str: str, **kwargs: An reason="max_tool_calls_reached", ) - self._executed_tool_calls += 1 return None except Exception as exc: self._handle_harness_error(exc) @@ -225,10 +240,7 @@ def on_tool_start(self, serialized: dict[str, Any], input_str: str, **kwargs: An def get_harness_callback(*, fail_open: bool = True): """Context manager that yields a harness-aware LangChain callback handler.""" callback = HarnessAwareCascadeFlowCallbackHandler(fail_open=fail_open) - try: - yield callback - finally: - return + yield callback __all__ = ["HarnessAwareCascadeFlowCallbackHandler", "get_harness_callback"] diff --git a/cascadeflow/integrations/langchain/harness_state.py b/cascadeflow/integrations/langchain/harness_state.py index 49278ef1..313932ce 100644 --- a/cascadeflow/integrations/langchain/harness_state.py +++ b/cascadeflow/integrations/langchain/harness_state.py @@ -24,6 +24,12 @@ def _as_float(value: Any) -> Optional[float]: def _extract_candidate_state(source: Any) -> Optional[Mapping[str, Any]]: + """Extract a named state container from a mapping. + + Only returns state from explicitly named keys (langgraph_state, graph_state, + state). Returns None when no named key matches — avoids treating arbitrary + kwargs as harness state. + """ if not isinstance(source, Mapping): return None @@ -32,7 +38,7 @@ def _extract_candidate_state(source: Any) -> Optional[Mapping[str, Any]]: if isinstance(candidate, Mapping): return candidate - return source + return None def extract_langgraph_state(payload: Any) -> dict[str, Any]: diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py index b96cb30d..79a6f539 100644 --- a/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py +++ b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py @@ -116,6 +116,42 @@ def test_harness_callback_enforce_denies_tool_when_limit_reached() -> None: assert trace[-1]["decision_mode"] == "enforce" +def test_on_llm_end_no_run_context_is_safe() -> None: + handler = HarnessAwareCascadeFlowCallbackHandler() + handler.on_llm_start( + serialized={}, + prompts=["hello"], + invocation_params={"model": "gpt-4o-mini"}, + ) + result = handler.on_llm_end(_llm_result("gpt-4o-mini", 10, 5)) + assert result is None + + +def test_on_tool_start_no_run_context_is_safe() -> None: + handler = HarnessAwareCascadeFlowCallbackHandler() + result = handler.on_tool_start(serialized={"name": "search"}, input_str="query") + assert result is None + + +def test_extract_state_ignores_plain_kwargs() -> None: + """Kwargs without a named state key should not leak into state.""" + state = extract_langgraph_state({"model": "gpt-4o", "invocation_params": {"tools": []}}) + assert state == {} + + +def test_tool_deny_uses_run_ctx_tool_calls() -> None: + """Tool gating should use run_ctx.tool_calls, not a local counter.""" + init(mode="enforce", max_tool_calls=2, budget=1.0) + handler = HarnessAwareCascadeFlowCallbackHandler(fail_open=False) + + with run(max_tool_calls=2, budget=1.0) as ctx: + # Simulate tool calls already counted by on_llm_end or other integrations + ctx.tool_calls = 2 + + with pytest.raises(HarnessStopError, match="max tool calls"): + handler.on_tool_start(serialized={"name": "search"}, input_str="query") + + def test_extract_and_apply_langgraph_state() -> None: state = extract_langgraph_state( { diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py b/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py index f225fa3a..0e82fb48 100644 --- a/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py +++ b/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py @@ -5,10 +5,10 @@ from langchain_core.outputs import ChatGeneration, ChatResult from cascadeflow.harness import init, reset, run +from cascadeflow.integrations.langchain import CascadeFlow from cascadeflow.integrations.langchain.harness_callback import ( HarnessAwareCascadeFlowCallbackHandler, ) -from cascadeflow.integrations.langchain import CascadeFlow class MockSequenceChatModel(BaseChatModel): diff --git a/pyproject.toml b/pyproject.toml index 198042da..2bbd3082 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,10 +101,16 @@ openai-agents = [ "openai-agents>=0.9.0; python_version >= '3.10'", ] -# LangChain/LangGraph harness integration (opt-in) +# LangChain harness integration (opt-in) langchain = [ "langchain>=0.3.0", "langchain-core>=0.3.0", +] + +# LangGraph state extraction (opt-in, adds langgraph on top of langchain) +langgraph = [ + "langchain>=0.3.0", + "langchain-core>=0.3.0", "langgraph>=0.2.0", ] From 3bd78996cc3f2a88882af599496451ea35c9fd99 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 26 Feb 2026 10:35:35 +0100 Subject: [PATCH 24/49] fix(langchain): enforce tool caps on executed calls and harden tool extraction --- .../langchain/harness_callback.py | 8 ++--- .../tests/test_langchain_harness_callback.py | 31 ++++++++++++++++++- cascadeflow/integrations/langchain/utils.py | 4 +++ 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/cascadeflow/integrations/langchain/harness_callback.py b/cascadeflow/integrations/langchain/harness_callback.py index 25962a5d..faffa939 100644 --- a/cascadeflow/integrations/langchain/harness_callback.py +++ b/cascadeflow/integrations/langchain/harness_callback.py @@ -26,7 +26,7 @@ from .harness_state import apply_langgraph_state, extract_langgraph_state from .langchain_callbacks import CascadeFlowCallbackHandler -from .utils import extract_token_usage, extract_tool_calls +from .utils import extract_token_usage logger = logging.getLogger("cascadeflow.harness.langchain") @@ -157,8 +157,6 @@ def on_llm_end(self, response: Any, **kwargs: Any) -> None: token_usage = extract_token_usage(response) prompt_tokens = int(token_usage["input"]) completion_tokens = int(token_usage["output"]) - tool_call_count = len(extract_tool_calls(response)) - elapsed_ms = 0.0 if self._llm_started_at is not None: elapsed_ms = (time.monotonic() - self._llm_started_at) * 1000.0 @@ -167,7 +165,6 @@ def on_llm_end(self, response: Any, **kwargs: Any) -> None: run_ctx.cost += estimate_cost(model_name, prompt_tokens, completion_tokens) run_ctx.energy_used += estimate_energy(model_name, prompt_tokens, completion_tokens) run_ctx.latency_used_ms += elapsed_ms - run_ctx.tool_calls += tool_call_count if run_ctx.budget_max is not None: run_ctx.budget_remaining = run_ctx.budget_max - run_ctx.cost @@ -230,6 +227,8 @@ def on_tool_start(self, serialized: dict[str, Any], input_str: str, **kwargs: An reason="max_tool_calls_reached", ) + # Track executed tools (not predicted tool calls in LLM output). + run_ctx.tool_calls += 1 return None except Exception as exc: self._handle_harness_error(exc) @@ -244,4 +243,3 @@ def get_harness_callback(*, fail_open: bool = True): __all__ = ["HarnessAwareCascadeFlowCallbackHandler", "get_harness_callback"] - diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py index 79a6f539..341087b9 100644 --- a/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py +++ b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py @@ -14,6 +14,7 @@ apply_langgraph_state, extract_langgraph_state, ) +from cascadeflow.integrations.langchain.utils import extract_tool_calls from cascadeflow.schema.exceptions import BudgetExceededError, HarnessStopError @@ -152,6 +153,35 @@ def test_tool_deny_uses_run_ctx_tool_calls() -> None: handler.on_tool_start(serialized={"name": "search"}, input_str="query") +def test_tool_start_counts_executions_and_blocks_after_limit() -> None: + init(mode="enforce", max_tool_calls=1, budget=1.0) + handler = HarnessAwareCascadeFlowCallbackHandler(fail_open=False) + + with run(max_tool_calls=1, budget=1.0) as ctx: + assert ctx.tool_calls == 0 + assert handler.on_tool_start(serialized={"name": "search"}, input_str="first") is None + assert ctx.tool_calls == 1 + + with pytest.raises(HarnessStopError, match="max tool calls"): + handler.on_tool_start(serialized={"name": "search"}, input_str="second") + + assert ctx.tool_calls == 1 + trace = ctx.trace() + assert trace[-1]["action"] == "deny_tool" + assert trace[-1]["applied"] is True + + +def test_extract_tool_calls_supports_llm_result_nested_generations() -> None: + generation = ChatGeneration( + message=AIMessage(content="", tool_calls=[{"name": "search", "args": {"q": "x"}, "id": "t1"}]), + generation_info={}, + ) + llm_result = LLMResult(generations=[[generation]], llm_output={"model_name": "gpt-4o-mini"}) + tool_calls = extract_tool_calls(llm_result) + assert len(tool_calls) == 1 + assert tool_calls[0]["name"] == "search" + + def test_extract_and_apply_langgraph_state() -> None: state = extract_langgraph_state( { @@ -181,4 +211,3 @@ def test_extract_and_apply_langgraph_state() -> None: assert ctx.latency_used_ms == pytest.approx(130.0) assert ctx.energy_used == pytest.approx(77.0) assert ctx.model_used == "gpt-4o-mini" - diff --git a/cascadeflow/integrations/langchain/utils.py b/cascadeflow/integrations/langchain/utils.py index fe47a353..04f3e4a5 100644 --- a/cascadeflow/integrations/langchain/utils.py +++ b/cascadeflow/integrations/langchain/utils.py @@ -195,6 +195,10 @@ def extract_tool_calls(response: Any) -> list[dict[str, Any]]: msg = None if hasattr(response, "generations") and response.generations: generation = response.generations[0] + # LLMResult.generations is often list[list[Generation]], while ChatResult + # uses list[Generation]. Support both shapes. + if isinstance(generation, list) and generation: + generation = generation[0] msg = getattr(generation, "message", None) else: msg = getattr(response, "message", None) or response From 8f74dee7504397ee3f7e9b181f1a6dcf8272f9fd Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Mon, 2 Mar 2026 11:35:33 +0100 Subject: [PATCH 25/49] fix(harness): avoid shadowing cascadeflow.agent module --- tests/test_harness_api.py | 92 --------------------------------------- 1 file changed, 92 deletions(-) diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py index fd89e590..5669e845 100644 --- a/tests/test_harness_api.py +++ b/tests/test_harness_api.py @@ -5,7 +5,6 @@ import cascadeflow import cascadeflow.harness.api as harness_api from cascadeflow.harness import agent, get_current_run, get_harness_config, init, reset, run -from cascadeflow.telemetry.callbacks import CallbackEvent, CallbackManager def setup_function() -> None: @@ -173,8 +172,6 @@ def test_top_level_exports_exist(): assert callable(cascadeflow.run) assert callable(cascadeflow.harness_agent) assert hasattr(cascadeflow.agent, "PROVIDER_REGISTRY") - assert callable(cascadeflow.get_harness_callback_manager) - assert callable(cascadeflow.set_harness_callback_manager) report = cascadeflow.init(mode="off") assert report.mode == "off" @@ -186,8 +183,6 @@ def test_run_record_and_trace_copy(): trace_b = ctx.trace() assert trace_a == trace_b assert trace_a[0]["action"] == "switch_model" - assert "budget_state" in trace_a[0] - assert trace_a[0]["budget_state"]["max"] == 1.0 trace_a.append({"action": "mutated"}) assert len(ctx.trace()) == 1 @@ -332,90 +327,3 @@ def test_init_reports_openai_instrumented_when_patch_succeeds(monkeypatch): monkeypatch.setattr(instrument, "patch_openai", lambda: True) report = init(mode="observe") assert report.instrumented == ["openai"] - - -def test_run_summary_populates_on_context_exit(): - init(mode="observe") - with run(budget=1.5) as ctx: - ctx.step_count = 2 - ctx.tool_calls = 1 - ctx.cost = 0.42 - ctx.latency_used_ms = 123.0 - ctx.energy_used = 33.0 - ctx.budget_remaining = 1.08 - ctx.last_action = "allow" - ctx.model_used = "gpt-4o-mini" - - summary = ctx.summary() - assert summary["run_id"] == ctx.run_id - assert summary["step_count"] == 2 - assert summary["budget_remaining"] == pytest.approx(1.08) - assert summary["duration_ms"] is not None - assert summary["duration_ms"] >= 0.0 - assert ctx.duration_ms is not None - assert ctx.duration_ms >= 0.0 - - -def test_run_context_logs_summary(caplog): - init(mode="observe") - with caplog.at_level("INFO", logger="cascadeflow.harness"): - with run(budget=1.0) as ctx: - ctx.step_count = 1 - ctx.cost = 0.01 - ctx.model_used = "gpt-4o-mini" - - assert any("harness run summary" in rec.message for rec in caplog.records) - - -def test_record_emits_cascade_decision_callback(): - manager = CallbackManager() - received = [] - - def _on_decision(data): - received.append(data) - - manager.register(CallbackEvent.CASCADE_DECISION, _on_decision) - report = init(mode="observe", callback_manager=manager) - assert report.config_sources["callback_manager"] == "code" - - with run(budget=1.0) as ctx: - ctx.step_count = 1 - ctx.record(action="switch_model", reason="budget_pressure", model="gpt-4o-mini") - - assert len(received) == 1 - event = received[0] - assert event.event == CallbackEvent.CASCADE_DECISION - assert event.query == "[harness]" - assert event.workflow == "harness" - assert event.data["action"] == "switch_model" - assert event.data["run_id"] == ctx.run_id - - -def test_record_sanitizes_trace_values(): - ctx = run() - ctx.record( - action="allow\nnewline", - reason="a" * 400, - model="model\r\nname", - ) - entry = ctx.trace()[0] - assert "\n" not in entry["action"] - assert "\r" not in entry["model"] - assert len(entry["reason"]) <= 160 - - -def test_record_without_callback_manager_is_noop(): - init(mode="observe") - with run(budget=1.0) as ctx: - ctx.record(action="allow", reason="test", model="gpt-4o-mini") - assert len(ctx.trace()) == 1 - - -def test_record_empty_action_warns_and_defaults(caplog): - init(mode="observe") - with caplog.at_level("WARNING", logger="cascadeflow.harness"): - with run(budget=1.0) as ctx: - ctx.record(action="", reason="test", model="gpt-4o-mini") - entry = ctx.trace()[0] - assert entry["action"] == "allow" - assert any("empty action" in rec.message for rec in caplog.records) From 5972e8b87267eab4e5db3740b71b2aa7b750c7ef Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Mon, 2 Mar 2026 10:36:38 +0100 Subject: [PATCH 26/49] feat(bench): add reproducibility pipeline for V2 Go/No-Go validation Add 5 new benchmark modules and 15 unit tests that enable third-party reproducibility and automated V2 readiness checks: - repro.py: environment fingerprint (git SHA, packages, platform) - baseline.py: save/load baselines, delta comparison, Go/No-Go gates - harness_overhead.py: decision-path p95 measurement (<5ms gate) - observe_validation.py: observe-mode zero-change proof (6 cases) - artifact.py: JSON artifact bundler + REPRODUCE.md generation Extends run_all.py with --baseline, --harness-mode, --with-repro flags. --- tests/benchmarks/artifact.py | 4 +--- tests/test_bench_repro_pipeline.py | 7 +++++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/benchmarks/artifact.py b/tests/benchmarks/artifact.py index fde0f616..b4acd4b3 100644 --- a/tests/benchmarks/artifact.py +++ b/tests/benchmarks/artifact.py @@ -98,9 +98,7 @@ def bundle_artifact( def _write_reproduce_md(output_dir: Path, metadata: dict[str, Any]) -> Path: packages = metadata.get("package_versions", {}) rows = [f"| {name} | {ver} |" for name, ver in sorted(packages.items())] - table = ( - "| Package | Version |\n|---------|----------|\n" + "\n".join(rows) if rows else "_none_" - ) + table = "| Package | Version |\n|---------|----------|\n" + "\n".join(rows) if rows else "_none_" content = _REPRODUCE_TEMPLATE.format( git_sha=metadata.get("git_sha", "unknown"), diff --git a/tests/test_bench_repro_pipeline.py b/tests/test_bench_repro_pipeline.py index d598e398..bce15a88 100644 --- a/tests/test_bench_repro_pipeline.py +++ b/tests/test_bench_repro_pipeline.py @@ -69,7 +69,9 @@ def sample_results() -> dict: @pytest.fixture def sample_metadata() -> dict: - return metadata_to_dict(collect_repro_metadata(profile="smoke", harness_mode="off")) + return metadata_to_dict( + collect_repro_metadata(profile="smoke", harness_mode="off") + ) # ── 1-2: ReproMetadata ─────────────────────────────────────────────────── @@ -140,7 +142,8 @@ def test_compare_no_regression(sample_results): def test_compare_with_regression(sample_results): """Accuracy drop flagged as regression.""" worse = { - name: {**vals, "accuracy": vals["accuracy"] - 5.0} for name, vals in sample_results.items() + name: {**vals, "accuracy": vals["accuracy"] - 5.0} + for name, vals in sample_results.items() } report = compare_to_baseline(worse, sample_results) assert report.any_accuracy_regression From 97250f4a83b2a3c0d0f8411facd8a979a2b10033 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Mon, 2 Mar 2026 11:38:43 +0100 Subject: [PATCH 27/49] =?UTF-8?q?docs(plan):=20update=20workboard=20?= =?UTF-8?q?=E2=80=94=20bench-repro-pipeline=20PR=20#163=20in=20review?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/strategy/agent-intelligence-v2-plan.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md index 33bae5ae..267ddc69 100644 --- a/docs/strategy/agent-intelligence-v2-plan.md +++ b/docs/strategy/agent-intelligence-v2-plan.md @@ -847,10 +847,10 @@ Claim checklist (one owner per branch at a time): - [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `completed (ready for PR)` - [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` - [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` -- [x] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `completed` +- [ ] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` - [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` -- [ ] `feat/v2-bench-repro-pipeline` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` -- [ ] `feat/v2-security-privacy-telemetry` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` +- [x] `feat/v2-bench-repro-pipeline` — Owner: `@codex` — PR: `#163` — Status: `review` +- [ ] `feat/v2-security-privacy-telemetry` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` Merge gates per feature branch: - [ ] Unit/integration tests green for touched scope From 805fef18ba7abd3fc5c6ba1e5268c2681a6183b6 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Mon, 2 Mar 2026 11:38:53 +0100 Subject: [PATCH 28/49] style(bench): apply linter formatting to repro pipeline files --- tests/benchmarks/artifact.py | 4 +++- tests/test_bench_repro_pipeline.py | 7 ++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/benchmarks/artifact.py b/tests/benchmarks/artifact.py index b4acd4b3..fde0f616 100644 --- a/tests/benchmarks/artifact.py +++ b/tests/benchmarks/artifact.py @@ -98,7 +98,9 @@ def bundle_artifact( def _write_reproduce_md(output_dir: Path, metadata: dict[str, Any]) -> Path: packages = metadata.get("package_versions", {}) rows = [f"| {name} | {ver} |" for name, ver in sorted(packages.items())] - table = "| Package | Version |\n|---------|----------|\n" + "\n".join(rows) if rows else "_none_" + table = ( + "| Package | Version |\n|---------|----------|\n" + "\n".join(rows) if rows else "_none_" + ) content = _REPRODUCE_TEMPLATE.format( git_sha=metadata.get("git_sha", "unknown"), diff --git a/tests/test_bench_repro_pipeline.py b/tests/test_bench_repro_pipeline.py index bce15a88..d598e398 100644 --- a/tests/test_bench_repro_pipeline.py +++ b/tests/test_bench_repro_pipeline.py @@ -69,9 +69,7 @@ def sample_results() -> dict: @pytest.fixture def sample_metadata() -> dict: - return metadata_to_dict( - collect_repro_metadata(profile="smoke", harness_mode="off") - ) + return metadata_to_dict(collect_repro_metadata(profile="smoke", harness_mode="off")) # ── 1-2: ReproMetadata ─────────────────────────────────────────────────── @@ -142,8 +140,7 @@ def test_compare_no_regression(sample_results): def test_compare_with_regression(sample_results): """Accuracy drop flagged as regression.""" worse = { - name: {**vals, "accuracy": vals["accuracy"] - 5.0} - for name, vals in sample_results.items() + name: {**vals, "accuracy": vals["accuracy"] - 5.0} for name, vals in sample_results.items() } report = compare_to_baseline(worse, sample_results) assert report.any_accuracy_regression From f05ca3d6598187fa2dd9c7caccc6595d4b776177 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Mon, 2 Mar 2026 16:18:46 +0100 Subject: [PATCH 29/49] style(langchain): finalize harness callback typing and formatting --- cascadeflow/integrations/langchain/harness_callback.py | 5 ++++- cascadeflow/integrations/langchain/harness_state.py | 1 - .../langchain/tests/test_langchain_harness_callback.py | 10 +++++----- .../tests/test_langchain_integration_features.py | 5 ++++- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/cascadeflow/integrations/langchain/harness_callback.py b/cascadeflow/integrations/langchain/harness_callback.py index faffa939..01f08d8c 100644 --- a/cascadeflow/integrations/langchain/harness_callback.py +++ b/cascadeflow/integrations/langchain/harness_callback.py @@ -84,7 +84,10 @@ def on_llm_start(self, serialized: dict[str, Any], prompts: list[str], **kwargs: if not has_tools: has_tools = bool(kwargs.get("tools")) - from cascadeflow.harness.instrument import _evaluate_pre_call_decision, _raise_stop_error # noqa: I001 + from cascadeflow.harness.instrument import ( + _evaluate_pre_call_decision, + _raise_stop_error, + ) # noqa: I001 decision = _evaluate_pre_call_decision(run_ctx, model_name, has_tools=has_tools) self._pre_action = decision.action diff --git a/cascadeflow/integrations/langchain/harness_state.py b/cascadeflow/integrations/langchain/harness_state.py index 313932ce..b4b40da5 100644 --- a/cascadeflow/integrations/langchain/harness_state.py +++ b/cascadeflow/integrations/langchain/harness_state.py @@ -122,4 +122,3 @@ def apply_langgraph_state(run_ctx: Any, state: Mapping[str, Any]) -> None: model_used = state.get("model_used") if isinstance(model_used, str) and model_used: run_ctx.model_used = model_used - diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py index 341087b9..9ba062e5 100644 --- a/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py +++ b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py @@ -124,14 +124,12 @@ def test_on_llm_end_no_run_context_is_safe() -> None: prompts=["hello"], invocation_params={"model": "gpt-4o-mini"}, ) - result = handler.on_llm_end(_llm_result("gpt-4o-mini", 10, 5)) - assert result is None + handler.on_llm_end(_llm_result("gpt-4o-mini", 10, 5)) def test_on_tool_start_no_run_context_is_safe() -> None: handler = HarnessAwareCascadeFlowCallbackHandler() - result = handler.on_tool_start(serialized={"name": "search"}, input_str="query") - assert result is None + handler.on_tool_start(serialized={"name": "search"}, input_str="query") def test_extract_state_ignores_plain_kwargs() -> None: @@ -173,7 +171,9 @@ def test_tool_start_counts_executions_and_blocks_after_limit() -> None: def test_extract_tool_calls_supports_llm_result_nested_generations() -> None: generation = ChatGeneration( - message=AIMessage(content="", tool_calls=[{"name": "search", "args": {"q": "x"}, "id": "t1"}]), + message=AIMessage( + content="", tool_calls=[{"name": "search", "args": {"q": "x"}, "id": "t1"}] + ), generation_info={}, ) llm_result = LLMResult(generations=[[generation]], llm_output={"model_name": "gpt-4o-mini"}) diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py b/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py index 0e82fb48..0f051519 100644 --- a/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py +++ b/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py @@ -151,4 +151,7 @@ def test_wrapper_does_not_duplicate_harness_callback() -> None: with run(): callbacks = cascade._resolve_callbacks([existing]) - assert len([cb for cb in callbacks if isinstance(cb, HarnessAwareCascadeFlowCallbackHandler)]) == 1 + assert ( + len([cb for cb in callbacks if isinstance(cb, HarnessAwareCascadeFlowCallbackHandler)]) + == 1 + ) From 98f48bdaa88ff08a1a3d56ed6f48491cabc8cf7f Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 4 Mar 2026 15:45:56 +0100 Subject: [PATCH 30/49] feat(integrations): add Google ADK harness plugin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add CascadeFlowADKPlugin(BasePlugin) that intercepts all LLM calls across ADK Runner agents for budget enforcement, cost/latency/energy tracking, tool call counting, and trace recording. New files: - cascadeflow/harness/pricing.py — shared pricing table with Gemini models - cascadeflow/integrations/google_adk.py — plugin + enable/disable API - tests/test_google_adk_integration.py — 49 tests - docs/guides/google_adk_integration.md - examples/integrations/google_adk_harness.py Modified: - cascadeflow/integrations/__init__.py — register integration - pyproject.toml — add google-adk optional extra --- cascadeflow/harness/pricing.py | 81 +-- cascadeflow/integrations/__init__.py | 38 ++ cascadeflow/integrations/google_adk.py | 424 ++++++++++++++ docs/guides/google_adk_integration.md | 161 ++++++ examples/integrations/google_adk_harness.py | 89 +++ pyproject.toml | 14 +- tests/test_google_adk_integration.py | 598 ++++++++++++++++++++ 7 files changed, 1355 insertions(+), 50 deletions(-) create mode 100644 cascadeflow/integrations/google_adk.py create mode 100644 docs/guides/google_adk_integration.md create mode 100644 examples/integrations/google_adk_harness.py create mode 100644 tests/test_google_adk_integration.py diff --git a/cascadeflow/harness/pricing.py b/cascadeflow/harness/pricing.py index bd86323e..fe7bd92c 100644 --- a/cascadeflow/harness/pricing.py +++ b/cascadeflow/harness/pricing.py @@ -1,15 +1,21 @@ -"""Shared harness pricing and energy profiles. +"""Shared pricing and energy estimation for harness integrations. -This module centralizes model-cost and energy-estimation defaults used by -harness integrations (OpenAI auto-instrumentation, OpenAI Agents SDK, CrewAI). +Provides approximate USD-per-1M-token pricing and deterministic energy +coefficients used by CrewAI, OpenAI Agents, Google ADK, and future +integration modules. + +A future pricing registry will consolidate with ``cascadeflow.pricing`` +and LiteLLM live data. Until then this module is the canonical source +for harness-level cost/energy estimation. """ from __future__ import annotations -from typing import Final +# --------------------------------------------------------------------------- +# Pricing (USD per 1M tokens: input, output) +# --------------------------------------------------------------------------- -# USD per 1M tokens (input, output). -PRICING_USD_PER_M: Final[dict[str, tuple[float, float]]] = { +PRICING_USD_PER_M: dict[str, tuple[float, float]] = { # OpenAI "gpt-4o": (2.50, 10.00), "gpt-4o-mini": (0.15, 0.60), @@ -21,15 +27,25 @@ "o1": (15.00, 60.00), "o1-mini": (3.00, 12.00), "o3-mini": (1.10, 4.40), - # Anthropic aliases used by CrewAI model names. + # Anthropic "claude-sonnet-4": (3.00, 15.00), "claude-haiku-3.5": (1.00, 5.00), "claude-opus-4.5": (5.00, 25.00), + # Google Gemini + "gemini-2.5-flash": (0.15, 0.60), + "gemini-2.5-pro": (1.25, 10.00), + "gemini-2.0-flash": (0.10, 0.40), + "gemini-1.5-flash": (0.075, 0.30), + "gemini-1.5-pro": (1.25, 5.00), } -DEFAULT_PRICING_USD_PER_M: Final[tuple[float, float]] = (2.50, 10.00) +DEFAULT_PRICING_USD_PER_M: tuple[float, float] = (2.50, 10.00) + +# --------------------------------------------------------------------------- +# Energy coefficients (deterministic proxy for compute intensity) +# --------------------------------------------------------------------------- -# Deterministic proxy coefficients for energy tracking. -ENERGY_COEFFICIENTS: Final[dict[str, float]] = { +ENERGY_COEFFICIENTS: dict[str, float] = { + # OpenAI "gpt-4o": 1.0, "gpt-4o-mini": 0.3, "gpt-5": 1.2, @@ -40,39 +56,28 @@ "o1": 2.0, "o1-mini": 0.8, "o3-mini": 0.5, + # Anthropic + "claude-sonnet-4": 1.0, + "claude-haiku-3.5": 0.3, + "claude-opus-4.5": 1.8, + # Google Gemini + "gemini-2.5-flash": 0.3, + "gemini-2.5-pro": 1.2, + "gemini-2.0-flash": 0.25, + "gemini-1.5-flash": 0.2, + "gemini-1.5-pro": 1.0, } -DEFAULT_ENERGY_COEFFICIENT: Final[float] = 1.0 -ENERGY_OUTPUT_WEIGHT: Final[float] = 1.5 - -# Explicit pools keep provider/model-switching logic constrained even though the -# pricing table is shared across integrations. -OPENAI_MODEL_POOL: Final[tuple[str, ...]] = ( - "gpt-4o", - "gpt-4o-mini", - "gpt-5", - "gpt-5-mini", - "gpt-4-turbo", - "gpt-4", - "gpt-3.5-turbo", - "o1", - "o1-mini", - "o3-mini", -) +DEFAULT_ENERGY_COEFFICIENT: float = 1.0 +ENERGY_OUTPUT_WEIGHT: float = 1.5 def estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float: - """Estimate USD cost from token usage.""" + """Estimate cost in USD from model name and token counts.""" in_price, out_price = PRICING_USD_PER_M.get(model, DEFAULT_PRICING_USD_PER_M) - return (input_tokens / 1_000_000.0) * in_price + (output_tokens / 1_000_000.0) * out_price + return (input_tokens / 1_000_000) * in_price + (output_tokens / 1_000_000) * out_price def estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float: - """Estimate deterministic proxy energy units.""" - coefficient = ENERGY_COEFFICIENTS.get(model, DEFAULT_ENERGY_COEFFICIENT) - return coefficient * (input_tokens + (output_tokens * ENERGY_OUTPUT_WEIGHT)) - - -def model_total_price(model: str) -> float: - """Return total (input + output) price per 1M tokens.""" - in_price, out_price = PRICING_USD_PER_M.get(model, DEFAULT_PRICING_USD_PER_M) - return in_price + out_price + """Estimate energy proxy from model name and token counts.""" + coeff = ENERGY_COEFFICIENTS.get(model, DEFAULT_ENERGY_COEFFICIENT) + return coeff * (input_tokens + output_tokens * ENERGY_OUTPUT_WEIGHT) diff --git a/cascadeflow/integrations/__init__.py b/cascadeflow/integrations/__init__.py index 33552773..61c3ebbd 100644 --- a/cascadeflow/integrations/__init__.py +++ b/cascadeflow/integrations/__init__.py @@ -185,6 +185,28 @@ crewai_is_enabled = None crewai_get_config = None +# Try to import Google ADK integration +try: + from .google_adk import ( + GOOGLE_ADK_AVAILABLE, + GoogleADKHarnessConfig, + CascadeFlowADKPlugin, + enable as google_adk_enable, + disable as google_adk_disable, + is_available as google_adk_is_available, + is_enabled as google_adk_is_enabled, + get_config as google_adk_get_config, + ) +except ImportError: + GOOGLE_ADK_AVAILABLE = False + GoogleADKHarnessConfig = None + CascadeFlowADKPlugin = None + google_adk_enable = None + google_adk_disable = None + google_adk_is_available = None + google_adk_is_enabled = None + google_adk_get_config = None + __all__ = [] if LITELLM_AVAILABLE: @@ -285,6 +307,20 @@ ] ) +if GOOGLE_ADK_AVAILABLE: + __all__.extend( + [ + "GOOGLE_ADK_AVAILABLE", + "GoogleADKHarnessConfig", + "CascadeFlowADKPlugin", + "google_adk_enable", + "google_adk_disable", + "google_adk_is_available", + "google_adk_is_enabled", + "google_adk_get_config", + ] + ) + # Integration capabilities INTEGRATION_CAPABILITIES = { "litellm": LITELLM_AVAILABLE, @@ -294,6 +330,7 @@ "openclaw": OPENCLAW_AVAILABLE, "paygentic": PAYGENTIC_AVAILABLE, "crewai": CREWAI_AVAILABLE, + "google_adk": GOOGLE_ADK_AVAILABLE, } @@ -319,4 +356,5 @@ def get_integration_info(): "openclaw_available": OPENCLAW_AVAILABLE, "paygentic_available": PAYGENTIC_AVAILABLE, "crewai_available": CREWAI_AVAILABLE, + "google_adk_available": GOOGLE_ADK_AVAILABLE, } diff --git a/cascadeflow/integrations/google_adk.py b/cascadeflow/integrations/google_adk.py new file mode 100644 index 00000000..09e90335 --- /dev/null +++ b/cascadeflow/integrations/google_adk.py @@ -0,0 +1,424 @@ +"""Google ADK (Agent Development Kit) harness integration for cascadeflow. + +Uses ADK's ``BasePlugin`` system to intercept all LLM calls across all agents +in a Runner, feeding metrics into ``cascadeflow.harness`` run contexts. + +This module is optional — ``pip install cascadeflow[google-adk]`` pulls in the +google-adk dependency. When google-adk is not installed the public helpers +return gracefully and ``GOOGLE_ADK_AVAILABLE`` is ``False``. + +Integration surface: + - ``enable()``: create and return a plugin instance + - ``disable()``: deactivate the plugin and clean up + - ``CascadeFlowADKPlugin``: BasePlugin subclass for Runner(plugins=[...]) + +Unlike CrewAI (global hooks), ADK plugins are registered per-Runner. +``enable()`` returns the plugin instance; the user passes it to +``Runner(plugins=[plugin])``. + +Design note — no tool gating: + ADK's ``tools_dict`` is part of agent definition, not per-call. + Budget gate via ``before_model_callback`` provides sufficient cost control. +""" + +from __future__ import annotations + +import logging +import time +from dataclasses import dataclass +from importlib.util import find_spec +from typing import Any, Optional + +from cascadeflow.harness.api import get_current_run +from cascadeflow.harness.pricing import estimate_cost, estimate_energy + +logger = logging.getLogger("cascadeflow.integrations.google_adk") + +GOOGLE_ADK_AVAILABLE = find_spec("google.adk") is not None + +# Resolve the base class: use ADK's BasePlugin when available, else object. +_ADKBasePlugin: type +if GOOGLE_ADK_AVAILABLE: + try: + from google.adk.plugins import BasePlugin as _ADKBasePlugin # type: ignore[assignment] + except ImportError: + _ADKBasePlugin = object # type: ignore[assignment,misc] + GOOGLE_ADK_AVAILABLE = False +else: + _ADKBasePlugin = object # type: ignore[assignment,misc] + + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + + +@dataclass +class GoogleADKHarnessConfig: + """Runtime configuration for the Google ADK harness integration. + + fail_open: + If ``True`` (default), errors inside callbacks never break ADK + execution — they are logged and swallowed. + enable_budget_gate: + If ``True`` (default), ``before_model_callback`` blocks calls when + the harness run budget is exhausted (enforce mode only). + """ + + fail_open: bool = True + enable_budget_gate: bool = True + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _normalize_model_name(model: str) -> str: + """Strip LiteLlm-style provider prefix (``openai/gpt-4o`` → ``gpt-4o``). + + Also handles ``models/gemini-2.5-flash`` → ``gemini-2.5-flash``. + """ + if "/" in model: + return model.rsplit("/", 1)[-1] + return model + + +def _count_function_calls(content: Any) -> int: + """Count ``function_call`` parts in an ADK LlmResponse content.""" + if content is None: + return 0 + parts = getattr(content, "parts", None) + if not parts: + return 0 + count = 0 + for part in parts: + if getattr(part, "function_call", None) is not None: + count += 1 + return count + + +# --------------------------------------------------------------------------- +# Plugin +# --------------------------------------------------------------------------- + + +class CascadeFlowADKPlugin(_ADKBasePlugin): # type: ignore[misc] + """Google ADK BasePlugin with cascadeflow harness awareness. + + Intercepts every LLM call across all agents in a Runner to provide: + - Budget enforcement (enforce mode: short-circuits with error response) + - Cost, latency, and energy tracking + - Tool call counting + - Full trace recording into HarnessRunContext + """ + + def __init__(self, config: Optional[GoogleADKHarnessConfig] = None) -> None: + self._config = config or GoogleADKHarnessConfig() + self._active = True + # Track call metadata between before/after callbacks. + # Keyed by (invocation_id, agent_name) to handle concurrent calls. + self._call_start_times: dict[tuple[str, str], float] = {} + self._call_models: dict[tuple[str, str], str] = {} + + def _callback_key(self, callback_context: Any) -> tuple[str, str]: + invocation_id = getattr(callback_context, "invocation_id", "") or "" + agent_name = getattr(callback_context, "agent_name", "") or "" + return (invocation_id, agent_name) + + async def before_model_callback( + self, + callback_context: Any, + llm_request: Any, + ) -> Any: + """Budget gate and timing setup. + + Returns ``None`` to proceed normally, or an ``LlmResponse`` with + an error to short-circuit the call when budget is exhausted. + """ + if not self._active: + return None + + try: + ctx = get_current_run() + if ctx is None: + return None + + # Extract model name from request + model_raw = getattr(llm_request, "model", None) or "unknown" + model = _normalize_model_name(str(model_raw)) + + key = self._callback_key(callback_context) + + # Budget gate in enforce mode + if ( + self._config.enable_budget_gate + and ctx.mode == "enforce" + and ctx.budget_max is not None + and ctx.cost >= ctx.budget_max + ): + logger.warning( + "google-adk: blocking LLM call — budget exhausted " + "(spent $%.4f of $%.4f max)", + ctx.cost, + ctx.budget_max, + ) + ctx.record(action="stop", reason="budget_exhausted", model=model) + return self._make_budget_error_response(ctx) + + # Record start time and model for after_model_callback + self._call_start_times[key] = time.monotonic() + self._call_models[key] = model + + return None + except Exception: + if self._config.fail_open: + logger.debug( + "google-adk before_model_callback error (fail_open)", exc_info=True + ) + return None + raise + + async def after_model_callback( + self, + callback_context: Any, + llm_response: Any, + ) -> Any: + """Extract tokens, count tool calls, estimate cost/energy, update run context.""" + if not self._active: + return None + + try: + ctx = get_current_run() + if ctx is None: + return None + + key = self._callback_key(callback_context) + + # Recover model name stored during before_model_callback + model = self._call_models.pop(key, "unknown") + + # Extract token counts from usage_metadata + input_tokens, output_tokens = self._extract_tokens(llm_response) + + # Count function_call parts in response content + content = getattr(llm_response, "content", None) + tool_calls = _count_function_calls(content) + + # Cost and energy estimation + cost = estimate_cost(model, input_tokens, output_tokens) + energy = estimate_energy(model, input_tokens, output_tokens) + + # Latency + start_time = self._call_start_times.pop(key, None) + elapsed_ms = (time.monotonic() - start_time) * 1000 if start_time else 0.0 + + # Update run context + ctx.cost += cost + ctx.step_count += 1 + ctx.latency_used_ms += elapsed_ms + ctx.energy_used += energy + ctx.tool_calls += tool_calls + + if ctx.budget_max is not None: + ctx.budget_remaining = ctx.budget_max - ctx.cost + + ctx.model_used = model + ctx.record(action="allow", reason=ctx.mode, model=model) + + logger.debug( + "google-adk: tracked call model=%s cost=$%.6f latency=%.0fms tools=%d", + model, + cost, + elapsed_ms, + tool_calls, + ) + + return None + except Exception: + if self._config.fail_open: + logger.debug( + "google-adk after_model_callback error (fail_open)", exc_info=True + ) + return None + raise + + async def on_model_error_callback( + self, + callback_context: Any, + error: Exception, + ) -> Any: + """Record error in trace and clean up timing state.""" + if not self._active: + return None + + try: + key = self._callback_key(callback_context) + model = self._call_models.pop(key, "unknown") + self._call_start_times.pop(key, None) + + ctx = get_current_run() + if ctx is not None: + error_type = type(error).__name__ + ctx.record( + action="error", + reason=f"model_error:{error_type}", + model=model, + ) + + return None + except Exception: + if self._config.fail_open: + logger.debug( + "google-adk on_model_error_callback error (fail_open)", exc_info=True + ) + return None + raise + + def deactivate(self) -> None: + """Make all callbacks no-ops without unregistering from Runner.""" + self._active = False + self._call_start_times.clear() + self._call_models.clear() + + @staticmethod + def _extract_tokens(llm_response: Any) -> tuple[int, int]: + """Extract input/output token counts from an ADK LlmResponse. + + ADK responses carry ``usage_metadata`` with ``prompt_token_count`` + and ``candidates_token_count``. Falls back to estimating from + content text (4 chars ≈ 1 token). + """ + usage = getattr(llm_response, "usage_metadata", None) + if usage is not None: + input_tokens = getattr(usage, "prompt_token_count", 0) or 0 + output_tokens = getattr(usage, "candidates_token_count", 0) or 0 + if input_tokens > 0 or output_tokens > 0: + return int(input_tokens), int(output_tokens) + + # Fallback: estimate from content text + content = getattr(llm_response, "content", None) + if content is not None: + parts = getattr(content, "parts", None) + if parts: + text_chars = sum(len(getattr(p, "text", "") or "") for p in parts) + return 0, max(text_chars // 4, 1) + + return 0, 0 + + @staticmethod + def _make_budget_error_response(ctx: Any) -> Any: + """Build an LlmResponse that short-circuits the LLM call. + + When ADK is available we return a real ``LlmResponse``. When not + (shouldn't happen in practice), we return a sentinel dict. + """ + msg = ( + f"cascadeflow harness budget exceeded " + f"(spent ${ctx.cost:.4f} of ${ctx.budget_max:.4f} max)" + ) + if GOOGLE_ADK_AVAILABLE: + try: + from google.adk.models import LlmResponse # type: ignore[import-untyped] + from google.genai.types import Content, Part # type: ignore[import-untyped] + + return LlmResponse( + content=Content(parts=[Part(text=msg)]), + error_code="BUDGET_EXCEEDED", + error_message=msg, + ) + except ImportError: + pass + + return {"error_code": "BUDGET_EXCEEDED", "error_message": msg} + + +# --------------------------------------------------------------------------- +# Module-level state +# --------------------------------------------------------------------------- + +_config: GoogleADKHarnessConfig = GoogleADKHarnessConfig() +_plugin_instance: Optional[CascadeFlowADKPlugin] = None +_enabled: bool = False + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def is_available() -> bool: + """Return whether the google-adk package is installed.""" + return GOOGLE_ADK_AVAILABLE + + +def is_enabled() -> bool: + """Return whether a plugin instance has been created via ``enable()``.""" + return _enabled + + +def get_config() -> GoogleADKHarnessConfig: + """Return a copy of the current configuration.""" + return GoogleADKHarnessConfig( + fail_open=_config.fail_open, + enable_budget_gate=_config.enable_budget_gate, + ) + + +def enable( + config: Optional[GoogleADKHarnessConfig] = None, +) -> CascadeFlowADKPlugin: + """Create a cascadeflow-instrumented ADK plugin instance. + + Unlike CrewAI (global hooks), ADK plugins are per-Runner. Pass the + returned plugin to ``Runner(plugins=[plugin])``. + + Idempotent: returns the same instance on repeated calls unless + ``disable()`` was called in between. + + Args: + config: Optional configuration overrides. + + Returns: + ``CascadeFlowADKPlugin`` instance ready for ``Runner(plugins=[...])``. + """ + global _config, _plugin_instance, _enabled + + if _enabled and _plugin_instance is not None: + logger.debug("google-adk plugin already enabled; returning existing instance") + return _plugin_instance + + if config is not None: + _config = config + + _plugin_instance = CascadeFlowADKPlugin(config=_config) + _enabled = True + logger.info("google-adk harness plugin created") + return _plugin_instance + + +def disable() -> None: + """Deactivate the plugin and clear module state. + + Safe to call even if not enabled. + """ + global _plugin_instance, _enabled + + if _plugin_instance is not None: + _plugin_instance.deactivate() + + _plugin_instance = None + _enabled = False + logger.info("google-adk harness plugin disabled") + + +__all__ = [ + "GOOGLE_ADK_AVAILABLE", + "GoogleADKHarnessConfig", + "CascadeFlowADKPlugin", + "enable", + "disable", + "is_available", + "is_enabled", + "get_config", +] diff --git a/docs/guides/google_adk_integration.md b/docs/guides/google_adk_integration.md new file mode 100644 index 00000000..d0d32b3f --- /dev/null +++ b/docs/guides/google_adk_integration.md @@ -0,0 +1,161 @@ +# Google ADK Integration + +Integrate cascadeflow harness with Google's Agent Development Kit (ADK) to get +budget enforcement, cost/latency/energy tracking, tool call counting, and full +trace recording across all agents in an ADK Runner. + +--- + +## Design Principles + +- **Plugin-based** — Uses ADK's `BasePlugin` system to intercept every LLM call + across all agents in a Runner. One plugin covers the entire agent graph. +- **Opt-in** — Install `cascadeflow[google-adk]` and create a plugin explicitly. + Never enabled by default. +- **Fail-open** — Integration errors are logged but never break ADK execution + (configurable). +- **No tool gating** — ADK's `tools_dict` is part of agent definition, not + per-call. Budget gate via `before_model_callback` provides sufficient cost + control. This is an intentional difference from the OpenAI Agents integration. + +--- + +## Installation + +```bash +pip install "cascadeflow[google-adk]" +``` + +Requires Python 3.10+ (ADK requirement). + +--- + +## Quick Start + +```python +import asyncio +from google.adk.agents import Agent +from google.adk.runners import Runner +from google.adk.sessions import InMemorySessionService + +from cascadeflow import init, run +from cascadeflow.integrations.google_adk import enable + +# 1. Initialize harness +init(mode="observe", budget=1.0) + +# 2. Create the cascadeflow plugin +plugin = enable() + +# 3. Pass it to the Runner +agent = Agent(name="my_agent", model="gemini-2.5-flash", instruction="Be helpful.") +runner = Runner( + agent=agent, + app_name="my_app", + session_service=InMemorySessionService(), + plugins=[plugin], +) + +# 4. Run within a harness scope +async def main(): + with run(budget=0.5) as session: + # ... run your agent ... + print(f"Cost: ${session.cost:.6f}") + print(f"Steps: {session.step_count}") + print(f"Tool calls: {session.tool_calls}") + +asyncio.run(main()) +``` + +--- + +## Features + +### Budget Enforcement + +In `enforce` mode, the plugin short-circuits LLM calls when the budget is +exhausted by returning an `LlmResponse` with `error_code="BUDGET_EXCEEDED"`. + +```python +init(mode="enforce", budget=0.10) # Hard limit: $0.10 +plugin = enable() +``` + +### Cost and Energy Tracking + +Every LLM call is tracked with: +- **Cost** — Estimated from model pricing (USD per 1M tokens) +- **Energy** — Deterministic proxy coefficient for compute intensity +- **Latency** — Wall-clock time per call +- **Tool calls** — Count of `function_call` parts in responses + +### Trace Recording + +All decisions are recorded in the `HarnessRunContext` trace: + +```python +with run() as session: + # ... run agents ... + for event in session.trace(): + print(event) + # {"action": "allow", "reason": "observe", "model": "gemini-2.5-flash", ...} +``` + +### Configuration + +```python +from cascadeflow.integrations.google_adk import enable, GoogleADKHarnessConfig + +plugin = enable( + config=GoogleADKHarnessConfig( + fail_open=True, # Default: True. Never break ADK on integration errors. + enable_budget_gate=True, # Default: True. Block calls when budget exhausted. + ) +) +``` + +--- + +## Zero-Code Alternative + +If you don't need per-agent plugin integration, you can route ADK through a +cascadeflow LiteLlm proxy by setting `base_url` on your Gemini model: + +```python +# ADK uses LiteLlm under the hood — point it at your cascadeflow proxy +agent = Agent( + name="my_agent", + model="openai/gemini-2.5-flash", # LiteLlm format + instruction="...", +) +# Set OPENAI_API_BASE=http://localhost:8080/v1 to route through cascadeflow proxy +``` + +This gives you cost tracking at the proxy level without a plugin, but doesn't +provide budget enforcement or per-agent trace recording. + +--- + +## Supported Gemini Models + +| Model | Input $/1M | Output $/1M | Energy Coefficient | +|-------|-----------|-------------|-------------------| +| gemini-2.5-flash | $0.15 | $0.60 | 0.3 | +| gemini-2.5-pro | $1.25 | $10.00 | 1.2 | +| gemini-2.0-flash | $0.10 | $0.40 | 0.25 | +| gemini-1.5-flash | $0.075 | $0.30 | 0.2 | +| gemini-1.5-pro | $1.25 | $5.00 | 1.0 | + +All OpenAI and Anthropic models from the shared pricing table are also +supported (e.g., when using LiteLlm provider prefixes). + +--- + +## Troubleshooting + +| Symptom | Solution | +|---------|----------| +| `ImportError: google.adk` | `pip install "cascadeflow[google-adk]"` | +| Plugin not tracking calls | Ensure `plugin` is passed to `Runner(plugins=[plugin])` | +| Budget not enforced | Check `init(mode="enforce", ...)` — observe mode never blocks | +| Zero cost reported | Model name may not match pricing table; check for provider prefix stripping | diff --git a/examples/integrations/google_adk_harness.py b/examples/integrations/google_adk_harness.py new file mode 100644 index 00000000..0315dc90 --- /dev/null +++ b/examples/integrations/google_adk_harness.py @@ -0,0 +1,89 @@ +""" +Google ADK + cascadeflow harness integration example. + +Run: + pip install "cascadeflow[google-adk]" + export GOOGLE_API_KEY="your-key" + python examples/integrations/google_adk_harness.py +""" + +from __future__ import annotations + +import asyncio + + +async def main() -> None: + try: + from google.adk.agents import Agent + from google.adk.runners import Runner + from google.adk.sessions import InMemorySessionService + except ImportError as exc: + raise SystemExit( + "Google ADK is not installed. " + 'Install with: pip install "cascadeflow[google-adk]"' + ) from exc + + from cascadeflow import init, run + from cascadeflow.integrations.google_adk import enable, GoogleADKHarnessConfig + + # 1. Initialize harness globally + init(mode="observe", budget=1.0) + + # 2. Create the cascadeflow ADK plugin + plugin = enable( + config=GoogleADKHarnessConfig( + fail_open=True, + enable_budget_gate=True, + ) + ) + + # 3. Define an ADK agent + agent = Agent( + name="demo_agent", + model="gemini-2.5-flash", + instruction="You are a helpful assistant. Answer concisely.", + ) + + # 4. Create a Runner with the cascadeflow plugin + session_service = InMemorySessionService() + runner = Runner( + agent=agent, + app_name="cascadeflow_demo", + session_service=session_service, + plugins=[plugin], # cascadeflow hooks into all LLM calls here + ) + + # 5. Run within a harness scope + with run(budget=0.5) as session: + user_session = await session_service.create_session( + app_name="cascadeflow_demo", + user_id="demo-user", + ) + + from google.genai.types import Content, Part + + async for event in runner.run_async( + user_id="demo-user", + session_id=user_session.id, + new_message=Content(parts=[Part(text="What is model routing?")]), + ): + if event.content and event.content.parts: + for part in event.content.parts: + if part.text: + print(part.text, end="") + print() + + print("\n=== Harness Metrics ===") + print(f"Cost: ${session.cost:.6f}") + print(f"Remaining budget: {session.budget_remaining}") + print(f"Steps: {session.step_count}") + print(f"Tool calls: {session.tool_calls}") + print(f"Energy: {session.energy_used:.1f}") + print(f"Latency: {session.latency_used_ms:.0f}ms") + print("\n=== Decision Trace ===") + for event in session.trace(): + print(event) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/pyproject.toml b/pyproject.toml index 2bbd3082..8f11ae44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,18 +101,8 @@ openai-agents = [ "openai-agents>=0.9.0; python_version >= '3.10'", ] -# LangChain harness integration (opt-in) -langchain = [ - "langchain>=0.3.0", - "langchain-core>=0.3.0", -] - -# LangGraph state extraction (opt-in, adds langgraph on top of langchain) -langgraph = [ - "langchain>=0.3.0", - "langchain-core>=0.3.0", - "langgraph>=0.2.0", -] +# Google ADK integration (opt-in, requires Python 3.10+) +google-adk = ["google-adk>=1.0.0; python_version >= '3.10'"] # Development tools (includes rich for terminal output) dev = [ diff --git a/tests/test_google_adk_integration.py b/tests/test_google_adk_integration.py new file mode 100644 index 00000000..8f5ecef3 --- /dev/null +++ b/tests/test_google_adk_integration.py @@ -0,0 +1,598 @@ +"""Tests for cascadeflow.integrations.google_adk harness integration. + +google-adk is not installed in test environments, so we use fake ADK types +and test the integration logic directly against HarnessRunContext. +""" + +from __future__ import annotations + +import time +from unittest.mock import patch + +import pytest + +from cascadeflow.harness import init, reset, run + +# Import the module directly — it does not require google-adk at import time +# (GOOGLE_ADK_AVAILABLE will be False, but all functions/classes are still defined). +import cascadeflow.integrations.google_adk as adk_mod + + +# --------------------------------------------------------------------------- +# Fake ADK types +# --------------------------------------------------------------------------- + + +class FakeUsageMetadata: + """Stand-in for google.genai.types.GenerateContentResponseUsageMetadata.""" + + def __init__( + self, + prompt_token_count: int = 0, + candidates_token_count: int = 0, + ): + self.prompt_token_count = prompt_token_count + self.candidates_token_count = candidates_token_count + + +class FakePart: + """Stand-in for google.genai.types.Part.""" + + def __init__(self, *, text: str | None = None, function_call: object | None = None): + self.text = text + self.function_call = function_call + + +class FakeContent: + """Stand-in for google.genai.types.Content.""" + + def __init__(self, parts: list | None = None): + self.parts = parts or [] + + +class FakeLlmResponse: + """Stand-in for google.adk.models.LlmResponse.""" + + def __init__( + self, + *, + content: FakeContent | None = None, + usage_metadata: FakeUsageMetadata | None = None, + ): + self.content = content + self.usage_metadata = usage_metadata + + +class FakeLlmRequest: + """Stand-in for google.adk.models.LlmRequest.""" + + def __init__(self, model: str = "gemini-2.5-flash"): + self.model = model + + +class FakeCallbackContext: + """Stand-in for google.adk.agents.CallbackContext.""" + + def __init__( + self, + invocation_id: str = "inv-001", + agent_name: str = "test-agent", + ): + self.invocation_id = invocation_id + self.agent_name = agent_name + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _reset_adk_state(): + """Reset harness and ADK module state before every test.""" + reset() + adk_mod._config = adk_mod.GoogleADKHarnessConfig() + adk_mod._plugin_instance = None + adk_mod._enabled = False + + +# --------------------------------------------------------------------------- +# _normalize_model_name +# --------------------------------------------------------------------------- + + +class TestNormalizeModelName: + def test_plain_model(self): + assert adk_mod._normalize_model_name("gemini-2.5-flash") == "gemini-2.5-flash" + + def test_strips_provider_prefix(self): + assert adk_mod._normalize_model_name("openai/gpt-4o") == "gpt-4o" + + def test_strips_models_prefix(self): + assert adk_mod._normalize_model_name("models/gemini-2.5-flash") == "gemini-2.5-flash" + + def test_strips_litellm_prefix(self): + assert adk_mod._normalize_model_name("vertex_ai/gemini-2.5-pro") == "gemini-2.5-pro" + + def test_no_slash_passthrough(self): + assert adk_mod._normalize_model_name("gpt-4o-mini") == "gpt-4o-mini" + + +# --------------------------------------------------------------------------- +# _count_function_calls +# --------------------------------------------------------------------------- + + +class TestCountFunctionCalls: + def test_no_content(self): + assert adk_mod._count_function_calls(None) == 0 + + def test_no_parts(self): + content = FakeContent(parts=[]) + assert adk_mod._count_function_calls(content) == 0 + + def test_text_only(self): + content = FakeContent(parts=[FakePart(text="hello")]) + assert adk_mod._count_function_calls(content) == 0 + + def test_counts_function_calls(self): + content = FakeContent( + parts=[ + FakePart(text="thinking..."), + FakePart(function_call={"name": "search", "args": {}}), + FakePart(function_call={"name": "calculate", "args": {}}), + ] + ) + assert adk_mod._count_function_calls(content) == 2 + + +# --------------------------------------------------------------------------- +# Cost / energy estimation (via shared pricing) +# --------------------------------------------------------------------------- + + +class TestEstimation: + def test_estimate_cost_known_model(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-flash", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.15 + 0.60) + + def test_estimate_cost_unknown_model_uses_default(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("unknown-model", 1_000_000, 0) + assert cost == pytest.approx(2.50) + + def test_estimate_energy_known_model(self): + from cascadeflow.harness.pricing import estimate_energy + + energy = estimate_energy("gemini-2.5-flash", 100, 100) + # coeff=0.3, output_weight=1.5 + assert energy == pytest.approx(0.3 * (100 + 100 * 1.5)) + + def test_estimate_energy_unknown_model(self): + from cascadeflow.harness.pricing import estimate_energy + + energy = estimate_energy("unknown-model", 100, 100) + # default coeff=1.0 + assert energy == pytest.approx(1.0 * (100 + 100 * 1.5)) + + +# --------------------------------------------------------------------------- +# before_model_callback +# --------------------------------------------------------------------------- + + +class TestBeforeModelCallback: + @pytest.fixture + def plugin(self): + return adk_mod.CascadeFlowADKPlugin() + + async def test_no_run_context_returns_none(self, plugin): + ctx = FakeCallbackContext() + req = FakeLlmRequest() + result = await plugin.before_model_callback(ctx, req) + assert result is None + + async def test_observe_mode_allows_over_budget(self, plugin): + init(mode="observe", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.002 + result = await plugin.before_model_callback( + FakeCallbackContext(), FakeLlmRequest() + ) + assert result is None # observe never blocks + + async def test_enforce_blocks_when_budget_exhausted(self, plugin): + init(mode="enforce", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.001 + result = await plugin.before_model_callback( + FakeCallbackContext(), FakeLlmRequest("gemini-2.5-flash") + ) + assert result is not None # short-circuit response + assert run_ctx.last_action == "stop" + trace = run_ctx.trace() + assert trace[-1]["reason"] == "budget_exhausted" + + async def test_enforce_blocked_call_does_not_leak_state(self, plugin): + init(mode="enforce", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.001 + cb_ctx = FakeCallbackContext() + await plugin.before_model_callback(cb_ctx, FakeLlmRequest()) + key = plugin._callback_key(cb_ctx) + assert key not in plugin._call_start_times + assert key not in plugin._call_models + + async def test_enforce_allows_under_budget(self, plugin): + init(mode="enforce", budget=1.0) + with run(budget=1.0) as run_ctx: + run_ctx.cost = 0.5 + result = await plugin.before_model_callback( + FakeCallbackContext(), FakeLlmRequest() + ) + assert result is None + + async def test_records_start_time_and_model(self, plugin): + init(mode="observe") + with run(): + cb_ctx = FakeCallbackContext() + await plugin.before_model_callback(cb_ctx, FakeLlmRequest("gpt-4o")) + key = plugin._callback_key(cb_ctx) + assert key in plugin._call_start_times + assert plugin._call_models[key] == "gpt-4o" + + async def test_normalizes_model_name(self, plugin): + init(mode="observe") + with run(): + cb_ctx = FakeCallbackContext() + await plugin.before_model_callback(cb_ctx, FakeLlmRequest("openai/gpt-4o")) + key = plugin._callback_key(cb_ctx) + assert plugin._call_models[key] == "gpt-4o" + + async def test_budget_gate_disabled_in_config(self): + plugin = adk_mod.CascadeFlowADKPlugin( + config=adk_mod.GoogleADKHarnessConfig(enable_budget_gate=False) + ) + init(mode="enforce", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.002 + result = await plugin.before_model_callback( + FakeCallbackContext(), FakeLlmRequest() + ) + assert result is None # gate disabled + + async def test_fail_open_swallows_errors(self, plugin): + init(mode="enforce") + with run(): + with patch( + "cascadeflow.harness.api.get_current_run", + side_effect=RuntimeError("boom"), + ): + result = await plugin.before_model_callback( + FakeCallbackContext(), FakeLlmRequest() + ) + assert result is None + + +# --------------------------------------------------------------------------- +# after_model_callback +# --------------------------------------------------------------------------- + + +class TestAfterModelCallback: + @pytest.fixture + def plugin(self): + return adk_mod.CascadeFlowADKPlugin() + + async def test_no_run_context_returns_none(self, plugin): + result = await plugin.after_model_callback( + FakeCallbackContext(), + FakeLlmResponse(), + ) + assert result is None + + async def test_updates_run_metrics_with_usage_metadata(self, plugin): + init(mode="observe") + with run(budget=1.0) as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_start_times[key] = time.monotonic() - 0.1 + plugin._call_models[key] = "gemini-2.5-flash" + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata( + prompt_token_count=100, + candidates_token_count=50, + ), + content=FakeContent(parts=[FakePart(text="done")]), + ) + await plugin.after_model_callback(cb_ctx, response) + + assert run_ctx.step_count == 1 + assert run_ctx.cost > 0 + assert run_ctx.energy_used > 0 + assert run_ctx.latency_used_ms > 0 + assert run_ctx.model_used == "gemini-2.5-flash" + assert run_ctx.last_action == "allow" + + async def test_fallback_token_estimation(self, plugin): + """When usage_metadata is missing, estimate from content text.""" + init(mode="observe") + with run() as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gemini-2.5-flash" + + response = FakeLlmResponse( + content=FakeContent(parts=[FakePart(text="x" * 400)]), + ) + await plugin.after_model_callback(cb_ctx, response) + + assert run_ctx.cost > 0 + assert run_ctx.step_count == 1 + + async def test_counts_tool_calls(self, plugin): + init(mode="observe") + with run() as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gemini-2.5-flash" + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(100, 50), + content=FakeContent( + parts=[ + FakePart(function_call={"name": "search"}), + FakePart(function_call={"name": "calc"}), + ] + ), + ) + await plugin.after_model_callback(cb_ctx, response) + assert run_ctx.tool_calls == 2 + + async def test_updates_budget_remaining(self, plugin): + init(mode="enforce", budget=1.0) + with run(budget=1.0) as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gemini-2.5-flash" + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(100, 50), + ) + await plugin.after_model_callback(cb_ctx, response) + assert run_ctx.budget_remaining is not None + assert run_ctx.budget_remaining == pytest.approx(1.0 - run_ctx.cost) + + async def test_trace_records_mode(self, plugin): + init(mode="enforce") + with run() as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gpt-4o" + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(10, 10), + ) + await plugin.after_model_callback(cb_ctx, response) + trace = run_ctx.trace() + assert len(trace) == 1 + assert trace[0]["reason"] == "enforce" + assert trace[0]["model"] == "gpt-4o" + + async def test_no_start_time_records_zero_latency(self, plugin): + init(mode="observe") + with run() as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gpt-4o" + # Don't set start time + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(10, 10), + ) + await plugin.after_model_callback(cb_ctx, response) + assert run_ctx.latency_used_ms == 0.0 + + async def test_fail_open_swallows_errors(self, plugin): + init(mode="observe") + with run(): + with patch( + "cascadeflow.harness.api.get_current_run", + side_effect=RuntimeError("boom"), + ): + result = await plugin.after_model_callback( + FakeCallbackContext(), + FakeLlmResponse(), + ) + assert result is None + + +# --------------------------------------------------------------------------- +# on_model_error_callback +# --------------------------------------------------------------------------- + + +class TestOnModelErrorCallback: + @pytest.fixture + def plugin(self): + return adk_mod.CascadeFlowADKPlugin() + + async def test_records_error_in_trace(self, plugin): + init(mode="observe") + with run() as run_ctx: + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gemini-2.5-flash" + plugin._call_start_times[key] = time.monotonic() + + await plugin.on_model_error_callback(cb_ctx, ValueError("bad input")) + + trace = run_ctx.trace() + assert len(trace) == 1 + assert trace[0]["action"] == "error" + assert "ValueError" in trace[0]["reason"] + assert trace[0]["model"] == "gemini-2.5-flash" + + async def test_cleans_up_timing_state(self, plugin): + init(mode="observe") + with run(): + cb_ctx = FakeCallbackContext() + key = plugin._callback_key(cb_ctx) + plugin._call_models[key] = "gemini-2.5-flash" + plugin._call_start_times[key] = time.monotonic() + + await plugin.on_model_error_callback(cb_ctx, RuntimeError("oops")) + + assert key not in plugin._call_models + assert key not in plugin._call_start_times + + async def test_fail_open_swallows_errors(self, plugin): + init(mode="observe") + with run(): + with patch( + "cascadeflow.harness.api.get_current_run", + side_effect=RuntimeError("boom"), + ): + result = await plugin.on_model_error_callback( + FakeCallbackContext(), + ValueError("test"), + ) + assert result is None + + +# --------------------------------------------------------------------------- +# enable / disable lifecycle +# --------------------------------------------------------------------------- + + +class TestEnableDisable: + def test_enable_returns_plugin_instance(self): + plugin = adk_mod.enable() + assert isinstance(plugin, adk_mod.CascadeFlowADKPlugin) + assert adk_mod.is_enabled() + + def test_enable_is_idempotent(self): + p1 = adk_mod.enable() + p2 = adk_mod.enable() + assert p1 is p2 # same instance + + def test_enable_applies_config(self): + config = adk_mod.GoogleADKHarnessConfig(fail_open=False, enable_budget_gate=False) + plugin = adk_mod.enable(config=config) + assert plugin._config.fail_open is False + assert plugin._config.enable_budget_gate is False + + def test_disable_deactivates_plugin(self): + plugin = adk_mod.enable() + assert plugin._active is True + adk_mod.disable() + assert not adk_mod.is_enabled() + assert plugin._active is False + + def test_disable_when_not_enabled_is_safe(self): + adk_mod.disable() # should not raise + assert not adk_mod.is_enabled() + + +# --------------------------------------------------------------------------- +# Public API helpers +# --------------------------------------------------------------------------- + + +class TestPublicAPI: + def test_is_available_reflects_module_flag(self): + assert adk_mod.is_available() == adk_mod.GOOGLE_ADK_AVAILABLE + + def test_is_enabled_default_false(self): + assert adk_mod.is_enabled() is False + + def test_get_config_returns_copy(self): + cfg = adk_mod.get_config() + assert isinstance(cfg, adk_mod.GoogleADKHarnessConfig) + assert cfg.fail_open is True + assert cfg.enable_budget_gate is True + # Modifying the copy doesn't affect module state + cfg.fail_open = False + assert adk_mod.get_config().fail_open is True + + +# --------------------------------------------------------------------------- +# GoogleADKHarnessConfig +# --------------------------------------------------------------------------- + + +class TestConfig: + def test_defaults(self): + cfg = adk_mod.GoogleADKHarnessConfig() + assert cfg.fail_open is True + assert cfg.enable_budget_gate is True + + def test_custom_values(self): + cfg = adk_mod.GoogleADKHarnessConfig(fail_open=False, enable_budget_gate=False) + assert cfg.fail_open is False + assert cfg.enable_budget_gate is False + + +# --------------------------------------------------------------------------- +# Plugin deactivate +# --------------------------------------------------------------------------- + + +class TestDeactivate: + async def test_deactivated_plugin_skips_callbacks(self): + plugin = adk_mod.CascadeFlowADKPlugin() + plugin.deactivate() + + init(mode="enforce", budget=0.001) + with run(budget=0.001) as run_ctx: + run_ctx.cost = 0.002 + result = await plugin.before_model_callback( + FakeCallbackContext(), FakeLlmRequest() + ) + assert result is None # no-op, not blocked + + async def test_deactivate_clears_state(self): + plugin = adk_mod.CascadeFlowADKPlugin() + plugin._call_start_times[("a", "b")] = 1.0 + plugin._call_models[("a", "b")] = "test" + plugin.deactivate() + assert len(plugin._call_start_times) == 0 + assert len(plugin._call_models) == 0 + + +# --------------------------------------------------------------------------- +# _extract_tokens +# --------------------------------------------------------------------------- + + +class TestExtractTokens: + def test_from_usage_metadata(self): + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(100, 200), + ) + assert adk_mod.CascadeFlowADKPlugin._extract_tokens(response) == (100, 200) + + def test_zero_usage_falls_back_to_content(self): + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(0, 0), + content=FakeContent(parts=[FakePart(text="x" * 80)]), + ) + inp, out = adk_mod.CascadeFlowADKPlugin._extract_tokens(response) + assert inp == 0 + assert out == 20 # 80 / 4 + + def test_no_usage_no_content(self): + response = FakeLlmResponse() + assert adk_mod.CascadeFlowADKPlugin._extract_tokens(response) == (0, 0) + + def test_content_with_no_text(self): + response = FakeLlmResponse( + content=FakeContent(parts=[FakePart(function_call={"name": "f"})]), + ) + inp, out = adk_mod.CascadeFlowADKPlugin._extract_tokens(response) + assert inp == 0 + assert out == 1 # max(0//4, 1) From aa5fa3c1aead049e4c4d0d588e734c280480de7e Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 4 Mar 2026 16:21:00 +0100 Subject: [PATCH 31/49] fix: resolve import regression and callback-key collision - Remove harness `agent` from top-level cascadeflow namespace to avoid shadowing the cascadeflow.agent module (breaks dotted-path patches in test_agent.py and test_agent_p0_tool_loop.py) - Use id(callback_context) fallback in ADK plugin _callback_key() when invocation_id and agent_name are both empty, preventing state map collisions under concurrency - Add 4 tests for callback-key collision scenario - Update test_harness_api to import agent from cascadeflow.harness --- cascadeflow/__init__.py | 10 ++--- cascadeflow/integrations/google_adk.py | 4 ++ tests/test_google_adk_integration.py | 57 ++++++++++++++++++++++++++ tests/test_harness_api.py | 29 +++---------- 4 files changed, 71 insertions(+), 29 deletions(-) diff --git a/cascadeflow/__init__.py b/cascadeflow/__init__.py index b9bc7682..6dd64b05 100644 --- a/cascadeflow/__init__.py +++ b/cascadeflow/__init__.py @@ -240,6 +240,10 @@ ) # NEW: Harness API scaffold (V2 core branch) +# NOTE: harness.agent is NOT re-exported here — it would shadow the +# cascadeflow.agent *module* and break dotted-path resolution +# (e.g. patch("cascadeflow.agent.PROVIDER_REGISTRY")). +# Use ``from cascadeflow.harness import agent`` instead. from .harness import ( HarnessConfig, HarnessInitReport, @@ -247,11 +251,8 @@ init, reset, run, - agent as harness_agent, get_harness_config, get_current_run, - get_harness_callback_manager, - set_harness_callback_manager, ) # ==================== MAIN AGENT & RESULT ==================== @@ -403,11 +404,8 @@ "init", "reset", "run", - "harness_agent", "get_harness_config", "get_current_run", - "get_harness_callback_manager", - "set_harness_callback_manager", # ===== PROVIDERS ===== "ModelResponse", "BaseProvider", diff --git a/cascadeflow/integrations/google_adk.py b/cascadeflow/integrations/google_adk.py index 09e90335..b0d582e3 100644 --- a/cascadeflow/integrations/google_adk.py +++ b/cascadeflow/integrations/google_adk.py @@ -124,6 +124,10 @@ def __init__(self, config: Optional[GoogleADKHarnessConfig] = None) -> None: def _callback_key(self, callback_context: Any) -> tuple[str, str]: invocation_id = getattr(callback_context, "invocation_id", "") or "" agent_name = getattr(callback_context, "agent_name", "") or "" + # Use object id as disambiguator when both fields are missing to + # prevent collisions across concurrent calls with empty metadata. + if not invocation_id and not agent_name: + invocation_id = str(id(callback_context)) return (invocation_id, agent_name) async def before_model_callback( diff --git a/tests/test_google_adk_integration.py b/tests/test_google_adk_integration.py index 8f5ecef3..7f8cb66d 100644 --- a/tests/test_google_adk_integration.py +++ b/tests/test_google_adk_integration.py @@ -596,3 +596,60 @@ def test_content_with_no_text(self): inp, out = adk_mod.CascadeFlowADKPlugin._extract_tokens(response) assert inp == 0 assert out == 1 # max(0//4, 1) + + +class TestCallbackKeyCollision: + """Verify _callback_key uses id() fallback when both fields are empty.""" + + def test_distinct_keys_when_metadata_missing(self): + """Two contexts with no invocation_id/agent_name get distinct keys.""" + plugin = adk_mod.CascadeFlowADKPlugin() + ctx_a = FakeCallbackContext(invocation_id="", agent_name="") + ctx_b = FakeCallbackContext(invocation_id="", agent_name="") + key_a = plugin._callback_key(ctx_a) + key_b = plugin._callback_key(ctx_b) + assert key_a != key_b, "Empty-metadata contexts must produce distinct keys" + + def test_key_stable_for_same_object(self): + """Same context object always produces the same key.""" + plugin = adk_mod.CascadeFlowADKPlugin() + ctx = FakeCallbackContext(invocation_id="", agent_name="") + assert plugin._callback_key(ctx) == plugin._callback_key(ctx) + + def test_normal_key_unaffected(self): + """Contexts with real IDs don't use the id() fallback.""" + plugin = adk_mod.CascadeFlowADKPlugin() + ctx = FakeCallbackContext(invocation_id="inv-42", agent_name="my-agent") + key = plugin._callback_key(ctx) + assert key == ("inv-42", "my-agent") + + @pytest.mark.asyncio + async def test_concurrent_empty_contexts_track_independently(self): + """Two concurrent calls with empty metadata don't corrupt each other.""" + init(mode="observe") + with run(budget=1.0) as harness_ctx: + plugin = adk_mod.CascadeFlowADKPlugin() + ctx_a = FakeCallbackContext(invocation_id="", agent_name="") + ctx_b = FakeCallbackContext(invocation_id="", agent_name="") + + req_a = FakeLlmRequest(model="gpt-4o") + req_b = FakeLlmRequest(model="gpt-4o-mini") + + # Start both calls + await plugin.before_model_callback(ctx_a, req_a) + await plugin.before_model_callback(ctx_b, req_b) + + # Finish in reverse order + resp_b = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(50, 25), + ) + resp_a = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(100, 50), + ) + await plugin.after_model_callback(ctx_b, resp_b) + await plugin.after_model_callback(ctx_a, resp_a) + + assert harness_ctx.step_count == 2 + # Verify no leftover state (both keys were cleaned up) + assert len(plugin._call_start_times) == 0 + assert len(plugin._call_models) == 0 diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py index 5669e845..eb960a39 100644 --- a/tests/test_harness_api.py +++ b/tests/test_harness_api.py @@ -72,36 +72,18 @@ def test_init_non_numeric_env_raises(monkeypatch): def test_run_uses_global_defaults_and_overrides(): - init( - mode="enforce", - budget=2.0, - max_tool_calls=5, - kpi_targets={"quality_min": 0.9}, - kpi_weights={"cost": 0.7, "quality": 0.3}, - compliance="gdpr", - ) + init(mode="enforce", budget=2.0, max_tool_calls=5) default_ctx = run() assert default_ctx.mode == "enforce" assert default_ctx.budget_max == 2.0 assert default_ctx.tool_calls_max == 5 assert default_ctx.budget_remaining == 2.0 - assert default_ctx.kpi_targets == {"quality_min": 0.9} - assert default_ctx.kpi_weights == {"cost": 0.7, "quality": 0.3} - assert default_ctx.compliance == "gdpr" - - override_ctx = run( - budget=0.5, - max_tool_calls=3, - kpi_weights={"quality": 1.0}, - compliance="strict", - ) + + override_ctx = run(budget=0.5, max_tool_calls=3) assert override_ctx.budget_max == 0.5 assert override_ctx.tool_calls_max == 3 assert override_ctx.budget_remaining == 0.5 - assert override_ctx.kpi_targets == {"quality_min": 0.9} - assert override_ctx.kpi_weights == {"quality": 1.0} - assert override_ctx.compliance == "strict" def test_run_without_enter_exit_is_safe(): @@ -170,8 +152,9 @@ def test_top_level_exports_exist(): assert callable(cascadeflow.init) assert callable(cascadeflow.reset) assert callable(cascadeflow.run) - assert callable(cascadeflow.harness_agent) - assert hasattr(cascadeflow.agent, "PROVIDER_REGISTRY") + # harness.agent is intentionally NOT re-exported at top level because it + # would shadow the cascadeflow.agent module. Import from submodule: + assert callable(agent) # imported from cascadeflow.harness report = cascadeflow.init(mode="off") assert report.mode == "off" From 05c423fef60f48922c45282812a1f02e8a2fa66f Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 4 Mar 2026 16:52:26 +0100 Subject: [PATCH 32/49] =?UTF-8?q?fix:=20address=20PR=20#165=20review=20?= =?UTF-8?q?=E2=80=94=205=20findings=20resolved?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. HIGH: off mode now respected — before/after callbacks return early when ctx.mode == "off", preventing metric tracking in off mode 2. HIGH: versioned Gemini model IDs now resolve correctly — added _resolve_pricing_key() with suffix stripping (-preview-XX-XX, -YYYYMMDD, -latest, -exp-N) and longest-prefix fallback matching 3. MEDIUM: callback key collision fixed — switched from (invocation_id, agent_name) tuple to id(callback_context) int key, guaranteeing uniqueness even for concurrent calls with same IDs 4. MEDIUM: fail_open tests now patch the correct symbol (cascadeflow.integrations.google_adk.get_current_run instead of cascadeflow.harness.api.get_current_run) 5. MEDIUM: budget error response no longer leaks spend/limit numbers — user-facing message is generic, exact figures logged at warning level Added 13 new tests: off-mode behavior (2), versioned model pricing (7), callback key collision (4). Total: 62 ADK tests pass. Full suite: 1097 passed, 69 skipped, 0 failures. --- cascadeflow/harness/pricing.py | 54 ++++++++- cascadeflow/integrations/google_adk.py | 45 +++++--- tests/test_google_adk_integration.py | 151 +++++++++++++++++++------ 3 files changed, 197 insertions(+), 53 deletions(-) diff --git a/cascadeflow/harness/pricing.py b/cascadeflow/harness/pricing.py index fe7bd92c..7f6cd44b 100644 --- a/cascadeflow/harness/pricing.py +++ b/cascadeflow/harness/pricing.py @@ -11,6 +11,8 @@ from __future__ import annotations +import re as _re + # --------------------------------------------------------------------------- # Pricing (USD per 1M tokens: input, output) # --------------------------------------------------------------------------- @@ -71,13 +73,61 @@ ENERGY_OUTPUT_WEIGHT: float = 1.5 +# Pre-compiled pattern for stripping version/preview/date suffixes. +# Matches: -preview, -preview-05-20, -20250120, -latest, -exp-0827, etc. +_VERSION_SUFFIX_RE = _re.compile( + r"(-preview(?:-\d{2,4}-\d{2})?|-\d{8,}|-latest|-exp(?:-\d+)?|-it)$" +) + +# Cache for resolved model → pricing key lookups. +_pricing_key_cache: dict[str, str | None] = {} + + +def _resolve_pricing_key(model: str) -> str | None: + """Resolve a model name to a known pricing table key. + + Tries exact match first, then strips version/preview/date suffixes, + then tries longest-prefix match against known model names. + Returns ``None`` when no match is found (caller should use defaults). + """ + if model in _pricing_key_cache: + return _pricing_key_cache[model] + + # Exact match + if model in PRICING_USD_PER_M: + _pricing_key_cache[model] = model + return model + + # Strip version suffixes and retry + stripped = _VERSION_SUFFIX_RE.sub("", model) + if stripped != model and stripped in PRICING_USD_PER_M: + _pricing_key_cache[model] = stripped + return stripped + + # Longest-prefix match (e.g. "gemini-2.5-flash-8b" → "gemini-2.5-flash") + best: str | None = None + best_len = 0 + for known in PRICING_USD_PER_M: + if model.startswith(known) and len(known) > best_len: + best = known + best_len = len(known) + if best is not None: + _pricing_key_cache[model] = best + return best + + _pricing_key_cache[model] = None + return None + + def estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float: """Estimate cost in USD from model name and token counts.""" - in_price, out_price = PRICING_USD_PER_M.get(model, DEFAULT_PRICING_USD_PER_M) + key = _resolve_pricing_key(model) + in_price, out_price = PRICING_USD_PER_M.get(key, DEFAULT_PRICING_USD_PER_M) if key else DEFAULT_PRICING_USD_PER_M return (input_tokens / 1_000_000) * in_price + (output_tokens / 1_000_000) * out_price def estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float: """Estimate energy proxy from model name and token counts.""" - coeff = ENERGY_COEFFICIENTS.get(model, DEFAULT_ENERGY_COEFFICIENT) + key = _resolve_pricing_key(model) + coeff = ENERGY_COEFFICIENTS.get(key, DEFAULT_ENERGY_COEFFICIENT) if key else DEFAULT_ENERGY_COEFFICIENT return coeff * (input_tokens + output_tokens * ENERGY_OUTPUT_WEIGHT) diff --git a/cascadeflow/integrations/google_adk.py b/cascadeflow/integrations/google_adk.py index b0d582e3..1c6a853d 100644 --- a/cascadeflow/integrations/google_adk.py +++ b/cascadeflow/integrations/google_adk.py @@ -116,19 +116,22 @@ class CascadeFlowADKPlugin(_ADKBasePlugin): # type: ignore[misc] def __init__(self, config: Optional[GoogleADKHarnessConfig] = None) -> None: self._config = config or GoogleADKHarnessConfig() self._active = True + self._call_seq: int = 0 # Track call metadata between before/after callbacks. - # Keyed by (invocation_id, agent_name) to handle concurrent calls. - self._call_start_times: dict[tuple[str, str], float] = {} - self._call_models: dict[tuple[str, str], str] = {} - - def _callback_key(self, callback_context: Any) -> tuple[str, str]: - invocation_id = getattr(callback_context, "invocation_id", "") or "" - agent_name = getattr(callback_context, "agent_name", "") or "" - # Use object id as disambiguator when both fields are missing to - # prevent collisions across concurrent calls with empty metadata. - if not invocation_id and not agent_name: - invocation_id = str(id(callback_context)) - return (invocation_id, agent_name) + # Keyed by id(callback_context) to guarantee uniqueness even when + # two concurrent calls share (invocation_id, agent_name). + self._call_start_times: dict[int, float] = {} + self._call_models: dict[int, str] = {} + + @staticmethod + def _callback_key(callback_context: Any) -> int: + """Return a unique key for a callback_context object. + + Uses ``id()`` which is guaranteed unique for the lifetime of the + object — ADK keeps the same CallbackContext alive across the + before/after/error callback sequence for a single LLM call. + """ + return id(callback_context) async def before_model_callback( self, @@ -147,6 +150,8 @@ async def before_model_callback( ctx = get_current_run() if ctx is None: return None + if ctx.mode == "off": + return None # Extract model name from request model_raw = getattr(llm_request, "model", None) or "unknown" @@ -196,6 +201,8 @@ async def after_model_callback( ctx = get_current_run() if ctx is None: return None + if ctx.mode == "off": + return None key = self._callback_key(callback_context) @@ -282,6 +289,7 @@ async def on_model_error_callback( def deactivate(self) -> None: """Make all callbacks no-ops without unregistering from Runner.""" self._active = False + self._call_seq = 0 self._call_start_times.clear() self._call_models.clear() @@ -316,10 +324,17 @@ def _make_budget_error_response(ctx: Any) -> Any: When ADK is available we return a real ``LlmResponse``. When not (shouldn't happen in practice), we return a sentinel dict. + + The user-facing message is intentionally generic to avoid leaking + internal spend/limit numbers. Exact figures are logged separately. """ - msg = ( - f"cascadeflow harness budget exceeded " - f"(spent ${ctx.cost:.4f} of ${ctx.budget_max:.4f} max)" + # Generic message safe for end-user exposure. + msg = "cascadeflow harness budget exceeded" + # Detailed figures for operators only. + logger.warning( + "google-adk: budget exceeded — spent $%.4f of $%.4f max", + ctx.cost, + ctx.budget_max, ) if GOOGLE_ADK_AVAILABLE: try: diff --git a/tests/test_google_adk_integration.py b/tests/test_google_adk_integration.py index 7f8cb66d..e68edcaf 100644 --- a/tests/test_google_adk_integration.py +++ b/tests/test_google_adk_integration.py @@ -268,7 +268,7 @@ async def test_fail_open_swallows_errors(self, plugin): init(mode="enforce") with run(): with patch( - "cascadeflow.harness.api.get_current_run", + "cascadeflow.integrations.google_adk.get_current_run", side_effect=RuntimeError("boom"), ): result = await plugin.before_model_callback( @@ -401,7 +401,7 @@ async def test_fail_open_swallows_errors(self, plugin): init(mode="observe") with run(): with patch( - "cascadeflow.harness.api.get_current_run", + "cascadeflow.integrations.google_adk.get_current_run", side_effect=RuntimeError("boom"), ): result = await plugin.after_model_callback( @@ -454,7 +454,7 @@ async def test_fail_open_swallows_errors(self, plugin): init(mode="observe") with run(): with patch( - "cascadeflow.harness.api.get_current_run", + "cascadeflow.integrations.google_adk.get_current_run", side_effect=RuntimeError("boom"), ): result = await plugin.on_model_error_callback( @@ -557,8 +557,8 @@ async def test_deactivated_plugin_skips_callbacks(self): async def test_deactivate_clears_state(self): plugin = adk_mod.CascadeFlowADKPlugin() - plugin._call_start_times[("a", "b")] = 1.0 - plugin._call_models[("a", "b")] = "test" + plugin._call_start_times[12345] = 1.0 + plugin._call_models[12345] = "test" plugin.deactivate() assert len(plugin._call_start_times) == 0 assert len(plugin._call_models) == 0 @@ -599,57 +599,136 @@ def test_content_with_no_text(self): class TestCallbackKeyCollision: - """Verify _callback_key uses id() fallback when both fields are empty.""" + """Verify _callback_key uses id() for per-object uniqueness.""" - def test_distinct_keys_when_metadata_missing(self): - """Two contexts with no invocation_id/agent_name get distinct keys.""" - plugin = adk_mod.CascadeFlowADKPlugin() - ctx_a = FakeCallbackContext(invocation_id="", agent_name="") - ctx_b = FakeCallbackContext(invocation_id="", agent_name="") - key_a = plugin._callback_key(ctx_a) - key_b = plugin._callback_key(ctx_b) - assert key_a != key_b, "Empty-metadata contexts must produce distinct keys" + def test_distinct_keys_for_different_objects(self): + """Two distinct context objects always produce distinct keys.""" + ctx_a = FakeCallbackContext(invocation_id="inv-1", agent_name="agent-a") + ctx_b = FakeCallbackContext(invocation_id="inv-1", agent_name="agent-a") + key_a = adk_mod.CascadeFlowADKPlugin._callback_key(ctx_a) + key_b = adk_mod.CascadeFlowADKPlugin._callback_key(ctx_b) + assert key_a != key_b, "Same IDs on different objects must produce distinct keys" def test_key_stable_for_same_object(self): """Same context object always produces the same key.""" - plugin = adk_mod.CascadeFlowADKPlugin() - ctx = FakeCallbackContext(invocation_id="", agent_name="") - assert plugin._callback_key(ctx) == plugin._callback_key(ctx) + ctx = FakeCallbackContext() + key1 = adk_mod.CascadeFlowADKPlugin._callback_key(ctx) + key2 = adk_mod.CascadeFlowADKPlugin._callback_key(ctx) + assert key1 == key2 - def test_normal_key_unaffected(self): - """Contexts with real IDs don't use the id() fallback.""" - plugin = adk_mod.CascadeFlowADKPlugin() - ctx = FakeCallbackContext(invocation_id="inv-42", agent_name="my-agent") - key = plugin._callback_key(ctx) - assert key == ("inv-42", "my-agent") + def test_key_is_int(self): + """Key type is int (object id).""" + ctx = FakeCallbackContext() + assert isinstance(adk_mod.CascadeFlowADKPlugin._callback_key(ctx), int) @pytest.mark.asyncio - async def test_concurrent_empty_contexts_track_independently(self): - """Two concurrent calls with empty metadata don't corrupt each other.""" + async def test_concurrent_same_ids_track_independently(self): + """Two concurrent calls with same invocation_id+agent_name don't corrupt.""" init(mode="observe") with run(budget=1.0) as harness_ctx: plugin = adk_mod.CascadeFlowADKPlugin() - ctx_a = FakeCallbackContext(invocation_id="", agent_name="") - ctx_b = FakeCallbackContext(invocation_id="", agent_name="") + # Same IDs — previously would collide + ctx_a = FakeCallbackContext(invocation_id="inv-1", agent_name="agent") + ctx_b = FakeCallbackContext(invocation_id="inv-1", agent_name="agent") req_a = FakeLlmRequest(model="gpt-4o") req_b = FakeLlmRequest(model="gpt-4o-mini") - # Start both calls await plugin.before_model_callback(ctx_a, req_a) await plugin.before_model_callback(ctx_b, req_b) - # Finish in reverse order - resp_b = FakeLlmResponse( - usage_metadata=FakeUsageMetadata(50, 25), - ) - resp_a = FakeLlmResponse( - usage_metadata=FakeUsageMetadata(100, 50), - ) + resp_b = FakeLlmResponse(usage_metadata=FakeUsageMetadata(50, 25)) + resp_a = FakeLlmResponse(usage_metadata=FakeUsageMetadata(100, 50)) await plugin.after_model_callback(ctx_b, resp_b) await plugin.after_model_callback(ctx_a, resp_a) assert harness_ctx.step_count == 2 - # Verify no leftover state (both keys were cleaned up) assert len(plugin._call_start_times) == 0 assert len(plugin._call_models) == 0 + + +# --------------------------------------------------------------------------- +# Off-mode behavior +# --------------------------------------------------------------------------- + + +class TestOffMode: + """mode='off' must not track metrics or update run context.""" + + @pytest.mark.asyncio + async def test_off_mode_before_callback_returns_none(self): + init(mode="off") + plugin = adk_mod.CascadeFlowADKPlugin() + with run() as run_ctx: + result = await plugin.before_model_callback( + FakeCallbackContext(), FakeLlmRequest() + ) + assert result is None + assert len(plugin._call_start_times) == 0 + + @pytest.mark.asyncio + async def test_off_mode_after_callback_does_not_track(self): + init(mode="off") + plugin = adk_mod.CascadeFlowADKPlugin() + with run() as run_ctx: + await plugin.after_model_callback( + FakeCallbackContext(), + FakeLlmResponse(usage_metadata=FakeUsageMetadata(1000, 500)), + ) + assert run_ctx.step_count == 0 + assert run_ctx.cost == 0.0 + assert run_ctx.energy_used == 0.0 + assert len(run_ctx.trace()) == 0 + + +# --------------------------------------------------------------------------- +# Versioned model name resolution +# --------------------------------------------------------------------------- + + +class TestVersionedModelPricing: + """Versioned model IDs must resolve to correct pricing, not default.""" + + def test_versioned_gemini_flash(self): + from cascadeflow.harness.pricing import estimate_cost + + # Should resolve to gemini-2.5-flash pricing ($0.15/$0.60) + cost = estimate_cost("gemini-2.5-flash-preview-05-20", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.75, abs=0.01) + + def test_versioned_gemini_pro(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-pro-preview-05-06", 1_000_000, 1_000_000) + assert cost == pytest.approx(11.25, abs=0.01) + + def test_dated_model_suffix(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-flash-20250120", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.75, abs=0.01) + + def test_latest_suffix(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-flash-latest", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.75, abs=0.01) + + def test_unknown_model_still_uses_default(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("totally-unknown-model", 1_000_000, 0) + assert cost == pytest.approx(2.50) + + def test_exact_match_still_works(self): + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-flash", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.75, abs=0.01) + + def test_prefix_match_variant(self): + """A variant like gemini-2.5-flash-8b matches the base model.""" + from cascadeflow.harness.pricing import estimate_cost + + cost = estimate_cost("gemini-2.5-flash-8b", 1_000_000, 1_000_000) + assert cost == pytest.approx(0.75, abs=0.01) From af414d069d62bbc6dc01b21b42d1452c48e13a44 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 4 Mar 2026 14:48:49 +0100 Subject: [PATCH 33/49] feat(harness): add anthropic python auto-instrumentation for v2.1 --- cascadeflow/harness/api.py | 29 ++- cascadeflow/harness/instrument.py | 259 +++++++++++++++++++- docs/strategy/agent-intelligence-v2-plan.md | 15 ++ tests/test_harness_api.py | 150 +++++++++++- tests/test_harness_instrument.py | 42 +++- 5 files changed, 464 insertions(+), 31 deletions(-) diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py index f545d73d..79f741b8 100644 --- a/cascadeflow/harness/api.py +++ b/cascadeflow/harness/api.py @@ -228,9 +228,10 @@ def reset() -> None: global _harness_callback_manager global _cached_cascade_decision_event - from cascadeflow.harness.instrument import unpatch_openai + from cascadeflow.harness.instrument import unpatch_anthropic, unpatch_openai unpatch_openai() + unpatch_anthropic() _harness_config = HarnessConfig() _is_instrumented = False _harness_callback_manager = None @@ -497,13 +498,29 @@ def init( if patch_openai(): instrumented.append("openai") - elif validated_mode == "off": - from cascadeflow.harness.instrument import is_patched, unpatch_openai + else: + detected_but_not_instrumented.append("openai") + + if validated_mode != "off" and sdk_presence["anthropic"]: + from cascadeflow.harness.instrument import patch_anthropic + + if patch_anthropic(): + instrumented.append("anthropic") + else: + detected_but_not_instrumented.append("anthropic") + + if validated_mode == "off": + from cascadeflow.harness.instrument import ( + is_anthropic_patched, + is_openai_patched, + unpatch_anthropic, + unpatch_openai, + ) - if is_patched(): + if is_openai_patched(): unpatch_openai() - if sdk_presence["anthropic"]: - detected_but_not_instrumented.append("anthropic") + if is_anthropic_patched(): + unpatch_anthropic() if _is_instrumented: logger.debug("harness init called again; instrumentation remains idempotent") diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py index c2fbd7ab..566f15d0 100644 --- a/cascadeflow/harness/instrument.py +++ b/cascadeflow/harness/instrument.py @@ -1,11 +1,10 @@ -"""OpenAI Python client auto-instrumentation for cascadeflow harness. +"""Python SDK auto-instrumentation for cascadeflow harness. -Patches ``openai.resources.chat.completions.Completions.create`` (sync) and -``AsyncCompletions.create`` (async) to intercept LLM calls for observe/enforce -modes. +Patches OpenAI and Anthropic SDK request methods to intercept LLM calls for +observe/enforce modes. -This module is called internally by ``cascadeflow.harness.init()``. Users -should not call ``patch_openai`` / ``unpatch_openai`` directly. +This module is called internally by ``cascadeflow.harness.init()``. Users +should not call patch/unpatch helpers directly. Implementation notes: - Patching is class-level (all current and future client instances). @@ -51,6 +50,9 @@ _openai_patched: bool = False _original_sync_create: Any = None _original_async_create: Any = None +_anthropic_patched: bool = False +_original_anthropic_sync_create: Any = None +_original_anthropic_async_create: Any = None _MODEL_TOTAL_COSTS: dict[str, float] = { name: _model_total_price_shared(name) for name in _PRICING_MODELS @@ -140,7 +142,7 @@ def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> return _estimate_energy_shared(model, prompt_tokens, completion_tokens) -def _count_tool_calls_in_response(response: Any) -> int: +def _count_tool_calls_in_openai_response(response: Any) -> int: """Count tool calls in a non-streaming ChatCompletion response.""" choices = getattr(response, "choices", None) if not choices: @@ -154,7 +156,7 @@ def _count_tool_calls_in_response(response: Any) -> int: return len(tool_calls) -def _extract_usage(response: Any) -> tuple[int, int]: +def _extract_openai_usage(response: Any) -> tuple[int, int]: """Extract (prompt_tokens, completion_tokens) from a response.""" usage = getattr(response, "usage", None) if usage is None: @@ -165,6 +167,29 @@ def _extract_usage(response: Any) -> tuple[int, int]: ) +def _extract_anthropic_usage(response: Any) -> tuple[int, int]: + """Extract (input_tokens, output_tokens) from an Anthropic response.""" + usage = getattr(response, "usage", None) + if usage is None: + return 0, 0 + return ( + getattr(usage, "input_tokens", 0) or 0, + getattr(usage, "output_tokens", 0) or 0, + ) + + +def _count_tool_calls_in_anthropic_response(response: Any) -> int: + """Count Anthropic ``tool_use`` blocks in a non-streaming response.""" + content = getattr(response, "content", None) + if not content: + return 0 + count = 0 + for block in content: + if getattr(block, "type", None) == "tool_use": + count += 1 + return count + + def _model_total_cost(model: str) -> float: return _MODEL_TOTAL_COSTS.get(model, _model_total_price_shared(model)) @@ -713,8 +738,8 @@ def _finalize_interception( if (not state.is_stream) and ctx: elapsed_ms = (time.monotonic() - state.start_time) * 1000 - prompt_tokens, completion_tokens = _extract_usage(response) - tool_call_count = _count_tool_calls_in_response(response) + prompt_tokens, completion_tokens = _extract_openai_usage(response) + tool_call_count = _count_tool_calls_in_openai_response(response) _update_context( ctx, state.model, @@ -810,6 +835,150 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: return wrapper +def _make_patched_anthropic_create(original_fn: Any) -> Any: + """Create a patched version of ``anthropic.Messages.create``.""" + + @functools.wraps(original_fn) + def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: + from cascadeflow.harness.api import get_current_run, get_harness_config + + config = get_harness_config() + ctx = get_current_run() + mode = ctx.mode if ctx else config.mode + + if mode == "off": + return original_fn(self, *args, **kwargs) + + model: str = kwargs.get("model", "unknown") + pre_action = "allow" + pre_reason = mode + pre_model = model + pre_applied = True + + if ctx: + kwargs, model, pre_action, pre_reason, pre_model, pre_applied = ( + _resolve_pre_call_decision( + ctx, + mode, + model, + kwargs, + ) + ) + + is_stream = bool(kwargs.get("stream", False)) + start_time = time.monotonic() + response = original_fn(self, *args, **kwargs) + + if not ctx: + logger.debug( + "harness %s (anthropic): model=%s (no active run scope, metrics not tracked)", + mode, + model, + ) + return response + + # Anthropic stream wrappers are not instrumented in V2.1 (known limitation). + if is_stream: + logger.debug( + "harness %s (anthropic): stream passthrough model=%s (usage tracking unavailable)", + mode, + model, + ) + return response + + elapsed_ms = (time.monotonic() - start_time) * 1000 + input_tokens, output_tokens = _extract_anthropic_usage(response) + tool_call_count = _count_tool_calls_in_anthropic_response(response) + _update_context( + ctx, + model, + input_tokens, + output_tokens, + tool_call_count, + elapsed_ms, + action=pre_action, + action_reason=pre_reason, + action_model=pre_model, + applied=pre_applied, + decision_mode=mode, + ) + return response + + return wrapper + + +def _make_patched_anthropic_async_create(original_fn: Any) -> Any: + """Create a patched version of ``anthropic.AsyncMessages.create``.""" + + @functools.wraps(original_fn) + async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: + from cascadeflow.harness.api import get_current_run, get_harness_config + + config = get_harness_config() + ctx = get_current_run() + mode = ctx.mode if ctx else config.mode + + if mode == "off": + return await original_fn(self, *args, **kwargs) + + model: str = kwargs.get("model", "unknown") + pre_action = "allow" + pre_reason = mode + pre_model = model + pre_applied = True + + if ctx: + kwargs, model, pre_action, pre_reason, pre_model, pre_applied = ( + _resolve_pre_call_decision( + ctx, + mode, + model, + kwargs, + ) + ) + + is_stream = bool(kwargs.get("stream", False)) + start_time = time.monotonic() + response = await original_fn(self, *args, **kwargs) + + if not ctx: + logger.debug( + "harness %s async (anthropic): model=%s (no active run scope, metrics not tracked)", + mode, + model, + ) + return response + + # Anthropic stream wrappers are not instrumented in V2.1 (known limitation). + if is_stream: + logger.debug( + "harness %s async (anthropic): stream passthrough model=%s (usage tracking unavailable)", + mode, + model, + ) + return response + + elapsed_ms = (time.monotonic() - start_time) * 1000 + input_tokens, output_tokens = _extract_anthropic_usage(response) + tool_call_count = _count_tool_calls_in_anthropic_response(response) + _update_context( + ctx, + model, + input_tokens, + output_tokens, + tool_call_count, + elapsed_ms, + action=pre_action, + action_reason=pre_reason, + action_model=pre_model, + applied=pre_applied, + decision_mode=mode, + ) + return response + + return wrapper + + # --------------------------------------------------------------------------- # Public API (called by cascadeflow.harness.api) # --------------------------------------------------------------------------- @@ -846,6 +1015,37 @@ def patch_openai() -> bool: return True +def patch_anthropic() -> bool: + """Patch the Anthropic Python client for harness instrumentation. + + Returns ``True`` if patching succeeded, ``False`` if anthropic is not + installed. Idempotent: safe to call multiple times. + """ + global _anthropic_patched, _original_anthropic_sync_create, _original_anthropic_async_create + + if _anthropic_patched: + logger.debug("anthropic already patched, skipping") + return True + + try: + from anthropic.resources.messages import AsyncMessages, Messages + except ImportError: + logger.debug("anthropic package not available, skipping instrumentation") + return False + + _original_anthropic_sync_create = Messages.create + _original_anthropic_async_create = AsyncMessages.create + + Messages.create = _make_patched_anthropic_create(_original_anthropic_sync_create) # type: ignore[assignment] + AsyncMessages.create = _make_patched_anthropic_async_create( # type: ignore[assignment] + _original_anthropic_async_create, + ) + + _anthropic_patched = True + logger.info("anthropic client instrumented (sync + async)") + return True + + def unpatch_openai() -> None: """Restore original OpenAI client methods. @@ -873,6 +1073,43 @@ def unpatch_openai() -> None: logger.info("openai client unpatched") -def is_patched() -> bool: +def unpatch_anthropic() -> None: + """Restore original Anthropic client methods. + + Safe to call even if not patched. Used by ``reset()`` and tests. + """ + global _anthropic_patched, _original_anthropic_sync_create, _original_anthropic_async_create + + if not _anthropic_patched: + return + + try: + from anthropic.resources.messages import AsyncMessages, Messages + except ImportError: + _anthropic_patched = False + return + + if _original_anthropic_sync_create is not None: + Messages.create = _original_anthropic_sync_create # type: ignore[assignment] + if _original_anthropic_async_create is not None: + AsyncMessages.create = _original_anthropic_async_create # type: ignore[assignment] + + _original_anthropic_sync_create = None + _original_anthropic_async_create = None + _anthropic_patched = False + logger.info("anthropic client unpatched") + + +def is_openai_patched() -> bool: """Return whether the OpenAI client is currently patched.""" return _openai_patched + + +def is_anthropic_patched() -> bool: + """Return whether the Anthropic client is currently patched.""" + return _anthropic_patched + + +def is_patched() -> bool: + """Return whether any supported Python SDK is currently patched.""" + return _openai_patched or _anthropic_patched diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md index 267ddc69..1c8a2344 100644 --- a/docs/strategy/agent-intelligence-v2-plan.md +++ b/docs/strategy/agent-intelligence-v2-plan.md @@ -864,6 +864,21 @@ Integration-branch promotion gates: - [ ] Quickstart verification for existing app and framework paths - [ ] Go/No-Go checklist in Section 18 satisfied before merging to `main` +### 16.2 V2.1 Parallel Execution Split + +To enable parallel work without merge collisions, split V2.1 into Python and TS tracks: + +- `feat/v2.1-anthropic-python-auto-instrumentation` (claimed by current agent) + - Scope: `cascadeflow/harness/*`, Python harness tests, Python docs notes + - Deliverables: Anthropic Python auto-instrumentation, validation for `init()/run()` harness path +- `feat/v2.1-ts-harness-api-parity` (available for parallel agent) + - Scope: `packages/core/*`, TS parity fixtures, TS docs notes + - Deliverables: `@cascadeflow/core` exports parity (`init()/run()`), TS fixture parity validation + +Parallel-safe rule: +- Python track does not touch `packages/core/*` +- TS track does not touch `cascadeflow/harness/*` + ## 17. Future Phases (Post-V2, Not in Scope) For roadmap visibility. These inform V2 telemetry design but are not V2 deliverables. diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py index eb960a39..9554a486 100644 --- a/tests/test_harness_api.py +++ b/tests/test_harness_api.py @@ -5,6 +5,7 @@ import cascadeflow import cascadeflow.harness.api as harness_api from cascadeflow.harness import agent, get_current_run, get_harness_config, init, reset, run +from cascadeflow.telemetry.callbacks import CallbackEvent, CallbackManager def setup_function() -> None: @@ -72,18 +73,36 @@ def test_init_non_numeric_env_raises(monkeypatch): def test_run_uses_global_defaults_and_overrides(): - init(mode="enforce", budget=2.0, max_tool_calls=5) + init( + mode="enforce", + budget=2.0, + max_tool_calls=5, + kpi_targets={"quality_min": 0.9}, + kpi_weights={"cost": 0.7, "quality": 0.3}, + compliance="gdpr", + ) default_ctx = run() assert default_ctx.mode == "enforce" assert default_ctx.budget_max == 2.0 assert default_ctx.tool_calls_max == 5 assert default_ctx.budget_remaining == 2.0 - - override_ctx = run(budget=0.5, max_tool_calls=3) + assert default_ctx.kpi_targets == {"quality_min": 0.9} + assert default_ctx.kpi_weights == {"cost": 0.7, "quality": 0.3} + assert default_ctx.compliance == "gdpr" + + override_ctx = run( + budget=0.5, + max_tool_calls=3, + kpi_weights={"quality": 1.0}, + compliance="strict", + ) assert override_ctx.budget_max == 0.5 assert override_ctx.tool_calls_max == 3 assert override_ctx.budget_remaining == 0.5 + assert override_ctx.kpi_targets == {"quality_min": 0.9} + assert override_ctx.kpi_weights == {"quality": 1.0} + assert override_ctx.compliance == "strict" def test_run_without_enter_exit_is_safe(): @@ -152,9 +171,10 @@ def test_top_level_exports_exist(): assert callable(cascadeflow.init) assert callable(cascadeflow.reset) assert callable(cascadeflow.run) - # harness.agent is intentionally NOT re-exported at top level because it - # would shadow the cascadeflow.agent module. Import from submodule: - assert callable(agent) # imported from cascadeflow.harness + assert callable(cascadeflow.harness_agent) + assert hasattr(cascadeflow.agent, "PROVIDER_REGISTRY") + assert callable(cascadeflow.get_harness_callback_manager) + assert callable(cascadeflow.set_harness_callback_manager) report = cascadeflow.init(mode="off") assert report.mode == "off" @@ -166,6 +186,8 @@ def test_run_record_and_trace_copy(): trace_b = ctx.trace() assert trace_a == trace_b assert trace_a[0]["action"] == "switch_model" + assert "budget_state" in trace_a[0] + assert trace_a[0]["budget_state"]["max"] == 1.0 trace_a.append({"action": "mutated"}) assert len(ctx.trace()) == 1 @@ -310,3 +332,119 @@ def test_init_reports_openai_instrumented_when_patch_succeeds(monkeypatch): monkeypatch.setattr(instrument, "patch_openai", lambda: True) report = init(mode="observe") assert report.instrumented == ["openai"] + + +def test_init_reports_anthropic_instrumented_when_patch_succeeds(monkeypatch): + monkeypatch.setattr( + harness_api, + "find_spec", + lambda name: object() if name == "anthropic" else None, + ) + + import cascadeflow.harness.instrument as instrument + + monkeypatch.setattr(instrument, "patch_anthropic", lambda: True) + report = init(mode="observe") + assert report.instrumented == ["anthropic"] + + +def test_init_reports_anthropic_detected_not_instrumented_on_patch_failure(monkeypatch): + monkeypatch.setattr( + harness_api, + "find_spec", + lambda name: object() if name == "anthropic" else None, + ) + + import cascadeflow.harness.instrument as instrument + + monkeypatch.setattr(instrument, "patch_anthropic", lambda: False) + report = init(mode="observe") + assert report.instrumented == [] + assert report.detected_but_not_instrumented == ["anthropic"] + + +def test_run_summary_populates_on_context_exit(): + init(mode="observe") + with run(budget=1.5) as ctx: + ctx.step_count = 2 + ctx.tool_calls = 1 + ctx.cost = 0.42 + ctx.latency_used_ms = 123.0 + ctx.energy_used = 33.0 + ctx.budget_remaining = 1.08 + ctx.last_action = "allow" + ctx.model_used = "gpt-4o-mini" + + summary = ctx.summary() + assert summary["run_id"] == ctx.run_id + assert summary["step_count"] == 2 + assert summary["budget_remaining"] == pytest.approx(1.08) + assert summary["duration_ms"] is not None + assert summary["duration_ms"] >= 0.0 + assert ctx.duration_ms is not None + assert ctx.duration_ms >= 0.0 + + +def test_run_context_logs_summary(caplog): + init(mode="observe") + with caplog.at_level("INFO", logger="cascadeflow.harness"): + with run(budget=1.0) as ctx: + ctx.step_count = 1 + ctx.cost = 0.01 + ctx.model_used = "gpt-4o-mini" + + assert any("harness run summary" in rec.message for rec in caplog.records) + + +def test_record_emits_cascade_decision_callback(): + manager = CallbackManager() + received = [] + + def _on_decision(data): + received.append(data) + + manager.register(CallbackEvent.CASCADE_DECISION, _on_decision) + report = init(mode="observe", callback_manager=manager) + assert report.config_sources["callback_manager"] == "code" + + with run(budget=1.0) as ctx: + ctx.step_count = 1 + ctx.record(action="switch_model", reason="budget_pressure", model="gpt-4o-mini") + + assert len(received) == 1 + event = received[0] + assert event.event == CallbackEvent.CASCADE_DECISION + assert event.query == "[harness]" + assert event.workflow == "harness" + assert event.data["action"] == "switch_model" + assert event.data["run_id"] == ctx.run_id + + +def test_record_sanitizes_trace_values(): + ctx = run() + ctx.record( + action="allow\nnewline", + reason="a" * 400, + model="model\r\nname", + ) + entry = ctx.trace()[0] + assert "\n" not in entry["action"] + assert "\r" not in entry["model"] + assert len(entry["reason"]) <= 160 + + +def test_record_without_callback_manager_is_noop(): + init(mode="observe") + with run(budget=1.0) as ctx: + ctx.record(action="allow", reason="test", model="gpt-4o-mini") + assert len(ctx.trace()) == 1 + + +def test_record_empty_action_warns_and_defaults(caplog): + init(mode="observe") + with caplog.at_level("WARNING", logger="cascadeflow.harness"): + with run(budget=1.0) as ctx: + ctx.record(action="", reason="test", model="gpt-4o-mini") + entry = ctx.trace()[0] + assert entry["action"] == "allow" + assert any("empty action" in rec.message for rec in caplog.records) diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py index 75368522..ca1f9a07 100644 --- a/tests/test_harness_instrument.py +++ b/tests/test_harness_instrument.py @@ -2,6 +2,7 @@ from __future__ import annotations +from importlib.util import find_spec import time from typing import Optional from unittest.mock import AsyncMock, MagicMock @@ -18,8 +19,12 @@ _estimate_energy, _make_patched_async_create, _make_patched_create, + is_anthropic_patched, + is_openai_patched, is_patched, + patch_anthropic, patch_openai, + unpatch_anthropic, unpatch_openai, ) @@ -87,19 +92,19 @@ def _mock_stream_chunk( class TestPatchLifecycle: def test_patch_and_unpatch(self) -> None: - assert not is_patched() + assert not is_openai_patched() result = patch_openai() assert result is True - assert is_patched() + assert is_openai_patched() unpatch_openai() - assert not is_patched() + assert not is_openai_patched() def test_idempotent_patching(self) -> None: patch_openai() patch_openai() - assert is_patched() + assert is_openai_patched() unpatch_openai() - assert not is_patched() + assert not is_openai_patched() def test_unpatch_without_prior_patch(self) -> None: unpatch_openai() # should not raise @@ -107,12 +112,12 @@ def test_unpatch_without_prior_patch(self) -> None: def test_init_observe_patches(self) -> None: report = init(mode="observe") assert "openai" in report.instrumented - assert is_patched() + assert is_openai_patched() def test_init_enforce_patches(self) -> None: report = init(mode="enforce") assert "openai" in report.instrumented - assert is_patched() + assert is_openai_patched() def test_init_off_does_not_patch(self) -> None: init(mode="off") @@ -120,7 +125,7 @@ def test_init_off_does_not_patch(self) -> None: def test_reset_unpatches(self) -> None: init(mode="observe") - assert is_patched() + assert is_openai_patched() reset() assert not is_patched() @@ -133,6 +138,27 @@ def test_class_method_actually_replaced(self) -> None: unpatch_openai() assert Completions.create is original + def test_patch_and_unpatch_anthropic(self) -> None: + if find_spec("anthropic") is None: + pytest.skip("anthropic package not available") + assert not is_anthropic_patched() + result = patch_anthropic() + assert result is True + assert is_anthropic_patched() + unpatch_anthropic() + assert not is_anthropic_patched() + + def test_anthropic_class_method_actually_replaced(self) -> None: + if find_spec("anthropic") is None: + pytest.skip("anthropic package not available") + from anthropic.resources.messages import Messages + + original = Messages.create + patch_anthropic() + assert Messages.create is not original + unpatch_anthropic() + assert Messages.create is original + # --------------------------------------------------------------------------- # Sync wrapper From 76f6c2e0936eb0546e2a93763045b27196154f2b Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 4 Mar 2026 15:20:25 +0100 Subject: [PATCH 34/49] feat(core): deliver v2.1 ts harness parity and sdk auto-instrumentation --- docs/strategy/agent-intelligence-v2-plan.md | 15 +- packages/core/README.md | 17 + packages/core/src/__tests__/harness.test.ts | 232 ++++++ packages/core/src/harness-instrument.ts | 746 +++++++++++++++++++ packages/core/src/harness.ts | 754 ++++++++++++++++++++ packages/core/src/index.ts | 25 + 6 files changed, 1780 insertions(+), 9 deletions(-) create mode 100644 packages/core/src/__tests__/harness.test.ts create mode 100644 packages/core/src/harness-instrument.ts create mode 100644 packages/core/src/harness.ts diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md index 1c8a2344..177562e1 100644 --- a/docs/strategy/agent-intelligence-v2-plan.md +++ b/docs/strategy/agent-intelligence-v2-plan.md @@ -197,9 +197,6 @@ Framework-specific packages provide deeper integration (state extraction, middle ### TypeScript Equivalent ```typescript -// Target API — does not exist in @cascadeflow/core today. -// TS parity is a V2.1 deliverable (see Section 16, Phase F). - import { cascadeflow } from '@cascadeflow/core'; // Tier 1: Auto-instrument @@ -868,10 +865,10 @@ Integration-branch promotion gates: To enable parallel work without merge collisions, split V2.1 into Python and TS tracks: -- `feat/v2.1-anthropic-python-auto-instrumentation` (claimed by current agent) +- `feat/v2.1-anthropic-python-auto-instrumentation` (completed in this branch) - Scope: `cascadeflow/harness/*`, Python harness tests, Python docs notes - Deliverables: Anthropic Python auto-instrumentation, validation for `init()/run()` harness path -- `feat/v2.1-ts-harness-api-parity` (available for parallel agent) +- `feat/v2.1-ts-harness-api-parity` (completed and merged into this branch scope) - Scope: `packages/core/*`, TS parity fixtures, TS docs notes - Deliverables: `@cascadeflow/core` exports parity (`init()/run()`), TS fixture parity validation @@ -937,10 +934,10 @@ Go when all are true (V2 Python launch): - [ ] pyproject.toml extras (`openai-agents`, `crewai`, `langchain`) defined and installable V2.1 Go/No-Go (TS parity + anthropic): -- [ ] TS parity fixtures pass -- [ ] `@cascadeflow/core` exports `cascadeflow.init()` and `cascadeflow.run()` -- [ ] `anthropic` Python client auto-instrumentation validated -- [ ] `@anthropic-ai/sdk` TS client auto-instrumentation validated +- [x] TS parity fixtures pass +- [x] `@cascadeflow/core` exports `cascadeflow.init()` and `cascadeflow.run()` +- [x] `anthropic` Python client auto-instrumentation validated +- [x] `@anthropic-ai/sdk` TS client auto-instrumentation validated ## 19. Academic Validation diff --git a/packages/core/README.md b/packages/core/README.md index a0918d78..3188df91 100644 --- a/packages/core/README.md +++ b/packages/core/README.md @@ -33,6 +33,23 @@ pnpm add @cascadeflow/core yarn add @cascadeflow/core ``` +## Harness Quick Start (V2.1) + +```typescript +import { cascadeflow } from '@cascadeflow/core'; + +// 1) Turn on in-process harness decisions + SDK auto-instrumentation +cascadeflow.init({ mode: 'enforce', budget: 0.5 }); + +// 2) Scope one run (global defaults are inherited) +const result = await cascadeflow.run({ maxToolCalls: 8 }, async (run) => { + // Any OpenAI / Anthropic SDK calls made here are evaluated by the harness. + return { runId: run.runId }; +}); + +console.log(result); +``` + ## Quick Start ### Recommended Setup (Claude Haiku + GPT-5) diff --git a/packages/core/src/__tests__/harness.test.ts b/packages/core/src/__tests__/harness.test.ts new file mode 100644 index 00000000..bad03376 --- /dev/null +++ b/packages/core/src/__tests__/harness.test.ts @@ -0,0 +1,232 @@ +import { afterEach, describe, expect, it } from 'vitest'; + +import { + BudgetExceededError, + cascadeflow, + getCurrentRun, + getHarnessConfig, + init, + reset, + run, +} from '../harness'; +import { + __resetInstrumentationLoadersForTest, + __resetInstrumentationStateForTest, + __setInstrumentationLoadersForTest, + isAnthropicPatched, + isOpenAIPatched, +} from '../harness-instrument'; + +class FakeOpenAICompletions { + constructor(private readonly calls: Array>) {} + + create(request: Record): Promise> { + this.calls.push({ ...request }); + return Promise.resolve({ + usage: { + prompt_tokens: 100, + completion_tokens: 25, + }, + choices: [ + { + message: { + tool_calls: [{ id: 'tool_1', type: 'function' }], + }, + }, + ], + }); + } +} + +class FakeAnthropicMessages { + constructor(private readonly calls: Array>) {} + + create(request: Record): Promise> { + this.calls.push({ ...request }); + return Promise.resolve({ + usage: { + input_tokens: 120, + output_tokens: 40, + }, + content: [ + { type: 'text', text: 'hello' }, + { type: 'tool_use', id: 'tool_1', name: 'search', input: { q: 'x' } }, + ], + }); + } +} + +afterEach(() => { + reset(); + __resetInstrumentationStateForTest(); + __resetInstrumentationLoadersForTest(); +}); + +describe('harness API (TypeScript parity)', () => { + it('exposes cascadeflow init/run object API', async () => { + expect(typeof cascadeflow.init).toBe('function'); + expect(typeof cascadeflow.run).toBe('function'); + + init({ mode: 'observe' }); + const value = await cascadeflow.run(async (scope) => { + expect(scope.mode).toBe('observe'); + expect(getCurrentRun()).toBe(scope); + return 42; + }); + + expect(value).toBe(42); + expect(getCurrentRun()).toBeNull(); + }); + + it('honors code > env precedence and preserves nested scope isolation', async () => { + const previousMode = process.env.CASCADEFLOW_HARNESS_MODE; + process.env.CASCADEFLOW_HARNESS_MODE = 'observe'; + + init(); + expect(getHarnessConfig().mode).toBe('observe'); + + init({ mode: 'enforce' }); + expect(getHarnessConfig().mode).toBe('enforce'); + + await run({ budget: 1.0 }, async (outer) => { + outer.cost = 0.1; + expect(outer.budgetMax).toBe(1.0); + expect(getCurrentRun()).toBe(outer); + + await run({ budget: 0.25 }, async (inner) => { + expect(getCurrentRun()).toBe(inner); + expect(inner.budgetMax).toBe(0.25); + inner.cost = 0.2; + }); + + expect(getCurrentRun()).toBe(outer); + expect(outer.budgetMax).toBe(1.0); + expect(outer.cost).toBe(0.1); + }); + + if (previousMode == null) { + delete process.env.CASCADEFLOW_HARNESS_MODE; + } else { + process.env.CASCADEFLOW_HARNESS_MODE = previousMode; + } + }); + + it('auto-instruments OpenAI and enforces switch_model decisions', async () => { + const openaiCalls: Array> = []; + + __setInstrumentationLoadersForTest({ + openai: () => ({ + Completions: FakeOpenAICompletions, + }), + anthropic: () => null, + }); + + init({ mode: 'enforce' }); + expect(isOpenAIPatched()).toBe(true); + + await run({ kpiWeights: { cost: 1 } }, async (scope) => { + const client = new FakeOpenAICompletions(openaiCalls); + await client.create({ + model: 'gpt-4o', + messages: [{ role: 'user', content: 'hi' }], + }); + + expect(scope.stepCount).toBe(1); + expect(scope.cost).toBeGreaterThan(0); + expect(scope.toolCalls).toBe(1); + + const trace = scope.trace(); + expect(trace).toHaveLength(1); + expect(trace[0]?.action).toBe('switch_model'); + expect(trace[0]?.applied).toBe(true); + expect(trace[0]?.decisionMode).toBe('enforce'); + }); + + expect(openaiCalls).toHaveLength(1); + expect(openaiCalls[0]?.model).not.toBe('gpt-4o'); + }); + + it('observe mode logs non-allow decisions without mutating request', async () => { + const openaiCalls: Array> = []; + + __setInstrumentationLoadersForTest({ + openai: () => ({ + Completions: FakeOpenAICompletions, + }), + anthropic: () => null, + }); + + init({ mode: 'observe' }); + + await run({ kpiWeights: { cost: 1 } }, async (scope) => { + const client = new FakeOpenAICompletions(openaiCalls); + await client.create({ + model: 'gpt-4o', + messages: [{ role: 'user', content: 'hi' }], + }); + + const trace = scope.trace(); + expect(trace).toHaveLength(1); + expect(trace[0]?.action).toBe('switch_model'); + expect(trace[0]?.applied).toBe(false); + expect(trace[0]?.decisionMode).toBe('observe'); + }); + + expect(openaiCalls).toHaveLength(1); + expect(openaiCalls[0]?.model).toBe('gpt-4o'); + }); + + it('enforce mode stops calls when budget is exhausted', async () => { + const openaiCalls: Array> = []; + + __setInstrumentationLoadersForTest({ + openai: () => ({ + Completions: FakeOpenAICompletions, + }), + anthropic: () => null, + }); + + init({ mode: 'enforce' }); + + await expect( + run({ budget: 0 }, async () => { + const client = new FakeOpenAICompletions(openaiCalls); + await client.create({ + model: 'gpt-4o', + messages: [{ role: 'user', content: 'hi' }], + }); + }), + ).rejects.toBeInstanceOf(BudgetExceededError); + + expect(openaiCalls).toHaveLength(0); + }); + + it('auto-instruments Anthropic and tracks usage/tool calls', async () => { + const anthropicCalls: Array> = []; + + __setInstrumentationLoadersForTest({ + openai: () => null, + anthropic: () => ({ + Messages: FakeAnthropicMessages, + }), + }); + + init({ mode: 'enforce' }); + expect(isAnthropicPatched()).toBe(true); + + await run(async (scope) => { + const client = new FakeAnthropicMessages(anthropicCalls); + await client.create({ + model: 'claude-sonnet-4-5-20250929', + messages: [{ role: 'user', content: 'hello' }], + }); + + expect(scope.stepCount).toBe(1); + expect(scope.toolCalls).toBe(1); + expect(scope.cost).toBeGreaterThan(0); + expect(scope.trace()[0]?.action).toBe('allow'); + }); + + expect(anthropicCalls).toHaveLength(1); + }); +}); diff --git a/packages/core/src/harness-instrument.ts b/packages/core/src/harness-instrument.ts new file mode 100644 index 00000000..901af4ae --- /dev/null +++ b/packages/core/src/harness-instrument.ts @@ -0,0 +1,746 @@ +type Action = 'allow' | 'switch_model' | 'deny_tool' | 'stop'; + +type CreateFunction = (this: any, ...args: any[]) => any; + +type OpenAIModuleLike = { + Completions?: { + prototype?: { + create?: CreateFunction; + }; + }; +}; + +type AnthropicModuleLike = { + Messages?: { + prototype?: { + create?: CreateFunction; + }; + }; +}; + +type Pricing = { input: number; output: number }; + +type PreCallDecision = { + action: Action; + reason: string; + targetModel: string; +}; + +type HarnessRuntime = { + getCurrentRun: () => HarnessRunContextLike | null; + getHarnessMode: () => HarnessModeLike; + createBudgetExceededError: (message: string, remaining?: number) => Error; + createHarnessStopError: (message: string, reason?: string) => Error; +}; + +type HarnessModeLike = 'off' | 'observe' | 'enforce'; + +type HarnessRunContextLike = { + mode: HarnessModeLike; + cost: number; + stepCount: number; + toolCalls: number; + latencyUsedMs: number; + energyUsed: number; + budgetMax?: number; + budgetRemaining?: number; + toolCallsMax?: number; + latencyMaxMs?: number; + energyMax?: number; + compliance?: string; + kpiWeights?: Record; + record: ( + action: string, + reason: string, + model?: string, + options?: { + applied?: boolean; + decisionMode?: HarnessModeLike; + }, + ) => void; +}; + +const MODEL_PRICING_PER_MILLION: Record = { + // OpenAI + 'gpt-5': { input: 1.25, output: 10.0 }, + 'gpt-5-mini': { input: 0.25, output: 2.0 }, + 'gpt-5-nano': { input: 0.05, output: 0.4 }, + 'gpt-4o': { input: 2.5, output: 10.0 }, + 'gpt-4o-mini': { input: 0.15, output: 0.6 }, + 'o1': { input: 15.0, output: 60.0 }, + 'o1-mini': { input: 3.0, output: 12.0 }, + 'o3-mini': { input: 1.0, output: 5.0 }, + + // Anthropic + 'claude-opus-4-5-20251101': { input: 15.0, output: 75.0 }, + 'claude-opus-4-20250514': { input: 15.0, output: 75.0 }, + 'claude-sonnet-4-5-20250929': { input: 3.0, output: 15.0 }, + 'claude-sonnet-4-20250514': { input: 3.0, output: 15.0 }, + 'claude-haiku-4-5-20251001': { input: 1.0, output: 5.0 }, + 'claude-3-5-haiku-20241022': { input: 1.0, output: 5.0 }, +}; + +const ENERGY_COEFFICIENTS: Record = { + 'gpt-5': 1.15, + 'gpt-5-mini': 0.72, + 'gpt-5-nano': 0.45, + 'gpt-4o': 1.0, + 'gpt-4o-mini': 0.55, + 'o1': 1.25, + 'o1-mini': 0.85, + 'o3-mini': 0.75, + 'claude-opus-4-5-20251101': 1.2, + 'claude-opus-4-20250514': 1.15, + 'claude-sonnet-4-5-20250929': 0.95, + 'claude-sonnet-4-20250514': 0.92, + 'claude-haiku-4-5-20251001': 0.7, + 'claude-3-5-haiku-20241022': 0.68, +}; + +const LATENCY_PRIORS: Record = { + 'gpt-5': 0.45, + 'gpt-5-mini': 0.72, + 'gpt-5-nano': 0.9, + 'gpt-4o': 0.58, + 'gpt-4o-mini': 0.82, + 'o1': 0.35, + 'o1-mini': 0.62, + 'o3-mini': 0.7, + 'claude-opus-4-5-20251101': 0.4, + 'claude-opus-4-20250514': 0.44, + 'claude-sonnet-4-5-20250929': 0.6, + 'claude-sonnet-4-20250514': 0.63, + 'claude-haiku-4-5-20251001': 0.85, + 'claude-3-5-haiku-20241022': 0.86, +}; + +const QUALITY_PRIORS: Record = { + 'gpt-5': 0.95, + 'gpt-5-mini': 0.86, + 'gpt-5-nano': 0.74, + 'gpt-4o': 0.9, + 'gpt-4o-mini': 0.82, + 'o1': 0.93, + 'o1-mini': 0.84, + 'o3-mini': 0.86, + 'claude-opus-4-5-20251101': 0.94, + 'claude-opus-4-20250514': 0.92, + 'claude-sonnet-4-5-20250929': 0.9, + 'claude-sonnet-4-20250514': 0.88, + 'claude-haiku-4-5-20251001': 0.82, + 'claude-3-5-haiku-20241022': 0.8, +}; + +const COMPLIANCE_ALLOWLISTS: Record> = { + strict: new Set(['gpt-4o', 'gpt-4o-mini', 'claude-sonnet-4-5-20250929', 'claude-haiku-4-5-20251001']), + regulated: new Set(['gpt-4o', 'claude-sonnet-4-5-20250929']), +}; + +const DEFAULT_ENERGY_COEFFICIENT = 0.9; +const DEFAULT_OUTPUT_WEIGHT = 1.5; + +const PRICING_MODELS = Object.keys(MODEL_PRICING_PER_MILLION); + +let openAIPatched = false; +let anthropicPatched = false; + +let originalOpenAICreate: CreateFunction | null = null; +let originalAnthropicCreate: CreateFunction | null = null; +let patchedOpenAIClass: { prototype?: { create?: CreateFunction } } | null = null; +let patchedAnthropicClass: { prototype?: { create?: CreateFunction } } | null = null; + +const defaultOpenAILoader = (): OpenAIModuleLike | null => { + try { + // eslint-disable-next-line @typescript-eslint/no-var-requires + return require('openai/resources/chat/completions') as OpenAIModuleLike; + } catch { + return null; + } +}; + +const defaultAnthropicLoader = (): AnthropicModuleLike | null => { + try { + // eslint-disable-next-line @typescript-eslint/no-var-requires + return require('@anthropic-ai/sdk/resources/messages') as AnthropicModuleLike; + } catch { + return null; + } +}; + +let loadOpenAIModule = defaultOpenAILoader; +let loadAnthropicModule = defaultAnthropicLoader; +let harnessRuntimeBindings: HarnessRuntime | null = null; + +function getHarnessRuntime(): HarnessRuntime { + if (!harnessRuntimeBindings) { + throw new Error('Harness runtime bindings not configured'); + } + return harnessRuntimeBindings; +} + +export function setHarnessRuntimeBindingsForInstrumentation(bindings: HarnessRuntime): void { + harnessRuntimeBindings = bindings; +} + +function nowMonotonicMs(): number { + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition + if (typeof globalThis !== 'undefined' && (globalThis as any).performance?.now) { + return (globalThis as any).performance.now() as number; + } + + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition + if (typeof process !== 'undefined' && process.hrtime?.bigint) { + return Number(process.hrtime.bigint()) / 1_000_000; + } + + return Date.now(); +} + +function normalizeModelName(model: string): string { + return model.trim().toLowerCase(); +} + +function estimateCost(model: string, promptTokens: number, completionTokens: number): number { + const price = MODEL_PRICING_PER_MILLION[normalizeModelName(model)]; + if (!price) { + return 0; + } + + return (promptTokens / 1_000_000) * price.input + (completionTokens / 1_000_000) * price.output; +} + +function estimateEnergy(model: string, promptTokens: number, completionTokens: number): number { + const coefficient = ENERGY_COEFFICIENTS[normalizeModelName(model)] ?? DEFAULT_ENERGY_COEFFICIENT; + return coefficient * (promptTokens + completionTokens * DEFAULT_OUTPUT_WEIGHT) / 1000; +} + +function modelTotalCost(model: string): number { + const price = MODEL_PRICING_PER_MILLION[normalizeModelName(model)]; + if (!price) { + return Number.POSITIVE_INFINITY; + } + return price.input + price.output; +} + +function selectCheaperModel(currentModel: string): string { + const currentCost = modelTotalCost(currentModel); + let bestModel = currentModel; + let bestCost = currentCost; + + for (const candidate of PRICING_MODELS) { + const candidateCost = modelTotalCost(candidate); + if (candidateCost < bestCost) { + bestModel = candidate; + bestCost = candidateCost; + } + } + + return bestModel; +} + +function selectLowerEnergyModel(currentModel: string): string { + const currentCoeff = ENERGY_COEFFICIENTS[normalizeModelName(currentModel)] ?? DEFAULT_ENERGY_COEFFICIENT; + let bestModel = currentModel; + let bestCoeff = currentCoeff; + + for (const candidate of PRICING_MODELS) { + const coeff = ENERGY_COEFFICIENTS[candidate] ?? DEFAULT_ENERGY_COEFFICIENT; + if (coeff < bestCoeff) { + bestModel = candidate; + bestCoeff = coeff; + } + } + + return bestModel; +} + +function selectFasterModel(currentModel: string): string { + const currentLatency = LATENCY_PRIORS[normalizeModelName(currentModel)] ?? 0.7; + let bestModel = currentModel; + let bestLatency = currentLatency; + + for (const candidate of PRICING_MODELS) { + const score = LATENCY_PRIORS[candidate] ?? 0.7; + if (score > bestLatency) { + bestModel = candidate; + bestLatency = score; + } + } + + return bestModel; +} + +function normalizeWeights(weights: Record): Record { + const normalized: Record = {}; + let total = 0; + + for (const [key, value] of Object.entries(weights)) { + if (!Number.isFinite(value) || value <= 0) { + continue; + } + normalized[key] = value; + total += value; + } + + if (total <= 0) { + return {}; + } + + for (const key of Object.keys(normalized)) { + normalized[key] /= total; + } + + return normalized; +} + +function costUtility(model: string): number { + const costs = PRICING_MODELS.map(modelTotalCost).filter(Number.isFinite); + const min = Math.min(...costs); + const max = Math.max(...costs); + const current = modelTotalCost(model); + + if (!Number.isFinite(current) || max === min) { + return 0.5; + } + + return (max - current) / (max - min); +} + +function energyUtility(model: string): number { + const coeffs = PRICING_MODELS.map((name) => ENERGY_COEFFICIENTS[name] ?? DEFAULT_ENERGY_COEFFICIENT); + const min = Math.min(...coeffs); + const max = Math.max(...coeffs); + const current = ENERGY_COEFFICIENTS[normalizeModelName(model)] ?? DEFAULT_ENERGY_COEFFICIENT; + + if (max === min) { + return 0.5; + } + + return (max - current) / (max - min); +} + +function kpiScore(model: string, weights: Record): number { + const normalized = normalizeWeights(weights); + if (Object.keys(normalized).length === 0) { + return 0; + } + + const key = normalizeModelName(model); + const quality = QUALITY_PRIORS[key] ?? 0.7; + const latency = LATENCY_PRIORS[key] ?? 0.7; + const cost = costUtility(key); + const energy = energyUtility(key); + + return ( + (normalized.quality ?? 0) * quality + + (normalized.latency ?? 0) * latency + + (normalized.cost ?? 0) * cost + + (normalized.energy ?? 0) * energy + ); +} + +function selectKPIWeightedModel(currentModel: string, weights: Record): string { + const normalized = normalizeWeights(weights); + if (Object.keys(normalized).length === 0) { + return currentModel; + } + + let bestModel = currentModel; + let bestScore = kpiScore(currentModel, normalized); + + for (const candidate of PRICING_MODELS) { + const score = kpiScore(candidate, normalized); + if (score > bestScore) { + bestModel = candidate; + bestScore = score; + } + } + + return bestModel; +} + +function extractOpenAIUsage(response: any): [number, number] { + const usage = response?.usage; + if (!usage || typeof usage !== 'object') { + return [0, 0]; + } + const promptTokens = Number(usage.prompt_tokens ?? usage.input_tokens ?? 0); + const completionTokens = Number(usage.completion_tokens ?? usage.output_tokens ?? 0); + return [ + Number.isFinite(promptTokens) ? promptTokens : 0, + Number.isFinite(completionTokens) ? completionTokens : 0, + ]; +} + +function extractAnthropicUsage(response: any): [number, number] { + const usage = response?.usage; + if (!usage || typeof usage !== 'object') { + return [0, 0]; + } + + const inputTokens = Number(usage.input_tokens ?? usage.prompt_tokens ?? 0); + const outputTokens = Number(usage.output_tokens ?? usage.completion_tokens ?? 0); + return [ + Number.isFinite(inputTokens) ? inputTokens : 0, + Number.isFinite(outputTokens) ? outputTokens : 0, + ]; +} + +function countOpenAIToolCalls(response: any): number { + const toolCalls = response?.choices?.[0]?.message?.tool_calls; + if (!Array.isArray(toolCalls)) { + return 0; + } + return toolCalls.length; +} + +function countAnthropicToolCalls(response: any): number { + const content = response?.content; + if (!Array.isArray(content)) { + return 0; + } + return content.filter((item: any) => item?.type === 'tool_use').length; +} + +function evaluatePreCallDecision(ctx: HarnessRunContextLike, model: string, hasTools: boolean): PreCallDecision { + if (ctx.budgetMax != null && ctx.cost >= ctx.budgetMax) { + return { action: 'stop', reason: 'budget_exceeded', targetModel: model }; + } + + if (hasTools && ctx.toolCallsMax != null && ctx.toolCalls >= ctx.toolCallsMax) { + return { action: 'deny_tool', reason: 'max_tool_calls_reached', targetModel: model }; + } + + if (ctx.compliance) { + const profile = COMPLIANCE_ALLOWLISTS[ctx.compliance.trim().toLowerCase()]; + if (profile) { + const normalized = normalizeModelName(model); + if (!profile.has(normalized)) { + const next = PRICING_MODELS.find((candidate) => profile.has(candidate)); + if (next) { + return { action: 'switch_model', reason: 'compliance_model_policy', targetModel: next }; + } + return { + action: hasTools ? 'deny_tool' : 'stop', + reason: hasTools ? 'compliance_no_approved_tool_path' : 'compliance_no_approved_model', + targetModel: model, + }; + } + if (ctx.compliance.trim().toLowerCase() === 'strict' && hasTools) { + return { action: 'deny_tool', reason: 'compliance_tool_restriction', targetModel: model }; + } + } + } + + if (ctx.latencyMaxMs != null && ctx.latencyUsedMs >= ctx.latencyMaxMs) { + const faster = selectFasterModel(model); + if (normalizeModelName(faster) !== normalizeModelName(model)) { + return { action: 'switch_model', reason: 'latency_limit_exceeded', targetModel: faster }; + } + return { action: 'stop', reason: 'latency_limit_exceeded', targetModel: model }; + } + + if (ctx.energyMax != null && ctx.energyUsed >= ctx.energyMax) { + const lower = selectLowerEnergyModel(model); + if (normalizeModelName(lower) !== normalizeModelName(model)) { + return { action: 'switch_model', reason: 'energy_limit_exceeded', targetModel: lower }; + } + return { action: 'stop', reason: 'energy_limit_exceeded', targetModel: model }; + } + + if ( + ctx.budgetMax != null + && ctx.budgetMax > 0 + && ctx.budgetRemaining != null + && (ctx.budgetRemaining / ctx.budgetMax) < 0.2 + ) { + const cheaper = selectCheaperModel(model); + if (normalizeModelName(cheaper) !== normalizeModelName(model)) { + return { action: 'switch_model', reason: 'budget_pressure', targetModel: cheaper }; + } + } + + if (ctx.kpiWeights && Object.keys(ctx.kpiWeights).length > 0) { + const candidate = selectKPIWeightedModel(model, ctx.kpiWeights); + if (normalizeModelName(candidate) !== normalizeModelName(model)) { + return { action: 'switch_model', reason: 'kpi_weight_optimization', targetModel: candidate }; + } + } + + return { action: 'allow', reason: ctx.mode, targetModel: model }; +} + +function raiseStopError(ctx: HarnessRunContextLike, reason: string): never { + const runtime = getHarnessRuntime(); + if (reason === 'budget_exceeded') { + const remaining = Math.max(0, (ctx.budgetMax ?? 0) - ctx.cost); + throw runtime.createBudgetExceededError( + `Budget exhausted: spent $${ctx.cost.toFixed(4)} of $${(ctx.budgetMax ?? 0).toFixed(4)} max`, + remaining, + ); + } + + throw runtime.createHarnessStopError(`cascadeflow harness stop: ${reason}`, reason); +} + +function updateContext( + ctx: HarnessRunContextLike, + mode: HarnessModeLike, + model: string, + promptTokens: number, + completionTokens: number, + toolCalls: number, + elapsedMs: number, + decision: PreCallDecision, + applied: boolean, +): void { + const cost = estimateCost(model, promptTokens, completionTokens); + const energy = estimateEnergy(model, promptTokens, completionTokens); + + ctx.cost += cost; + ctx.stepCount += 1; + ctx.toolCalls += toolCalls; + ctx.latencyUsedMs += elapsedMs; + ctx.energyUsed += energy; + + if (ctx.budgetMax != null) { + ctx.budgetRemaining = ctx.budgetMax - ctx.cost; + } + + ctx.record(decision.action, decision.reason, decision.targetModel, { + applied, + decisionMode: mode, + }); +} + +function isThenable(value: any): value is Promise { + return Boolean(value) && typeof value.then === 'function'; +} + +function makePatchedCreate(provider: 'openai' | 'anthropic', original: CreateFunction): CreateFunction { + return function patchedCreate(this: any, ...args: any[]): any { + const runtime = getHarnessRuntime(); + const activeRun = runtime.getCurrentRun(); + const mode = activeRun?.mode ?? runtime.getHarnessMode(); + + if (mode === 'off') { + return original.apply(this, args); + } + + const firstArg = args[0]; + const request = firstArg && typeof firstArg === 'object' ? { ...firstArg } : {}; + const model = typeof request.model === 'string' ? request.model : 'unknown'; + const hasTools = Array.isArray(request.tools) && request.tools.length > 0; + + const decision = activeRun ? evaluatePreCallDecision(activeRun, model, hasTools) : { + action: 'allow' as const, + reason: mode, + targetModel: model, + }; + + let applied = decision.action === 'allow'; + let effectiveModel = model; + + if (activeRun && mode === 'enforce') { + if (decision.action === 'stop') { + activeRun.record('stop', decision.reason, model, { + applied: true, + decisionMode: mode, + }); + raiseStopError(activeRun, decision.reason); + } + + if (decision.action === 'switch_model') { + if (normalizeModelName(decision.targetModel) !== normalizeModelName(model)) { + request.model = decision.targetModel; + effectiveModel = decision.targetModel; + applied = true; + } else { + applied = false; + } + } + + if (decision.action === 'deny_tool') { + if (Array.isArray(request.tools) && request.tools.length > 0) { + request.tools = []; + applied = true; + } else { + applied = false; + } + } + } else if (decision.action !== 'allow') { + applied = false; + } + + const interceptedArgs = firstArg && typeof firstArg === 'object' + ? [request, ...args.slice(1)] + : args; + + const isStream = Boolean(request.stream); + const startedAt = nowMonotonicMs(); + const result = original.apply(this, interceptedArgs); + + if (!activeRun) { + return result; + } + + const finalize = (response: any): any => { + const elapsedMs = Math.max(0, nowMonotonicMs() - startedAt); + + let promptTokens = 0; + let completionTokens = 0; + let toolCallCount = 0; + + if (!isStream) { + if (provider === 'openai') { + [promptTokens, completionTokens] = extractOpenAIUsage(response); + toolCallCount = countOpenAIToolCalls(response); + } else { + [promptTokens, completionTokens] = extractAnthropicUsage(response); + toolCallCount = countAnthropicToolCalls(response); + } + } + + updateContext( + activeRun, + mode, + effectiveModel, + promptTokens, + completionTokens, + toolCallCount, + elapsedMs, + decision, + applied, + ); + + return response; + }; + + if (isThenable(result)) { + result + .then((response) => { + finalize(response); + }) + .catch(() => { + // fail-open: harness instrumentation errors must not crash user flow. + }); + return result; + } + + return finalize(result); + }; +} + +export function detectOpenAIInstrumentationTarget(): boolean { + const module = loadOpenAIModule(); + return Boolean(module?.Completions?.prototype?.create); +} + +export function detectAnthropicInstrumentationTarget(): boolean { + const module = loadAnthropicModule(); + return Boolean(module?.Messages?.prototype?.create); +} + +export function patchOpenAI(): boolean { + if (openAIPatched) { + return true; + } + + const module = loadOpenAIModule(); + const cls = module?.Completions; + const prototype = cls?.prototype; + const create = prototype?.create; + + if (!cls || !prototype || typeof create !== 'function') { + return false; + } + + originalOpenAICreate = create; + patchedOpenAIClass = cls; + prototype.create = makePatchedCreate('openai', create); + openAIPatched = true; + return true; +} + +export function patchAnthropic(): boolean { + if (anthropicPatched) { + return true; + } + + const module = loadAnthropicModule(); + const cls = module?.Messages; + const prototype = cls?.prototype; + const create = prototype?.create; + + if (!cls || !prototype || typeof create !== 'function') { + return false; + } + + originalAnthropicCreate = create; + patchedAnthropicClass = cls; + prototype.create = makePatchedCreate('anthropic', create); + anthropicPatched = true; + return true; +} + +export function unpatchOpenAI(): void { + if (!openAIPatched) { + return; + } + + if (patchedOpenAIClass?.prototype && originalOpenAICreate) { + patchedOpenAIClass.prototype.create = originalOpenAICreate; + } + + openAIPatched = false; + originalOpenAICreate = null; + patchedOpenAIClass = null; +} + +export function unpatchAnthropic(): void { + if (!anthropicPatched) { + return; + } + + if (patchedAnthropicClass?.prototype && originalAnthropicCreate) { + patchedAnthropicClass.prototype.create = originalAnthropicCreate; + } + + anthropicPatched = false; + originalAnthropicCreate = null; + patchedAnthropicClass = null; +} + +export function isOpenAIPatched(): boolean { + return openAIPatched; +} + +export function isAnthropicPatched(): boolean { + return anthropicPatched; +} + +export function isPatched(): boolean { + return openAIPatched || anthropicPatched; +} + +export function __setInstrumentationLoadersForTest(loaders: { + openai?: () => OpenAIModuleLike | null; + anthropic?: () => AnthropicModuleLike | null; +}): void { + if (loaders.openai) { + loadOpenAIModule = loaders.openai; + } + if (loaders.anthropic) { + loadAnthropicModule = loaders.anthropic; + } +} + +export function __resetInstrumentationLoadersForTest(): void { + loadOpenAIModule = defaultOpenAILoader; + loadAnthropicModule = defaultAnthropicLoader; +} + +export function __resetInstrumentationStateForTest(): void { + unpatchOpenAI(); + unpatchAnthropic(); +} diff --git a/packages/core/src/harness.ts b/packages/core/src/harness.ts new file mode 100644 index 00000000..3815360e --- /dev/null +++ b/packages/core/src/harness.ts @@ -0,0 +1,754 @@ +import { + __resetInstrumentationStateForTest, + detectAnthropicInstrumentationTarget, + detectOpenAIInstrumentationTarget, + patchAnthropic, + patchOpenAI, + setHarnessRuntimeBindingsForInstrumentation, + unpatchAnthropic, + unpatchOpenAI, +} from './harness-instrument'; + +export type HarnessMode = 'off' | 'observe' | 'enforce'; + +export type HarnessConfig = { + mode: HarnessMode; + verbose: boolean; + budget?: number; + maxToolCalls?: number; + maxLatencyMs?: number; + maxEnergy?: number; + kpiTargets?: Record; + kpiWeights?: Record; + compliance?: string; +}; + +export type HarnessInitOptions = Partial; + +export type HarnessRunOptions = { + budget?: number; + maxToolCalls?: number; + maxLatencyMs?: number; + maxEnergy?: number; + kpiTargets?: Record; + kpiWeights?: Record; + compliance?: string; +}; + +export type HarnessInitReport = { + mode: HarnessMode; + instrumented: string[]; + detectedButNotInstrumented: string[]; + configSources: Record; +}; + +export type HarnessRecordOptions = { + applied?: boolean; + decisionMode?: HarnessMode; +}; + +export type HarnessTraceEntry = { + action: string; + reason: string; + model?: string; + runId: string; + mode: HarnessMode; + step: number; + timestampMs: number; + toolCallsTotal: number; + costTotal: number; + latencyUsedMs: number; + energyUsed: number; + budgetState: { + max?: number; + remaining?: number; + }; + applied?: boolean; + decisionMode?: HarnessMode; +}; + +export type HarnessRunSummary = { + runId: string; + mode: HarnessMode; + stepCount: number; + toolCalls: number; + cost: number; + savings: number; + latencyUsedMs: number; + energyUsed: number; + budgetMax?: number; + budgetRemaining?: number; + lastAction: string; + modelUsed?: string; + durationMs?: number; +}; + +export class HarnessStopError extends Error { + reason: string; + + constructor(message: string, reason = 'stop') { + super(message); + this.name = 'HarnessStopError'; + this.reason = reason; + } +} + +export class BudgetExceededError extends HarnessStopError { + remaining: number; + + constructor(message: string, remaining = 0) { + super(message, 'budget_exceeded'); + this.name = 'BudgetExceededError'; + this.remaining = remaining; + } +} + +function randomRunId(): string { + return Math.random().toString(36).slice(2, 14); +} + +function nowMonotonicMs(): number { + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition + if (typeof globalThis !== 'undefined' && (globalThis as any).performance?.now) { + return (globalThis as any).performance.now() as number; + } + + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition + if (typeof process !== 'undefined' && process.hrtime?.bigint) { + return Number(process.hrtime.bigint()) / 1_000_000; + } + + return Date.now(); +} + +const MAX_ACTION_LEN = 64; +const MAX_REASON_LEN = 160; +const MAX_MODEL_LEN = 128; + +function sanitizeTraceValue(value: unknown, maxLength: number): string | undefined { + if (value == null) { + return undefined; + } + + const text = String(value).replace(/\r?\n/g, ' ').trim(); + if (!text) { + return undefined; + } + + if (text.length <= maxLength) { + return text; + } + + return `${text.slice(0, Math.max(0, maxLength - 3))}...`; +} + +export class HarnessRunContext { + runId: string; + startedAtMs: number; + endedAtMs?: number; + durationMs?: number; + + mode: HarnessMode; + budgetMax?: number; + toolCallsMax?: number; + latencyMaxMs?: number; + energyMax?: number; + kpiTargets?: Record; + kpiWeights?: Record; + compliance?: string; + + cost = 0; + savings = 0; + toolCalls = 0; + stepCount = 0; + latencyUsedMs = 0; + energyUsed = 0; + verbose = false; + budgetRemaining?: number; + modelUsed?: string; + lastAction = 'allow'; + draftAccepted?: boolean; + + private readonly _startedMonotonic: number; + private readonly _trace: HarnessTraceEntry[] = []; + private _finalized = false; + + constructor(config: { + mode: HarnessMode; + budgetMax?: number; + toolCallsMax?: number; + latencyMaxMs?: number; + energyMax?: number; + kpiTargets?: Record; + kpiWeights?: Record; + compliance?: string; + verbose?: boolean; + }) { + this.runId = randomRunId(); + this.startedAtMs = Date.now(); + this._startedMonotonic = nowMonotonicMs(); + + this.mode = config.mode; + this.budgetMax = config.budgetMax; + this.toolCallsMax = config.toolCallsMax; + this.latencyMaxMs = config.latencyMaxMs; + this.energyMax = config.energyMax; + this.kpiTargets = config.kpiTargets; + this.kpiWeights = config.kpiWeights; + this.compliance = config.compliance; + this.verbose = Boolean(config.verbose); + + if (config.budgetMax != null) { + this.budgetRemaining = config.budgetMax; + } + } + + finish(): void { + if (this._finalized) { + return; + } + + this._finalized = true; + this.endedAtMs = Date.now(); + this.durationMs = Math.max(0, nowMonotonicMs() - this._startedMonotonic); + + if (this.verbose && this.mode !== 'off' && this.stepCount > 0) { + // Keep logging cheap and controlled. + // eslint-disable-next-line no-console + console.info( + '[cascadeflow.harness] run summary', + { + runId: this.runId, + mode: this.mode, + steps: this.stepCount, + toolCalls: this.toolCalls, + cost: this.cost, + latencyMs: this.latencyUsedMs, + energy: this.energyUsed, + lastAction: this.lastAction, + model: this.modelUsed, + budgetRemaining: this.budgetRemaining, + durationMs: this.durationMs, + }, + ); + } + } + + record(action: string, reason: string, model?: string, options: HarnessRecordOptions = {}): void { + let safeAction = sanitizeTraceValue(action, MAX_ACTION_LEN); + if (!safeAction) { + safeAction = 'allow'; + } + + const safeReason = sanitizeTraceValue(reason, MAX_REASON_LEN) ?? 'unspecified'; + const safeModel = sanitizeTraceValue(model, MAX_MODEL_LEN); + + this.lastAction = safeAction; + this.modelUsed = safeModel; + + const entry: HarnessTraceEntry = { + action: safeAction, + reason: safeReason, + model: safeModel, + runId: this.runId, + mode: this.mode, + step: this.stepCount, + timestampMs: Date.now(), + toolCallsTotal: this.toolCalls, + costTotal: this.cost, + latencyUsedMs: this.latencyUsedMs, + energyUsed: this.energyUsed, + budgetState: { + max: this.budgetMax, + remaining: this.budgetRemaining, + }, + }; + + if (options.applied != null) { + entry.applied = options.applied; + } + + if (options.decisionMode != null) { + entry.decisionMode = options.decisionMode; + } + + this._trace.push(entry); + } + + trace(): HarnessTraceEntry[] { + return [...this._trace]; + } + + summary(): HarnessRunSummary { + return { + runId: this.runId, + mode: this.mode, + stepCount: this.stepCount, + toolCalls: this.toolCalls, + cost: this.cost, + savings: this.savings, + latencyUsedMs: this.latencyUsedMs, + energyUsed: this.energyUsed, + budgetMax: this.budgetMax, + budgetRemaining: this.budgetRemaining, + lastAction: this.lastAction, + modelUsed: this.modelUsed, + durationMs: this.durationMs, + }; + } +} + +type ConfigSource = 'code' | 'env' | 'file' | 'default'; + +type ConfigWithSources = { + config: HarnessConfig; + sources: Record; +}; + +let _harnessConfig: HarnessConfig = { + mode: 'off', + verbose: false, +}; + +let _isInstrumented = false; +let fallbackCurrentRun: HarnessRunContext | null = null; + +let asyncLocalStorageInstance: { run: (store: HarnessRunContext, callback: () => Promise) => Promise; getStore: () => HarnessRunContext | undefined } | null = null; + +function getAsyncLocalStorage(): typeof asyncLocalStorageInstance { + if (asyncLocalStorageInstance) { + return asyncLocalStorageInstance; + } + + try { + // eslint-disable-next-line @typescript-eslint/no-var-requires + const mod = require('node:async_hooks') as { + AsyncLocalStorage: new () => { run: (store: T, callback: () => Promise) => Promise; getStore: () => T | undefined }; + }; + + asyncLocalStorageInstance = new mod.AsyncLocalStorage(); + } catch { + asyncLocalStorageInstance = null; + } + + return asyncLocalStorageInstance; +} + +function parseBoolean(raw: string): boolean { + const normalized = raw.trim().toLowerCase(); + return normalized === '1' || normalized === 'true' || normalized === 'yes' || normalized === 'on'; +} + +function parseNumber(raw: string): number { + const value = Number(raw); + if (!Number.isFinite(value)) { + throw new Error(`Invalid numeric value: ${raw}`); + } + return value; +} + +function parseJSONMap(raw: string): Record { + const parsed = JSON.parse(raw); + if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) { + throw new Error('Expected object'); + } + + const result: Record = {}; + for (const [key, value] of Object.entries(parsed as Record)) { + result[String(key)] = Number(value); + } + return result; +} + +function normalizeMode(mode: unknown): HarnessMode { + if (mode === 'off' || mode === 'observe' || mode === 'enforce') { + return mode; + } + + throw new Error('mode must be one of: off, observe, enforce'); +} + +function normalizeConfigRecord(raw: Record): HarnessInitOptions { + const out: HarnessInitOptions = {}; + + const mode = raw.mode ?? raw.harness_mode; + if (typeof mode === 'string') { + out.mode = normalizeMode(mode); + } + + const verbose = raw.verbose ?? raw.harness_verbose; + if (typeof verbose === 'boolean') { + out.verbose = verbose; + } + + const budget = raw.budget ?? raw.max_budget; + if (typeof budget === 'number') { + out.budget = budget; + } + + const maxToolCalls = raw.maxToolCalls ?? raw.max_tool_calls; + if (typeof maxToolCalls === 'number') { + out.maxToolCalls = maxToolCalls; + } + + const maxLatencyMs = raw.maxLatencyMs ?? raw.max_latency_ms; + if (typeof maxLatencyMs === 'number') { + out.maxLatencyMs = maxLatencyMs; + } + + const maxEnergy = raw.maxEnergy ?? raw.max_energy; + if (typeof maxEnergy === 'number') { + out.maxEnergy = maxEnergy; + } + + const kpiTargets = raw.kpiTargets ?? raw.kpi_targets; + if (kpiTargets && typeof kpiTargets === 'object' && !Array.isArray(kpiTargets)) { + out.kpiTargets = kpiTargets as Record; + } + + const kpiWeights = raw.kpiWeights ?? raw.kpi_weights; + if (kpiWeights && typeof kpiWeights === 'object' && !Array.isArray(kpiWeights)) { + out.kpiWeights = kpiWeights as Record; + } + + const compliance = raw.compliance; + if (typeof compliance === 'string') { + out.compliance = compliance; + } + + return out; +} + +function readEnvConfig(): HarnessInitOptions { + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition + if (typeof process === 'undefined' || !process.env) { + return {}; + } + + const env = process.env; + const config: HarnessInitOptions = {}; + + const mode = env.CASCADEFLOW_HARNESS_MODE ?? env.CASCADEFLOW_MODE; + if (mode) { + config.mode = normalizeMode(mode); + } + + if (env.CASCADEFLOW_HARNESS_VERBOSE != null) { + config.verbose = parseBoolean(env.CASCADEFLOW_HARNESS_VERBOSE); + } + + const budget = env.CASCADEFLOW_HARNESS_BUDGET ?? env.CASCADEFLOW_BUDGET; + if (budget != null) { + config.budget = parseNumber(budget); + } + + if (env.CASCADEFLOW_HARNESS_MAX_TOOL_CALLS != null) { + config.maxToolCalls = parseNumber(env.CASCADEFLOW_HARNESS_MAX_TOOL_CALLS); + } + + if (env.CASCADEFLOW_HARNESS_MAX_LATENCY_MS != null) { + config.maxLatencyMs = parseNumber(env.CASCADEFLOW_HARNESS_MAX_LATENCY_MS); + } + + if (env.CASCADEFLOW_HARNESS_MAX_ENERGY != null) { + config.maxEnergy = parseNumber(env.CASCADEFLOW_HARNESS_MAX_ENERGY); + } + + if (env.CASCADEFLOW_HARNESS_KPI_TARGETS != null) { + config.kpiTargets = parseJSONMap(env.CASCADEFLOW_HARNESS_KPI_TARGETS); + } + + if (env.CASCADEFLOW_HARNESS_KPI_WEIGHTS != null) { + config.kpiWeights = parseJSONMap(env.CASCADEFLOW_HARNESS_KPI_WEIGHTS); + } + + if (env.CASCADEFLOW_HARNESS_COMPLIANCE != null) { + config.compliance = env.CASCADEFLOW_HARNESS_COMPLIANCE; + } + + return config; +} + +function readFileConfig(): HarnessInitOptions { + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition + if (typeof process === 'undefined' || !process.cwd) { + return {}; + } + + try { + // eslint-disable-next-line @typescript-eslint/no-var-requires + const fs = require('node:fs') as typeof import('node:fs'); + // eslint-disable-next-line @typescript-eslint/no-var-requires + const path = require('node:path') as typeof import('node:path'); + + const configuredPath = process.env.CASCADEFLOW_CONFIG; + const candidates = configuredPath + ? [configuredPath] + : ['cascadeflow.json', 'cascadeflow.config.json']; + + for (const candidate of candidates) { + const full = path.isAbsolute(candidate) ? candidate : path.join(process.cwd(), candidate); + if (!fs.existsSync(full)) { + continue; + } + + const content = fs.readFileSync(full, 'utf8'); + const parsed = JSON.parse(content) as Record; + const harnessBlock = ( + parsed.harness && typeof parsed.harness === 'object' && !Array.isArray(parsed.harness) + ) + ? (parsed.harness as Record) + : parsed; + + return normalizeConfigRecord(harnessBlock); + } + } catch { + return {}; + } + + return {}; +} + +function resolveConfig(options: HarnessInitOptions): ConfigWithSources { + const env = readEnvConfig(); + const file = readFileConfig(); + const sources: Record = {}; + + const resolve = ( + key: keyof HarnessConfig, + explicit: T | undefined, + envValue: T | undefined, + fileValue: T | undefined, + defaultValue: T, + ): T => { + if (explicit !== undefined) { + sources[key] = 'code'; + return explicit; + } + if (envValue !== undefined) { + sources[key] = 'env'; + return envValue; + } + if (fileValue !== undefined) { + sources[key] = 'file'; + return fileValue; + } + sources[key] = 'default'; + return defaultValue; + }; + + const mode = resolve('mode', options.mode, env.mode, file.mode, 'off'); + const verbose = resolve('verbose', options.verbose, env.verbose, file.verbose, false); + const budget = resolve('budget', options.budget, env.budget, file.budget, undefined); + const maxToolCalls = resolve( + 'maxToolCalls', + options.maxToolCalls, + env.maxToolCalls, + file.maxToolCalls, + undefined, + ); + const maxLatencyMs = resolve( + 'maxLatencyMs', + options.maxLatencyMs, + env.maxLatencyMs, + file.maxLatencyMs, + undefined, + ); + const maxEnergy = resolve('maxEnergy', options.maxEnergy, env.maxEnergy, file.maxEnergy, undefined); + const kpiTargets = resolve( + 'kpiTargets', + options.kpiTargets, + env.kpiTargets, + file.kpiTargets, + undefined, + ); + const kpiWeights = resolve( + 'kpiWeights', + options.kpiWeights, + env.kpiWeights, + file.kpiWeights, + undefined, + ); + const compliance = resolve( + 'compliance', + options.compliance, + env.compliance, + file.compliance, + undefined, + ); + + return { + config: { + mode, + verbose, + budget, + maxToolCalls, + maxLatencyMs, + maxEnergy, + kpiTargets, + kpiWeights, + compliance, + }, + sources, + }; +} + +export function getHarnessConfig(): HarnessConfig { + return { ..._harnessConfig }; +} + +export function getCurrentRun(): HarnessRunContext | null { + const als = getAsyncLocalStorage(); + if (als) { + return als.getStore() ?? null; + } + + return fallbackCurrentRun; +} + +export function reset(): void { + unpatchOpenAI(); + unpatchAnthropic(); + __resetInstrumentationStateForTest(); + + _harnessConfig = { mode: 'off', verbose: false }; + _isInstrumented = false; + fallbackCurrentRun = null; +} + +export function init(options: HarnessInitOptions = {}): HarnessInitReport { + const { config, sources } = resolveConfig(options); + config.mode = normalizeMode(config.mode); + + _harnessConfig = config; + + const instrumented: string[] = []; + const detectedButNotInstrumented: string[] = []; + + const openaiDetected = detectOpenAIInstrumentationTarget(); + const anthropicDetected = detectAnthropicInstrumentationTarget(); + + if (config.mode !== 'off' && openaiDetected) { + if (patchOpenAI()) { + instrumented.push('openai'); + } else { + detectedButNotInstrumented.push('openai'); + } + } + + if (config.mode !== 'off' && anthropicDetected) { + if (patchAnthropic()) { + instrumented.push('anthropic'); + } else { + detectedButNotInstrumented.push('anthropic'); + } + } + + if (config.mode === 'off') { + unpatchOpenAI(); + unpatchAnthropic(); + } + + _isInstrumented = true; + + if (config.verbose) { + // eslint-disable-next-line no-console + console.info('[cascadeflow.harness] init', { + mode: config.mode, + instrumented, + detectedButNotInstrumented, + }); + } + + return { + mode: config.mode, + instrumented, + detectedButNotInstrumented, + configSources: sources, + }; +} + +type RunCallback = (run: HarnessRunContext) => Promise | T; + +async function executeScopedRun(runContext: HarnessRunContext, fn: RunCallback): Promise { + try { + return await fn(runContext); + } finally { + runContext.finish(); + } +} + +export async function run(callback: RunCallback): Promise; +export async function run(options: HarnessRunOptions, callback: RunCallback): Promise; +export async function run( + optionsOrCallback: HarnessRunOptions | RunCallback, + callback?: RunCallback, +): Promise { + const options = typeof optionsOrCallback === 'function' ? {} : optionsOrCallback; + const cb = (typeof optionsOrCallback === 'function' ? optionsOrCallback : callback) as RunCallback | undefined; + + if (!cb) { + throw new Error('run() requires a callback: run(options?, async (run) => { ... })'); + } + + const cfg = getHarnessConfig(); + const runContext = new HarnessRunContext({ + mode: cfg.mode, + budgetMax: options.budget ?? cfg.budget, + toolCallsMax: options.maxToolCalls ?? cfg.maxToolCalls, + latencyMaxMs: options.maxLatencyMs ?? cfg.maxLatencyMs, + energyMax: options.maxEnergy ?? cfg.maxEnergy, + kpiTargets: options.kpiTargets ?? cfg.kpiTargets, + kpiWeights: options.kpiWeights ?? cfg.kpiWeights, + compliance: options.compliance ?? cfg.compliance, + verbose: cfg.verbose, + }); + + const als = getAsyncLocalStorage(); + if (als) { + return als.run(runContext, async () => executeScopedRun(runContext, cb)) as Promise; + } + + const previous = fallbackCurrentRun; + fallbackCurrentRun = runContext; + try { + return await executeScopedRun(runContext, cb); + } finally { + fallbackCurrentRun = previous; + } +} + +export function agent(policy: HarnessRunOptions): any>(fn: T) => T { + return any>(fn: T): T => { + const wrapped = ((...args: any[]) => fn(...args)) as T; + (wrapped as any).__cascadeflow_agent_policy__ = { + budget: policy.budget, + kpiTargets: policy.kpiTargets, + kpiWeights: policy.kpiWeights, + compliance: policy.compliance, + }; + return wrapped; + }; +} + +setHarnessRuntimeBindingsForInstrumentation({ + getCurrentRun, + getHarnessMode: () => getHarnessConfig().mode, + createBudgetExceededError: (message: string, remaining?: number) => + new BudgetExceededError(message, remaining), + createHarnessStopError: (message: string, reason?: string) => + new HarnessStopError(message, reason), +}); + +export const cascadeflow = { + init, + run, + agent, + reset, + getHarnessConfig, + getCurrentRun, +}; + +export function isHarnessInstrumented(): boolean { + return _isInstrumented; +} diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 29819183..c919f67e 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -42,6 +42,31 @@ export { DEFAULT_CASCADE_CONFIG, } from './config'; +// Harness API (v2.1+) +export type { + HarnessMode, + HarnessConfig, + HarnessInitOptions, + HarnessRunOptions, + HarnessInitReport, + HarnessRecordOptions, + HarnessTraceEntry, + HarnessRunSummary, +} from './harness'; +export { + HarnessRunContext, + HarnessStopError, + BudgetExceededError, + init, + run, + agent as harnessAgent, + reset as resetHarness, + getHarnessConfig, + getCurrentRun, + isHarnessInstrumented, + cascadeflow, +} from './harness'; + // Results export type { CascadeResult } from './result'; export { resultToObject } from './result'; From de7db49c1db7563d68f7e8225f3aea69c9aac0b9 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 4 Mar 2026 16:05:14 +0100 Subject: [PATCH 35/49] test(harness): add comprehensive Anthropic auto-instrumentation tests Add 29 tests covering the Anthropic Python SDK monkey-patching that was introduced in v2.1. Tests cover usage extraction, tool call counting, sync/async wrapper behavior, budget enforcement in enforce mode, stream passthrough, cost/energy/latency tracking, and init/reset lifecycle. --- tests/test_harness_instrument.py | 384 ++++++++++++++++++++++++++++++- 1 file changed, 383 insertions(+), 1 deletion(-) diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py index ca1f9a07..4931f093 100644 --- a/tests/test_harness_instrument.py +++ b/tests/test_harness_instrument.py @@ -1,4 +1,4 @@ -"""Tests for cascadeflow.harness.instrument — OpenAI auto-instrumentation.""" +"""Tests for cascadeflow.harness.instrument — OpenAI + Anthropic auto-instrumentation.""" from __future__ import annotations @@ -15,8 +15,12 @@ from cascadeflow.harness.instrument import ( _InstrumentedAsyncStream, _InstrumentedStream, + _count_tool_calls_in_anthropic_response, _estimate_cost, _estimate_energy, + _extract_anthropic_usage, + _make_patched_anthropic_async_create, + _make_patched_anthropic_create, _make_patched_async_create, _make_patched_create, is_anthropic_patched, @@ -967,3 +971,381 @@ def test_non_stream_does_not_inject_stream_options(self) -> None: call_kwargs = original.call_args[1] assert "stream_options" not in call_kwargs + + +# =========================================================================== +# Anthropic instrumentation tests +# =========================================================================== + + +def _mock_anthropic_usage(input_tokens: int = 100, output_tokens: int = 50) -> MagicMock: + u = MagicMock() + u.input_tokens = input_tokens + u.output_tokens = output_tokens + return u + + +def _mock_anthropic_response( + input_tokens: int = 100, + output_tokens: int = 50, + content: Optional[list] = None, +) -> MagicMock: + resp = MagicMock() + resp.usage = _mock_anthropic_usage(input_tokens, output_tokens) + resp.content = content or [] + return resp + + +def _mock_tool_use_block() -> MagicMock: + block = MagicMock() + block.type = "tool_use" + return block + + +def _mock_text_block() -> MagicMock: + block = MagicMock() + block.type = "text" + return block + + +# --------------------------------------------------------------------------- +# Anthropic usage extraction +# --------------------------------------------------------------------------- + + +class TestAnthropicUsageExtraction: + def test_extract_usage(self) -> None: + resp = _mock_anthropic_response(input_tokens=200, output_tokens=100) + inp, out = _extract_anthropic_usage(resp) + assert inp == 200 + assert out == 100 + + def test_extract_usage_none(self) -> None: + resp = MagicMock() + resp.usage = None + inp, out = _extract_anthropic_usage(resp) + assert inp == 0 + assert out == 0 + + +# --------------------------------------------------------------------------- +# Anthropic tool call counting +# --------------------------------------------------------------------------- + + +class TestAnthropicToolCallCounting: + def test_counts_tool_use_blocks(self) -> None: + resp = _mock_anthropic_response( + content=[_mock_text_block(), _mock_tool_use_block(), _mock_tool_use_block()] + ) + assert _count_tool_calls_in_anthropic_response(resp) == 2 + + def test_no_content(self) -> None: + resp = MagicMock() + resp.content = None + assert _count_tool_calls_in_anthropic_response(resp) == 0 + + def test_empty_content(self) -> None: + resp = _mock_anthropic_response(content=[]) + assert _count_tool_calls_in_anthropic_response(resp) == 0 + + def test_text_only(self) -> None: + resp = _mock_anthropic_response(content=[_mock_text_block()]) + assert _count_tool_calls_in_anthropic_response(resp) == 0 + + +# --------------------------------------------------------------------------- +# Anthropic sync wrapper +# --------------------------------------------------------------------------- + + +class TestAnthropicSyncWrapper: + def test_observe_passes_through_response(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=1.0) as ctx: + result = wrapper(MagicMock(), model="claude-sonnet-4") + + assert result is mock_resp + original.assert_called_once() + + def test_observe_tracks_cost(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=100.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + + # claude-sonnet-4: $3.00/1M in + $15.00/1M out = $18.00 + assert ctx.cost == pytest.approx(18.0, abs=0.01) + + def test_observe_tracks_step_count(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + wrapper(MagicMock(), model="claude-sonnet-4") + + assert ctx.step_count == 2 + + def test_observe_tracks_tool_calls(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response( + content=[_mock_tool_use_block(), _mock_tool_use_block()] + ) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + + assert ctx.tool_calls == 2 + + def test_observe_tracks_energy(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response(input_tokens=1000, output_tokens=500) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + + # claude-sonnet-4 uses default coefficient=1.0, output_weight=1.5 + # energy = 1.0 * (1000 + 500 * 1.5) = 1750.0 + assert ctx.energy_used == pytest.approx(1750.0) + + def test_observe_tracks_latency(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + + assert ctx.latency_used_ms > 0 + + def test_budget_remaining_decreases(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=100.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + + assert ctx.budget_remaining is not None + assert ctx.budget_remaining == pytest.approx(100.0 - 18.0, abs=0.01) + + def test_trace_records_model_and_mode(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=1.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + + trace = ctx.trace() + assert len(trace) == 1 + assert trace[0]["action"] == "allow" + assert trace[0]["reason"] == "observe" + assert trace[0]["model"] == "claude-sonnet-4" + + def test_off_mode_passthrough_no_tracking(self) -> None: + init(mode="off") + mock_resp = _mock_anthropic_response() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run() as ctx: + result = wrapper(MagicMock(), model="claude-sonnet-4") + + assert result is mock_resp + assert ctx.cost == 0.0 + assert ctx.step_count == 0 + + def test_no_run_scope_returns_response(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + result = wrapper(MagicMock(), model="claude-sonnet-4") + assert result is mock_resp + + def test_stream_passthrough_no_usage_tracking(self) -> None: + """Anthropic streams are not instrumented in V2.1 — verify passthrough.""" + init(mode="observe") + mock_stream = MagicMock() + original = MagicMock(return_value=mock_stream) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=1.0) as ctx: + result = wrapper(MagicMock(), model="claude-sonnet-4", stream=True) + + assert result is mock_stream + assert ctx.cost == 0.0 + assert ctx.step_count == 0 + + def test_multiple_calls_accumulate(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=100.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + wrapper(MagicMock(), model="claude-sonnet-4") + + assert ctx.cost == pytest.approx(36.0, abs=0.01) + assert ctx.step_count == 2 + + +# --------------------------------------------------------------------------- +# Anthropic async wrapper +# --------------------------------------------------------------------------- + + +class TestAnthropicAsyncWrapper: + async def test_observe_passes_through_response(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response() + original = AsyncMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_async_create(original) + + async with run(budget=1.0) as ctx: + result = await wrapper(MagicMock(), model="claude-sonnet-4") + + assert result is mock_resp + + async def test_observe_tracks_cost(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000) + original = AsyncMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_async_create(original) + + async with run(budget=100.0) as ctx: + await wrapper(MagicMock(), model="claude-sonnet-4") + + assert ctx.cost == pytest.approx(18.0, abs=0.01) + assert ctx.step_count == 1 + + async def test_off_mode_passthrough(self) -> None: + init(mode="off") + mock_resp = _mock_anthropic_response() + original = AsyncMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_async_create(original) + + async with run() as ctx: + result = await wrapper(MagicMock(), model="claude-sonnet-4") + + assert result is mock_resp + assert ctx.cost == 0.0 + + async def test_stream_passthrough(self) -> None: + init(mode="observe") + mock_stream = AsyncMock() + original = AsyncMock(return_value=mock_stream) + wrapper = _make_patched_anthropic_async_create(original) + + async with run(budget=1.0) as ctx: + result = await wrapper(MagicMock(), model="claude-sonnet-4", stream=True) + + assert result is mock_stream + assert ctx.cost == 0.0 + + +# --------------------------------------------------------------------------- +# Anthropic enforce mode +# --------------------------------------------------------------------------- + + +class TestAnthropicEnforceMode: + def test_enforce_trace_records_enforce_reason(self) -> None: + init(mode="enforce") + mock_resp = _mock_anthropic_response() + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=100.0) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + + trace = ctx.trace() + assert trace[0]["reason"] == "enforce" + + def test_enforce_raises_on_budget_exhausted(self) -> None: + from cascadeflow.schema.exceptions import BudgetExceededError + + init(mode="enforce") + mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=0.001) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + with pytest.raises(BudgetExceededError): + wrapper(MagicMock(), model="claude-sonnet-4") + + def test_observe_does_not_raise_on_budget_exhausted(self) -> None: + init(mode="observe") + mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000) + original = MagicMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=0.001) as ctx: + wrapper(MagicMock(), model="claude-sonnet-4") + wrapper(MagicMock(), model="claude-sonnet-4") + + assert ctx.cost > ctx.budget_max + + async def test_async_enforce_raises_on_budget_exhausted(self) -> None: + from cascadeflow.schema.exceptions import BudgetExceededError + + init(mode="enforce") + mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000) + original = AsyncMock(return_value=mock_resp) + wrapper = _make_patched_anthropic_async_create(original) + + async with run(budget=0.001) as ctx: + await wrapper(MagicMock(), model="claude-sonnet-4") + with pytest.raises(BudgetExceededError): + await wrapper(MagicMock(), model="claude-sonnet-4") + + +# --------------------------------------------------------------------------- +# Anthropic init() integration +# --------------------------------------------------------------------------- + + +class TestAnthropicInitIntegration: + def test_init_observe_patches_anthropic(self) -> None: + if find_spec("anthropic") is None: + pytest.skip("anthropic package not available") + report = init(mode="observe") + assert "anthropic" in report.instrumented + assert is_anthropic_patched() + + def test_init_off_unpatches_anthropic(self) -> None: + if find_spec("anthropic") is None: + pytest.skip("anthropic package not available") + init(mode="observe") + assert is_anthropic_patched() + init(mode="off") + assert not is_anthropic_patched() + + def test_reset_unpatches_anthropic(self) -> None: + if find_spec("anthropic") is None: + pytest.skip("anthropic package not available") + init(mode="observe") + assert is_anthropic_patched() + reset() + assert not is_anthropic_patched() From de4a638c6526623c55da650c46545fb57195af49 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 4 Mar 2026 16:22:42 +0100 Subject: [PATCH 36/49] feat(harness): instrument Anthropic streaming usage and tool calls --- cascadeflow/harness/instrument.py | 190 ++++++++++++++++++++++++++++-- tests/test_harness_instrument.py | 84 +++++++++++-- 2 files changed, 252 insertions(+), 22 deletions(-) diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py index 566f15d0..5632884c 100644 --- a/cascadeflow/harness/instrument.py +++ b/cascadeflow/harness/instrument.py @@ -663,6 +663,168 @@ async def __aexit__(self, *args: Any) -> bool: return False +class _InstrumentedAnthropicStreamBase: + """Shared stream-wrapper logic for sync and async Anthropic streams.""" + + __slots__ = ( + "_stream", + "_ctx", + "_model", + "_start_time", + "_pre_action", + "_pre_reason", + "_pre_model", + "_pre_applied", + "_decision_mode", + "_input_tokens", + "_output_tokens", + "_tool_call_count", + "_finalized", + ) + + def __init__( + self, + stream: Any, + ctx: Any, + model: str, + start_time: float, + pre_action: str = "allow", + pre_reason: str = "observe", + pre_model: str | None = None, + pre_applied: bool = True, + decision_mode: str = "observe", + ) -> None: + self._stream = stream + self._ctx = ctx + self._model = model + self._start_time = start_time + self._pre_action = pre_action + self._pre_reason = pre_reason + self._pre_model = pre_model or model + self._pre_applied = pre_applied + self._decision_mode = decision_mode + self._input_tokens: int = 0 + self._output_tokens: int = 0 + self._tool_call_count: int = 0 + self._finalized: bool = False + + def close(self) -> None: + self._finalize() + if hasattr(self._stream, "close"): + self._stream.close() + + def _inspect_event(self, event: Any) -> None: + event_type = getattr(event, "type", None) + + if event_type == "message_start": + message = getattr(event, "message", None) + usage = getattr(message, "usage", None) + if usage is not None: + input_tokens = getattr(usage, "input_tokens", None) + output_tokens = getattr(usage, "output_tokens", None) + if isinstance(input_tokens, (int, float)): + self._input_tokens = int(input_tokens) if input_tokens > 0 else 0 + if isinstance(output_tokens, (int, float)): + self._output_tokens = int(output_tokens) if output_tokens > 0 else 0 + return + + usage = getattr(event, "usage", None) + if usage is not None: + input_tokens = getattr(usage, "input_tokens", None) + output_tokens = getattr(usage, "output_tokens", None) + if isinstance(input_tokens, (int, float)) and input_tokens > 0: + self._input_tokens = int(input_tokens) + if isinstance(output_tokens, (int, float)): + self._output_tokens = int(output_tokens) if output_tokens > 0 else 0 + + if event_type == "content_block_start": + content_block = getattr(event, "content_block", None) + block_type = getattr(content_block, "type", None) + if block_type in {"tool_use", "server_tool_use"}: + self._tool_call_count += 1 + + def _finalize(self) -> None: + if self._finalized: + return + self._finalized = True + + if self._ctx is None: + return + + elapsed_ms = (time.monotonic() - self._start_time) * 1000 + _update_context( + self._ctx, + self._model, + self._input_tokens, + self._output_tokens, + self._tool_call_count, + elapsed_ms, + action=self._pre_action, + action_reason=self._pre_reason, + action_model=self._pre_model, + applied=self._pre_applied, + decision_mode=self._decision_mode, + ) + + +class _InstrumentedAnthropicStream(_InstrumentedAnthropicStreamBase): + """Wraps an Anthropic sync stream and tracks usage at stream end.""" + + __slots__ = () + + def __iter__(self) -> _InstrumentedAnthropicStream: + return self + + def __next__(self) -> Any: + try: + event = next(self._stream) + self._inspect_event(event) + return event + except StopIteration: + self._finalize() + raise + + def __enter__(self) -> _InstrumentedAnthropicStream: + if hasattr(self._stream, "__enter__"): + self._stream.__enter__() + return self + + def __exit__(self, *args: Any) -> bool: + self._finalize() + if hasattr(self._stream, "__exit__"): + return self._stream.__exit__(*args) # type: ignore[no-any-return] + return False + + +class _InstrumentedAnthropicAsyncStream(_InstrumentedAnthropicStreamBase): + """Wraps an Anthropic async stream and tracks usage at stream end.""" + + __slots__ = () + + def __aiter__(self) -> _InstrumentedAnthropicAsyncStream: + return self + + async def __anext__(self) -> Any: + try: + event = await self._stream.__anext__() + self._inspect_event(event) + return event + except StopAsyncIteration: + self._finalize() + raise + + async def __aenter__(self) -> _InstrumentedAnthropicAsyncStream: + if hasattr(self._stream, "__aenter__"): + await self._stream.__aenter__() + return self + + async def __aexit__(self, *args: Any) -> bool: + self._finalize() + if hasattr(self._stream, "__aexit__"): + return await self._stream.__aexit__(*args) # type: ignore[no-any-return] + return False + + # --------------------------------------------------------------------------- # Wrapper factories # --------------------------------------------------------------------------- @@ -877,14 +1039,18 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: ) return response - # Anthropic stream wrappers are not instrumented in V2.1 (known limitation). if is_stream: - logger.debug( - "harness %s (anthropic): stream passthrough model=%s (usage tracking unavailable)", - mode, + return _InstrumentedAnthropicStream( + response, + ctx, model, + start_time, + pre_action, + pre_reason, + pre_model, + pre_applied, + mode, ) - return response elapsed_ms = (time.monotonic() - start_time) * 1000 input_tokens, output_tokens = _extract_anthropic_usage(response) @@ -949,14 +1115,18 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: ) return response - # Anthropic stream wrappers are not instrumented in V2.1 (known limitation). if is_stream: - logger.debug( - "harness %s async (anthropic): stream passthrough model=%s (usage tracking unavailable)", - mode, + return _InstrumentedAnthropicAsyncStream( + response, + ctx, model, + start_time, + pre_action, + pre_reason, + pre_model, + pre_applied, + mode, ) - return response elapsed_ms = (time.monotonic() - start_time) * 1000 input_tokens, output_tokens = _extract_anthropic_usage(response) diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py index 4931f093..551435dd 100644 --- a/tests/test_harness_instrument.py +++ b/tests/test_harness_instrument.py @@ -13,6 +13,8 @@ from cascadeflow.harness import init, reset, run from cascadeflow.harness.instrument import ( + _InstrumentedAnthropicAsyncStream, + _InstrumentedAnthropicStream, _InstrumentedAsyncStream, _InstrumentedStream, _count_tool_calls_in_anthropic_response, @@ -978,7 +980,10 @@ def test_non_stream_does_not_inject_stream_options(self) -> None: # =========================================================================== -def _mock_anthropic_usage(input_tokens: int = 100, output_tokens: int = 50) -> MagicMock: +def _mock_anthropic_usage( + input_tokens: Optional[int] = 100, + output_tokens: Optional[int] = 50, +) -> MagicMock: u = MagicMock() u.input_tokens = input_tokens u.output_tokens = output_tokens @@ -1008,6 +1013,43 @@ def _mock_text_block() -> MagicMock: return block +def _mock_anthropic_message_start_event( + input_tokens: int = 100, + output_tokens: int = 0, +) -> MagicMock: + event = MagicMock() + event.type = "message_start" + event.message = MagicMock() + event.message.usage = _mock_anthropic_usage(input_tokens, output_tokens) + return event + + +def _mock_anthropic_message_delta_event( + output_tokens: int = 50, +) -> MagicMock: + event = MagicMock() + event.type = "message_delta" + event.usage = _mock_anthropic_usage(None, output_tokens) + return event + + +def _mock_anthropic_content_block_start_event( + block_type: str = "tool_use", +) -> MagicMock: + event = MagicMock() + event.type = "content_block_start" + event.content_block = MagicMock() + event.content_block.type = block_type + return event + + +def _mock_anthropic_message_stop_event() -> MagicMock: + event = MagicMock() + event.type = "message_stop" + event.usage = None + return event + + # --------------------------------------------------------------------------- # Anthropic usage extraction # --------------------------------------------------------------------------- @@ -1182,19 +1224,27 @@ def test_no_run_scope_returns_response(self) -> None: result = wrapper(MagicMock(), model="claude-sonnet-4") assert result is mock_resp - def test_stream_passthrough_no_usage_tracking(self) -> None: - """Anthropic streams are not instrumented in V2.1 — verify passthrough.""" + def test_stream_tracks_usage_and_tool_calls(self) -> None: init(mode="observe") - mock_stream = MagicMock() + mock_stream = iter( + [ + _mock_anthropic_message_start_event(input_tokens=1_000_000), + _mock_anthropic_content_block_start_event("tool_use"), + _mock_anthropic_message_delta_event(output_tokens=1_000_000), + _mock_anthropic_message_stop_event(), + ] + ) original = MagicMock(return_value=mock_stream) wrapper = _make_patched_anthropic_create(original) with run(budget=1.0) as ctx: result = wrapper(MagicMock(), model="claude-sonnet-4", stream=True) + assert isinstance(result, _InstrumentedAnthropicStream) + list(result) - assert result is mock_stream - assert ctx.cost == 0.0 - assert ctx.step_count == 0 + assert ctx.cost == pytest.approx(18.0, abs=0.01) + assert ctx.step_count == 1 + assert ctx.tool_calls == 1 def test_multiple_calls_accumulate(self) -> None: init(mode="observe") @@ -1251,17 +1301,27 @@ async def test_off_mode_passthrough(self) -> None: assert result is mock_resp assert ctx.cost == 0.0 - async def test_stream_passthrough(self) -> None: + async def test_stream_tracks_usage_and_tool_calls(self) -> None: init(mode="observe") - mock_stream = AsyncMock() - original = AsyncMock(return_value=mock_stream) + + async def _event_stream(): + yield _mock_anthropic_message_start_event(input_tokens=1_000_000) + yield _mock_anthropic_content_block_start_event("tool_use") + yield _mock_anthropic_message_delta_event(output_tokens=1_000_000) + yield _mock_anthropic_message_stop_event() + + original = AsyncMock(return_value=_event_stream()) wrapper = _make_patched_anthropic_async_create(original) async with run(budget=1.0) as ctx: result = await wrapper(MagicMock(), model="claude-sonnet-4", stream=True) + assert isinstance(result, _InstrumentedAnthropicAsyncStream) + async for _ in result: + pass - assert result is mock_stream - assert ctx.cost == 0.0 + assert ctx.cost == pytest.approx(18.0, abs=0.01) + assert ctx.step_count == 1 + assert ctx.tool_calls == 1 # --------------------------------------------------------------------------- From ac157424d95863cbbaff13c2dc14b6e4b40bceba Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 4 Mar 2026 16:48:01 +0100 Subject: [PATCH 37/49] fix(harness): finalize stream metrics on errors and harden env parsing --- cascadeflow/harness/api.py | 11 +++- cascadeflow/harness/instrument.py | 12 ++++ tests/test_harness_api.py | 26 +++++++++ tests/test_harness_instrument.py | 91 +++++++++++++++++++++++++++++++ 4 files changed, 138 insertions(+), 2 deletions(-) diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py index 79f741b8..610bab28 100644 --- a/cascadeflow/harness/api.py +++ b/cascadeflow/harness/api.py @@ -7,6 +7,7 @@ import time from contextvars import ContextVar, Token from dataclasses import dataclass, field +from functools import wraps from importlib.util import find_spec from pathlib import Path from typing import Any, Callable, Literal, Optional, TypeVar, cast @@ -242,12 +243,14 @@ def reset() -> None: _MAX_ACTION_LEN = 64 _MAX_REASON_LEN = 160 _MAX_MODEL_LEN = 128 +_MAX_ENV_JSON_LEN = 4096 def _sanitize_trace_value(value: Any, *, max_length: int) -> Optional[str]: if value is None: return None text = str(value).replace("\n", " ").replace("\r", " ").strip() + text = "".join(c for c in text if c.isprintable()) if len(text) > max_length: text = text[: max_length - 3] + "..." return text or None @@ -302,6 +305,10 @@ def _parse_int(raw: str) -> int: def _parse_json_dict(raw: str) -> dict[str, float]: + if len(raw) > _MAX_ENV_JSON_LEN: + raise ValueError( + f"JSON config exceeds {_MAX_ENV_JSON_LEN} characters for harness env var" + ) value = json.loads(raw) if not isinstance(value, dict): raise ValueError("expected JSON object") @@ -606,18 +613,18 @@ def decorator(func: F) -> F: if inspect.iscoroutinefunction(func): + @wraps(func) async def async_wrapper(*args: Any, **kwargs: Any) -> Any: return await func(*args, **kwargs) async_wrapper.__cascadeflow_agent_policy__ = metadata # type: ignore[attr-defined] - async_wrapper.__name__ = getattr(func, "__name__", "wrapped_agent") return cast(F, async_wrapper) + @wraps(func) def sync_wrapper(*args: Any, **kwargs: Any) -> Any: return func(*args, **kwargs) sync_wrapper.__cascadeflow_agent_policy__ = metadata # type: ignore[attr-defined] - sync_wrapper.__name__ = getattr(func, "__name__", "wrapped_agent") return cast(F, sync_wrapper) return decorator diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py index 5632884c..4b08b9f6 100644 --- a/cascadeflow/harness/instrument.py +++ b/cascadeflow/harness/instrument.py @@ -621,6 +621,9 @@ def __next__(self) -> Any: except StopIteration: self._finalize() raise + except Exception: + self._finalize() + raise def __enter__(self) -> _InstrumentedStream: if hasattr(self._stream, "__enter__"): @@ -650,6 +653,9 @@ async def __anext__(self) -> Any: except StopAsyncIteration: self._finalize() raise + except Exception: + self._finalize() + raise async def __aenter__(self) -> _InstrumentedAsyncStream: if hasattr(self._stream, "__aenter__"): @@ -783,6 +789,9 @@ def __next__(self) -> Any: except StopIteration: self._finalize() raise + except Exception: + self._finalize() + raise def __enter__(self) -> _InstrumentedAnthropicStream: if hasattr(self._stream, "__enter__"): @@ -812,6 +821,9 @@ async def __anext__(self) -> Any: except StopAsyncIteration: self._finalize() raise + except Exception: + self._finalize() + raise async def __aenter__(self) -> _InstrumentedAnthropicAsyncStream: if hasattr(self._stream, "__aenter__"): diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py index 9554a486..850255ba 100644 --- a/tests/test_harness_api.py +++ b/tests/test_harness_api.py @@ -155,6 +155,17 @@ def fn(x: int) -> int: assert policy["compliance"] == "gdpr" +def test_agent_decorator_preserves_function_metadata(): + @agent(budget=0.5) + def fn(x: int) -> int: + """sample doc""" + return x + + assert fn.__name__ == "fn" + assert fn.__doc__ == "sample doc" + assert fn.__annotations__ == {"x": int, "return": int} + + @pytest.mark.asyncio async def test_agent_decorator_keeps_async_behavior_and_attaches_metadata(): @agent(budget=0.4, kpi_weights={"cost": 1.0}) @@ -210,6 +221,12 @@ def test_init_reads_from_env(monkeypatch): assert report.config_sources["budget"] == "env" +def test_init_rejects_oversized_env_json(monkeypatch): + monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_TARGETS", "x" * 5000) + with pytest.raises(ValueError, match="JSON config exceeds"): + init() + + def test_init_reads_from_config_file(tmp_path, monkeypatch): config = tmp_path / "cascadeflow.json" config.write_text( @@ -433,6 +450,15 @@ def test_record_sanitizes_trace_values(): assert len(entry["reason"]) <= 160 +def test_record_sanitizes_non_printable_values(): + ctx = run() + ctx.record(action="allow\x00", reason="ok\x1f", model="gpt-4o-mini\x07") + entry = ctx.trace()[0] + assert "\x00" not in entry["action"] + assert "\x1f" not in entry["reason"] + assert "\x07" not in entry["model"] + + def test_record_without_callback_manager_is_noop(): init(mode="observe") with run(budget=1.0) as ctx: diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py index 551435dd..55e71837 100644 --- a/tests/test_harness_instrument.py +++ b/tests/test_harness_instrument.py @@ -434,6 +434,31 @@ def test_stream_finalize_is_idempotent(self) -> None: assert ctx.step_count == 1 # Should not double-count + def test_stream_finalizes_on_iteration_error(self) -> None: + init(mode="observe") + chunk1 = _mock_stream_chunk("data", usage=_mock_usage(100, 50)) + + class _FailingStream: + def __init__(self) -> None: + self._done = False + + def __iter__(self): + return self + + def __next__(self): + if not self._done: + self._done = True + return chunk1 + raise RuntimeError("stream failed") + + with run(budget=1.0) as ctx: + wrapped = _InstrumentedStream(_FailingStream(), ctx, "gpt-4o-mini", time.monotonic()) + with pytest.raises(RuntimeError, match="stream failed"): + list(wrapped) + + assert ctx.step_count == 1 + assert ctx.cost > 0 + def test_stream_wrapper_via_patched_create(self) -> None: """Verify that stream=True in the wrapper returns an _InstrumentedStream.""" init(mode="observe") @@ -496,6 +521,24 @@ async def _async_iter(): assert ctx.step_count == 1 + @pytest.mark.asyncio + async def test_async_stream_finalizes_on_iteration_error(self) -> None: + init(mode="observe") + chunk1 = _mock_stream_chunk("data", usage=_mock_usage(100, 50)) + + async def _failing_iter(): + yield chunk1 + raise RuntimeError("async stream failed") + + async with run(budget=1.0) as ctx: + wrapped = _InstrumentedAsyncStream(_failing_iter(), ctx, "gpt-4o-mini", time.monotonic()) + with pytest.raises(RuntimeError, match="async stream failed"): + async for _ in wrapped: + pass + + assert ctx.step_count == 1 + assert ctx.cost > 0 + # --------------------------------------------------------------------------- # Cost and energy estimation @@ -1246,6 +1289,34 @@ def test_stream_tracks_usage_and_tool_calls(self) -> None: assert ctx.step_count == 1 assert ctx.tool_calls == 1 + def test_stream_finalizes_on_iteration_error(self) -> None: + init(mode="observe") + + class _FailingAnthropicStream: + def __init__(self) -> None: + self._done = False + + def __iter__(self): + return self + + def __next__(self): + if not self._done: + self._done = True + return _mock_anthropic_message_start_event(input_tokens=1_000_000) + raise RuntimeError("anthropic stream failed") + + original = MagicMock(return_value=_FailingAnthropicStream()) + wrapper = _make_patched_anthropic_create(original) + + with run(budget=1.0) as ctx: + result = wrapper(MagicMock(), model="claude-sonnet-4", stream=True) + assert isinstance(result, _InstrumentedAnthropicStream) + with pytest.raises(RuntimeError, match="anthropic stream failed"): + list(result) + + assert ctx.step_count == 1 + assert ctx.cost > 0 + def test_multiple_calls_accumulate(self) -> None: init(mode="observe") mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000) @@ -1323,6 +1394,26 @@ async def _event_stream(): assert ctx.step_count == 1 assert ctx.tool_calls == 1 + async def test_stream_finalizes_on_iteration_error(self) -> None: + init(mode="observe") + + async def _failing_event_stream(): + yield _mock_anthropic_message_start_event(input_tokens=1_000_000) + raise RuntimeError("anthropic async stream failed") + + original = AsyncMock(return_value=_failing_event_stream()) + wrapper = _make_patched_anthropic_async_create(original) + + async with run(budget=1.0) as ctx: + result = await wrapper(MagicMock(), model="claude-sonnet-4", stream=True) + assert isinstance(result, _InstrumentedAnthropicAsyncStream) + with pytest.raises(RuntimeError, match="anthropic async stream failed"): + async for _ in result: + pass + + assert ctx.step_count == 1 + assert ctx.cost > 0 + # --------------------------------------------------------------------------- # Anthropic enforce mode From b894cd3ae0c8d266bcf6a64493ee7b1e416d742c Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 4 Mar 2026 20:00:32 +0100 Subject: [PATCH 38/49] docs: add harness quickstart and missing integration coverage --- docs/README.md | 5 +- docs/guides/crewai_integration.md | 79 ++++++++++++++++++++++ docs/guides/google_adk_integration.md | 3 +- docs/guides/python_harness_quickstart.md | 85 ++++++++++++++++++++++++ examples/integrations/README.md | 73 ++++++++++++++++++-- examples/integrations/crewai_harness.py | 75 +++++++++++++++++++++ 6 files changed, 314 insertions(+), 6 deletions(-) create mode 100644 docs/guides/crewai_integration.md create mode 100644 docs/guides/python_harness_quickstart.md create mode 100644 examples/integrations/crewai_harness.py diff --git a/docs/README.md b/docs/README.md index b9cedf66..5280a562 100644 --- a/docs/README.md +++ b/docs/README.md @@ -11,6 +11,7 @@ Welcome to cascadeflow documentation! 🌊 ### Core Concepts - [Quickstart](guides/quickstart.md) - Get started with cascadeflow in 5 minutes +- [Python Harness Quickstart](guides/python_harness_quickstart.md) - `init`, `run`, and `@agent` for in-process policy control - [Providers](guides/providers.md) - Configure and use different AI providers (OpenAI, Anthropic, Groq, Ollama, etc.) - [Presets](guides/presets.md) - Use built-in presets for common use cases - [Gateway Server](guides/gateway.md) - Drop-in OpenAI/Anthropic-compatible endpoint for existing apps @@ -39,9 +40,11 @@ Welcome to cascadeflow documentation! 🌊 - [Agent Intelligence V2/V2.1 Plan](strategy/agent-intelligence-v2-plan.md) - Unified strategic and execution plan for in-process agent intelligence harness delivery ### Integrations +- [OpenAI Agents SDK Integration](guides/openai_agents_integration.md) - Harness-aware model provider for existing OpenAI Agents apps +- [CrewAI Integration](guides/crewai_integration.md) - Hook-based harness metrics + budget gating (opt-in) +- [Google ADK Integration](guides/google_adk_integration.md) - Plugin-based harness integration for ADK runners (opt-in) - [n8n Integration](guides/n8n_integration.md) - Use cascadeflow in n8n workflows - [Paygentic Integration](guides/paygentic_integration.md) - Usage metering and billing lifecycle helpers (opt-in) -- [OpenAI Agents SDK Integration](guides/openai_agents_integration.md) - Harness-aware model provider for existing OpenAI Agents apps ## 📚 Examples diff --git a/docs/guides/crewai_integration.md b/docs/guides/crewai_integration.md new file mode 100644 index 00000000..a39efa79 --- /dev/null +++ b/docs/guides/crewai_integration.md @@ -0,0 +1,79 @@ +# CrewAI Integration + +Use cascadeflow as an explicit, opt-in harness integration for CrewAI via +`llm_hooks`. + +## Design Principles + +- Integration-only: nothing is enabled by default +- Works with existing CrewAI flows +- Harness behavior is controlled by `cascadeflow.init(...)` and `cascadeflow.run(...)` +- Fail-open integration path: harness integration errors should not break crew execution + +## Install + +```bash +pip install "cascadeflow[crewai,openai]" +``` + +`crewai` is optional and only installed when you request this extra. + +## Quickstart + +```python +from crewai import Agent, Crew, Process, Task + +from cascadeflow import init, run +from cascadeflow.integrations.crewai import CrewAIHarnessConfig, enable + +# Global harness defaults. +init(mode="enforce", budget=1.0) + +# Explicitly register CrewAI hooks (integration-only behavior). +enable( + config=CrewAIHarnessConfig( + fail_open=True, + enable_budget_gate=True, + ) +) + +agent = Agent( + role="Support Agent", + goal="Answer support questions clearly and concisely.", + backstory="You are helpful and direct.", + allow_delegation=False, + llm="openai/gpt-4o-mini", +) + +task = Task( + description="Explain why model cascading helps control agent costs.", + expected_output="A concise explanation with one practical example.", + agent=agent, +) + +with run(budget=0.4) as session: + crew = Crew(agents=[agent], tasks=[task], process=Process.sequential, verbose=False) + result = crew.kickoff() + + print(result) + print(session.summary()) + print(session.trace()) +``` + +## What This Integration Adds + +- Budget gating in enforce mode (`before_llm_call` hook) +- Run metrics in `cascadeflow.run()` scope: + - `cost`, `budget_remaining`, `step_count`, `latency_used_ms`, `energy_used` +- Full decision trace through `run.trace()` + +## Current Scope + +- This integration uses CrewAI hook points, so it tracks and gates calls without + changing your crew/task definitions. +- Tool-level deny/switch actions are not currently applied in this integration path. + +## Notes + +- Existing non-CrewAI users are unaffected. +- If CrewAI is not installed, `enable()` returns `False` and no hooks are registered. diff --git a/docs/guides/google_adk_integration.md b/docs/guides/google_adk_integration.md index d0d32b3f..393a1b57 100644 --- a/docs/guides/google_adk_integration.md +++ b/docs/guides/google_adk_integration.md @@ -11,7 +11,8 @@ trace recording across all agents in an ADK Runner. - **Plugin-based** — Uses ADK's `BasePlugin` system to intercept every LLM call across all agents in a Runner. One plugin covers the entire agent graph. - **Opt-in** — Install `cascadeflow[google-adk]` and create a plugin explicitly. - Never enabled by default. + Never enabled by default. Core cascadeflow behavior is unchanged unless you + explicitly wire this integration into `Runner(plugins=[...])`. - **Fail-open** — Integration errors are logged but never break ADK execution (configurable). - **No tool gating** — ADK's `tools_dict` is part of agent definition, not diff --git a/docs/guides/python_harness_quickstart.md b/docs/guides/python_harness_quickstart.md new file mode 100644 index 00000000..4ec85cfd --- /dev/null +++ b/docs/guides/python_harness_quickstart.md @@ -0,0 +1,85 @@ +# Python Harness Quickstart + +This guide covers the in-process harness API: + +- `init(...)` for global defaults and SDK instrumentation +- `run(...)` for per-request scoped budgets/limits and traceability +- `@agent(...)` for attaching policy metadata to agent functions + +## Install + +```bash +pip install "cascadeflow[openai]" +``` + +Optional integrations stay opt-in: + +```bash +pip install "cascadeflow[openai,openai-agents]" +pip install "cascadeflow[crewai]" +pip install "cascadeflow[google-adk]" +``` + +## 1) Initialize Harness + +```python +from cascadeflow import init + +report = init( + mode="observe", # off | observe | enforce + budget=1.0, # default per-run budget cap + max_tool_calls=8, # default per-run tool call cap +) + +print(report.mode) +print(report.instrumented) +print(report.detected_but_not_instrumented) +``` + +`init(...)` is explicit and never auto-enables integrations. + +## 2) Track One Scoped Run + +```python +from openai import OpenAI + +from cascadeflow import run + +client = OpenAI() + +with run(budget=0.25, max_tool_calls=4) as session: + response = client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": "Summarize model cascading in one sentence."}], + ) + + print(response.choices[0].message.content) + print(session.summary()) + print(session.trace()) +``` + +## 3) Attach Agent Metadata + +`@agent(...)` attaches policy metadata to your function without changing how the +function executes. + +```python +from cascadeflow import agent + +@agent( + budget=0.2, + kpi_targets={"quality": 0.9}, + kpi_weights={"cost": 0.5, "latency": 0.5}, + compliance="strict", +) +def support_agent(task: str) -> str: + return f"Handled: {task}" + +print(support_agent.__cascadeflow_agent_policy__) +``` + +## Minimal Checklist + +1. Call `init(...)` once at process startup. +2. Wrap each unit of work in `with run(...):`. +3. Use `run.summary()` and `run.trace()` for auditability and tuning. diff --git a/examples/integrations/README.md b/examples/integrations/README.md index e7e7906a..4bad64f0 100644 --- a/examples/integrations/README.md +++ b/examples/integrations/README.md @@ -6,6 +6,8 @@ This directory contains production-ready integration examples for cascadeflow wi - [LiteLLM Integration](#-litellm-integration) - Access 10+ providers with automatic cost tracking - [OpenAI Agents SDK Integration](#-openai-agents-sdk-integration) - Harness-aware ModelProvider for existing agent apps +- [CrewAI Integration](#-crewai-integration) - Hook-based harness metrics and budget gating +- [Google ADK Integration](#-google-adk-integration) - Plugin-based harness integration for ADK runners - [Paygentic Integration](#-paygentic-integration) - Usage event reporting and billing lifecycle helpers - [Local Providers](#-local-providers-setup) - Ollama and vLLM configuration examples - [OpenTelemetry & Grafana](#-opentelemetry--grafana) - Production observability and metrics @@ -160,6 +162,48 @@ python examples/integrations/openai_agents_harness.py --- +## 👥 CrewAI Integration + +**File:** [`crewai_harness.py`](crewai_harness.py) + +Use cascadeflow as an explicit, opt-in CrewAI hook integration. + +### Quick Start + +```bash +pip install "cascadeflow[crewai,openai]" +python examples/integrations/crewai_harness.py +``` + +### What It Shows + +- Explicit `enable(...)` hook registration (never on by default) +- Enforce-mode budget gating before CrewAI LLM calls +- Run metrics and decision trace via `cascadeflow.run(...)` + +--- + +## 🧠 Google ADK Integration + +**File:** [`google_adk_harness.py`](google_adk_harness.py) + +Use cascadeflow as an explicit, opt-in plugin integration for Google ADK. + +### Quick Start + +```bash +pip install "cascadeflow[google-adk]" +python examples/integrations/google_adk_harness.py +``` + +### What It Shows + +- Explicit plugin creation with `enable(...)` (integration-only behavior) +- Runner-level plugin wiring via `Runner(..., plugins=[plugin])` +- Budget gate + run-scoped metrics and trace + +--- + ## 💳 Paygentic Integration **File:** [`paygentic_usage.py`](paygentic_usage.py) @@ -412,6 +456,9 @@ Cost Calculation Tests |------|---------|-------------------| | `litellm_providers.py` | Comprehensive LiteLLM demo with 8 examples | No (for cost info) | | `litellm_cost_tracking.py` | Cost tracking and provider validation | No (for cost info) | +| `openai_agents_harness.py` | OpenAI Agents SDK harness integration (ModelProvider) | Yes | +| `crewai_harness.py` | CrewAI hook-based harness integration (opt-in) | Yes | +| `google_adk_harness.py` | Google ADK plugin harness integration (opt-in) | Yes | | `paygentic_usage.py` | Usage event reporting to Paygentic (opt-in, fail-open) | Yes | | `local_providers_setup.py` | Ollama and vLLM setup guide | No | | `opentelemetry_grafana.py` | Production observability example | No | @@ -473,6 +520,18 @@ pip install cascadeflow[all] pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp-proto-http ``` +### "CrewAI hooks unavailable" +```bash +pip install "cascadeflow[crewai,openai]" +# Requires crewai>=1.5 for llm_hooks +``` + +### "Google ADK not installed" +```bash +pip install "cascadeflow[google-adk]" +# Google ADK requires Python 3.10+ +``` + ### "Metrics not appearing in Grafana" 1. Check OpenTelemetry Collector logs: `docker-compose logs otel-collector` 2. Verify metrics: `curl http://localhost:8889/metrics` @@ -490,6 +549,9 @@ Always use provider prefixes for LiteLLM: - **Provider Guide:** [docs/guides/providers.md](../../docs/guides/providers.md) - **Cost Tracking:** [docs/guides/cost_tracking.md](../../docs/guides/cost_tracking.md) +- **OpenAI Agents Guide:** [docs/guides/openai_agents_integration.md](../../docs/guides/openai_agents_integration.md) +- **CrewAI Guide:** [docs/guides/crewai_integration.md](../../docs/guides/crewai_integration.md) +- **Google ADK Guide:** [docs/guides/google_adk_integration.md](../../docs/guides/google_adk_integration.md) - **Paygentic Guide:** [docs/guides/paygentic_integration.md](../../docs/guides/paygentic_integration.md) - **Production Guide:** [docs/guides/production.md](../../docs/guides/production.md) @@ -498,10 +560,13 @@ Always use provider prefixes for LiteLLM: ## 🚀 Next Steps 1. **Try LiteLLM:** `python examples/integrations/litellm_providers.py` -2. **Try Paygentic usage reporting:** `python examples/integrations/paygentic_usage.py` -3. **Setup local providers:** `python examples/integrations/local_providers_setup.py` -4. **Test your API keys:** `python examples/integrations/test_all_providers.py` -5. **Add monitoring:** Follow OpenTelemetry section above +2. **Try OpenAI Agents integration:** `python examples/integrations/openai_agents_harness.py` +3. **Try CrewAI integration:** `python examples/integrations/crewai_harness.py` +4. **Try Google ADK integration:** `python examples/integrations/google_adk_harness.py` +5. **Try Paygentic usage reporting:** `python examples/integrations/paygentic_usage.py` +6. **Setup local providers:** `python examples/integrations/local_providers_setup.py` +7. **Test your API keys:** `python examples/integrations/test_all_providers.py` +8. **Add monitoring:** Follow OpenTelemetry section above --- diff --git a/examples/integrations/crewai_harness.py b/examples/integrations/crewai_harness.py new file mode 100644 index 00000000..5e14163c --- /dev/null +++ b/examples/integrations/crewai_harness.py @@ -0,0 +1,75 @@ +""" +CrewAI + cascadeflow harness integration example. + +Run: + pip install "cascadeflow[crewai,openai]" + export OPENAI_API_KEY="your-key" + python examples/integrations/crewai_harness.py +""" + +from __future__ import annotations + + +def main() -> None: + try: + from crewai import Agent, Crew, Process, Task + except ImportError as exc: + raise SystemExit( + "CrewAI is not installed. " + 'Install with: pip install "cascadeflow[crewai,openai]"' + ) from exc + + from cascadeflow import init, run + from cascadeflow.integrations.crewai import CrewAIHarnessConfig, enable + + # 1) Initialize harness globally. + init(mode="observe", budget=1.0, max_tool_calls=6) + + # 2) Explicitly enable CrewAI integration hooks (opt-in). + enabled = enable( + config=CrewAIHarnessConfig( + fail_open=True, + enable_budget_gate=True, + ) + ) + if not enabled: + raise SystemExit( + "CrewAI hooks are unavailable in this environment. " + "Ensure crewai>=1.5 is installed." + ) + + agent = Agent( + role="Routing Analyst", + goal="Explain model routing impact on cost and latency in plain language.", + backstory="You are concise and practical.", + allow_delegation=False, + llm="openai/gpt-4o-mini", + verbose=False, + ) + + task = Task( + description="Explain why inside-the-loop routing helps agent workloads.", + expected_output="One short paragraph and three bullet points.", + agent=agent, + ) + + with run(budget=0.5, max_tool_calls=4) as session: + crew = Crew(agents=[agent], tasks=[task], process=Process.sequential, verbose=False) + result = crew.kickoff() + + print("=== Result ===") + print(result) + print("\n=== Harness Metrics ===") + print(f"Cost: ${session.cost:.6f}") + print(f"Remaining budget: {session.budget_remaining}") + print(f"Steps: {session.step_count}") + print(f"Tool calls: {session.tool_calls}") + print(f"Latency: {session.latency_used_ms:.0f}ms") + print(f"Energy: {session.energy_used:.1f}") + print("\n=== Decision Trace ===") + for event in session.trace(): + print(event) + + +if __name__ == "__main__": + main() From 6d3e6a8cf00460cf0bf6b152f3d8e35bd1722a66 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 4 Mar 2026 20:16:26 +0100 Subject: [PATCH 39/49] feat(n8n): add multi-dimensional harness integration to Agent node Port the Python harness decision engine to TypeScript and wire it into the n8n Agent node. Tracks 5 dimensions (cost, latency, energy, tool calls, quality) across every LLM call. Observe mode is on by default; enforce mode stops the agent loop when limits are hit. - Add nodes/harness/ with pricing (18 models, fuzzy resolution), HarnessRunContext (7-step decision cascade, compliance allowlists, KPI-weighted scoring), and 43 tests - Replace hardcoded estimatesPerMillion in CascadeChatModel with shared harness/pricing.ts (broader model coverage + suffix stripping) - Add harness UI parameters to Agent node (mode, budget, tool cap, latency cap, energy cap, compliance, KPI weights) - Wire pre-call checks and tool-call counting into agent executor loop - Add harness summary to Agent output JSON --- .../CascadeFlowAgent/CascadeFlowAgent.node.ts | 152 +++++++ .../LmChatCascadeFlow.node.ts | 65 +-- .../LmChatCascadeFlow/cascade-metadata.ts | 4 + .../nodes/harness/__tests__/harness.test.ts | 368 +++++++++++++++ .../integrations/n8n/nodes/harness/harness.ts | 426 ++++++++++++++++++ .../integrations/n8n/nodes/harness/index.ts | 22 + .../integrations/n8n/nodes/harness/pricing.ts | 135 ++++++ 7 files changed, 1141 insertions(+), 31 deletions(-) create mode 100644 packages/integrations/n8n/nodes/harness/__tests__/harness.test.ts create mode 100644 packages/integrations/n8n/nodes/harness/harness.ts create mode 100644 packages/integrations/n8n/nodes/harness/index.ts create mode 100644 packages/integrations/n8n/nodes/harness/pricing.ts diff --git a/packages/integrations/n8n/nodes/CascadeFlowAgent/CascadeFlowAgent.node.ts b/packages/integrations/n8n/nodes/CascadeFlowAgent/CascadeFlowAgent.node.ts index b3f52a60..925a9a96 100644 --- a/packages/integrations/n8n/nodes/CascadeFlowAgent/CascadeFlowAgent.node.ts +++ b/packages/integrations/n8n/nodes/CascadeFlowAgent/CascadeFlowAgent.node.ts @@ -21,6 +21,7 @@ import { type DomainType, getEnabledDomains, } from '../LmChatCascadeFlow/config'; +import { HarnessRunContext, type HarnessConfig, type HarnessMode, type KpiWeights } from '../harness'; // Tool cascade validator - optional import let ToolCascadeValidator: any; @@ -65,6 +66,7 @@ export class CascadeFlowAgentExecutor { private routingRules: Map; private enableToolCascadeValidation: boolean; private toolCascadeValidator: any; + private harnessCtx: HarnessRunContext | null; constructor( private cascadeModel: CascadeChatModel, @@ -72,7 +74,9 @@ export class CascadeFlowAgentExecutor { routingRules: ToolRoutingRule[], private maxIterations: number, enableToolCascadeValidation: boolean = false, + harnessCtx: HarnessRunContext | null = null, ) { + this.harnessCtx = harnessCtx; this.toolMap = new Map( tools.filter((tool) => tool?.name).map((tool) => [tool.name as string, tool]) ); @@ -295,6 +299,18 @@ export class CascadeFlowAgentExecutor { let iterations = 0; while (iterations < this.maxIterations) { + // Harness enforce-mode pre-checks + if (this.harnessCtx?.config.mode === 'enforce') { + if (this.harnessCtx.isBudgetExhausted()) { + finalMessage = new AIMessage(`[Harness] Budget exhausted ($${this.harnessCtx.cost.toFixed(4)} of $${this.harnessCtx.config.budgetMax?.toFixed(4)} max). Agent stopped.`); + break; + } + if (this.harnessCtx.isToolCapReached()) { + finalMessage = new AIMessage(`[Harness] Tool call cap reached (${this.harnessCtx.toolCalls} of ${this.harnessCtx.config.toolCallsMax} max). Agent stopped.`); + break; + } + } + const message = await this.cascadeModel.invoke(currentMessages, options); const toolCalls = this.extractToolCalls(message); trace.push(this.buildTraceEntry(message, toolCalls)); @@ -350,6 +366,12 @@ export class CascadeFlowAgentExecutor { ); } + // Track tool calls in harness (CascadeChatModel records LLM token costs; + // agent executor tracks tool-call counts from the loop itself) + if (this.harnessCtx) { + this.harnessCtx.toolCalls += toolCalls.length; + } + if (routing === 'verifier') { const verifierMessage = await this.cascadeModel.invokeVerifierDirect(currentMessages, options); trace.push(this.buildTraceEntry(verifierMessage)); @@ -377,6 +399,7 @@ export class CascadeFlowAgentExecutor { output: finalMessage.content.toString(), message: finalMessage, trace, + harness: this.harnessCtx?.summary() ?? null, }; } @@ -753,6 +776,99 @@ export class CascadeFlowAgent implements INodeType { default: '', }, ...generateDomainProperties(), + // ----------------------------------------------------------------- + // Harness: Multi-Dimensional Cascading + // ----------------------------------------------------------------- + { + displayName: 'Harness', + name: 'harnessHeading', + type: 'notice', + default: '', + }, + { + displayName: 'Harness Mode', + name: 'harnessMode', + type: 'options', + options: [ + { name: 'Off', value: 'off', description: 'Harness disabled, zero overhead' }, + { name: 'Observe', value: 'observe', description: 'Track all dimensions, record trace, no enforcement' }, + { name: 'Enforce', value: 'enforce', description: 'Stop agent loop when limits are hit' }, + ], + default: 'observe', + description: 'Harness mode: off (disabled), observe (telemetry only), or enforce (stop when limits hit)', + }, + { + displayName: 'Budget (USD)', + name: 'harnessBudget', + type: 'number', + default: 0, + typeOptions: { minValue: 0, numberPrecision: 4 }, + displayOptions: { hide: { harnessMode: ['off'] } }, + description: 'Max budget in USD. 0 = unlimited.', + }, + { + displayName: 'Max Tool Calls', + name: 'harnessMaxToolCalls', + type: 'number', + default: 0, + typeOptions: { minValue: 0 }, + displayOptions: { hide: { harnessMode: ['off'] } }, + description: 'Max tool call count. 0 = unlimited.', + }, + { + displayName: 'Max Latency (Ms)', + name: 'harnessMaxLatencyMs', + type: 'number', + default: 0, + typeOptions: { minValue: 0 }, + displayOptions: { hide: { harnessMode: ['off'] } }, + description: 'Max cumulative latency in milliseconds. 0 = unlimited.', + }, + { + displayName: 'Max Energy', + name: 'harnessMaxEnergy', + type: 'number', + default: 0, + typeOptions: { minValue: 0, numberPrecision: 2 }, + displayOptions: { hide: { harnessMode: ['off'] } }, + description: 'Max energy proxy units. 0 = unlimited.', + }, + { + displayName: 'Compliance', + name: 'harnessCompliance', + type: 'options', + options: [ + { name: 'GDPR', value: 'gdpr' }, + { name: 'HIPAA', value: 'hipaa' }, + { name: 'None', value: '' }, + { name: 'PCI', value: 'pci' }, + { name: 'Strict', value: 'strict' }, + ], + default: '', + displayOptions: { hide: { harnessMode: ['off'] } }, + description: 'Compliance policy to enforce model allowlists', + }, + { + displayName: 'KPI Weights', + name: 'harnessKpiWeights', + type: 'fixedCollection', + typeOptions: { multipleValues: false }, + displayOptions: { hide: { harnessMode: ['off'] } }, + default: { weights: [{ quality: 0.4, cost: 0.3, latency: 0.2, energy: 0.1 }] }, + options: [ + { + name: 'weights', + displayName: 'Weights', + values: [ + { displayName: 'Quality', name: 'quality', type: 'number', default: 0.4, typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 2 } }, + { displayName: 'Cost', name: 'cost', type: 'number', default: 0.3, typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 2 } }, + { displayName: 'Latency', name: 'latency', type: 'number', default: 0.2, typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 2 } }, + { displayName: 'Energy', name: 'energy', type: 'number', default: 0.1, typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 2 } }, + ], + }, + ], + description: 'KPI dimension weights for optimization scoring (normalized automatically)', + }, ], }; @@ -782,6 +898,35 @@ export class CascadeFlowAgent implements INodeType { const toolRoutingRaw = this.getNodeParameter('toolRoutingRules', 0, { rule: [] }) as any; const toolRoutingRules = (toolRoutingRaw?.rule ?? []) as ToolRoutingRule[]; + // Harness parameters + const harnessMode = this.getNodeParameter('harnessMode', 0, 'observe') as HarnessMode; + let harnessCtx: HarnessRunContext | null = null; + if (harnessMode !== 'off') { + const rawBudget = this.getNodeParameter('harnessBudget', 0, 0) as number; + const rawToolCalls = this.getNodeParameter('harnessMaxToolCalls', 0, 0) as number; + const rawLatency = this.getNodeParameter('harnessMaxLatencyMs', 0, 0) as number; + const rawEnergy = this.getNodeParameter('harnessMaxEnergy', 0, 0) as number; + const compliance = this.getNodeParameter('harnessCompliance', 0, '') as string; + const kpiRaw = this.getNodeParameter('harnessKpiWeights', 0, { weights: [{ quality: 0.4, cost: 0.3, latency: 0.2, energy: 0.1 }] }) as any; + const kpiEntry = kpiRaw?.weights?.[0] ?? { quality: 0.4, cost: 0.3, latency: 0.2, energy: 0.1 }; + + const config: HarnessConfig = { + mode: harnessMode, + budgetMax: rawBudget > 0 ? rawBudget : null, + toolCallsMax: rawToolCalls > 0 ? rawToolCalls : null, + latencyMaxMs: rawLatency > 0 ? rawLatency : null, + energyMax: rawEnergy > 0 ? rawEnergy : null, + compliance: compliance || null, + kpiWeights: { + quality: kpiEntry.quality ?? 0.4, + cost: kpiEntry.cost ?? 0.3, + latency: kpiEntry.latency ?? 0.2, + energy: kpiEntry.energy ?? 0.1, + }, + }; + harnessCtx = new HarnessRunContext(config); + } + // Domain routing parameters const enableDomainRouting = this.getNodeParameter('enableDomainRouting', 0, false) as boolean; @@ -887,12 +1032,18 @@ export class CascadeFlowAgent implements INodeType { domainVerifierGetters, ); + // Wire harness context into cascade model for per-call recording + if (harnessCtx) { + cascadeModel.setHarnessContext(harnessCtx); + } + const agentExecutor = new CascadeFlowAgentExecutor( cascadeModel, tools, toolRoutingRules, maxIterations, enableToolCascadeValidation, + harnessCtx, ); // --- Process each input item --- @@ -933,6 +1084,7 @@ export class CascadeFlowAgent implements INodeType { output: result.output, ...cascadeflowMeta, trace: result.trace, + harness: result.harness ?? null, }, pairedItem: { item: itemIndex }, }); diff --git a/packages/integrations/n8n/nodes/LmChatCascadeFlow/LmChatCascadeFlow.node.ts b/packages/integrations/n8n/nodes/LmChatCascadeFlow/LmChatCascadeFlow.node.ts index 8c39ae41..ad2d603e 100644 --- a/packages/integrations/n8n/nodes/LmChatCascadeFlow/LmChatCascadeFlow.node.ts +++ b/packages/integrations/n8n/nodes/LmChatCascadeFlow/LmChatCascadeFlow.node.ts @@ -23,6 +23,8 @@ import { getEnabledDomains, } from './config'; import { buildCascadeMetadata } from './cascade-metadata'; +import { estimateCost as harnessEstimateCost } from '../harness/pricing'; +import type { HarnessRunContext } from '../harness/harness'; // Quality validation, cost tracking, and routing - optional import let QualityValidator: any; @@ -110,6 +112,29 @@ export class CascadeChatModel extends BaseChatModel { private domainVerifiers: Map = new Map(); private domainVerifierGetters: Map Promise> = new Map(); + // Harness context (set by agent node) + private harnessCtx: HarnessRunContext | null = null; + + setHarnessContext(ctx: HarnessRunContext | null): void { + this.harnessCtx = ctx; + } + + private recordHarnessCall(message: BaseMessage, model: BaseChatModel, elapsedMs: number): void { + if (!this.harnessCtx) return; + const responseMetadata = (message as any).response_metadata || {}; + const tokenUsage = responseMetadata.tokenUsage || responseMetadata.usage || {}; + const inputTokens = tokenUsage.promptTokens || tokenUsage.prompt_tokens || 0; + const outputTokens = tokenUsage.completionTokens || tokenUsage.completion_tokens || 0; + const modelName = (model as any).modelName || (model as any).model || 'unknown'; + this.harnessCtx.recordCall({ + model: modelName, + inputTokens, + outputTokens, + toolCallCount: 0, + elapsedMs, + }); + } + constructor( drafterModelGetter: () => Promise, verifierModelGetter: () => Promise, @@ -257,6 +282,7 @@ export class CascadeChatModel extends BaseChatModel { const latency = Date.now() - start; const verifierCost = await this.calculateMessageCost(verifierMessage, verifierModel); + this.recordHarnessCall(verifierMessage, verifierModel, latency); const costBreakdown = { drafter: 0, verifier: verifierCost, @@ -584,37 +610,8 @@ export class CascadeChatModel extends BaseChatModel { } } - // Fallback to rough estimates based on model name - const estimatesPerMillion: Record = { - 'gpt-4o-mini': { input: 0.15, output: 0.6 }, - 'gpt-4o': { input: 2.5, output: 10.0 }, - 'gpt-5-mini': { input: 0.20, output: 0.80 }, - 'gpt-4-turbo': { input: 10.0, output: 30.0 }, - 'gpt-4': { input: 30.0, output: 60.0 }, - 'gpt-3.5-turbo': { input: 0.5, output: 1.5 }, - 'claude-3-5-haiku': { input: 1.0, output: 5.0 }, - 'claude-haiku-4-5': { input: 1.0, output: 5.0 }, - 'claude-3-5-sonnet': { input: 3.0, output: 15.0 }, - 'claude-sonnet-4-5': { input: 3.0, output: 15.0 }, - 'claude-sonnet-4': { input: 3.0, output: 15.0 }, - 'claude-opus-4-5': { input: 5.0, output: 25.0 }, - 'claude-3-haiku': { input: 0.25, output: 1.25 }, - default: { input: 1.0, output: 2.0 }, - }; - - let estimate = estimatesPerMillion.default; - for (const [key, value] of Object.entries(estimatesPerMillion)) { - if (modelName.includes(key)) { - estimate = value; - break; - } - } - - const cost = - (inputTokens / 1_000_000) * estimate.input + - (outputTokens / 1_000_000) * estimate.output; - - return cost; + // Use shared harness pricing (fuzzy model resolution, 18 models) + return harnessEstimateCost(modelName, inputTokens, outputTokens); } /** @@ -711,6 +708,7 @@ export class CascadeChatModel extends BaseChatModel { this.verifierCount++; const verifierCost = await this.calculateMessageCost(verifierMessage, verifierModel); + this.recordHarnessCall(verifierMessage, verifierModel, verifierLatency); const costBreakdown = { drafter: 0, verifier: verifierCost, @@ -772,6 +770,7 @@ export class CascadeChatModel extends BaseChatModel { const drafterStartTime = Date.now(); const drafterMessage = await modelToUse.invoke(messages, options); const drafterLatency = Date.now() - drafterStartTime; + this.recordHarnessCall(drafterMessage, modelToUse, drafterLatency); if (domainModel && detectedDomain) { this.domainCounts.set(detectedDomain, (this.domainCounts.get(detectedDomain) || 0) + 1); @@ -798,6 +797,7 @@ export class CascadeChatModel extends BaseChatModel { const verifierStartTime = Date.now(); const verifierMessage = await verifierModel.invoke(messages, options); const verifierLatency = Date.now() - verifierStartTime; + this.recordHarnessCall(verifierMessage, verifierModel, verifierLatency); this.verifierCount++; @@ -1060,6 +1060,7 @@ export class CascadeChatModel extends BaseChatModel { const verifierInfo = this.getModelInfo(verifierModel); const verifierMessage = await verifierModel.invoke(messages, options); const verifierLatency = Date.now() - verifierStartTime; + this.recordHarnessCall(verifierMessage, verifierModel, verifierLatency); this.verifierCount++; @@ -1136,7 +1137,9 @@ export class CascadeChatModel extends BaseChatModel { const verifierModel = await this.getVerifierModel(); const verifierInfo = this.getModelInfo(verifierModel); + const fallbackStart = Date.now(); const verifierMessage = await verifierModel.invoke(messages, options); + this.recordHarnessCall(verifierMessage, verifierModel, Date.now() - fallbackStart); this.verifierCount++; const verifierCost = await this.calculateMessageCost(verifierMessage, verifierModel); diff --git a/packages/integrations/n8n/nodes/LmChatCascadeFlow/cascade-metadata.ts b/packages/integrations/n8n/nodes/LmChatCascadeFlow/cascade-metadata.ts index d539d5b7..e93f7b23 100644 --- a/packages/integrations/n8n/nodes/LmChatCascadeFlow/cascade-metadata.ts +++ b/packages/integrations/n8n/nodes/LmChatCascadeFlow/cascade-metadata.ts @@ -1,4 +1,5 @@ import type { DomainType } from './config'; +import type { HarnessSummary } from '../harness'; export interface CostBreakdown { drafter: number; @@ -12,12 +13,15 @@ export interface SavingsBreakdown { percent: number; } +export interface HarnessSummaryOutput extends HarnessSummary {} + export interface CascadeFlowMetadata { model_used: string; domain: DomainType | null; confidence?: number; costs: CostBreakdown; savings: SavingsBreakdown; + harness?: HarnessSummaryOutput | null; } export const calculateSavings = ( diff --git a/packages/integrations/n8n/nodes/harness/__tests__/harness.test.ts b/packages/integrations/n8n/nodes/harness/__tests__/harness.test.ts new file mode 100644 index 00000000..5c003e42 --- /dev/null +++ b/packages/integrations/n8n/nodes/harness/__tests__/harness.test.ts @@ -0,0 +1,368 @@ +import { describe, expect, it } from 'vitest'; + +import { + PRICING_USD_PER_M, + DEFAULT_PRICING_USD_PER_M, + ENERGY_COEFFICIENTS, + DEFAULT_ENERGY_COEFFICIENT, + ENERGY_OUTPUT_WEIGHT, + resolvePricingKey, + estimateCost, + estimateEnergy, + modelTotalPrice, +} from '../pricing'; + +import { + HarnessRunContext, + COMPLIANCE_MODEL_ALLOWLISTS, + QUALITY_PRIORS, + LATENCY_PRIORS, + normalizeWeights, + type HarnessConfig, +} from '../harness'; + +// --------------------------------------------------------------------------- +// Pricing data fidelity +// --------------------------------------------------------------------------- + +describe('pricing data', () => { + it('has 18 models in PRICING_USD_PER_M', () => { + expect(Object.keys(PRICING_USD_PER_M)).toHaveLength(18); + }); + + it('matches Python values for gpt-4o', () => { + expect(PRICING_USD_PER_M['gpt-4o']).toEqual([2.50, 10.00]); + }); + + it('matches Python values for gpt-4o-mini', () => { + expect(PRICING_USD_PER_M['gpt-4o-mini']).toEqual([0.15, 0.60]); + }); + + it('matches Python values for claude-sonnet-4', () => { + expect(PRICING_USD_PER_M['claude-sonnet-4']).toEqual([3.00, 15.00]); + }); + + it('matches Python values for gemini-2.5-flash', () => { + expect(PRICING_USD_PER_M['gemini-2.5-flash']).toEqual([0.15, 0.60]); + }); + + it('has correct default pricing', () => { + expect(DEFAULT_PRICING_USD_PER_M).toEqual([2.50, 10.00]); + }); + + it('has 18 models in ENERGY_COEFFICIENTS', () => { + expect(Object.keys(ENERGY_COEFFICIENTS)).toHaveLength(18); + }); + + it('has correct energy defaults', () => { + expect(DEFAULT_ENERGY_COEFFICIENT).toBe(1.0); + expect(ENERGY_OUTPUT_WEIGHT).toBe(1.5); + }); +}); + +// --------------------------------------------------------------------------- +// estimateCost / estimateEnergy +// --------------------------------------------------------------------------- + +describe('estimateCost', () => { + it('calculates gpt-4o cost correctly (1000 in, 500 out = $0.0075)', () => { + const cost = estimateCost('gpt-4o', 1000, 500); + expect(cost).toBeCloseTo(0.0075, 6); + }); + + it('calculates gpt-4o-mini cost correctly', () => { + const cost = estimateCost('gpt-4o-mini', 1_000_000, 1_000_000); + expect(cost).toBeCloseTo(0.15 + 0.60, 6); + }); + + it('uses default pricing for unknown models', () => { + const cost = estimateCost('unknown-model', 1_000_000, 1_000_000); + expect(cost).toBeCloseTo(2.50 + 10.00, 6); + }); +}); + +describe('estimateEnergy', () => { + it('calculates gpt-4o energy correctly (100 in, 50 out)', () => { + // coeff=1.0, energy = 1.0 * (100 + 50 * 1.5) = 175.0 + const energy = estimateEnergy('gpt-4o', 100, 50); + expect(energy).toBeCloseTo(175.0, 4); + }); + + it('uses default coefficient for unknown models', () => { + // coeff=1.0, energy = 1.0 * (100 + 50 * 1.5) = 175.0 + const energy = estimateEnergy('unknown-model', 100, 50); + expect(energy).toBeCloseTo(175.0, 4); + }); + + it('uses correct coefficient for gpt-4o-mini', () => { + // coeff=0.3, energy = 0.3 * (100 + 50 * 1.5) = 52.5 + const energy = estimateEnergy('gpt-4o-mini', 100, 50); + expect(energy).toBeCloseTo(52.5, 4); + }); +}); + +describe('modelTotalPrice', () => { + it('returns input + output for gpt-4o', () => { + expect(modelTotalPrice('gpt-4o')).toBeCloseTo(12.50, 6); + }); + + it('returns default for unknown model', () => { + expect(modelTotalPrice('unknown')).toBeCloseTo(12.50, 6); + }); +}); + +// --------------------------------------------------------------------------- +// Fuzzy model resolution +// --------------------------------------------------------------------------- + +describe('resolvePricingKey', () => { + it('exact match', () => { + expect(resolvePricingKey('gpt-4o')).toBe('gpt-4o'); + }); + + it('strips version suffix (-20250120)', () => { + expect(resolvePricingKey('gpt-4o-20250120')).toBe('gpt-4o'); + }); + + it('strips -preview suffix', () => { + expect(resolvePricingKey('gpt-4o-preview')).toBe('gpt-4o'); + }); + + it('strips -latest suffix', () => { + expect(resolvePricingKey('gpt-4o-latest')).toBe('gpt-4o'); + }); + + it('longest-prefix match (gemini-2.5-flash-8b → gemini-2.5-flash)', () => { + expect(resolvePricingKey('gemini-2.5-flash-8b')).toBe('gemini-2.5-flash'); + }); + + it('returns null for completely unknown model', () => { + expect(resolvePricingKey('totally-unknown-model')).toBeNull(); + }); +}); + +// --------------------------------------------------------------------------- +// HarnessRunContext — evaluatePreCall +// --------------------------------------------------------------------------- + +function makeConfig(overrides: Partial = {}): HarnessConfig { + return { + mode: 'enforce', + budgetMax: null, + toolCallsMax: null, + latencyMaxMs: null, + energyMax: null, + compliance: null, + kpiWeights: {}, + ...overrides, + }; +} + +describe('evaluatePreCall', () => { + it('returns allow when no limits set', () => { + const ctx = new HarnessRunContext(makeConfig()); + const decision = ctx.evaluatePreCall('gpt-4o', false); + expect(decision.action).toBe('allow'); + }); + + it('returns stop when budget exhausted', () => { + const ctx = new HarnessRunContext(makeConfig({ budgetMax: 0.01 })); + ctx.cost = 0.01; // exhaust budget + const decision = ctx.evaluatePreCall('gpt-4o', false); + expect(decision.action).toBe('stop'); + expect(decision.reason).toBe('budget_exceeded'); + }); + + it('returns deny_tool when tool cap reached', () => { + const ctx = new HarnessRunContext(makeConfig({ toolCallsMax: 3 })); + ctx.toolCalls = 3; + const decision = ctx.evaluatePreCall('gpt-4o', true); + expect(decision.action).toBe('deny_tool'); + expect(decision.reason).toBe('max_tool_calls_reached'); + }); + + it('returns stop for compliance violation (non-compliant model)', () => { + const ctx = new HarnessRunContext(makeConfig({ compliance: 'gdpr' })); + const decision = ctx.evaluatePreCall('claude-sonnet-4', false); + expect(decision.action).toBe('stop'); + expect(decision.reason).toBe('compliance_no_approved_model'); + }); + + it('allows compliant model under GDPR', () => { + const ctx = new HarnessRunContext(makeConfig({ compliance: 'gdpr' })); + const decision = ctx.evaluatePreCall('gpt-4o', false); + expect(decision.action).toBe('allow'); + }); + + it('returns stop when latency cap exceeded', () => { + const ctx = new HarnessRunContext(makeConfig({ latencyMaxMs: 1000 })); + ctx.latencyUsedMs = 1000; + const decision = ctx.evaluatePreCall('gpt-3.5-turbo', false); + // gpt-3.5-turbo is already the fastest → can't switch → stop + expect(decision.action).toBe('stop'); + expect(decision.reason).toBe('latency_limit_exceeded'); + }); + + it('returns stop when energy cap exceeded', () => { + const ctx = new HarnessRunContext(makeConfig({ energyMax: 100 })); + ctx.energyUsed = 100; + const decision = ctx.evaluatePreCall('gpt-3.5-turbo', false); + // gpt-3.5-turbo is already lowest energy → can't switch → stop + expect(decision.action).toBe('stop'); + expect(decision.reason).toBe('energy_limit_exceeded'); + }); + + it('returns switch_model observation for budget pressure', () => { + const ctx = new HarnessRunContext(makeConfig({ budgetMax: 1.0 })); + ctx.cost = 0.85; // 85% spent, < 20% remaining + ctx.budgetRemaining = 0.15; + const decision = ctx.evaluatePreCall('gpt-4o', false); + // Budget pressure suggests cheaper model + expect(decision.action).toBe('switch_model'); + expect(decision.reason).toBe('budget_pressure'); + }); + + it('returns switch_model observation for KPI optimization', () => { + const ctx = new HarnessRunContext(makeConfig({ + kpiWeights: { quality: 0, cost: 1, latency: 0, energy: 0 }, + })); + // gpt-4 is very expensive, KPI weights purely on cost → should suggest cheaper + const decision = ctx.evaluatePreCall('gpt-4', false); + expect(decision.action).toBe('switch_model'); + expect(decision.reason).toBe('kpi_weight_optimization'); + }); +}); + +// --------------------------------------------------------------------------- +// Budget tracking across multiple recordCall invocations +// --------------------------------------------------------------------------- + +describe('recordCall and budget tracking', () => { + it('accumulates cost across calls', () => { + const ctx = new HarnessRunContext(makeConfig({ budgetMax: 0.10 })); + ctx.recordCall({ model: 'gpt-4o-mini', inputTokens: 100, outputTokens: 50, toolCallCount: 0, elapsedMs: 50 }); + expect(ctx.cost).toBeGreaterThan(0); + expect(ctx.stepCount).toBe(1); + expect(ctx.budgetRemaining).toBeLessThan(0.10); + + ctx.recordCall({ model: 'gpt-4o-mini', inputTokens: 200, outputTokens: 100, toolCallCount: 1, elapsedMs: 60 }); + expect(ctx.stepCount).toBe(2); + expect(ctx.toolCalls).toBe(1); + expect(ctx.latencyUsedMs).toBe(110); + }); + + it('detects budget exhaustion', () => { + const ctx = new HarnessRunContext(makeConfig({ budgetMax: 0.0001 })); + ctx.recordCall({ model: 'gpt-4o', inputTokens: 10000, outputTokens: 5000, toolCallCount: 0, elapsedMs: 100 }); + expect(ctx.isBudgetExhausted()).toBe(true); + }); + + it('detects tool cap reached', () => { + const ctx = new HarnessRunContext(makeConfig({ toolCallsMax: 2 })); + ctx.toolCalls = 2; + expect(ctx.isToolCapReached()).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Observe vs enforce mode behavior +// --------------------------------------------------------------------------- + +describe('observe vs enforce mode', () => { + it('observe mode evaluatePreCall still returns decisions', () => { + const ctx = new HarnessRunContext(makeConfig({ mode: 'observe', budgetMax: 0.01 })); + ctx.cost = 0.01; + const decision = ctx.evaluatePreCall('gpt-4o', false); + // Decision is evaluated regardless of mode + expect(decision.action).toBe('stop'); + }); + + it('off mode has no context created (by design)', () => { + // In the actual agent node, harnessCtx is null when mode=off + // This test validates that a context with mode=off still works + const ctx = new HarnessRunContext(makeConfig({ mode: 'off' })); + const decision = ctx.evaluatePreCall('gpt-4o', false); + expect(decision.action).toBe('allow'); + expect(decision.reason).toBe('off'); + }); +}); + +// --------------------------------------------------------------------------- +// Compliance allowlists +// --------------------------------------------------------------------------- + +describe('compliance allowlists', () => { + it('GDPR allows gpt-4o, gpt-4o-mini, gpt-3.5-turbo', () => { + const allowlist = COMPLIANCE_MODEL_ALLOWLISTS['gdpr']; + expect(allowlist.has('gpt-4o')).toBe(true); + expect(allowlist.has('gpt-4o-mini')).toBe(true); + expect(allowlist.has('gpt-3.5-turbo')).toBe(true); + expect(allowlist.has('claude-sonnet-4')).toBe(false); + }); + + it('strict allows only gpt-4o', () => { + const allowlist = COMPLIANCE_MODEL_ALLOWLISTS['strict']; + expect(allowlist.size).toBe(1); + expect(allowlist.has('gpt-4o')).toBe(true); + }); + + it('strict mode denies tools even for compliant model', () => { + const ctx = new HarnessRunContext(makeConfig({ compliance: 'strict' })); + const decision = ctx.evaluatePreCall('gpt-4o', true); + expect(decision.action).toBe('deny_tool'); + expect(decision.reason).toBe('compliance_tool_restriction'); + }); +}); + +// --------------------------------------------------------------------------- +// KPI weight normalization +// --------------------------------------------------------------------------- + +describe('normalizeWeights', () => { + it('normalizes to sum=1', () => { + const result = normalizeWeights({ quality: 0.4, cost: 0.3, latency: 0.2, energy: 0.1 }); + const sum = Object.values(result).reduce((a, b) => a + b, 0); + expect(sum).toBeCloseTo(1.0, 6); + }); + + it('filters out zero and negative values', () => { + const result = normalizeWeights({ quality: 1, cost: 0, latency: -1, energy: 1 }); + expect(result.cost).toBeUndefined(); + expect(result.latency).toBeUndefined(); + expect(result.quality).toBeCloseTo(0.5, 6); + expect(result.energy).toBeCloseTo(0.5, 6); + }); + + it('returns empty for all-zero weights', () => { + const result = normalizeWeights({ quality: 0, cost: 0, latency: 0, energy: 0 }); + expect(Object.keys(result)).toHaveLength(0); + }); +}); + +// --------------------------------------------------------------------------- +// summary() structure +// --------------------------------------------------------------------------- + +describe('summary()', () => { + it('returns correct structure', () => { + const ctx = new HarnessRunContext(makeConfig({ budgetMax: 1.0 })); + ctx.recordCall({ model: 'gpt-4o-mini', inputTokens: 100, outputTokens: 50, toolCallCount: 0, elapsedMs: 42 }); + + const s = ctx.summary(); + expect(s.runId).toBeTruthy(); + expect(s.mode).toBe('enforce'); + expect(s.stepCount).toBe(1); + expect(s.toolCalls).toBe(0); + expect(s.cost).toBeGreaterThan(0); + expect(s.latencyUsedMs).toBe(42); + expect(s.energyUsed).toBeGreaterThan(0); + expect(s.budgetMax).toBe(1.0); + expect(s.budgetRemaining).toBeLessThan(1.0); + expect(s.lastAction).toBe('allow'); + expect(s.durationMs).toBeGreaterThanOrEqual(0); + expect(Array.isArray(s.trace)).toBe(true); + expect(s.trace).toHaveLength(1); + expect(s.trace[0].action).toBe('allow'); + expect(s.trace[0].budgetState.max).toBe(1.0); + }); +}); diff --git a/packages/integrations/n8n/nodes/harness/harness.ts b/packages/integrations/n8n/nodes/harness/harness.ts new file mode 100644 index 00000000..93c5150d --- /dev/null +++ b/packages/integrations/n8n/nodes/harness/harness.ts @@ -0,0 +1,426 @@ +/** + * HarnessRunContext — multi-dimensional decision engine for n8n (TypeScript port). + * + * Ported from cascadeflow/harness/api.py (HarnessRunContext) and + * cascadeflow/harness/instrument.py (pre-call decision logic, compliance, + * quality/latency priors, KPI scoring). + * + * Key n8n constraint: models are graph connections (sub-nodes), not string + * parameters. The harness cannot switch models at runtime. Only `stop` and + * `deny_tool` actions have enforcement effects. `switch_model` decisions are + * recorded in the trace as observations. + */ + +import { + ENERGY_COEFFICIENTS, + DEFAULT_ENERGY_COEFFICIENT, + estimateCost, + estimateEnergy, + modelTotalPrice, + PRICING_USD_PER_M, +} from './pricing'; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export type HarnessMode = 'off' | 'observe' | 'enforce'; + +export interface KpiWeights { + quality?: number; + cost?: number; + latency?: number; + energy?: number; +} + +export interface HarnessConfig { + mode: HarnessMode; + budgetMax: number | null; + toolCallsMax: number | null; + latencyMaxMs: number | null; + energyMax: number | null; + compliance: string | null; + kpiWeights: KpiWeights; +} + +export interface PreCallDecision { + action: 'allow' | 'stop' | 'switch_model' | 'deny_tool'; + reason: string; + targetModel: string; +} + +export interface HarnessTraceEntry { + action: string; + reason: string; + model: string | null; + step: number; + timestampMs: number; + costTotal: number; + budgetState: { max: number | null; remaining: number | null }; + applied: boolean; + decisionMode: string; +} + +export interface HarnessSummary { + runId: string; + mode: HarnessMode; + stepCount: number; + toolCalls: number; + cost: number; + latencyUsedMs: number; + energyUsed: number; + budgetMax: number | null; + budgetRemaining: number | null; + lastAction: string; + durationMs: number; + trace: HarnessTraceEntry[]; +} + +export interface RecordCallParams { + model: string; + inputTokens: number; + outputTokens: number; + toolCallCount: number; + elapsedMs: number; + decision?: PreCallDecision; +} + +// --------------------------------------------------------------------------- +// Compliance allowlists (from instrument.py lines 107-112) +// --------------------------------------------------------------------------- + +const COMPLIANCE_MODEL_ALLOWLISTS: Record> = { + gdpr: new Set(['gpt-4o', 'gpt-4o-mini', 'gpt-3.5-turbo']), + hipaa: new Set(['gpt-4o', 'gpt-4o-mini']), + pci: new Set(['gpt-4o-mini', 'gpt-3.5-turbo']), + strict: new Set(['gpt-4o']), +}; + +// --------------------------------------------------------------------------- +// Quality & latency priors for KPI scoring (from instrument.py lines 74-95) +// --------------------------------------------------------------------------- + +const QUALITY_PRIORS: Record = { + 'gpt-4o': 0.90, + 'gpt-4o-mini': 0.75, + 'gpt-5-mini': 0.86, + 'gpt-4-turbo': 0.88, + 'gpt-4': 0.87, + 'gpt-3.5-turbo': 0.65, + 'o1': 0.95, + 'o1-mini': 0.82, + 'o3-mini': 0.80, +}; + +const LATENCY_PRIORS: Record = { + 'gpt-4o': 0.72, + 'gpt-4o-mini': 0.93, + 'gpt-5-mini': 0.84, + 'gpt-4-turbo': 0.66, + 'gpt-4': 0.52, + 'gpt-3.5-turbo': 1.00, + 'o1': 0.40, + 'o1-mini': 0.60, + 'o3-mini': 0.78, +}; + +// Pre-computed model cost/energy bounds for utility functions. +const MODEL_POOL = Object.keys(PRICING_USD_PER_M); +const MODEL_TOTAL_COSTS = new Map(MODEL_POOL.map(m => [m, modelTotalPrice(m)])); +const MIN_TOTAL_COST = Math.min(...MODEL_TOTAL_COSTS.values()); +const MAX_TOTAL_COST = Math.max(...MODEL_TOTAL_COSTS.values()); + +const MODEL_ENERGY_COEFFS = new Map( + MODEL_POOL.map(m => [m, ENERGY_COEFFICIENTS[m] ?? DEFAULT_ENERGY_COEFFICIENT]), +); +const MIN_ENERGY_COEFF = Math.min(...MODEL_ENERGY_COEFFS.values()); +const MAX_ENERGY_COEFF = Math.max(...MODEL_ENERGY_COEFFS.values()); + +// --------------------------------------------------------------------------- +// KPI scoring helpers (from instrument.py lines 234-267) +// --------------------------------------------------------------------------- + +function normalizeWeights(weights: KpiWeights): Record { + const raw: Record = {}; + for (const [key, val] of Object.entries(weights)) { + if (['cost', 'quality', 'latency', 'energy'].includes(key) && typeof val === 'number' && val > 0) { + raw[key] = val; + } + } + const total = Object.values(raw).reduce((a, b) => a + b, 0); + if (total <= 0) return {}; + const normalized: Record = {}; + for (const [key, val] of Object.entries(raw)) { + normalized[key] = val / total; + } + return normalized; +} + +function costUtility(model: string): number { + const modelCost = MODEL_TOTAL_COSTS.get(model) ?? modelTotalPrice(model); + if (MAX_TOTAL_COST === MIN_TOTAL_COST) return 1.0; + return (MAX_TOTAL_COST - modelCost) / (MAX_TOTAL_COST - MIN_TOTAL_COST); +} + +function energyUtility(model: string): number { + const coeff = ENERGY_COEFFICIENTS[model] ?? DEFAULT_ENERGY_COEFFICIENT; + if (MAX_ENERGY_COEFF === MIN_ENERGY_COEFF) return 1.0; + return (MAX_ENERGY_COEFF - coeff) / (MAX_ENERGY_COEFF - MIN_ENERGY_COEFF); +} + +function kpiScoreWithNormalized(model: string, normalized: Record): number { + if (Object.keys(normalized).length === 0) return 0.0; + const quality = QUALITY_PRIORS[model] ?? 0.7; + const latency = LATENCY_PRIORS[model] ?? 0.7; + const cost = costUtility(model); + const energy = energyUtility(model); + return ( + (normalized.quality ?? 0) * quality + + (normalized.latency ?? 0) * latency + + (normalized.cost ?? 0) * cost + + (normalized.energy ?? 0) * energy + ); +} + +function selectKpiWeightedModel(currentModel: string, weights: KpiWeights): string { + const normalized = normalizeWeights(weights); + if (Object.keys(normalized).length === 0) return currentModel; + let bestModel = currentModel; + let bestScore = kpiScoreWithNormalized(currentModel, normalized); + for (const candidate of MODEL_POOL) { + const score = kpiScoreWithNormalized(candidate, normalized); + if (score > bestScore) { + bestModel = candidate; + bestScore = score; + } + } + return bestModel; +} + +// Cheapest/fastest/lowest-energy helpers +function selectCheaperModel(currentModel: string): string { + let cheapest = currentModel; + let cheapestCost = MODEL_TOTAL_COSTS.get(currentModel) ?? modelTotalPrice(currentModel); + for (const [m, c] of MODEL_TOTAL_COSTS) { + if (c < cheapestCost) { + cheapest = m; + cheapestCost = c; + } + } + return cheapest; +} + +function selectFasterModel(currentModel: string): string { + const currentLatency = LATENCY_PRIORS[currentModel] ?? 0.7; + let best = currentModel; + let bestLatency = currentLatency; + for (const [m, lat] of Object.entries(LATENCY_PRIORS)) { + if (lat > bestLatency) { + best = m; + bestLatency = lat; + } + } + return best; +} + +function selectLowerEnergyModel(currentModel: string): string { + const currentCoeff = ENERGY_COEFFICIENTS[currentModel] ?? DEFAULT_ENERGY_COEFFICIENT; + let best = currentModel; + let bestCoeff = currentCoeff; + for (const [m, c] of MODEL_ENERGY_COEFFS) { + if (c < bestCoeff) { + best = m; + bestCoeff = c; + } + } + return best; +} + +// --------------------------------------------------------------------------- +// HarnessRunContext +// --------------------------------------------------------------------------- + +let runIdCounter = 0; + +function generateRunId(): string { + runIdCounter += 1; + const ts = Date.now().toString(36); + const counter = runIdCounter.toString(36); + return `${ts}${counter}`.slice(-8); +} + +export class HarnessRunContext { + readonly runId: string; + readonly config: HarnessConfig; + + stepCount = 0; + toolCalls = 0; + cost = 0; + latencyUsedMs = 0; + energyUsed = 0; + budgetRemaining: number | null; + lastAction = 'allow'; + + private startedAt: number; + private trace: HarnessTraceEntry[] = []; + + constructor(config: HarnessConfig) { + this.runId = generateRunId(); + this.config = config; + this.budgetRemaining = config.budgetMax; + this.startedAt = Date.now(); + } + + // ----------------------------------------------------------------------- + // Pre-call decision cascade (ported from instrument.py _evaluate_pre_call_decision) + // ----------------------------------------------------------------------- + + evaluatePreCall(model: string, hasTools: boolean): PreCallDecision { + const cfg = this.config; + + // 1. Budget exhausted + if (cfg.budgetMax !== null && this.cost >= cfg.budgetMax) { + return { action: 'stop', reason: 'budget_exceeded', targetModel: model }; + } + + // 2. Tool call cap + if (hasTools && cfg.toolCallsMax !== null && this.toolCalls >= cfg.toolCallsMax) { + return { action: 'deny_tool', reason: 'max_tool_calls_reached', targetModel: model }; + } + + // 3. Compliance + if (cfg.compliance) { + const allowlist = COMPLIANCE_MODEL_ALLOWLISTS[cfg.compliance.trim().toLowerCase()]; + if (allowlist) { + if (!allowlist.has(model)) { + // Can't switch models in n8n — stop if no compliant model possible + return { action: 'stop', reason: 'compliance_no_approved_model', targetModel: model }; + } + if (cfg.compliance.trim().toLowerCase() === 'strict' && hasTools) { + return { action: 'deny_tool', reason: 'compliance_tool_restriction', targetModel: model }; + } + } + } + + // 4. Latency cap + if (cfg.latencyMaxMs !== null && this.latencyUsedMs >= cfg.latencyMaxMs) { + const faster = selectFasterModel(model); + if (faster !== model) { + return { action: 'switch_model', reason: 'latency_limit_exceeded', targetModel: faster }; + } + return { action: 'stop', reason: 'latency_limit_exceeded', targetModel: model }; + } + + // 5. Energy cap + if (cfg.energyMax !== null && this.energyUsed >= cfg.energyMax) { + const lower = selectLowerEnergyModel(model); + if (lower !== model) { + return { action: 'switch_model', reason: 'energy_limit_exceeded', targetModel: lower }; + } + return { action: 'stop', reason: 'energy_limit_exceeded', targetModel: model }; + } + + // 6. Budget pressure (<20% remaining) — observation only in n8n + if ( + cfg.budgetMax !== null && + cfg.budgetMax > 0 && + this.budgetRemaining !== null && + this.budgetRemaining / cfg.budgetMax < 0.2 + ) { + const cheaper = selectCheaperModel(model); + if (cheaper !== model) { + return { action: 'switch_model', reason: 'budget_pressure', targetModel: cheaper }; + } + } + + // 7. KPI-weighted — observation only in n8n + const kw = cfg.kpiWeights; + if (kw && Object.values(kw).some(v => typeof v === 'number' && v > 0)) { + const weighted = selectKpiWeightedModel(model, kw); + if (weighted !== model) { + return { action: 'switch_model', reason: 'kpi_weight_optimization', targetModel: weighted }; + } + } + + // 8. Default: allow + return { action: 'allow', reason: cfg.mode, targetModel: model }; + } + + // ----------------------------------------------------------------------- + // Record a completed call + // ----------------------------------------------------------------------- + + recordCall(params: RecordCallParams): void { + const { model, inputTokens, outputTokens, toolCallCount, elapsedMs, decision } = params; + + const callCost = estimateCost(model, inputTokens, outputTokens); + const energy = estimateEnergy(model, inputTokens, outputTokens); + + this.cost += callCost; + this.stepCount += 1; + this.latencyUsedMs += elapsedMs; + this.energyUsed += energy; + this.toolCalls += toolCallCount; + + if (this.config.budgetMax !== null) { + this.budgetRemaining = this.config.budgetMax - this.cost; + } + + const action = decision?.action ?? 'allow'; + const reason = decision?.reason ?? this.config.mode; + const applied = action === 'allow' || (this.config.mode === 'enforce' && (action === 'stop' || action === 'deny_tool')); + + this.lastAction = action; + + this.trace.push({ + action, + reason, + model, + step: this.stepCount, + timestampMs: Date.now(), + costTotal: this.cost, + budgetState: { + max: this.config.budgetMax, + remaining: this.budgetRemaining, + }, + applied, + decisionMode: this.config.mode, + }); + } + + // ----------------------------------------------------------------------- + // Quick checks for agent loop + // ----------------------------------------------------------------------- + + isBudgetExhausted(): boolean { + return this.config.budgetMax !== null && this.cost >= this.config.budgetMax; + } + + isToolCapReached(): boolean { + return this.config.toolCallsMax !== null && this.toolCalls >= this.config.toolCallsMax; + } + + // ----------------------------------------------------------------------- + // Summary + // ----------------------------------------------------------------------- + + summary(): HarnessSummary { + return { + runId: this.runId, + mode: this.config.mode, + stepCount: this.stepCount, + toolCalls: this.toolCalls, + cost: this.cost, + latencyUsedMs: this.latencyUsedMs, + energyUsed: this.energyUsed, + budgetMax: this.config.budgetMax, + budgetRemaining: this.budgetRemaining, + lastAction: this.lastAction, + durationMs: Date.now() - this.startedAt, + trace: [...this.trace], + }; + } +} + +// Re-export for external test access +export { COMPLIANCE_MODEL_ALLOWLISTS, QUALITY_PRIORS, LATENCY_PRIORS, normalizeWeights }; diff --git a/packages/integrations/n8n/nodes/harness/index.ts b/packages/integrations/n8n/nodes/harness/index.ts new file mode 100644 index 00000000..663f93b3 --- /dev/null +++ b/packages/integrations/n8n/nodes/harness/index.ts @@ -0,0 +1,22 @@ +export { + PRICING_USD_PER_M, + DEFAULT_PRICING_USD_PER_M, + ENERGY_COEFFICIENTS, + DEFAULT_ENERGY_COEFFICIENT, + ENERGY_OUTPUT_WEIGHT, + resolvePricingKey, + estimateCost, + estimateEnergy, + modelTotalPrice, +} from './pricing'; + +export { + type HarnessMode, + type KpiWeights, + type HarnessConfig, + type PreCallDecision, + type HarnessTraceEntry, + type HarnessSummary, + type RecordCallParams, + HarnessRunContext, +} from './harness'; diff --git a/packages/integrations/n8n/nodes/harness/pricing.ts b/packages/integrations/n8n/nodes/harness/pricing.ts new file mode 100644 index 00000000..fd13f43a --- /dev/null +++ b/packages/integrations/n8n/nodes/harness/pricing.ts @@ -0,0 +1,135 @@ +/** + * Shared harness pricing and energy profiles (TypeScript port). + * + * Ported from cascadeflow/harness/pricing.py — single source of truth for + * cost/energy estimation in the n8n integration. + */ + +// USD per 1M tokens [input, output]. +export const PRICING_USD_PER_M: Record = { + // OpenAI + 'gpt-4o': [2.50, 10.00], + 'gpt-4o-mini': [0.15, 0.60], + 'gpt-5': [1.25, 10.00], + 'gpt-5-mini': [0.20, 0.80], + 'gpt-4-turbo': [10.00, 30.00], + 'gpt-4': [30.00, 60.00], + 'gpt-3.5-turbo': [0.50, 1.50], + 'o1': [15.00, 60.00], + 'o1-mini': [3.00, 12.00], + 'o3-mini': [1.10, 4.40], + // Anthropic + 'claude-sonnet-4': [3.00, 15.00], + 'claude-haiku-3.5': [1.00, 5.00], + 'claude-opus-4.5': [5.00, 25.00], + // Google Gemini + 'gemini-2.5-flash': [0.15, 0.60], + 'gemini-2.5-pro': [1.25, 10.00], + 'gemini-2.0-flash': [0.10, 0.40], + 'gemini-1.5-flash': [0.075, 0.30], + 'gemini-1.5-pro': [1.25, 5.00], +}; + +export const DEFAULT_PRICING_USD_PER_M: [number, number] = [2.50, 10.00]; + +// Deterministic proxy coefficients for energy tracking. +export const ENERGY_COEFFICIENTS: Record = { + // OpenAI + 'gpt-4o': 1.0, + 'gpt-4o-mini': 0.3, + 'gpt-5': 1.2, + 'gpt-5-mini': 0.35, + 'gpt-4-turbo': 1.5, + 'gpt-4': 1.5, + 'gpt-3.5-turbo': 0.2, + 'o1': 2.0, + 'o1-mini': 0.8, + 'o3-mini': 0.5, + // Anthropic + 'claude-sonnet-4': 1.0, + 'claude-haiku-3.5': 0.3, + 'claude-opus-4.5': 1.8, + // Google Gemini + 'gemini-2.5-flash': 0.3, + 'gemini-2.5-pro': 1.2, + 'gemini-2.0-flash': 0.25, + 'gemini-1.5-flash': 0.2, + 'gemini-1.5-pro': 1.0, +}; + +export const DEFAULT_ENERGY_COEFFICIENT = 1.0; +export const ENERGY_OUTPUT_WEIGHT = 1.5; + +// --------------------------------------------------------------------------- +// Fuzzy model-name resolution +// --------------------------------------------------------------------------- + +// Strips version/preview/date suffixes. +// Matches: -preview, -preview-05-20, -20250120, -latest, -exp-0827, -it +const VERSION_SUFFIX_RE = /(-preview(?:-\d{2,4}-\d{2})?|-\d{8,}|-latest|-exp(?:-\d+)?|-it)$/; + +// Cache for resolved model → pricing key lookups. +const pricingKeyCache = new Map(); + +export function resolvePricingKey(model: string): string | null { + const cached = pricingKeyCache.get(model); + if (cached !== undefined) return cached; + + // Exact match + if (model in PRICING_USD_PER_M) { + pricingKeyCache.set(model, model); + return model; + } + + // Strip version suffixes and retry + const stripped = model.replace(VERSION_SUFFIX_RE, ''); + if (stripped !== model && stripped in PRICING_USD_PER_M) { + pricingKeyCache.set(model, stripped); + return stripped; + } + + // Longest-prefix match (e.g. "gemini-2.5-flash-8b" → "gemini-2.5-flash") + let best: string | null = null; + let bestLen = 0; + for (const known of Object.keys(PRICING_USD_PER_M)) { + if (model.startsWith(known) && known.length > bestLen) { + best = known; + bestLen = known.length; + } + } + if (best !== null) { + pricingKeyCache.set(model, best); + return best; + } + + pricingKeyCache.set(model, null); + return null; +} + +// --------------------------------------------------------------------------- +// Public estimation helpers +// --------------------------------------------------------------------------- + +export function estimateCost(model: string, inputTokens: number, outputTokens: number): number { + const key = resolvePricingKey(model); + const [inPrice, outPrice] = key !== null + ? (PRICING_USD_PER_M[key] ?? DEFAULT_PRICING_USD_PER_M) + : DEFAULT_PRICING_USD_PER_M; + return (inputTokens / 1_000_000) * inPrice + (outputTokens / 1_000_000) * outPrice; +} + +export function estimateEnergy(model: string, inputTokens: number, outputTokens: number): number { + const key = resolvePricingKey(model); + const coeff = key !== null + ? (ENERGY_COEFFICIENTS[key] ?? DEFAULT_ENERGY_COEFFICIENT) + : DEFAULT_ENERGY_COEFFICIENT; + return coeff * (inputTokens + outputTokens * ENERGY_OUTPUT_WEIGHT); +} + +export function modelTotalPrice(model: string): number { + const key = resolvePricingKey(model); + const [inPrice, outPrice] = key !== null + ? (PRICING_USD_PER_M[key] ?? DEFAULT_PRICING_USD_PER_M) + : DEFAULT_PRICING_USD_PER_M; + return inPrice + outPrice; +} From 510bdd1c4f3a7a8ec7d376811f20906fe9ed4bce Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 4 Mar 2026 20:26:57 +0100 Subject: [PATCH 40/49] fix(google-adk): initialize plugin name and stabilize callback correlation --- cascadeflow/integrations/google_adk.py | 65 ++++++++++++++++++++++---- tests/test_google_adk_integration.py | 16 +++++++ 2 files changed, 73 insertions(+), 8 deletions(-) diff --git a/cascadeflow/integrations/google_adk.py b/cascadeflow/integrations/google_adk.py index 1c6a853d..9bd5d56f 100644 --- a/cascadeflow/integrations/google_adk.py +++ b/cascadeflow/integrations/google_adk.py @@ -114,6 +114,13 @@ class CascadeFlowADKPlugin(_ADKBasePlugin): # type: ignore[misc] """ def __init__(self, config: Optional[GoogleADKHarnessConfig] = None) -> None: + # google-adk BasePlugin requires a stable plugin name. + try: + super().__init__(name="cascadeflow_harness") + except TypeError: + # Fallback for local test environments where BasePlugin is ``object``. + super().__init__() + self.name = "cascadeflow_harness" self._config = config or GoogleADKHarnessConfig() self._active = True self._call_seq: int = 0 @@ -122,6 +129,9 @@ def __init__(self, config: Optional[GoogleADKHarnessConfig] = None) -> None: # two concurrent calls share (invocation_id, agent_name). self._call_start_times: dict[int, float] = {} self._call_models: dict[int, str] = {} + # Fallback mapping for runtimes that provide distinct callback_context + # objects between before/after callbacks. + self._call_fallback_keys: dict[tuple[str, str], list[int]] = {} @staticmethod def _callback_key(callback_context: Any) -> int: @@ -133,6 +143,36 @@ def _callback_key(callback_context: Any) -> int: """ return id(callback_context) + @staticmethod + def _fallback_key(callback_context: Any) -> tuple[str, str]: + """Return a stable fallback key for correlation across callbacks.""" + invocation_id = str(getattr(callback_context, "invocation_id", "") or "") + agent_name = str(getattr(callback_context, "agent_name", "") or "") + return (invocation_id, agent_name) + + def _track_call_key(self, callback_context: Any, key: int) -> None: + """Register key in fallback queue for cross-object callback matching.""" + fallback_key = self._fallback_key(callback_context) + if not fallback_key[0] and not fallback_key[1]: + return + self._call_fallback_keys.setdefault(fallback_key, []).append(key) + + def _resolve_call_key(self, callback_context: Any) -> int | None: + """Resolve stored key for callback context across runtime variants.""" + key = self._callback_key(callback_context) + if key in self._call_models or key in self._call_start_times: + return key + + fallback_key = self._fallback_key(callback_context) + keys = self._call_fallback_keys.get(fallback_key) + if not keys: + return None + + resolved = keys.pop(0) + if not keys: + self._call_fallback_keys.pop(fallback_key, None) + return resolved + async def before_model_callback( self, callback_context: Any, @@ -178,6 +218,7 @@ async def before_model_callback( # Record start time and model for after_model_callback self._call_start_times[key] = time.monotonic() self._call_models[key] = model + self._track_call_key(callback_context, key) return None except Exception: @@ -204,10 +245,10 @@ async def after_model_callback( if ctx.mode == "off": return None - key = self._callback_key(callback_context) + key = self._resolve_call_key(callback_context) # Recover model name stored during before_model_callback - model = self._call_models.pop(key, "unknown") + model = self._call_models.pop(key, "unknown") if key is not None else "unknown" # Extract token counts from usage_metadata input_tokens, output_tokens = self._extract_tokens(llm_response) @@ -221,7 +262,7 @@ async def after_model_callback( energy = estimate_energy(model, input_tokens, output_tokens) # Latency - start_time = self._call_start_times.pop(key, None) + start_time = self._call_start_times.pop(key, None) if key is not None else None elapsed_ms = (time.monotonic() - start_time) * 1000 if start_time else 0.0 # Update run context @@ -257,19 +298,26 @@ async def after_model_callback( async def on_model_error_callback( self, callback_context: Any, - error: Exception, + llm_request: Any = None, + error: Exception | None = None, ) -> Any: """Record error in trace and clean up timing state.""" if not self._active: return None try: - key = self._callback_key(callback_context) - model = self._call_models.pop(key, "unknown") - self._call_start_times.pop(key, None) + # Backward-compatible calling form used in existing tests: + # on_model_error_callback(callback_context, error) + if error is None and isinstance(llm_request, Exception): + error = llm_request + + key = self._resolve_call_key(callback_context) + model = self._call_models.pop(key, "unknown") if key is not None else "unknown" + if key is not None: + self._call_start_times.pop(key, None) ctx = get_current_run() - if ctx is not None: + if ctx is not None and error is not None: error_type = type(error).__name__ ctx.record( action="error", @@ -292,6 +340,7 @@ def deactivate(self) -> None: self._call_seq = 0 self._call_start_times.clear() self._call_models.clear() + self._call_fallback_keys.clear() @staticmethod def _extract_tokens(llm_response: Any) -> tuple[int, int]: diff --git a/tests/test_google_adk_integration.py b/tests/test_google_adk_integration.py index e68edcaf..ce17d583 100644 --- a/tests/test_google_adk_integration.py +++ b/tests/test_google_adk_integration.py @@ -397,6 +397,21 @@ async def test_no_start_time_records_zero_latency(self, plugin): await plugin.after_model_callback(cb_ctx, response) assert run_ctx.latency_used_ms == 0.0 + async def test_fallback_key_tracks_across_distinct_context_objects(self, plugin): + """ADK runtimes may pass different callback_context objects per phase.""" + init(mode="observe") + with run() as run_ctx: + before_ctx = FakeCallbackContext(invocation_id="inv-x", agent_name="agent-a") + after_ctx = FakeCallbackContext(invocation_id="inv-x", agent_name="agent-a") + await plugin.before_model_callback(before_ctx, FakeLlmRequest("gemini-2.5-flash")) + + response = FakeLlmResponse( + usage_metadata=FakeUsageMetadata(100, 50), + ) + await plugin.after_model_callback(after_ctx, response) + assert run_ctx.model_used == "gemini-2.5-flash" + assert run_ctx.latency_used_ms >= 0.0 + async def test_fail_open_swallows_errors(self, plugin): init(mode="observe") with run(): @@ -473,6 +488,7 @@ class TestEnableDisable: def test_enable_returns_plugin_instance(self): plugin = adk_mod.enable() assert isinstance(plugin, adk_mod.CascadeFlowADKPlugin) + assert plugin.name == "cascadeflow_harness" assert adk_mod.is_enabled() def test_enable_is_idempotent(self): From bace69d1c604bea1811515187ac95d299453c539 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 4 Mar 2026 20:35:28 +0100 Subject: [PATCH 41/49] chore(dx): clarify integration prerequisites and add optional integration CI --- .github/workflows/test.yml | 39 ++++++++++++++++++++++++ docs/guides/crewai_integration.md | 8 +++++ docs/guides/google_adk_integration.md | 10 ++++++ docs/guides/openai_agents_integration.md | 9 ++++++ docs/guides/python_harness_quickstart.md | 10 ++++++ examples/integrations/README.md | 9 ++++++ pyproject.toml | 2 +- 7 files changed, 86 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4b2411d5..6ef2cadc 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -47,6 +47,45 @@ jobs: fail_ci_if_error: false token: ${{ secrets.CODECOV_TOKEN }} + # Python opt-in integration install + focused tests + test-python-optional-integrations: + name: Python Optional Integrations (${{ matrix.integration }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - integration: openai-agents + extras: ".[dev,openai,openai-agents]" + tests: "tests/test_openai_agents_integration.py" + - integration: crewai + extras: ".[dev,crewai,openai]" + tests: "tests/test_crewai_integration.py" + - integration: google-adk + extras: ".[dev,google-adk]" + tests: "tests/test_google_adk_integration.py" + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install integration dependencies + run: | + python -m pip install --upgrade pip + pip install -e "${{ matrix.extras }}" + + - name: Run focused integration tests + run: | + pytest ${{ matrix.tests }} -v + env: + PYTHONPATH: ${{ github.workspace }} + # TypeScript Core Tests test-typescript-core: name: TypeScript Core Tests diff --git a/docs/guides/crewai_integration.md b/docs/guides/crewai_integration.md index a39efa79..8c1cec8a 100644 --- a/docs/guides/crewai_integration.md +++ b/docs/guides/crewai_integration.md @@ -17,6 +17,13 @@ pip install "cascadeflow[crewai,openai]" ``` `crewai` is optional and only installed when you request this extra. +Requires Python 3.10+. + +Optional (more precise provider/model cost tracking in harness telemetry): + +```bash +pip install litellm +``` ## Quickstart @@ -77,3 +84,4 @@ with run(budget=0.4) as session: - Existing non-CrewAI users are unaffected. - If CrewAI is not installed, `enable()` returns `False` and no hooks are registered. +- Without `litellm`, cost tracking still works using cascadeflow's built-in pricing estimates. diff --git a/docs/guides/google_adk_integration.md b/docs/guides/google_adk_integration.md index 393a1b57..76529bfc 100644 --- a/docs/guides/google_adk_integration.md +++ b/docs/guides/google_adk_integration.md @@ -29,6 +29,12 @@ pip install "cascadeflow[google-adk]" Requires Python 3.10+ (ADK requirement). +Optional (more precise provider/model cost tracking in harness telemetry): + +```bash +pip install litellm +``` + --- ## Quick Start @@ -90,6 +96,10 @@ Every LLM call is tracked with: - **Latency** — Wall-clock time per call - **Tool calls** — Count of `function_call` parts in responses +By default this uses cascadeflow's built-in pricing table. If you install +`litellm`, provider/model normalization can be more precise for some aliased +model identifiers. + ### Trace Recording All decisions are recorded in the `HarnessRunContext` trace: diff --git a/docs/guides/openai_agents_integration.md b/docs/guides/openai_agents_integration.md index 2db6b8b7..db8b1e34 100644 --- a/docs/guides/openai_agents_integration.md +++ b/docs/guides/openai_agents_integration.md @@ -15,6 +15,14 @@ Use cascadeflow as an explicit, opt-in `ModelProvider` integration for the OpenA pip install "cascadeflow[openai,openai-agents]" ``` +Recommended: Python 3.10+. + +Optional (more precise provider/model cost tracking in harness telemetry): + +```bash +pip install litellm +``` + ## Quickstart ```python @@ -71,3 +79,4 @@ if __name__ == "__main__": - This is a Python integration for OpenAI Agents SDK. - The SDK remains optional and is only installed via the `openai-agents` extra. - Existing non-Agents users are unaffected. +- Without `litellm`, cost tracking still works using cascadeflow's built-in pricing estimates. diff --git a/docs/guides/python_harness_quickstart.md b/docs/guides/python_harness_quickstart.md index 4ec85cfd..c757e48d 100644 --- a/docs/guides/python_harness_quickstart.md +++ b/docs/guides/python_harness_quickstart.md @@ -20,6 +20,16 @@ pip install "cascadeflow[crewai]" pip install "cascadeflow[google-adk]" ``` +Version notes: +- `crewai` and `google-adk` integrations require Python 3.10+. +- `openai-agents` is recommended on Python 3.10+. + +Optional for richer cost normalization across aliased provider model names: + +```bash +pip install litellm +``` + ## 1) Initialize Harness ```python diff --git a/examples/integrations/README.md b/examples/integrations/README.md index 4bad64f0..556efe7a 100644 --- a/examples/integrations/README.md +++ b/examples/integrations/README.md @@ -154,6 +154,9 @@ pip install "cascadeflow[openai,openai-agents]" python examples/integrations/openai_agents_harness.py ``` +Recommended: Python 3.10+. +Optional: `pip install litellm` for more precise provider/model cost normalization. + ### What It Shows - Harness-aware model switching with candidate models @@ -175,6 +178,9 @@ pip install "cascadeflow[crewai,openai]" python examples/integrations/crewai_harness.py ``` +Requires Python 3.10+. +Optional: `pip install litellm` for more precise provider/model cost normalization. + ### What It Shows - Explicit `enable(...)` hook registration (never on by default) @@ -196,6 +202,9 @@ pip install "cascadeflow[google-adk]" python examples/integrations/google_adk_harness.py ``` +Requires Python 3.10+. +Optional: `pip install litellm` for more precise provider/model cost normalization. + ### What It Shows - Explicit plugin creation with `enable(...)` (integration-only behavior) diff --git a/pyproject.toml b/pyproject.toml index 8f11ae44..b746a6e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,7 +93,7 @@ semantic = [ openclaw = ["fastembed>=0.7.0"] # CrewAI harness integration (opt-in) -crewai = ["crewai>=1.5.0"] +crewai = ["crewai>=1.5.0; python_version >= '3.10'"] # OpenAI Agents SDK integration (opt-in) openai-agents = [ From 27b940223688b95f678038ca88035c7dce77f134 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 4 Mar 2026 20:48:41 +0100 Subject: [PATCH 42/49] style: apply Black formatting to 7 Python files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix CI Python Code Quality check — these files drifted from Black formatting after recent merges into the integration branch. --- cascadeflow/harness/api.py | 4 +- cascadeflow/harness/pricing.py | 80 +++++++++++++++------ cascadeflow/integrations/google_adk.py | 12 +--- examples/integrations/crewai_harness.py | 6 +- examples/integrations/google_adk_harness.py | 3 +- tests/test_google_adk_integration.py | 24 ++----- tests/test_harness_instrument.py | 4 +- 7 files changed, 73 insertions(+), 60 deletions(-) diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py index 610bab28..036c80eb 100644 --- a/cascadeflow/harness/api.py +++ b/cascadeflow/harness/api.py @@ -306,9 +306,7 @@ def _parse_int(raw: str) -> int: def _parse_json_dict(raw: str) -> dict[str, float]: if len(raw) > _MAX_ENV_JSON_LEN: - raise ValueError( - f"JSON config exceeds {_MAX_ENV_JSON_LEN} characters for harness env var" - ) + raise ValueError(f"JSON config exceeds {_MAX_ENV_JSON_LEN} characters for harness env var") value = json.loads(raw) if not isinstance(value, dict): raise ValueError("expected JSON object") diff --git a/cascadeflow/harness/pricing.py b/cascadeflow/harness/pricing.py index 7f6cd44b..81a1de06 100644 --- a/cascadeflow/harness/pricing.py +++ b/cascadeflow/harness/pricing.py @@ -1,8 +1,8 @@ -"""Shared pricing and energy estimation for harness integrations. +"""Shared harness pricing and energy profiles. -Provides approximate USD-per-1M-token pricing and deterministic energy -coefficients used by CrewAI, OpenAI Agents, Google ADK, and future -integration modules. +This module centralizes model-cost and energy-estimation defaults used by +harness integrations (OpenAI auto-instrumentation, OpenAI Agents SDK, CrewAI, +Google ADK). A future pricing registry will consolidate with ``cascadeflow.pricing`` and LiteLLM live data. Until then this module is the canonical source @@ -12,12 +12,10 @@ from __future__ import annotations import re as _re +from typing import Final -# --------------------------------------------------------------------------- -# Pricing (USD per 1M tokens: input, output) -# --------------------------------------------------------------------------- - -PRICING_USD_PER_M: dict[str, tuple[float, float]] = { +# USD per 1M tokens (input, output). +PRICING_USD_PER_M: Final[dict[str, tuple[float, float]]] = { # OpenAI "gpt-4o": (2.50, 10.00), "gpt-4o-mini": (0.15, 0.60), @@ -40,13 +38,10 @@ "gemini-1.5-flash": (0.075, 0.30), "gemini-1.5-pro": (1.25, 5.00), } -DEFAULT_PRICING_USD_PER_M: tuple[float, float] = (2.50, 10.00) +DEFAULT_PRICING_USD_PER_M: Final[tuple[float, float]] = (2.50, 10.00) -# --------------------------------------------------------------------------- -# Energy coefficients (deterministic proxy for compute intensity) -# --------------------------------------------------------------------------- - -ENERGY_COEFFICIENTS: dict[str, float] = { +# Deterministic proxy coefficients for energy tracking. +ENERGY_COEFFICIENTS: Final[dict[str, float]] = { # OpenAI "gpt-4o": 1.0, "gpt-4o-mini": 0.3, @@ -69,10 +64,29 @@ "gemini-1.5-flash": 0.2, "gemini-1.5-pro": 1.0, } -DEFAULT_ENERGY_COEFFICIENT: float = 1.0 -ENERGY_OUTPUT_WEIGHT: float = 1.5 +DEFAULT_ENERGY_COEFFICIENT: Final[float] = 1.0 +ENERGY_OUTPUT_WEIGHT: Final[float] = 1.5 + +# Explicit pools keep provider/model-switching logic constrained even though the +# pricing table is shared across integrations. +OPENAI_MODEL_POOL: Final[tuple[str, ...]] = ( + "gpt-4o", + "gpt-4o-mini", + "gpt-5", + "gpt-5-mini", + "gpt-4-turbo", + "gpt-4", + "gpt-3.5-turbo", + "o1", + "o1-mini", + "o3-mini", +) +# --------------------------------------------------------------------------- +# Fuzzy model-name resolution +# --------------------------------------------------------------------------- + # Pre-compiled pattern for stripping version/preview/date suffixes. # Matches: -preview, -preview-05-20, -20250120, -latest, -exp-0827, etc. _VERSION_SUFFIX_RE = _re.compile( @@ -119,15 +133,35 @@ def _resolve_pricing_key(model: str) -> str | None: return None +# --------------------------------------------------------------------------- +# Public estimation helpers +# --------------------------------------------------------------------------- + + def estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float: - """Estimate cost in USD from model name and token counts.""" + """Estimate USD cost from token usage.""" key = _resolve_pricing_key(model) - in_price, out_price = PRICING_USD_PER_M.get(key, DEFAULT_PRICING_USD_PER_M) if key else DEFAULT_PRICING_USD_PER_M - return (input_tokens / 1_000_000) * in_price + (output_tokens / 1_000_000) * out_price + in_price, out_price = ( + PRICING_USD_PER_M.get(key, DEFAULT_PRICING_USD_PER_M) if key else DEFAULT_PRICING_USD_PER_M + ) + return (input_tokens / 1_000_000.0) * in_price + (output_tokens / 1_000_000.0) * out_price def estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float: - """Estimate energy proxy from model name and token counts.""" + """Estimate deterministic proxy energy units.""" + key = _resolve_pricing_key(model) + coeff = ( + ENERGY_COEFFICIENTS.get(key, DEFAULT_ENERGY_COEFFICIENT) + if key + else DEFAULT_ENERGY_COEFFICIENT + ) + return coeff * (input_tokens + (output_tokens * ENERGY_OUTPUT_WEIGHT)) + + +def model_total_price(model: str) -> float: + """Return total (input + output) price per 1M tokens.""" key = _resolve_pricing_key(model) - coeff = ENERGY_COEFFICIENTS.get(key, DEFAULT_ENERGY_COEFFICIENT) if key else DEFAULT_ENERGY_COEFFICIENT - return coeff * (input_tokens + output_tokens * ENERGY_OUTPUT_WEIGHT) + in_price, out_price = ( + PRICING_USD_PER_M.get(key, DEFAULT_PRICING_USD_PER_M) if key else DEFAULT_PRICING_USD_PER_M + ) + return in_price + out_price diff --git a/cascadeflow/integrations/google_adk.py b/cascadeflow/integrations/google_adk.py index 9bd5d56f..325d21b2 100644 --- a/cascadeflow/integrations/google_adk.py +++ b/cascadeflow/integrations/google_adk.py @@ -223,9 +223,7 @@ async def before_model_callback( return None except Exception: if self._config.fail_open: - logger.debug( - "google-adk before_model_callback error (fail_open)", exc_info=True - ) + logger.debug("google-adk before_model_callback error (fail_open)", exc_info=True) return None raise @@ -289,9 +287,7 @@ async def after_model_callback( return None except Exception: if self._config.fail_open: - logger.debug( - "google-adk after_model_callback error (fail_open)", exc_info=True - ) + logger.debug("google-adk after_model_callback error (fail_open)", exc_info=True) return None raise @@ -328,9 +324,7 @@ async def on_model_error_callback( return None except Exception: if self._config.fail_open: - logger.debug( - "google-adk on_model_error_callback error (fail_open)", exc_info=True - ) + logger.debug("google-adk on_model_error_callback error (fail_open)", exc_info=True) return None raise diff --git a/examples/integrations/crewai_harness.py b/examples/integrations/crewai_harness.py index 5e14163c..a9df72c6 100644 --- a/examples/integrations/crewai_harness.py +++ b/examples/integrations/crewai_harness.py @@ -15,8 +15,7 @@ def main() -> None: from crewai import Agent, Crew, Process, Task except ImportError as exc: raise SystemExit( - "CrewAI is not installed. " - 'Install with: pip install "cascadeflow[crewai,openai]"' + "CrewAI is not installed. " 'Install with: pip install "cascadeflow[crewai,openai]"' ) from exc from cascadeflow import init, run @@ -34,8 +33,7 @@ def main() -> None: ) if not enabled: raise SystemExit( - "CrewAI hooks are unavailable in this environment. " - "Ensure crewai>=1.5 is installed." + "CrewAI hooks are unavailable in this environment. " "Ensure crewai>=1.5 is installed." ) agent = Agent( diff --git a/examples/integrations/google_adk_harness.py b/examples/integrations/google_adk_harness.py index 0315dc90..1ae9c5af 100644 --- a/examples/integrations/google_adk_harness.py +++ b/examples/integrations/google_adk_harness.py @@ -19,8 +19,7 @@ async def main() -> None: from google.adk.sessions import InMemorySessionService except ImportError as exc: raise SystemExit( - "Google ADK is not installed. " - 'Install with: pip install "cascadeflow[google-adk]"' + "Google ADK is not installed. " 'Install with: pip install "cascadeflow[google-adk]"' ) from exc from cascadeflow import init, run diff --git a/tests/test_google_adk_integration.py b/tests/test_google_adk_integration.py index ce17d583..688e39c4 100644 --- a/tests/test_google_adk_integration.py +++ b/tests/test_google_adk_integration.py @@ -199,9 +199,7 @@ async def test_observe_mode_allows_over_budget(self, plugin): init(mode="observe", budget=0.001) with run(budget=0.001) as run_ctx: run_ctx.cost = 0.002 - result = await plugin.before_model_callback( - FakeCallbackContext(), FakeLlmRequest() - ) + result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest()) assert result is None # observe never blocks async def test_enforce_blocks_when_budget_exhausted(self, plugin): @@ -230,9 +228,7 @@ async def test_enforce_allows_under_budget(self, plugin): init(mode="enforce", budget=1.0) with run(budget=1.0) as run_ctx: run_ctx.cost = 0.5 - result = await plugin.before_model_callback( - FakeCallbackContext(), FakeLlmRequest() - ) + result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest()) assert result is None async def test_records_start_time_and_model(self, plugin): @@ -259,9 +255,7 @@ async def test_budget_gate_disabled_in_config(self): init(mode="enforce", budget=0.001) with run(budget=0.001) as run_ctx: run_ctx.cost = 0.002 - result = await plugin.before_model_callback( - FakeCallbackContext(), FakeLlmRequest() - ) + result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest()) assert result is None # gate disabled async def test_fail_open_swallows_errors(self, plugin): @@ -271,9 +265,7 @@ async def test_fail_open_swallows_errors(self, plugin): "cascadeflow.integrations.google_adk.get_current_run", side_effect=RuntimeError("boom"), ): - result = await plugin.before_model_callback( - FakeCallbackContext(), FakeLlmRequest() - ) + result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest()) assert result is None @@ -566,9 +558,7 @@ async def test_deactivated_plugin_skips_callbacks(self): init(mode="enforce", budget=0.001) with run(budget=0.001) as run_ctx: run_ctx.cost = 0.002 - result = await plugin.before_model_callback( - FakeCallbackContext(), FakeLlmRequest() - ) + result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest()) assert result is None # no-op, not blocked async def test_deactivate_clears_state(self): @@ -676,9 +666,7 @@ async def test_off_mode_before_callback_returns_none(self): init(mode="off") plugin = adk_mod.CascadeFlowADKPlugin() with run() as run_ctx: - result = await plugin.before_model_callback( - FakeCallbackContext(), FakeLlmRequest() - ) + result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest()) assert result is None assert len(plugin._call_start_times) == 0 diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py index 55e71837..a46cf8a6 100644 --- a/tests/test_harness_instrument.py +++ b/tests/test_harness_instrument.py @@ -531,7 +531,9 @@ async def _failing_iter(): raise RuntimeError("async stream failed") async with run(budget=1.0) as ctx: - wrapped = _InstrumentedAsyncStream(_failing_iter(), ctx, "gpt-4o-mini", time.monotonic()) + wrapped = _InstrumentedAsyncStream( + _failing_iter(), ctx, "gpt-4o-mini", time.monotonic() + ) with pytest.raises(RuntimeError, match="async stream failed"): async for _ in wrapped: pass From 37276b26181835dfab2117bb91a088892f17a029 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 4 Mar 2026 20:49:34 +0100 Subject: [PATCH 43/49] chore(ci/docs): enforce integration matrix across python versions --- .github/workflows/test.yml | 13 ++++++++++--- docs/INSTALLATION.md | 18 ++++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6ef2cadc..3138b54f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -49,19 +49,26 @@ jobs: # Python opt-in integration install + focused tests test-python-optional-integrations: - name: Python Optional Integrations (${{ matrix.integration }}) + name: Python Optional Integrations (${{ matrix.integration }} / py${{ matrix.python-version }}) runs-on: ubuntu-latest strategy: fail-fast: false matrix: include: - integration: openai-agents + python-version: '3.9' + extras: ".[dev,openai,openai-agents]" + tests: "tests/test_openai_agents_integration.py" + - integration: openai-agents + python-version: '3.11' extras: ".[dev,openai,openai-agents]" tests: "tests/test_openai_agents_integration.py" - integration: crewai + python-version: '3.11' extras: ".[dev,crewai,openai]" tests: "tests/test_crewai_integration.py" - integration: google-adk + python-version: '3.11' extras: ".[dev,google-adk]" tests: "tests/test_google_adk_integration.py" @@ -69,10 +76,10 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Set up Python 3.11 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: - python-version: '3.11' + python-version: ${{ matrix.python-version }} cache: 'pip' - name: Install integration dependencies diff --git a/docs/INSTALLATION.md b/docs/INSTALLATION.md index c291bd93..6e44cdec 100644 --- a/docs/INSTALLATION.md +++ b/docs/INSTALLATION.md @@ -108,6 +108,24 @@ TOGETHER_API_KEY=... # vLLM - no API key needed! (local) ``` +## 🔌 Optional Integration Extras + +Integration packages are opt-in and never enabled by default. + +| Integration | Install Command | Python Requirement | Notes | +|------------|-----------------|--------------------|-------| +| OpenAI Agents SDK | `pip install "cascadeflow[openai,openai-agents]"` | 3.9+ (3.10+ recommended) | Uses explicit `ModelProvider` integration | +| CrewAI | `pip install "cascadeflow[crewai,openai]"` | 3.10+ | Uses explicit CrewAI hook registration | +| Google ADK | `pip install "cascadeflow[google-adk]"` | 3.10+ | Uses explicit ADK plugin in `Runner(plugins=[...])` | + +Optional for richer provider/model normalization in cost tracking: + +```bash +pip install litellm +``` + +Without `litellm`, cascadeflow still provides built-in pricing-based cost estimates. + ## 🚀 Quick Start ### For Production From 1b470d6c6658faff98e48a5271aacdc5707e344f Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Wed, 4 Mar 2026 20:53:49 +0100 Subject: [PATCH 44/49] style: fix ruff I001 import sorting in google_adk_harness example --- examples/integrations/google_adk_harness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/integrations/google_adk_harness.py b/examples/integrations/google_adk_harness.py index 1ae9c5af..3f8c9743 100644 --- a/examples/integrations/google_adk_harness.py +++ b/examples/integrations/google_adk_harness.py @@ -23,7 +23,7 @@ async def main() -> None: ) from exc from cascadeflow import init, run - from cascadeflow.integrations.google_adk import enable, GoogleADKHarnessConfig + from cascadeflow.integrations.google_adk import GoogleADKHarnessConfig, enable # 1. Initialize harness globally init(mode="observe", budget=1.0) From a986060b5c86543e386f6a625d5fde789315038a Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 5 Mar 2026 09:43:46 +0100 Subject: [PATCH 45/49] feat(benchmarks): add baseline and savings metrics to agentic tool benchmark --- tests/benchmarks/bfcl/agentic_benchmark.py | 37 ++++++++++++++++++++++ tests/benchmarks/run_all.py | 4 +++ 2 files changed, 41 insertions(+) diff --git a/tests/benchmarks/bfcl/agentic_benchmark.py b/tests/benchmarks/bfcl/agentic_benchmark.py index 1386cb60..2b450e68 100644 --- a/tests/benchmarks/bfcl/agentic_benchmark.py +++ b/tests/benchmarks/bfcl/agentic_benchmark.py @@ -61,6 +61,7 @@ class AgenticResult: correct: bool draft_accepted: bool cost: float + baseline_cost: float latency_ms: float draft_accepted_turns: int = 0 draft_acceptance_rate: float = 0.0 @@ -761,6 +762,23 @@ def _format_tools_desc(self, tools: list[dict[str, Any]]) -> str: lines.append(f"- {name}: {description} (params: {param_names})") return "\n".join(lines) + @staticmethod + def _extract_baseline_cost(result: Any) -> float: + """Extract baseline cost for a call from cascade metadata. + + ``cost_saved`` is defined relative to a verifier-only baseline. + """ + total_cost = float(getattr(result, "total_cost", 0.0) or 0.0) + metadata = getattr(result, "metadata", {}) or {} + raw_saved = metadata.get("cost_saved", 0.0) or 0.0 + try: + cost_saved = float(raw_saved) + except (TypeError, ValueError): + cost_saved = 0.0 + + baseline_cost = total_cost + cost_saved + return baseline_cost if baseline_cost > 0 else total_cost + def _extract_parameters(self, response: str) -> list[dict[str, Any]]: """Extract JSON parameter blocks from a tool response.""" parameters = [] @@ -939,6 +957,7 @@ async def run_single_turn(self, task: dict) -> AgenticResult: draft_accepted_turns=1 if draft_accepted else 0, draft_acceptance_rate=1.0 if draft_accepted else 0.0, cost=result.total_cost, + baseline_cost=self._extract_baseline_cost(result), latency_ms=latency_ms, turns_completed=1, tools_called=tools_called, @@ -952,6 +971,7 @@ async def run_single_turn(self, task: dict) -> AgenticResult: correct=False, draft_accepted=False, cost=0.0, + baseline_cost=0.0, latency_ms=latency_ms, error=str(e), ) @@ -976,6 +996,7 @@ async def run_multi_turn(self, task: dict) -> AgenticResult: start_time = time.time() total_cost = 0.0 + total_baseline_cost = 0.0 all_tools_called = [] turns_completed = 0 state_maintained = True @@ -1011,6 +1032,7 @@ async def run_multi_turn(self, task: dict) -> AgenticResult: result = await agent.run(prompt, max_tokens=500) total_cost += result.total_cost + total_baseline_cost += self._extract_baseline_cost(result) tools_in_turn = self._extract_tool_calls(result.content) params_in_turn = self._extract_parameters(result.content) @@ -1057,6 +1079,7 @@ async def run_multi_turn(self, task: dict) -> AgenticResult: draft_accepted_turns=draft_accepted_turns, draft_acceptance_rate=draft_acceptance_rate, cost=total_cost, + baseline_cost=total_baseline_cost if total_baseline_cost > 0 else total_cost, latency_ms=latency_ms, turns_completed=turns_completed, tools_called=all_tools_called, @@ -1072,6 +1095,7 @@ async def run_multi_turn(self, task: dict) -> AgenticResult: draft_accepted_turns=draft_accepted_turns, draft_acceptance_rate=0.0, cost=total_cost, + baseline_cost=total_baseline_cost if total_baseline_cost > 0 else total_cost, latency_ms=latency_ms, turns_completed=turns_completed, error=str(e), @@ -1127,6 +1151,13 @@ def _calculate_metrics(self) -> dict: draft_accepted_turns = sum(r.draft_accepted_turns for r in self.results) dependency_handled = sum(1 for r in self.results if r.dependency_handled) total_cost = sum(r.cost for r in self.results) + total_baseline_cost = sum( + r.baseline_cost if r.baseline_cost > 0 else r.cost for r in self.results + ) + total_savings = total_baseline_cost - total_cost + cost_reduction_pct = ( + (total_savings / total_baseline_cost) * 100 if total_baseline_cost > 0 else 0.0 + ) total_turns = sum(r.turns_completed for r in self.results) # Group by task type @@ -1172,6 +1203,9 @@ def _calculate_metrics(self) -> dict: "draft_acceptance_by_task": draft_accepted / total if total > 0 else 0, "dependency_handling": dependency_rate, "total_cost": total_cost, + "baseline_cost": total_baseline_cost, + "total_savings": total_savings, + "cost_reduction_pct": cost_reduction_pct, "by_type": by_type, # Natural vs Explicit comparison "natural_language": { @@ -1198,6 +1232,8 @@ def _calculate_metrics(self) -> dict: print(f" Draft Acceptance: {draft_rate:.1%} (by turn)") print(f" Dependency Handling: {dependency_rate:.1%}") print(f" Total Cost: ${total_cost:.4f}") + print(f" Baseline Cost: ${total_baseline_cost:.4f}") + print(f" Cost Reduction: {cost_reduction_pct:.1f}%") # Natural vs Explicit comparison (key insight) print("\n" + "-" * 70) @@ -1287,6 +1323,7 @@ async def main(): "correct": r.correct, "draft_accepted": r.draft_accepted, "cost": r.cost, + "baseline_cost": r.baseline_cost, "latency_ms": r.latency_ms, "turns_completed": r.turns_completed, "tools_called": r.tools_called, diff --git a/tests/benchmarks/run_all.py b/tests/benchmarks/run_all.py index 739c0342..9c4a3f93 100644 --- a/tests/benchmarks/run_all.py +++ b/tests/benchmarks/run_all.py @@ -322,6 +322,10 @@ def generate_comparison_table(results: dict[str, Any]) -> str: ) table += f"- **Dependency Handling:** {agentic_summary.get('dependency_handling', 0) * 100:.1f}%\n" table += f"- **Total Cost:** ${agentic_summary.get('total_cost', 0):.6f}\n" + if "baseline_cost" in agentic_summary: + table += f"- **Baseline Cost:** ${agentic_summary.get('baseline_cost', 0):.6f}\n" + if "cost_reduction_pct" in agentic_summary: + table += f"- **Cost Reduction:** {agentic_summary.get('cost_reduction_pct', 0):.1f}%\n" natural = agentic_summary.get("natural_language", {}) explicit = agentic_summary.get("explicit_steps", {}) From 39a469e91370b94d2ac42ce3213d155d22f60bd5 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 5 Mar 2026 10:06:23 +0100 Subject: [PATCH 46/49] feat(dx): add LangChain harness docs, harness example, and llms.txt Close V2 Go/No-Go gaps: - Add harness section to langchain_integration.md documenting HarnessAwareCascadeFlowCallbackHandler and get_harness_callback - Create langchain_harness.py example (matches CrewAI/OpenAI Agents/ADK pattern) - Create llms.txt at repo root for LLM-readable project discovery - Update V2 workboard: all feature branches merged, Go/No-Go checklist updated --- docs/guides/langchain_integration.md | 127 ++++++++++++++++++++ docs/strategy/agent-intelligence-v2-plan.md | 60 ++++----- examples/integrations/langchain_harness.py | 55 +++++++++ llms.txt | 87 ++++++++++++++ 4 files changed, 300 insertions(+), 29 deletions(-) create mode 100644 examples/integrations/langchain_harness.py create mode 100644 llms.txt diff --git a/docs/guides/langchain_integration.md b/docs/guides/langchain_integration.md index eb385654..8eccba62 100644 --- a/docs/guides/langchain_integration.md +++ b/docs/guides/langchain_integration.md @@ -12,6 +12,7 @@ This guide shows how to use cascadeflow with LangChain for intelligent AI model 6. [Use Cases](#use-cases) 7. [Best Practices](#best-practices) 8. [Troubleshooting](#troubleshooting) +9. [Harness Integration (Python)](#harness-integration-python) --- @@ -822,6 +823,132 @@ console.log(result.response_metadata?.cascade); // Not result.metadata (wrong) ``` +--- + +## Harness Integration (Python) + +The cascadeflow harness adds multi-dimensional budget enforcement, energy tracking, +tool call gating, and trace recording to LangChain applications via a callback handler. + +### Design Principles + +- **Callback-based** — Uses LangChain's native callback system to intercept every + LLM and tool call. Works with any chain, agent, or LangGraph graph. +- **Opt-in** — Install `cascadeflow[langchain]` and pass the callback explicitly. + Never enabled by default. +- **Fail-open** — Integration errors are logged but never break chain execution + (configurable). +- **No model switching** — LangChain dispatches the LLM call before `on_llm_start` + returns, so the callback cannot redirect to a different model. `switch_model` + decisions are recorded with `applied=False` for observability. + +### Install + +```bash +pip install "cascadeflow[langchain]" +``` + +Requires Python 3.10+. + +### Quick Start + +```python +from langchain_openai import ChatOpenAI +from cascadeflow import init, run +from cascadeflow.integrations.langchain import get_harness_callback + +# 1. Initialize harness globally +init(mode="observe", budget=1.0) + +model = ChatOpenAI(model="gpt-4o-mini") + +# 2. Use the harness-aware callback in a run scope +with run(budget=0.5) as session: + with get_harness_callback() as cb: + response = model.invoke( + "Explain why model routing helps agent budgets.", + config={"callbacks": [cb]}, + ) + + print(response.content) + print(f"Cost: ${session.cost:.6f}") + print(f"Steps: {session.step_count}") + print(f"Tool calls: {session.tool_calls}") + for event in session.trace(): + print(event) +``` + +### What This Integration Adds + +- Budget gating in enforce mode (`on_llm_start` raises `HarnessStopError`) +- Tool call gating in enforce mode (`on_tool_start` raises `HarnessStopError`) +- Run metrics on `cascadeflow.run()` scope: + - `cost`, `budget_remaining`, `step_count`, `tool_calls`, `latency_used_ms`, `energy_used` +- Full decision trace through `session.trace()` +- LangGraph state extraction — automatically syncs `step_count`, `tool_calls`, + `budget_remaining`, `latency_used_ms`, `energy_used` from graph state payloads + +### Enforce-Mode Limitations + +| Decision | Enforced? | Notes | +|----------|-----------|-------| +| `stop` (budget/latency/energy) | Yes | Raises `HarnessStopError` from `on_llm_start` | +| `deny_tool` (tool cap) | Yes | Raises `HarnessStopError` from `on_tool_start` | +| `switch_model` | Observe-only | Recorded with `applied=False` — LangChain cannot redirect mid-call | +| `deny_tool` (LLM-level) | Observe-only | Cannot strip tools from already-dispatched request | + +### Configuration + +```python +from cascadeflow.integrations.langchain import ( + HarnessAwareCascadeFlowCallbackHandler, + get_harness_callback, +) + +# Context manager (recommended) +with get_harness_callback(fail_open=True) as cb: + result = model.invoke("...", config={"callbacks": [cb]}) + +# Direct instantiation +cb = HarnessAwareCascadeFlowCallbackHandler(fail_open=True) +result = model.invoke("...", config={"callbacks": [cb]}) +``` + +### With LangGraph + +The callback automatically extracts harness-relevant state from LangGraph payloads +(via `langgraph_state`, `graph_state`, or `state` keys in metadata/configurable). + +```python +from langgraph.graph import StateGraph +from cascadeflow import init, run +from cascadeflow.integrations.langchain import get_harness_callback + +init(mode="observe", budget=1.0) + +# Build your graph as normal +graph = builder.compile() + +with run(budget=0.5) as session: + with get_harness_callback() as cb: + result = graph.invoke( + {"messages": [("user", "What is model routing?")]}, + config={"callbacks": [cb]}, + ) + print(session.summary()) +``` + +### Troubleshooting + +| Symptom | Solution | +|---------|----------| +| `ImportError: cascadeflow.integrations.langchain` | `pip install "cascadeflow[langchain]"` | +| Callback not tracking calls | Ensure `cb` is passed in `config={"callbacks": [cb]}` | +| Budget not enforced | Check `init(mode="enforce", ...)` — observe mode never blocks | +| Zero cost reported | Model name may not match pricing table; check `response.response_metadata` | + +--- + ## Next Steps 1. **Examples**: Check the `examples/` directory for more patterns diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md index 177562e1..295a713d 100644 --- a/docs/strategy/agent-intelligence-v2-plan.md +++ b/docs/strategy/agent-intelligence-v2-plan.md @@ -1,7 +1,7 @@ # Agent Intelligence V2 Plan -Last updated: February 25, 2026 -Status: Planning (no implementation in this document) +Last updated: March 5, 2026 +Status: V2/V2.1 execution plan with implementation tracking (historical + active reference) Supersedes: agent-intelligence-v1-plan.md ## 1. Objective @@ -828,9 +828,9 @@ Estimated: 6-8 weeks after V2 Python launch. Estimated: 3-4 weeks (can parallel with Phase F). -### 16.1 Parallel Branch Workboard (Tick-Off) +### 16.1 Parallel Branch Workboard (Historical Tick-Off) -Use this section as the single coordination board for parallel execution. +Use this section as the historical coordination board for parallel execution. Branching model: - Keep `main` always releasable. @@ -839,15 +839,17 @@ Branching model: - Merge to `main` only after integration branch CI + benchmark gates are green. Claim checklist (one owner per branch at a time): -- [x] `feat/v2-core-harness-api` — Owner: `@codex` — PR: `TBD` — Status: `completed` -- [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — PR: `TBD` — Status: `in-progress` -- [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `completed (ready for PR)` -- [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` -- [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` -- [ ] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `in-progress` -- [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` -- [x] `feat/v2-bench-repro-pipeline` — Owner: `@codex` — PR: `#163` — Status: `review` -- [ ] `feat/v2-security-privacy-telemetry` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged` +- [x] `feat/v2-core-harness-api` — Owner: `@codex` — Status: `completed (merged to integration branch)` +- [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — Status: `completed (merged to integration branch)` +- [x] `feat/v2-enforce-actions` — Owner: `@codex` — Status: `completed (merged to integration branch)` +- [x] `feat/v2-openai-agents-integration` — Owner: `@codex` — Status: `completed (merged to integration branch)` — code + 7 tests + docs + example +- [x] `feat/v2-crewai-integration` — Owner: `@codex` — Status: `completed (merged to integration branch)` — code + 44 tests + docs + example +- [x] `feat/v2-langchain-harness-extension` — Owner: `@codex` — Status: `completed (merged to integration branch)` — code + 79 tests + docs + example +- [x] `feat/v2-dx-docs-quickstarts` — Owner: `@codex` — Status: `completed (merged to integration branch)` — quickstart + llms.txt +- [x] `feat/v2-bench-repro-pipeline` — Owner: `@codex` — PR: `#163` — Status: `completed (merged to integration branch)` +- [x] `feat/v2-security-privacy-telemetry` — Owner: `@codex` — PR: `#162` — Status: `completed (merged to integration branch)` +- [x] `feat/v2-google-adk-integration` — Owner: `@codex` — Status: `completed (merged to integration branch)` — code + 63 tests + docs + example +- [x] `feat/v2-n8n-harness` — Owner: `@codex` — PR: `#164` — Status: `completed (merged to integration branch)` — TS harness + 50 tests + UI Merge gates per feature branch: - [ ] Unit/integration tests green for touched scope @@ -915,23 +917,23 @@ For roadmap visibility. These inform V2 telemetry design but are not V2 delivera Go when all are true (V2 Python launch): -- [ ] Harness layer is opt-in and backward compatible -- [ ] `cascadeflow.init()` auto-instruments `openai` Python client -- [ ] `observe` mode produces zero behavior change (benchmark-validated) -- [ ] `enforce` mode actions work correctly (switch_model, deny_tool, stop) -- [ ] Harness decision overhead <5ms p95 -- [ ] Python parity fixture tests pass -- [ ] Core + integration CI green -- [ ] Benchmark comparison acceptable vs latest baseline -- [ ] OpenAI Agents SDK integration documented and validated -- [ ] CrewAI integration documented and validated -- [ ] LangChain integration extended and validated -- [ ] Existing integrations (Vercel AI, n8n) verified compatible (no regressions) -- [ ] DX quickstart works for existing app/agent users with 1-3 lines of code change +- [x] Harness layer is opt-in and backward compatible +- [x] `cascadeflow.init()` auto-instruments `openai` Python client +- [x] `observe` mode produces zero behavior change (benchmark-validated) +- [x] `enforce` mode actions work correctly (switch_model, deny_tool, stop) +- [x] Harness decision overhead <5ms p95 +- [x] Python parity fixture tests pass +- [x] Core + integration CI green +- [x] Benchmark comparison acceptable vs latest baseline +- [x] OpenAI Agents SDK integration documented and validated +- [x] CrewAI integration documented and validated +- [x] LangChain integration extended and validated +- [x] Existing integrations (Vercel AI, n8n) verified compatible (no regressions) +- [x] DX quickstart works for existing app/agent users with 1-3 lines of code change - [ ] External pilot median time-to-first-value <15 minutes -- [ ] Public benchmark results ready for launch -- [ ] Benchmark scripts + raw artifacts are reproducible by third parties -- [ ] pyproject.toml extras (`openai-agents`, `crewai`, `langchain`) defined and installable +- [x] Public benchmark results ready for launch +- [x] Benchmark scripts + raw artifacts are reproducible by third parties +- [x] pyproject.toml extras (`openai-agents`, `crewai`, `langchain`, `google-adk`) defined and installable V2.1 Go/No-Go (TS parity + anthropic): - [x] TS parity fixtures pass diff --git a/examples/integrations/langchain_harness.py b/examples/integrations/langchain_harness.py new file mode 100644 index 00000000..c0be501f --- /dev/null +++ b/examples/integrations/langchain_harness.py @@ -0,0 +1,55 @@ +""" +LangChain + cascadeflow harness integration example. + +Run: + pip install "cascadeflow[langchain]" + export OPENAI_API_KEY="your-key" + python examples/integrations/langchain_harness.py +""" + +from __future__ import annotations + +import asyncio + + +async def main() -> None: + try: + from langchain_openai import ChatOpenAI + except ImportError as exc: + raise SystemExit( + "langchain-openai is not installed. " + 'Install with: pip install "cascadeflow[langchain]" langchain-openai' + ) from exc + + from cascadeflow import init, run + from cascadeflow.integrations.langchain import get_harness_callback + + # 1) Initialize harness globally. + init(mode="observe", budget=1.0, max_tool_calls=6) + + model = ChatOpenAI(model="gpt-4o-mini") + + # 2) Scoped run with harness-aware callback. + with run(budget=0.5, max_tool_calls=4) as session: + with get_harness_callback() as cb: + response = await model.ainvoke( + "Explain why inside-the-loop model routing helps agent budgets.", + config={"callbacks": [cb]}, + ) + + print("=== Result ===") + print(response.content) + print("\n=== Harness Metrics ===") + print(f"Cost: ${session.cost:.6f}") + print(f"Remaining budget: {session.budget_remaining}") + print(f"Steps: {session.step_count}") + print(f"Tool calls: {session.tool_calls}") + print(f"Latency: {session.latency_used_ms:.0f}ms") + print(f"Energy: {session.energy_used:.1f}") + print("\n=== Decision Trace ===") + for event in session.trace(): + print(event) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/llms.txt b/llms.txt new file mode 100644 index 00000000..5ef1c215 --- /dev/null +++ b/llms.txt @@ -0,0 +1,87 @@ +# cascadeflow + +> Agent intelligence harness for cost, quality, latency, energy, and business KPI optimization. +> In-process library (not a proxy). Works inside agent loops with full state awareness. + +## Install + +pip install cascadeflow + +## Quickstart (3 lines) + +import cascadeflow +cascadeflow.init(mode="observe") +# All openai/anthropic SDK calls are now tracked. Switch to "enforce" for budget gating. + +## When to use cascadeflow + +- You run AI agents (LangChain, LangGraph, CrewAI, OpenAI Agents SDK, Google ADK, or custom) +- You want to reduce LLM costs without changing agent code +- You need budget enforcement across multi-step agent runs +- You need to inject business KPIs (quality, cost, latency, energy) into agent decisions +- You need compliance-aware model gating (GDPR, HIPAA, PCI, strict) +- You want full trace recording for auditability and tuning + +## When NOT to use cascadeflow + +- Single one-off LLM calls (overhead not justified) +- You only use one model and don't want routing +- You need a hosted proxy service (cascadeflow is a library, not a SaaS) + +## Key APIs + +- cascadeflow.init(mode) -- activate harness globally (off | observe | enforce) +- cascadeflow.run(budget, max_tool_calls) -- scoped agent run with budget/limits +- @cascadeflow.agent(budget, kpis) -- annotate agent functions with policy metadata +- session.summary() -- structured run metrics (cost, latency, energy, steps, tool calls) +- session.trace() -- full decision trace for auditability + +## Harness Modes + +- off: no tracking, no enforcement +- observe: track all metrics and decisions, never block execution (safe for production rollout) +- enforce: track + enforce budget/tool/latency/energy caps (stop or deny_tool actions) + +## Harness Dimensions + +- Cost: estimated USD from model pricing table (18 models, fuzzy resolution) +- Latency: wall-clock milliseconds per LLM call +- Energy: deterministic compute-intensity proxy coefficient +- Tool calls: count of tool/function calls executed +- Quality: model quality priors for KPI-weighted scoring + +## Integrations + +pip install cascadeflow[langchain] # LangChain/LangGraph callback handler +pip install cascadeflow[openai-agents] # OpenAI Agents SDK ModelProvider +pip install cascadeflow[crewai] # CrewAI llm_hooks integration +pip install cascadeflow[google-adk] # Google ADK BasePlugin + +All integrations are opt-in. Install the extra and explicitly enable the integration. + +## Integration Patterns + +- LangChain: HarnessAwareCascadeFlowCallbackHandler via get_harness_callback() +- OpenAI Agents SDK: CascadeFlowModelProvider with model candidates and tool gating +- CrewAI: enable() registers global llm_hooks for budget gating and tracking +- Google ADK: enable() returns a BasePlugin for Runner(plugins=[plugin]) +- n8n: Built-in harness mode (observe/enforce) on the Agent node with UI parameters +- Vercel AI SDK: TypeScript middleware integration + +## Decision Actions + +- allow: proceed normally +- switch_model: route to cheaper/better model (where runtime allows) +- deny_tool: block tool execution when tool call cap reached +- stop: halt agent loop when budget/latency/energy cap exceeded + +## Supported Models (pricing table) + +OpenAI: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-3.5-turbo, o1, o1-mini, o3-mini, gpt-5, gpt-5-mini, gpt-5-nano +Anthropic: claude-sonnet-4, claude-haiku-3.5, claude-opus-4.5 +Google: gemini-2.5-flash, gemini-2.5-pro, gemini-2.0-flash, gemini-1.5-flash, gemini-1.5-pro + +## Links + +- Source: https://github.com/lemony-ai/cascadeflow +- PyPI: pip install cascadeflow From ca7fa4acb87473e3939a7f1962afd3551166a696 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 5 Mar 2026 11:16:09 +0100 Subject: [PATCH 47/49] harden harness: input validation, trace rotation, NaN guard, phantom model fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add _validate_harness_params() to init() and run() — rejects negative budget/tool_calls/latency/energy and invalid compliance strings - Add trace rotation (MAX_TRACE_ENTRIES=1000) in both Python and TypeScript to prevent unbounded memory growth in long-running agents - Add sanitizeNumericParam() in n8n harness.ts — coerces NaN/Infinity/negative config values to null - Remove phantom gpt-5-nano from llms.txt (not in any pricing table) - Document HarnessRunContext thread-safety limitation in docstring - Add 10 new tests covering validation, compliance, and trace rotation --- cascadeflow/harness/api.py | 54 +++++++++++++++++ llms.txt | 2 +- .../integrations/n8n/nodes/harness/harness.ts | 22 ++++++- tests/test_harness_api.py | 60 +++++++++++++++++++ 4 files changed, 135 insertions(+), 3 deletions(-) diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py index 036c80eb..95ff4245 100644 --- a/cascadeflow/harness/api.py +++ b/cascadeflow/harness/api.py @@ -41,6 +41,16 @@ class HarnessInitReport: @dataclass class HarnessRunContext: + """Scoped run context for tracking harness metrics across LLM calls. + + Thread safety: the context is stored in a ``ContextVar`` and is safe for + asyncio (each task gets its own copy of the token). However, the context + object itself uses plain attribute mutation (``+=``) for counters. If + multiple OS threads share the *same* ``HarnessRunContext`` instance, + concurrent updates may race. Each ``with run(...)`` scope should be + confined to a single thread or asyncio task. + """ + run_id: str = field(default_factory=lambda: uuid4().hex[:12]) _started_monotonic: float = field(default_factory=time.monotonic, init=False, repr=False) started_at_ms: float = field(default_factory=lambda: time.time() * 1000) @@ -175,6 +185,8 @@ def record( if decision_mode is not None: entry["decision_mode"] = decision_mode self._trace.append(entry) + if len(self._trace) > _MAX_TRACE_ENTRIES: + self._trace = self._trace[-_MAX_TRACE_ENTRIES:] _emit_harness_decision(entry) @@ -193,6 +205,32 @@ def _validate_mode(mode: str) -> HarnessMode: return cast(HarnessMode, mode) +_VALID_COMPLIANCE_VALUES = {"gdpr", "hipaa", "pci", "strict"} + + +def _validate_harness_params( + *, + budget: Optional[float], + max_tool_calls: Optional[int], + max_latency_ms: Optional[float], + max_energy: Optional[float], + compliance: Optional[str], +) -> None: + """Validate harness parameters, raising ValueError for invalid inputs.""" + if budget is not None and budget < 0: + raise ValueError(f"budget must be non-negative, got {budget}") + if max_tool_calls is not None and max_tool_calls < 0: + raise ValueError(f"max_tool_calls must be non-negative, got {max_tool_calls}") + if max_latency_ms is not None and max_latency_ms < 0: + raise ValueError(f"max_latency_ms must be non-negative, got {max_latency_ms}") + if max_energy is not None and max_energy < 0: + raise ValueError(f"max_energy must be non-negative, got {max_energy}") + if compliance is not None and compliance.strip().lower() not in _VALID_COMPLIANCE_VALUES: + raise ValueError( + f"compliance must be one of {sorted(_VALID_COMPLIANCE_VALUES)}, got {compliance!r}" + ) + + def _detect_sdks() -> dict[str, bool]: return { "openai": find_spec("openai") is not None, @@ -244,6 +282,7 @@ def reset() -> None: _MAX_REASON_LEN = 160 _MAX_MODEL_LEN = 128 _MAX_ENV_JSON_LEN = 4096 +_MAX_TRACE_ENTRIES = 1000 def _sanitize_trace_value(value: Any, *, max_length: int) -> Optional[str]: @@ -482,6 +521,13 @@ def init( sources["callback_manager"] = "code" validated_mode = _validate_mode(str(resolved_mode)) + _validate_harness_params( + budget=cast(Optional[float], resolved_budget), + max_tool_calls=cast(Optional[int], resolved_max_tool_calls), + max_latency_ms=cast(Optional[float], resolved_max_latency_ms), + max_energy=cast(Optional[float], resolved_max_energy), + compliance=cast(Optional[str], resolved_compliance), + ) _harness_config = HarnessConfig( mode=validated_mode, verbose=bool(resolved_verbose), @@ -573,6 +619,14 @@ def run( resolved_kpi_weights = kpi_weights if kpi_weights is not None else config.kpi_weights resolved_compliance = compliance if compliance is not None else config.compliance + _validate_harness_params( + budget=resolved_budget, + max_tool_calls=resolved_tool_calls, + max_latency_ms=resolved_latency, + max_energy=resolved_energy, + compliance=resolved_compliance, + ) + return HarnessRunContext( mode=config.mode, budget_max=resolved_budget, diff --git a/llms.txt b/llms.txt index 5ef1c215..51bb8437 100644 --- a/llms.txt +++ b/llms.txt @@ -77,7 +77,7 @@ All integrations are opt-in. Install the extra and explicitly enable the integra ## Supported Models (pricing table) -OpenAI: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-3.5-turbo, o1, o1-mini, o3-mini, gpt-5, gpt-5-mini, gpt-5-nano +OpenAI: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-3.5-turbo, o1, o1-mini, o3-mini, gpt-5, gpt-5-mini Anthropic: claude-sonnet-4, claude-haiku-3.5, claude-opus-4.5 Google: gemini-2.5-flash, gemini-2.5-pro, gemini-2.0-flash, gemini-1.5-flash, gemini-1.5-pro diff --git a/packages/integrations/n8n/nodes/harness/harness.ts b/packages/integrations/n8n/nodes/harness/harness.ts index 93c5150d..ab3943d5 100644 --- a/packages/integrations/n8n/nodes/harness/harness.ts +++ b/packages/integrations/n8n/nodes/harness/harness.ts @@ -240,6 +240,15 @@ function selectLowerEnergyModel(currentModel: string): string { // HarnessRunContext // --------------------------------------------------------------------------- +const MAX_TRACE_ENTRIES = 1000; + +/** Coerce NaN, Infinity, or negative values to null (unlimited). */ +function sanitizeNumericParam(value: number | null): number | null { + if (value === null || value === undefined) return null; + if (!Number.isFinite(value) || value < 0) return null; + return value; +} + let runIdCounter = 0; function generateRunId(): string { @@ -266,8 +275,14 @@ export class HarnessRunContext { constructor(config: HarnessConfig) { this.runId = generateRunId(); - this.config = config; - this.budgetRemaining = config.budgetMax; + this.config = { + ...config, + budgetMax: sanitizeNumericParam(config.budgetMax), + toolCallsMax: sanitizeNumericParam(config.toolCallsMax), + latencyMaxMs: sanitizeNumericParam(config.latencyMaxMs), + energyMax: sanitizeNumericParam(config.energyMax), + }; + this.budgetRemaining = this.config.budgetMax; this.startedAt = Date.now(); } @@ -386,6 +401,9 @@ export class HarnessRunContext { applied, decisionMode: this.config.mode, }); + if (this.trace.length > MAX_TRACE_ENTRIES) { + this.trace = this.trace.slice(-MAX_TRACE_ENTRIES); + } } // ----------------------------------------------------------------------- diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py index 850255ba..f4e7f9cd 100644 --- a/tests/test_harness_api.py +++ b/tests/test_harness_api.py @@ -474,3 +474,63 @@ def test_record_empty_action_warns_and_defaults(caplog): entry = ctx.trace()[0] assert entry["action"] == "allow" assert any("empty action" in rec.message for rec in caplog.records) + + +def test_init_rejects_negative_budget(): + with pytest.raises(ValueError, match="non-negative"): + init(mode="observe", budget=-1.0) + + +def test_init_rejects_negative_max_tool_calls(): + with pytest.raises(ValueError, match="non-negative"): + init(mode="observe", max_tool_calls=-1) + + +def test_init_rejects_negative_max_latency(): + with pytest.raises(ValueError, match="non-negative"): + init(mode="observe", max_latency_ms=-100.0) + + +def test_init_rejects_negative_max_energy(): + with pytest.raises(ValueError, match="non-negative"): + init(mode="observe", max_energy=-0.5) + + +def test_init_rejects_invalid_compliance(): + with pytest.raises(ValueError, match="compliance"): + init(mode="observe", compliance="invalid_mode") + + +def test_run_rejects_negative_budget(): + init(mode="observe") + with pytest.raises(ValueError, match="non-negative"): + run(budget=-0.5) + + +def test_run_rejects_invalid_compliance(): + init(mode="observe") + with pytest.raises(ValueError, match="compliance"): + run(compliance="foobar") + + +def test_init_accepts_zero_budget(): + report = init(mode="observe", budget=0.0) + cfg = get_harness_config() + assert cfg.budget == 0.0 + + +def test_init_accepts_valid_compliance(): + for value in ("gdpr", "hipaa", "pci", "strict"): + reset() + report = init(mode="observe", compliance=value) + cfg = get_harness_config() + assert cfg.compliance == value + + +def test_trace_rotation_limits_entries(): + init(mode="observe") + with run(budget=100.0) as ctx: + for i in range(1050): + ctx.record(action="allow", reason="test", model="gpt-4o-mini") + trace = ctx.trace() + assert len(trace) <= 1000 From 9547ab13175fa3d17fc36fa208ca3bb4d0ec9df0 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 5 Mar 2026 14:25:10 +0100 Subject: [PATCH 48/49] docs: reframe positioning as agent runtime intelligence layer + add Mintlify docs site MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 0 — GitHub refresh: - pyproject.toml: update description, keywords, classifier to Production/Stable - __init__.py: replace emoji docstring with harness API focus - llms.txt: expand from 88 to 214 lines (HarnessConfig, pricing, energy, integrations) - README.md: new H1, comparison table, Harness API section, 6 new feature rows - docs/README.md: Mintlify banner, add LangChain to integrations list Phase 1 — Mintlify docs site (docs-site/): - docs.json config (palm theme, 5 tabs, full navigation) - 36 MDX pages: Get Started (4), Harness (8), Integrations (7), API Reference (8), Examples (6), index + changelog + contributing - Logo assets copied from .github/assets/ --- README.md | 58 +++++- cascadeflow/__init__.py | 45 ++--- .../api-reference/python/agent-decorator.mdx | 79 ++++++++ .../api-reference/python/harness-config.mdx | 73 ++++++++ docs-site/api-reference/python/init.mdx | 68 +++++++ .../api-reference/python/run-context.mdx | 76 ++++++++ docs-site/api-reference/python/run.mdx | 83 +++++++++ docs-site/api-reference/typescript/core.mdx | 77 ++++++++ .../api-reference/typescript/langchain.mdx | 77 ++++++++ .../api-reference/typescript/vercel-ai.mdx | 63 +++++++ docs-site/changelog.mdx | 28 +++ docs-site/contributing.mdx | 96 ++++++++++ docs-site/docs.json | 130 +++++++++++++ docs-site/examples/basic-usage.mdx | 81 ++++++++ docs-site/examples/budget-enforcement.mdx | 84 +++++++++ docs-site/examples/compliance-gating.mdx | 89 +++++++++ docs-site/examples/enterprise-patterns.mdx | 127 +++++++++++++ docs-site/examples/kpi-weighted-routing.mdx | 95 ++++++++++ docs-site/examples/multi-agent.mdx | 103 +++++++++++ docs-site/favicon.svg | 8 + docs-site/get-started/how-it-works.mdx | 112 ++++++++++++ docs-site/get-started/installation.mdx | 101 ++++++++++ docs-site/get-started/introduction.mdx | 62 +++++++ docs-site/get-started/quickstart.mdx | 118 ++++++++++++ docs-site/harness/actions.mdx | 99 ++++++++++ docs-site/harness/budget-enforcement.mdx | 83 +++++++++ docs-site/harness/compliance.mdx | 66 +++++++ docs-site/harness/decision-trace.mdx | 102 +++++++++++ docs-site/harness/energy-tracking.mdx | 99 ++++++++++ docs-site/harness/kpi-optimization.mdx | 103 +++++++++++ docs-site/harness/modes.mdx | 78 ++++++++ docs-site/harness/overview.mdx | 80 ++++++++ docs-site/index.mdx | 91 +++++++++ docs-site/integrations/crewai.mdx | 78 ++++++++ docs-site/integrations/google-adk.mdx | 91 +++++++++ docs-site/integrations/langchain.mdx | 106 +++++++++++ docs-site/integrations/n8n.mdx | 70 +++++++ docs-site/integrations/openai-agents.mdx | 77 ++++++++ docs-site/integrations/overview.mdx | 53 ++++++ docs-site/integrations/vercel-ai.mdx | 88 +++++++++ docs-site/logo/cascadeflow-dark.svg | 27 +++ docs-site/logo/cascadeflow-light.svg | 20 ++ docs/README.md | 7 +- llms.txt | 173 +++++++++++++++--- pyproject.toml | 14 +- 45 files changed, 3477 insertions(+), 61 deletions(-) create mode 100644 docs-site/api-reference/python/agent-decorator.mdx create mode 100644 docs-site/api-reference/python/harness-config.mdx create mode 100644 docs-site/api-reference/python/init.mdx create mode 100644 docs-site/api-reference/python/run-context.mdx create mode 100644 docs-site/api-reference/python/run.mdx create mode 100644 docs-site/api-reference/typescript/core.mdx create mode 100644 docs-site/api-reference/typescript/langchain.mdx create mode 100644 docs-site/api-reference/typescript/vercel-ai.mdx create mode 100644 docs-site/changelog.mdx create mode 100644 docs-site/contributing.mdx create mode 100644 docs-site/docs.json create mode 100644 docs-site/examples/basic-usage.mdx create mode 100644 docs-site/examples/budget-enforcement.mdx create mode 100644 docs-site/examples/compliance-gating.mdx create mode 100644 docs-site/examples/enterprise-patterns.mdx create mode 100644 docs-site/examples/kpi-weighted-routing.mdx create mode 100644 docs-site/examples/multi-agent.mdx create mode 100644 docs-site/favicon.svg create mode 100644 docs-site/get-started/how-it-works.mdx create mode 100644 docs-site/get-started/installation.mdx create mode 100644 docs-site/get-started/introduction.mdx create mode 100644 docs-site/get-started/quickstart.mdx create mode 100644 docs-site/harness/actions.mdx create mode 100644 docs-site/harness/budget-enforcement.mdx create mode 100644 docs-site/harness/compliance.mdx create mode 100644 docs-site/harness/decision-trace.mdx create mode 100644 docs-site/harness/energy-tracking.mdx create mode 100644 docs-site/harness/kpi-optimization.mdx create mode 100644 docs-site/harness/modes.mdx create mode 100644 docs-site/harness/overview.mdx create mode 100644 docs-site/index.mdx create mode 100644 docs-site/integrations/crewai.mdx create mode 100644 docs-site/integrations/google-adk.mdx create mode 100644 docs-site/integrations/langchain.mdx create mode 100644 docs-site/integrations/n8n.mdx create mode 100644 docs-site/integrations/openai-agents.mdx create mode 100644 docs-site/integrations/overview.mdx create mode 100644 docs-site/integrations/vercel-ai.mdx create mode 100644 docs-site/logo/cascadeflow-dark.svg create mode 100644 docs-site/logo/cascadeflow-light.svg diff --git a/README.md b/README.md index 63e9af87..27baf1be 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ cascadeflow Logo -# Smart AI model cascading for cost optimization +# Agent Runtime Intelligence Layer [![PyPI version](https://img.shields.io/pypi/v/cascadeflow?color=blue&label=Python)](https://pypi.org/project/cascadeflow/) [![npm version](https://img.shields.io/npm/v/@cascadeflow/core?color=red&label=TypeScript)](https://www.npmjs.com/package/@cascadeflow/core) @@ -17,6 +17,7 @@ [![PyPI Downloads](https://static.pepy.tech/badge/cascadeflow)](https://pepy.tech/project/cascadeflow) [![npm Downloads](https://img.shields.io/npm/dt/@cascadeflow/n8n-nodes-cascadeflow?label=npm%20downloads&color=orange)](https://www.npmjs.com/search?q=%40cascadeflow) [![Tests](https://github.com/lemony-ai/cascadeflow/actions/workflows/test.yml/badge.svg)](https://github.com/lemony-ai/cascadeflow/actions/workflows/test.yml) +[![Docs](https://img.shields.io/badge/docs-cascadeflow.dev-blue)](https://docs.cascadeflow.dev) [![Python Docs](https://img.shields.io/badge/docs-Python-blue)](./docs/) [![TypeScript Docs](https://img.shields.io/badge/docs-TypeScript-red)](./docs/) [![X Follow](https://img.shields.io/twitter/follow/saschabuehrle?style=social)](https://x.com/saschabuehrle) @@ -28,17 +29,15 @@
-**[Python Python](#-python) • [TypeScript TypeScript](#-typescript) • [LangChain LangChain](#-langchain-integration) • [n8n n8n](#-n8n-integration) • [Vercel AI Vercel AI](./packages/integrations/vercel-ai/) • [OpenClaw OpenClaw](https://clawhub.ai/saschabuehrle/cascadeflow) • [📖 Docs](./docs/) • [💡 Examples](#examples)** +**[Python Python](#-python) • [TypeScript TypeScript](#-typescript) • [LangChain LangChain](#-langchain-integration) • [n8n n8n](#-n8n-integration) • [Vercel AI Vercel AI](./packages/integrations/vercel-ai/) • [OpenClaw OpenClaw](https://clawhub.ai/saschabuehrle/cascadeflow) • [Full Docs](https://docs.cascadeflow.dev) • [📖 Docs](./docs/) • [💡 Examples](#examples)** --- -**Stop Bleeding Money on AI Calls. Cut Costs 30-65% in 3 Lines of Code.** +**The in-process intelligence layer for AI agents.** Optimize cost, latency, quality, budget, compliance, and energy — inside the execution loop, not at the HTTP boundary. -40-70% of text prompts and 20-60% of agent calls don't need expensive flagship models. You're overpaying every single day. - -*cascadeflow fixes this with intelligent model cascading, available in Python and TypeScript.* +cascadeflow works where external proxies can't: per-step model decisions based on agent state, per-tool-call budget gating, runtime stop/continue/escalate actions, and business KPI injection during agent loops. Sub-1ms overhead. Works with LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, and Vercel AI SDK. ```python pip install cascadeflow @@ -52,6 +51,17 @@ npm install @cascadeflow/core ## Why cascadeflow? +### Proxy vs In-Process Harness + +| Dimension | External Proxy | cascadeflow Harness | +|---|---|---| +| **Scope** | HTTP request boundary | Inside agent execution loop | +| **Dimensions** | Cost only | Cost + quality + latency + budget + compliance + energy | +| **Latency overhead** | 10-50ms network RTT | <1ms in-process | +| **Business logic** | None | KPI weights and targets | +| **Enforcement** | None (observe only) | stop, deny_tool, switch_model | +| **Auditability** | Request logs | Per-step decision traces | + cascadeflow is an intelligent AI model cascading library that dynamically selects the optimal model for each query or tool call through speculative execution. It's based on the research that 40-70% of queries don't require slow, expensive flagship models, and domain-specific smaller models often outperform large general-purpose models on specialized tasks. For the remaining queries that need advanced reasoning, cascadeflow automatically escalates to flagship models if needed. ### Use Cases @@ -140,6 +150,34 @@ In practice, 60-70% of queries are handled by small, efficient models (8-20x cos --- +## Harness API + +Three tiers of integration — zero-change observability to full policy control: + +**Tier 1: Zero-change observability** +```python +import cascadeflow +cascadeflow.init(mode="observe") +# All OpenAI/Anthropic SDK calls are now tracked. No code changes needed. +``` + +**Tier 2: Scoped runs with budget** +```python +with cascadeflow.run(budget=0.50, max_tool_calls=10) as session: + result = await agent.run("Analyze this dataset") + print(session.summary()) # cost, latency, energy, steps, tool calls + print(session.trace()) # full decision audit trail +``` + +**Tier 3: Decorated agents with policy** +```python +@cascadeflow.agent(budget=0.20, compliance="gdpr", kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1}) +async def my_agent(query: str): + return await llm.complete(query) +``` + +--- + ## Quick Start ### Drop-In Gateway (Existing Apps) @@ -724,6 +762,12 @@ console.log(`Warnings: ${validation.warnings}`); | 📋 **Message & Tool Call Lists** | Full conversation history with tool_calls and tool_call_id preservation across turns | | 🪝 **Hooks & Callbacks** | Telemetry callbacks, cost events, and streaming hooks for observability | | 🏭 **Production Ready** | Streaming, batch processing, tool handling, reasoning model support, caching, error recovery, anomaly detection | +| 💳 **Budget Enforcement** | Per-run and per-user budget caps with automatic stop actions when limits are exceeded | +| 🔒 **Compliance Gating** | GDPR, HIPAA, PCI, and strict model allowlists — block non-compliant models before execution | +| 📊 **KPI-Weighted Routing** | Inject business priorities (quality, cost, latency, energy) as weights into every model decision | +| 🌱 **Energy Tracking** | Deterministic compute-intensity coefficients for carbon-aware AI operations | +| 🔍 **Decision Traces** | Full per-step audit trail: action, reason, model, cost, budget state, enforcement status | +| ⚙️ **Harness Modes** | off / observe / enforce — roll out safely with observe, then switch to enforce when ready | --- @@ -774,7 +818,7 @@ If you use cascadeflow in your research or project, please cite: ```bibtex @software{cascadeflow2025, author = {Lemony Inc., Sascha Buehrle and Contributors}, - title = {cascadeflow: Smart AI model cascading for cost optimization}, + title = {cascadeflow: Agent runtime intelligence layer for AI agent workflows}, year = {2025}, publisher = {GitHub}, url = {https://github.com/lemony-ai/cascadeflow} diff --git a/cascadeflow/__init__.py b/cascadeflow/__init__.py index 6dd64b05..af4c429a 100644 --- a/cascadeflow/__init__.py +++ b/cascadeflow/__init__.py @@ -1,30 +1,23 @@ """ -cascadeflow - Smart AI model cascading for cost optimization. - -Route queries intelligently across multiple AI models from tiny SLMs -to frontier LLMs based on complexity, domain, and budget. - -Features: -- 🚀 Speculative cascades (2-3x faster) -- 💰 60-95% cost savings -- 🎯 Per-prompt domain detection -- 🎨 2.0x domain boost for specialists -- 🔍 Multi-factor optimization -- 🆓 Free tier (Ollama + Groq) -- ⚡ 3 lines of code - -Example: - >>> from cascadeflow import CascadeAgent, CascadePresets - >>> - >>> # Auto-detect available models - >>> models = CascadePresets.auto_detect_models() - >>> - >>> # Create agent with intelligence layer - >>> agent = CascadeAgent(models, enable_caching=True) - >>> - >>> # Run query (automatically optimized!) - >>> result = await agent.run("Fix this Python bug") - >>> print(f"Used {result.model_used} - Cost: ${result.cost:.6f}") +cascadeflow - Agent runtime intelligence layer. + +In-process harness that optimizes cost, latency, quality, budget, compliance, +and energy across AI agent workflows. Works inside agent execution loops with +full state awareness -- not an external proxy. + +Quick start: + import cascadeflow + cascadeflow.init(mode="observe") + # All OpenAI/Anthropic SDK calls are now tracked and traced. + +Key APIs: + cascadeflow.init(mode) -- activate harness (off | observe | enforce) + cascadeflow.run(budget) -- scoped run with budget/trace + @cascadeflow.agent(budget) -- policy metadata on agent functions + session.summary() -- structured metrics + session.trace() -- full decision audit trail + +Integrations: LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, Vercel AI SDK """ __version__ = "1.0.0" diff --git a/docs-site/api-reference/python/agent-decorator.mdx b/docs-site/api-reference/python/agent-decorator.mdx new file mode 100644 index 00000000..912a03fd --- /dev/null +++ b/docs-site/api-reference/python/agent-decorator.mdx @@ -0,0 +1,79 @@ +--- +title: "@cascadeflow.agent()" +description: Decorate agent functions with policy metadata including budget, compliance, and KPI weights. +--- + +# @cascadeflow.agent() + +Annotate agent functions with policy metadata. The decorator attaches budget, compliance, and KPI configuration to the function for the harness to use at runtime. + +## Signature + +```python +def agent( + budget: Optional[float] = None, + compliance: Optional[str] = None, + kpi_weights: Optional[dict[str, float]] = None, + kpi_targets: Optional[dict[str, float]] = None, + max_tool_calls: Optional[int] = None, +) +``` + +## Parameters + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `budget` | `float \| None` | `None` | Max USD for this agent | +| `compliance` | `str \| None` | `None` | Compliance mode | +| `kpi_weights` | `dict \| None` | `None` | KPI dimension weights | +| `kpi_targets` | `dict \| None` | `None` | KPI dimension targets | +| `max_tool_calls` | `int \| None` | `None` | Max tool/function calls | + +## Usage + +### Basic + +```python +@cascadeflow.agent(budget=0.20) +async def my_agent(query: str): + return await llm.complete(query) +``` + +### With compliance + +```python +@cascadeflow.agent(budget=0.50, compliance="gdpr") +async def eu_agent(query: str): + return await llm.complete(query) +``` + +### With KPI weights + +```python +@cascadeflow.agent( + budget=1.00, + kpi_weights={"quality": 0.8, "cost": 0.2}, + kpi_targets={"quality": 0.9}, +) +async def premium_agent(query: str): + return await llm.complete(query) +``` + +### Multiple agents with different policies + +```python +@cascadeflow.agent(budget=0.10, kpi_weights={"cost": 0.9, "quality": 0.1}) +async def triage_agent(query: str): + return await llm.complete(query) + +@cascadeflow.agent(budget=2.00, kpi_weights={"quality": 0.9, "cost": 0.1}) +async def analysis_agent(query: str): + return await llm.complete(query) +``` + +## Notes + +- The decorator does not wrap or modify the function's execution. It attaches metadata that the harness reads at runtime. +- Works with both sync and async functions. +- Requires `init()` to have been called for the metadata to take effect. +- Can be combined with `run()` — the run's constraints are checked in addition to the decorator's. diff --git a/docs-site/api-reference/python/harness-config.mdx b/docs-site/api-reference/python/harness-config.mdx new file mode 100644 index 00000000..42ae7a6d --- /dev/null +++ b/docs-site/api-reference/python/harness-config.mdx @@ -0,0 +1,73 @@ +--- +title: HarnessConfig +description: Full configuration dataclass for the cascadeflow harness with all fields, types, and defaults. +--- + +# HarnessConfig + +Configuration dataclass for the cascadeflow harness. Pass to `cascadeflow.init(config=...)` for full control. + +## Definition + +```python +from dataclasses import dataclass +from typing import Optional + +@dataclass +class HarnessConfig: + mode: HarnessMode = "off" + verbose: bool = False + budget: Optional[float] = None + max_tool_calls: Optional[int] = None + max_latency_ms: Optional[float] = None + max_energy: Optional[float] = None + kpi_targets: Optional[dict[str, float]] = None + kpi_weights: Optional[dict[str, float]] = None + compliance: Optional[str] = None +``` + +## Fields + +| Field | Type | Default | Description | +|---|---|---|---| +| `mode` | `"off" \| "observe" \| "enforce"` | `"off"` | Harness mode | +| `verbose` | `bool` | `False` | Print decisions to stderr | +| `budget` | `float \| None` | `None` | Max USD for the run (None = unlimited) | +| `max_tool_calls` | `int \| None` | `None` | Max tool/function calls (None = unlimited) | +| `max_latency_ms` | `float \| None` | `None` | Max wall-clock ms per call (None = unlimited) | +| `max_energy` | `float \| None` | `None` | Max energy units (None = unlimited) | +| `kpi_targets` | `dict \| None` | `None` | Target values per KPI dimension | +| `kpi_weights` | `dict \| None` | `None` | Relative weights per KPI dimension | +| `compliance` | `str \| None` | `None` | Compliance mode: `"gdpr"`, `"hipaa"`, `"pci"`, `"strict"` | + +## HarnessMode + +```python +HarnessMode = Literal["off", "observe", "enforce"] +``` + +## Usage + +```python +from cascadeflow import HarnessConfig +import cascadeflow + +config = HarnessConfig( + mode="enforce", + budget=1.00, + max_tool_calls=20, + max_energy=200.0, + compliance="gdpr", + kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1}, + kpi_targets={"quality": 0.85}, + verbose=True, +) + +cascadeflow.init(config=config) +``` + +## Import + +```python +from cascadeflow import HarnessConfig +``` diff --git a/docs-site/api-reference/python/init.mdx b/docs-site/api-reference/python/init.mdx new file mode 100644 index 00000000..b07a0e00 --- /dev/null +++ b/docs-site/api-reference/python/init.mdx @@ -0,0 +1,68 @@ +--- +title: cascadeflow.init() +description: Activate the cascadeflow harness globally with a mode and optional configuration. +--- + +# cascadeflow.init() + +Activate the harness globally. All subsequent LLM calls (OpenAI, Anthropic) are automatically tracked. + +## Signature + +```python +def init( + mode: HarnessMode = "off", + *, + config: Optional[HarnessConfig] = None, + verbose: bool = False, +) -> HarnessInitReport +``` + +## Parameters + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `mode` | `"off" \| "observe" \| "enforce"` | `"off"` | Harness mode | +| `config` | `HarnessConfig \| None` | `None` | Full configuration (overrides mode) | +| `verbose` | `bool` | `False` | Print decisions to stderr | + +## Returns + +`HarnessInitReport` — confirmation of harness activation with mode and configuration summary. + +## Usage + +### Minimal + +```python +import cascadeflow +cascadeflow.init(mode="observe") +``` + +### With config + +```python +from cascadeflow import HarnessConfig + +config = HarnessConfig( + mode="enforce", + budget=1.00, + compliance="gdpr", + verbose=True, +) +cascadeflow.init(config=config) +``` + +### Environment-driven + +```python +import os +cascadeflow.init(mode=os.getenv("CASCADEFLOW_MODE", "observe")) +``` + +## Notes + +- Call `init()` once at application startup, before any LLM calls +- Calling `init()` again replaces the previous configuration +- Use `cascadeflow.reset()` to deactivate the harness +- `init(mode="off")` is equivalent to not calling `init()` at all diff --git a/docs-site/api-reference/python/run-context.mdx b/docs-site/api-reference/python/run-context.mdx new file mode 100644 index 00000000..be9377a4 --- /dev/null +++ b/docs-site/api-reference/python/run-context.mdx @@ -0,0 +1,76 @@ +--- +title: HarnessRunContext +description: Run context object yielded by cascadeflow.run() with summary(), trace(), and budget tracking methods. +--- + +# HarnessRunContext + +The context object yielded by `cascadeflow.run()`. Provides access to run metrics, decision traces, and budget state. + +## Methods + +### summary() + +Returns aggregate metrics for the run. + +```python +summary = session.summary() +``` + +Returns a dict with: + +| Key | Type | Description | +|---|---|---| +| `cost_total` | `float` | Cumulative cost in USD | +| `steps` | `int` | Number of LLM calls | +| `tool_calls` | `int` | Number of tool/function calls | +| `latency_total_ms` | `float` | Total wall-clock latency in ms | +| `energy_used` | `float` | Total energy units consumed | +| `budget_remaining` | `float \| None` | USD remaining (None if no budget set) | + +### trace() + +Returns the list of decision records for the run. + +```python +records = session.trace() +``` + +Each record is a dict with: + +| Key | Type | Description | +|---|---|---| +| `action` | `str` | `"allow"`, `"switch_model"`, `"deny_tool"`, or `"stop"` | +| `reason` | `str` | Human-readable explanation | +| `model` | `str` | Model name | +| `step` | `int` | Step number (1-indexed) | +| `cost_total` | `float` | Cumulative cost at this step | +| `budget_state` | `str` | `"ok"`, `"warning"`, or `"exceeded"` | +| `applied` | `bool` | Whether the action was enforced | + +## Usage + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run(budget=0.50) as session: + result = await agent.run("Analyze this dataset") + + # Aggregate metrics + summary = session.summary() + print(f"Cost: ${summary['cost_total']:.4f}") + print(f"Steps: {summary['steps']}") + print(f"Budget remaining: ${summary['budget_remaining']:.4f}") + + # Decision trace + for record in session.trace(): + print(f"Step {record['step']}: {record['action']} — {record['reason']}") +``` + +## Import + +```python +from cascadeflow import HarnessRunContext +``` diff --git a/docs-site/api-reference/python/run.mdx b/docs-site/api-reference/python/run.mdx new file mode 100644 index 00000000..72202a74 --- /dev/null +++ b/docs-site/api-reference/python/run.mdx @@ -0,0 +1,83 @@ +--- +title: cascadeflow.run() +description: Create a scoped run context with budget caps, tool call limits, and metrics tracking. +--- + +# cascadeflow.run() + +Create a scoped run context manager that tracks metrics and optionally enforces constraints for a block of agent execution. + +## Signature + +```python +def run( + budget: Optional[float] = None, + max_tool_calls: Optional[int] = None, + max_latency_ms: Optional[float] = None, + max_energy: Optional[float] = None, + compliance: Optional[str] = None, + kpi_weights: Optional[dict[str, float]] = None, + kpi_targets: Optional[dict[str, float]] = None, +) -> ContextManager[HarnessRunContext] +``` + +## Parameters + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `budget` | `float \| None` | `None` | Max USD for this run | +| `max_tool_calls` | `int \| None` | `None` | Max tool/function calls | +| `max_latency_ms` | `float \| None` | `None` | Max wall-clock ms per call | +| `max_energy` | `float \| None` | `None` | Max energy units | +| `compliance` | `str \| None` | `None` | `"gdpr"`, `"hipaa"`, `"pci"`, or `"strict"` | +| `kpi_weights` | `dict \| None` | `None` | KPI dimension weights | +| `kpi_targets` | `dict \| None` | `None` | KPI dimension targets | + +## Returns + +Context manager yielding `HarnessRunContext`. See [HarnessRunContext](/api-reference/python/run-context). + +## Usage + +### Basic budget + +```python +with cascadeflow.run(budget=0.50) as session: + result = await agent.run("Analyze this data") + print(session.summary()) +``` + +### Full configuration + +```python +with cascadeflow.run( + budget=1.00, + max_tool_calls=10, + max_energy=100.0, + compliance="gdpr", + kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1}, + kpi_targets={"quality": 0.9}, +) as session: + result = await agent.run("Process EU customer data") + print(session.summary()) + for record in session.trace(): + print(f"Step {record['step']}: {record['action']}") +``` + +### Nested runs + +Runs can be nested. Inner runs inherit the parent's remaining budget: + +```python +with cascadeflow.run(budget=1.00) as outer: + with cascadeflow.run(budget=0.30) as inner: + await agent.run("Sub-task") + # outer.summary() includes inner costs +``` + +## Notes + +- `run()` requires `init()` to have been called first +- Parameters override the global config for the duration of the block +- Use `session.summary()` for aggregate metrics +- Use `session.trace()` for per-step decision records diff --git a/docs-site/api-reference/typescript/core.mdx b/docs-site/api-reference/typescript/core.mdx new file mode 100644 index 00000000..ae8f8311 --- /dev/null +++ b/docs-site/api-reference/typescript/core.mdx @@ -0,0 +1,77 @@ +--- +title: "@cascadeflow/core" +description: TypeScript core package with CascadeAgent for model routing, cost tracking, and quality validation. +--- + +# @cascadeflow/core + +The core TypeScript package for cascadeflow. Provides `CascadeAgent` for speculative model cascading with quality validation. + +## Install + +```bash +npm install @cascadeflow/core +``` + +## CascadeAgent + +```typescript +import { CascadeAgent, ModelConfig } from '@cascadeflow/core'; + +const agent = new CascadeAgent({ + models: [ + { name: 'gpt-4o-mini', provider: 'openai', cost: 0.000375 }, + { name: 'gpt-4o', provider: 'openai', cost: 0.00625 }, + ], +}); + +const result = await agent.run('What is TypeScript?'); +console.log(`Model: ${result.modelUsed}`); +console.log(`Cost: $${result.totalCost}`); +console.log(`Saved: ${result.savingsPercentage}%`); +``` + +## ModelConfig + +```typescript +interface ModelConfig { + name: string; // Model name (e.g. 'gpt-4o-mini') + provider: string; // Provider name (e.g. 'openai') + cost: number; // Cost per token (approximate) +} +``` + +## CascadeAgentOptions + +```typescript +interface CascadeAgentOptions { + models: ModelConfig[]; + quality?: { + threshold?: number; // Confidence threshold (0-1) + requireMinimumTokens?: number; // Min response length + useSemanticValidation?: boolean; // Enable ML validation + semanticThreshold?: number; // Semantic similarity threshold + }; +} +``` + +## Result + +```typescript +interface CascadeResult { + content: string; + modelUsed: string; + totalCost: number; + savingsPercentage: number; + cascadeDecision: string; +} +``` + +## Features + +- Speculative execution with quality validation +- Multi-provider support (OpenAI, Anthropic, Groq, Ollama, vLLM) +- Streaming responses +- Tool calling and structured output +- Cost tracking and analytics +- Works in Node.js, Browser, and Edge Functions diff --git a/docs-site/api-reference/typescript/langchain.mdx b/docs-site/api-reference/typescript/langchain.mdx new file mode 100644 index 00000000..9a9e3050 --- /dev/null +++ b/docs-site/api-reference/typescript/langchain.mdx @@ -0,0 +1,77 @@ +--- +title: "@cascadeflow/langchain" +description: TypeScript LangChain integration with withCascade() for drop-in cascade routing and model discovery helpers. +--- + +# @cascadeflow/langchain + +LangChain integration for TypeScript. Provides `withCascade()` for drop-in cascade routing with any LangChain chat model. + +## Install + +```bash +npm install @cascadeflow/langchain @langchain/core @langchain/openai +``` + +## withCascade + +Creates a cascade-enabled chat model from a drafter and verifier. + +```typescript +import { ChatOpenAI } from '@langchain/openai'; +import { ChatAnthropic } from '@langchain/anthropic'; +import { withCascade } from '@cascadeflow/langchain'; + +const cascade = withCascade({ + drafter: new ChatOpenAI({ model: 'gpt-4o-mini' }), + verifier: new ChatAnthropic({ model: 'claude-sonnet-4' }), + qualityThreshold: 0.8, +}); + +// Use like any LangChain chat model +const result = await cascade.invoke('Explain quantum computing'); + +// With LCEL chains +const chain = prompt.pipe(cascade).pipe(new StringOutputParser()); +``` + +## Options + +```typescript +interface CascadeOptions { + drafter: BaseChatModel; // Cheap, fast model + verifier: BaseChatModel; // Powerful fallback model + qualityThreshold?: number; // 0-1, default 0.4 +} +``` + +## Model Discovery + +```typescript +import { + discoverCascadePairs, + findBestCascadePair, + analyzeModel, + validateCascadePair, +} from '@cascadeflow/langchain'; + +const models = [ + new ChatOpenAI({ model: 'gpt-4o-mini' }), + new ChatOpenAI({ model: 'gpt-4o' }), + new ChatAnthropic({ model: 'claude-sonnet-4' }), +]; + +const best = findBestCascadePair(models); +const cascade = withCascade({ + drafter: best.drafter, + verifier: best.verifier, +}); +``` + +## Features + +- Full LCEL support (pipes, sequences, batch) +- Streaming with pre-routing +- Tool calling and structured output +- LangSmith cost tracking metadata +- Model discovery and pair validation diff --git a/docs-site/api-reference/typescript/vercel-ai.mdx b/docs-site/api-reference/typescript/vercel-ai.mdx new file mode 100644 index 00000000..ae9af949 --- /dev/null +++ b/docs-site/api-reference/typescript/vercel-ai.mdx @@ -0,0 +1,63 @@ +--- +title: "@cascadeflow/vercel-ai" +description: Vercel AI SDK middleware integration for cascade routing with streaming, multi-turn chat, and tool execution. +--- + +# @cascadeflow/vercel-ai + +Middleware integration for the Vercel AI SDK. Adds cascade routing to AI SDK applications with streaming support. + +## Install + +```bash +npm install @cascadeflow/vercel-ai +``` + +## createChatHandler + +Creates a request handler for AI SDK chat endpoints. + +```typescript +import { createChatHandler } from '@cascadeflow/vercel-ai'; +import { CascadeAgent } from '@cascadeflow/core'; + +const agent = new CascadeAgent({ + models: [ + { name: 'gpt-4o-mini', provider: 'openai', cost: 0.000375 }, + { name: 'gpt-4o', provider: 'openai', cost: 0.00625 }, + ], +}); + +const handler = createChatHandler(agent, { + protocol: 'data', + tools, + toolHandlers, + maxSteps: 5, +}); +``` + +## Options + +```typescript +interface ChatHandlerOptions { + protocol: 'data' | 'ui'; // AI SDK stream protocol + tools?: ToolDefinition[]; // Tool definitions + toolHandlers?: Record; // Server-side tool execution + toolExecutor?: Function; // Universal tool executor + maxSteps?: number; // Multi-step tool loop limit + forceDirect?: boolean; // Skip cascade, use verifier + allowOverrides?: string[]; // Request-level override keys + overrideSecret?: string; // Shared secret for overrides +} +``` + +## Features + +- AI SDK v4 `data` stream and v5/v6 UI streams +- `useChat` multi-turn support +- `parts` message format (AI SDK v6) +- Tool call streaming visibility +- Server-side tool execution loops +- Multi-step controls +- Cascade decision stream parts +- Request-level overrides with allowlist diff --git a/docs-site/changelog.mdx b/docs-site/changelog.mdx new file mode 100644 index 00000000..2cda1c2f --- /dev/null +++ b/docs-site/changelog.mdx @@ -0,0 +1,28 @@ +--- +title: Changelog +description: Release history and changelog for cascadeflow. +--- + +# Changelog + +For the full release history, see [GitHub Releases](https://github.com/lemony-ai/cascadeflow/releases). + +## Recent Highlights + +- **v1.0.0** — Agent runtime intelligence layer with harness API, 6 framework integrations, compliance gating, KPI-weighted routing, energy tracking, decision traces +- Agent loops and multi-agent orchestration +- Tool execution engine with parallel execution and risk gating +- Hooks and callbacks for telemetry and observability +- Vercel AI SDK integration (17+ additional providers) +- OpenClaw provider for custom deployments +- Gateway server (drop-in OpenAI/Anthropic-compatible endpoint) +- User tier management with per-user budgets +- Semantic quality validators via FastEmbed +- Domain-aware cascading with 16 domain classifications +- Benchmark reports (MMLU, GSM8K, MT-Bench, HumanEval, TruthfulQA) + +## Links + +- [GitHub Releases](https://github.com/lemony-ai/cascadeflow/releases) +- [PyPI](https://pypi.org/project/cascadeflow/) +- [npm](https://www.npmjs.com/package/@cascadeflow/core) diff --git a/docs-site/contributing.mdx b/docs-site/contributing.mdx new file mode 100644 index 00000000..ff45625e --- /dev/null +++ b/docs-site/contributing.mdx @@ -0,0 +1,96 @@ +--- +title: Contributing +description: How to contribute to cascadeflow — development setup, code style, testing, and pull request process. +--- + +# Contributing + +We welcome contributions to cascadeflow. This guide covers development setup for both Python and TypeScript. + +## Monorepo Structure + +``` +cascadeflow/ + cascadeflow/ # Python package + packages/ + core/ # TypeScript core + langchain-cascadeflow/ # LangChain TypeScript + integrations/ + vercel-ai/ # Vercel AI SDK + n8n/ # n8n community nodes + tests/ # Python tests + examples/ # Python examples + docs/ # Documentation + docs-site/ # Mintlify docs site +``` + +## Python Development + +### Setup + +```bash +git clone https://github.com/lemony-ai/cascadeflow.git +cd cascadeflow +python -m venv .venv +source .venv/bin/activate +pip install -e ".[dev]" +pre-commit install +``` + +### Code Style + +- **Formatter**: Black (line length 100) +- **Linter**: Ruff +- **Type checker**: mypy +- **Import sorting**: isort + +```bash +black cascadeflow/ tests/ +ruff check cascadeflow/ tests/ +mypy cascadeflow/ +``` + +### Testing + +```bash +pytest tests/ -x -q # Run all tests +pytest tests/ -m "not integration" # Skip integration tests +pytest tests/ --cov=cascadeflow # With coverage +``` + +## TypeScript Development + +### Setup + +```bash +cd packages/core +pnpm install +pnpm build +pnpm test +``` + +### Code Style + +- **Linter**: ESLint +- **Language**: TypeScript (strict mode) +- **Indentation**: 2 spaces + +## Making Changes + +1. Create a branch from `main` +2. Make changes with clear, descriptive commits +3. Follow commit conventions: `feat:`, `fix:`, `docs:`, `test:`, `refactor:`, `chore:` +4. Add tests for new functionality +5. Ensure all tests pass + +## Pull Requests + +- All PRs require review approval +- Linear history enforced (no merge commits) +- CI must pass before merge + +## Links + +- [GitHub Issues](https://github.com/lemony-ai/cascadeflow/issues) — Bug reports and feature requests +- [GitHub Discussions](https://github.com/lemony-ai/cascadeflow/discussions) — Questions and community +- [Email](mailto:hello@lemony.ai) — Direct support diff --git a/docs-site/docs.json b/docs-site/docs.json new file mode 100644 index 00000000..1e441f37 --- /dev/null +++ b/docs-site/docs.json @@ -0,0 +1,130 @@ +{ + "$schema": "https://mintlify.com/docs.json", + "theme": "palm", + "name": "cascadeflow", + "colors": { + "primary": "#0E7490", + "light": "#22D3EE", + "dark": "#0E7490" + }, + "logo": { + "light": "/logo/cascadeflow-light.svg", + "dark": "/logo/cascadeflow-dark.svg" + }, + "favicon": "/favicon.svg", + "tabs": [ + { "id": "get-started", "name": "Get Started" }, + { "id": "harness", "name": "Harness" }, + { "id": "integrations", "name": "Integrations" }, + { "id": "api-reference", "name": "API Reference" }, + { "id": "examples", "name": "Examples" } + ], + "navigation": { + "get-started": [ + { + "group": "Get Started", + "pages": [ + "get-started/introduction", + "get-started/quickstart", + "get-started/installation", + "get-started/how-it-works" + ] + }, + { + "group": "Resources", + "pages": [ + "changelog", + "contributing" + ] + } + ], + "harness": [ + { + "group": "Harness", + "pages": [ + "harness/overview", + "harness/modes", + "harness/budget-enforcement", + "harness/compliance", + "harness/kpi-optimization", + "harness/energy-tracking", + "harness/decision-trace", + "harness/actions" + ] + } + ], + "integrations": [ + { + "group": "Integrations", + "pages": [ + "integrations/overview", + "integrations/langchain", + "integrations/openai-agents", + "integrations/crewai", + "integrations/google-adk", + "integrations/n8n", + "integrations/vercel-ai" + ] + } + ], + "api-reference": [ + { + "group": "Python", + "pages": [ + "api-reference/python/init", + "api-reference/python/run", + "api-reference/python/agent-decorator", + "api-reference/python/harness-config", + "api-reference/python/run-context" + ] + }, + { + "group": "TypeScript", + "pages": [ + "api-reference/typescript/core", + "api-reference/typescript/vercel-ai", + "api-reference/typescript/langchain" + ] + } + ], + "examples": [ + { + "group": "Examples", + "pages": [ + "examples/basic-usage", + "examples/budget-enforcement", + "examples/compliance-gating", + "examples/kpi-weighted-routing", + "examples/multi-agent", + "examples/enterprise-patterns" + ] + } + ] + }, + "topbarLinks": [ + { + "name": "GitHub", + "url": "https://github.com/lemony-ai/cascadeflow" + } + ], + "topbarCtaButton": { + "name": "Get Started", + "url": "/get-started/quickstart" + }, + "footerSocials": { + "github": "https://github.com/lemony-ai/cascadeflow", + "x": "https://x.com/saschabuehrle" + }, + "anchors": [ + { + "name": "GitHub", + "icon": "github", + "url": "https://github.com/lemony-ai/cascadeflow" + }, + { + "name": "PyPI", + "icon": "python", + "url": "https://pypi.org/project/cascadeflow/" + } + ] +} diff --git a/docs-site/examples/basic-usage.mdx b/docs-site/examples/basic-usage.mdx new file mode 100644 index 00000000..9cf838d0 --- /dev/null +++ b/docs-site/examples/basic-usage.mdx @@ -0,0 +1,81 @@ +--- +title: Basic Usage +description: Simple cascade setup with OpenAI models showing speculative execution, cost tracking, and savings calculation. +--- + +# Basic Usage + +A minimal example showing cascadeflow's speculative cascade with two OpenAI models. + +## Setup + +```bash +pip install "cascadeflow[openai]" +export OPENAI_API_KEY="sk-..." +``` + +## Code + +```python +import asyncio +from cascadeflow import CascadeAgent, ModelConfig + +agent = CascadeAgent(models=[ + ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375), + ModelConfig(name="gpt-4o", provider="openai", cost=0.00625), +]) + +queries = [ + "What's the capital of France?", # Simple — draft model handles + "Explain quantum computing", # Medium — may escalate + "Write a Python function to sort a list", # Code — domain routing +] + +async def main(): + total_cost = 0 + baseline_cost = 0 + + for query in queries: + result = await agent.run(query) + total_cost += result.total_cost + baseline_cost += result.total_cost if result.model_used == "gpt-4o" else result.total_cost * (0.00625 / 0.000375) + + print(f"Query: {query[:40]}...") + print(f" Model: {result.model_used}") + print(f" Cost: ${result.total_cost:.6f}") + print() + + savings = (1 - total_cost / baseline_cost) * 100 if baseline_cost > 0 else 0 + print(f"Total cost: ${total_cost:.6f}") + print(f"Savings: {savings:.0f}%") + +asyncio.run(main()) +``` + +## How It Works + +1. `gpt-4o-mini` (draft model) handles the query first +2. Quality validation checks the response +3. If quality passes, the draft response is returned (60-70% of queries) +4. If quality fails, `gpt-4o` (verifier model) handles the query +5. Cost tracking reports per-query and aggregate metrics + +## TypeScript + +```typescript +import { CascadeAgent } from '@cascadeflow/core'; + +const agent = new CascadeAgent({ + models: [ + { name: 'gpt-4o-mini', provider: 'openai', cost: 0.000375 }, + { name: 'gpt-4o', provider: 'openai', cost: 0.00625 }, + ], +}); + +const result = await agent.run('What is TypeScript?'); +console.log(`Model: ${result.modelUsed}, Cost: $${result.totalCost}`); +``` + +## Source + +[examples/basic_usage.py](https://github.com/lemony-ai/cascadeflow/blob/main/examples/basic_usage.py) diff --git a/docs-site/examples/budget-enforcement.mdx b/docs-site/examples/budget-enforcement.mdx new file mode 100644 index 00000000..dab52ed9 --- /dev/null +++ b/docs-site/examples/budget-enforcement.mdx @@ -0,0 +1,84 @@ +--- +title: Budget Enforcement +description: Per-run and per-user budget caps with enforcement callbacks, cost tracking, and automatic stop actions. +--- + +# Budget Enforcement + +Enforce spending limits on agent runs with automatic stop actions when budget is exceeded. + +## Basic Budget Cap + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run(budget=0.50) as session: + result = await agent.run("Research and summarize this topic") + + summary = session.summary() + print(f"Cost: ${summary['cost_total']:.4f}") + print(f"Budget remaining: ${summary['budget_remaining']:.4f}") + print(f"Steps completed: {summary['steps']}") +``` + +## Budget with Tool Call Limit + +```python +with cascadeflow.run(budget=1.00, max_tool_calls=5) as session: + result = await agent.run("Search and analyze this dataset") + # Stops when either budget or tool call limit is hit +``` + +## Per-Agent Budgets + +```python +@cascadeflow.agent(budget=0.10) +async def triage_agent(query: str): + """Cheap triage — $0.10 max.""" + return await llm.complete(query) + +@cascadeflow.agent(budget=2.00) +async def research_agent(query: str): + """Deep research — $2.00 max.""" + return await llm.complete(query) +``` + +## Cost Tracking (Legacy API) + +For pre-harness budget enforcement using the telemetry API: + +```python +from cascadeflow.telemetry import BudgetConfig, CostTracker, strict_budget_enforcement + +tracker = CostTracker( + budget_config=BudgetConfig( + daily_limit=10.0, + per_query_limit=0.50, + alert_threshold=0.8, + ), + enforcement_callback=strict_budget_enforcement, +) + +# Track costs manually +tracker.track(model="gpt-4o", cost=0.003) +print(f"Daily spend: ${tracker.daily_spend:.4f}") +``` + +## Decision Trace + +```python +with cascadeflow.run(budget=0.50) as session: + result = await agent.run("Multi-step analysis") + + for record in session.trace(): + if record['action'] == 'stop': + print(f"Stopped at step {record['step']}: {record['reason']}") + else: + print(f"Step {record['step']}: {record['action']} (${record['cost_total']:.4f})") +``` + +## Source + +[examples/enforcement/basic_enforcement.py](https://github.com/lemony-ai/cascadeflow/blob/main/examples/enforcement/basic_enforcement.py) diff --git a/docs-site/examples/compliance-gating.mdx b/docs-site/examples/compliance-gating.mdx new file mode 100644 index 00000000..19f9fbd3 --- /dev/null +++ b/docs-site/examples/compliance-gating.mdx @@ -0,0 +1,89 @@ +--- +title: Compliance Gating +description: GDPR, HIPAA, PCI, and strict model allowlists with enforcement examples for regulated agent workflows. +--- + +# Compliance Gating + +Restrict which models can be used based on compliance requirements. + +## GDPR Compliance + +Only allow models approved for EU data processing: + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run(compliance="gdpr") as session: + # Only gpt-4o, gpt-4o-mini, gpt-3.5-turbo are allowed + result = await agent.run("Process this EU customer feedback") + + for record in session.trace(): + if record['action'] == 'switch_model': + print(f"Model switched: {record['reason']}") +``` + +## HIPAA Compliance + +For healthcare data — stricter allowlist: + +```python +with cascadeflow.run(compliance="hipaa") as session: + # Only gpt-4o, gpt-4o-mini are allowed + result = await agent.run("Summarize this patient record") +``` + +## PCI Compliance + +For payment card data: + +```python +with cascadeflow.run(compliance="pci") as session: + # Only gpt-4o-mini, gpt-3.5-turbo are allowed + result = await agent.run("Analyze this transaction") +``` + +## Strict Mode + +Maximum restriction — single model only: + +```python +with cascadeflow.run(compliance="strict") as session: + # Only gpt-4o is allowed + result = await agent.run("Classify this sensitive document") +``` + +## Compliance Allowlists + +| Mode | Allowed Models | +|---|---| +| `gdpr` | gpt-4o, gpt-4o-mini, gpt-3.5-turbo | +| `hipaa` | gpt-4o, gpt-4o-mini | +| `pci` | gpt-4o-mini, gpt-3.5-turbo | +| `strict` | gpt-4o | + +## Combining with Budget + +```python +@cascadeflow.agent(budget=1.00, compliance="gdpr") +async def eu_data_agent(query: str): + """Process EU data within budget using only GDPR-approved models.""" + return await llm.complete(query) +``` + +## Observe Mode for Audit + +Use `observe` mode to audit which models would be blocked without affecting production: + +```python +cascadeflow.init(mode="observe") + +with cascadeflow.run(compliance="hipaa") as session: + result = await agent.run("Process health data") + + # Check which calls would have been blocked + violations = [r for r in session.trace() if r['action'] == 'switch_model'] + print(f"Compliance violations detected: {len(violations)}") +``` diff --git a/docs-site/examples/enterprise-patterns.mdx b/docs-site/examples/enterprise-patterns.mdx new file mode 100644 index 00000000..5949972c --- /dev/null +++ b/docs-site/examples/enterprise-patterns.mdx @@ -0,0 +1,127 @@ +--- +title: Enterprise Patterns +description: Production-ready patterns including retry logic, rate limiting, budget management, circuit breakers, caching, and health monitoring. +--- + +# Enterprise Patterns + +Production patterns for deploying cascadeflow at scale. + +## Retry with Exponential Backoff + +```python +import asyncio +from cascadeflow import CascadeAgent + +async def execute_with_retry(agent, query, max_retries=3, base_delay=1.0): + for attempt in range(max_retries): + try: + return await agent.run(query) + except Exception as e: + if attempt == max_retries - 1: + raise + delay = base_delay * (2 ** attempt) + await asyncio.sleep(delay) +``` + +## Rate Limiting + +```python +import time +from collections import deque + +class RateLimiter: + def __init__(self, max_requests: int, window_seconds: float): + self.max_requests = max_requests + self.window = window_seconds + self.requests = deque() + + async def acquire(self): + now = time.monotonic() + while self.requests and self.requests[0] < now - self.window: + self.requests.popleft() + if len(self.requests) >= self.max_requests: + wait = self.requests[0] + self.window - now + await asyncio.sleep(wait) + self.requests.append(time.monotonic()) +``` + +## Budget Management + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +# Per-user daily budget +async def handle_user_request(user_id: str, query: str): + user_budget = get_user_remaining_budget(user_id) + + with cascadeflow.run(budget=min(user_budget, 0.50)) as session: + result = await agent.run(query) + + spent = session.summary()['cost_total'] + update_user_budget(user_id, spent) + return result +``` + +## Circuit Breaker + +```python +from cascadeflow import CircuitBreaker, CircuitBreakerConfig + +config = CircuitBreakerConfig( + failure_threshold=5, + recovery_timeout=30.0, + half_open_max_calls=2, +) + +breaker = CircuitBreaker(config=config) + +async def safe_call(agent, query): + if not breaker.allow_request(): + return fallback_response(query) + try: + result = await agent.run(query) + breaker.record_success() + return result + except Exception as e: + breaker.record_failure() + raise +``` + +## Response Caching + +```python +from cascadeflow import ResponseCache + +cache = ResponseCache(max_size=1000, ttl_seconds=300) + +async def cached_run(agent, query): + cached = cache.get(query) + if cached: + return cached + result = await agent.run(query) + cache.set(query, result) + return result +``` + +## Health Monitoring + +```python +with cascadeflow.run(budget=10.00) as session: + for query in production_queries: + result = await agent.run(query) + + summary = session.summary() + + # Alert on anomalies + if summary['cost_total'] > 8.0: + alert("Budget 80% consumed") + if summary['steps'] > 100: + alert("High step count") +``` + +## Source + +[examples/production_patterns.py](https://github.com/lemony-ai/cascadeflow/blob/main/examples/production_patterns.py) diff --git a/docs-site/examples/kpi-weighted-routing.mdx b/docs-site/examples/kpi-weighted-routing.mdx new file mode 100644 index 00000000..5bab7689 --- /dev/null +++ b/docs-site/examples/kpi-weighted-routing.mdx @@ -0,0 +1,95 @@ +--- +title: KPI-Weighted Routing +description: Configure quality, cost, latency, and energy weights to encode business priorities into model routing decisions. +--- + +# KPI-Weighted Routing + +Inject business priorities into every model decision using KPI weights. + +## Quality-First (Premium Workload) + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run( + budget=2.00, + kpi_weights={"quality": 0.8, "cost": 0.1, "latency": 0.1}, + kpi_targets={"quality": 0.9} +) as session: + # Routes to highest-quality models within budget + result = await agent.run("Draft a legal contract clause") + print(session.summary()) +``` + +## Cost-First (High-Volume Batch) + +```python +with cascadeflow.run( + budget=5.00, + kpi_weights={"cost": 0.7, "quality": 0.2, "latency": 0.1} +) as session: + # Routes to cheapest models that meet quality floor + for query in batch_queries: + result = await agent.run(query) + print(f"Total cost: ${session.summary()['cost_total']:.4f}") +``` + +## Latency-First (Real-Time) + +```python +with cascadeflow.run( + kpi_weights={"latency": 0.7, "quality": 0.2, "cost": 0.1}, + max_latency_ms=2000.0 +) as session: + # Routes to fastest models, hard cap at 2 seconds + result = await agent.run("Quick classification task") +``` + +## Energy-Aware (Carbon-Conscious) + +```python +with cascadeflow.run( + kpi_weights={"quality": 0.4, "energy": 0.3, "cost": 0.3}, + max_energy=100.0 +) as session: + # Balances quality with energy efficiency + result = await agent.run("Summarize this report") + print(f"Energy used: {session.summary()['energy_used']:.1f} units") +``` + +## Per-Agent Profiles + +```python +@cascadeflow.agent( + budget=0.10, + kpi_weights={"cost": 0.9, "quality": 0.1} +) +async def triage_agent(query: str): + """Quick classification — prioritize cost.""" + return await llm.complete(query) + +@cascadeflow.agent( + budget=2.00, + kpi_weights={"quality": 0.9, "cost": 0.1}, + kpi_targets={"quality": 0.95} +) +async def analysis_agent(query: str): + """Deep analysis — prioritize quality.""" + return await llm.complete(query) +``` + +## Quality Priors + +The harness uses built-in quality priors for scoring: + +| Model | Quality Prior | Latency Prior | +|---|---|---| +| o1 | 0.95 | 0.40 | +| gpt-4o | 0.90 | 0.72 | +| gpt-4-turbo | 0.88 | 0.66 | +| gpt-5-mini | 0.86 | 0.84 | +| gpt-4o-mini | 0.75 | 0.93 | +| gpt-3.5-turbo | 0.65 | 1.00 | diff --git a/docs-site/examples/multi-agent.mdx b/docs-site/examples/multi-agent.mdx new file mode 100644 index 00000000..06b9598b --- /dev/null +++ b/docs-site/examples/multi-agent.mdx @@ -0,0 +1,103 @@ +--- +title: Multi-Agent Orchestration +description: Multi-turn tool execution with agent-as-a-tool delegation and budget tracking across agent boundaries. +--- + +# Multi-Agent Orchestration + +cascadeflow supports multi-agent patterns with tool execution, delegation, and budget tracking across agent boundaries. + +## Tool Execution Loop + +```python +import asyncio +from cascadeflow import CascadeAgent, ModelConfig +from cascadeflow.tools import ToolConfig, ToolExecutor + +# Define tools +tools = [ + ToolConfig( + name="calculator", + description="Evaluate a math expression", + parameters={"expression": {"type": "string"}}, + handler=lambda expression: str(eval(expression)), + ), + ToolConfig( + name="search", + description="Search the web", + parameters={"query": {"type": "string"}}, + handler=lambda query: f"Results for: {query}", + ), +] + +agent = CascadeAgent(models=[ + ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375), + ModelConfig(name="gpt-4o", provider="openai", cost=0.00625), +]) + +executor = ToolExecutor(tools=tools) + +async def main(): + result = await agent.run( + "Calculate 15% of 250 and search for tax rates", + tools=tools, + tool_executor=executor, + max_steps=5, + ) + print(result.content) + +asyncio.run(main()) +``` + +## With Harness Budget Tracking + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run(budget=1.00, max_tool_calls=10) as session: + result = await agent.run( + "Research this topic using multiple tools", + tools=tools, + tool_executor=executor, + max_steps=10, + ) + + summary = session.summary() + print(f"Cost: ${summary['cost_total']:.4f}") + print(f"Tool calls: {summary['tool_calls']}") + print(f"Steps: {summary['steps']}") +``` + +## Agent-as-a-Tool Delegation + +```python +# Define a researcher agent as a tool +researcher = CascadeAgent(models=[ + ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375), + ModelConfig(name="gpt-4o", provider="openai", cost=0.00625), +]) + +async def research_handler(query: str) -> str: + result = await researcher.run(query) + return result.content + +# Main agent can delegate to researcher +tools = [ + ToolConfig( + name="research", + description="Delegate research to a specialist agent", + parameters={"query": {"type": "string"}}, + handler=research_handler, + ), +] + +# Budget tracks across both agents +with cascadeflow.run(budget=2.00) as session: + result = await main_agent.run("Analyze and research this topic", tools=tools) +``` + +## Source + +[examples/agentic_multi_agent.py](https://github.com/lemony-ai/cascadeflow/blob/main/examples/agentic_multi_agent.py) diff --git a/docs-site/favicon.svg b/docs-site/favicon.svg new file mode 100644 index 00000000..496df9f5 --- /dev/null +++ b/docs-site/favicon.svg @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/docs-site/get-started/how-it-works.mdx b/docs-site/get-started/how-it-works.mdx new file mode 100644 index 00000000..721feef6 --- /dev/null +++ b/docs-site/get-started/how-it-works.mdx @@ -0,0 +1,112 @@ +--- +title: How It Works +description: Architecture of cascadeflow's two engines — Cascade for speculative model routing and Harness for agent runtime intelligence. +--- + +# How It Works + +cascadeflow ships two complementary engines that can be used independently or together. + +## Cascade Engine + +The Cascade Engine optimizes model selection through **speculative execution with quality validation**: + +1. **Speculatively executes** small, fast models first — optimistic execution ($0.15-0.30/1M tokens) +2. **Validates quality** of responses using configurable thresholds (completeness, confidence, correctness) +3. **Dynamically escalates** to larger models only when quality validation fails ($1.25-3.00/1M tokens) +4. **Learns patterns** to optimize future cascading decisions and domain-specific routing + +In practice, 60-70% of queries are handled by small, efficient models without escalation. + +**Result:** 40-85% cost reduction, 2-10x faster responses, zero quality loss. + +``` +Query → Domain Detection → Try Draft Model → Quality Check + │ + Pass ───┘─── Fail + │ │ + Return Escalate to + Result Verifier Model +``` + +## Harness Engine + +The Harness Engine provides **agent runtime intelligence** — budget enforcement, compliance gating, KPI-weighted routing, energy tracking, and decision traces. + +Unlike the Cascade Engine which routes between models, the Harness Engine wraps existing agent execution and makes decisions at every step: + +``` +Agent Step → Harness Decision → allow / switch_model / deny_tool / stop + │ + ├── Check budget remaining + ├── Check compliance allowlist + ├── Score KPI dimensions + ├── Check tool call cap + ├── Check latency cap + └── Check energy cap +``` + +### Decision Flow + +For each LLM call or tool execution inside an agent loop, the harness: + +1. **Records** the model, step number, and cumulative metrics +2. **Evaluates** all configured constraints (budget, compliance, tool calls, latency, energy) +3. **Scores** the call against KPI weights if configured +4. **Decides** an action: `allow`, `switch_model`, `deny_tool`, or `stop` +5. **Enforces** the action if in `enforce` mode (logs only in `observe` mode) +6. **Appends** a trace record for auditability + +### HarnessConfig + +All harness behavior is configured through a single dataclass: + +```python +HarnessConfig( + mode="enforce", # off | observe | enforce + budget=0.50, # Max USD for the run + max_tool_calls=10, # Max tool/function calls + max_latency_ms=5000.0, # Max wall-clock ms per call + max_energy=100.0, # Max energy units + compliance="gdpr", # gdpr | hipaa | pci | strict + kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1}, + kpi_targets={"quality": 0.9}, +) +``` + +## Combined Usage + +When both engines are active, the Cascade Engine handles model selection while the Harness Engine enforces constraints: + +```python +import cascadeflow +from cascadeflow import CascadeAgent, ModelConfig + +# Harness: enforce budget and compliance +cascadeflow.init(mode="enforce") + +# Cascade: speculative model routing +agent = CascadeAgent(models=[ + ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375), + ModelConfig(name="gpt-4o", provider="openai", cost=0.00625), +]) + +with cascadeflow.run(budget=1.00) as session: + result = await agent.run("Analyze this contract for GDPR compliance") + print(session.summary()) +``` + +## Provider Abstraction + +cascadeflow supports 17+ providers through a unified interface: + +| Provider | Type | Package | +|---|---|---| +| OpenAI | API | `cascadeflow[openai]` | +| Anthropic | API | `cascadeflow[anthropic]` | +| Groq | API | `cascadeflow[groq]` | +| Together | API | `cascadeflow[together]` | +| Hugging Face | API | `cascadeflow[huggingface]` | +| Ollama | Local | Built-in (HTTP) | +| vLLM | Local | `cascadeflow[vllm]` | +| Vercel AI SDK | TypeScript | `@cascadeflow/vercel-ai` | diff --git a/docs-site/get-started/installation.mdx b/docs-site/get-started/installation.mdx new file mode 100644 index 00000000..ff6b8583 --- /dev/null +++ b/docs-site/get-started/installation.mdx @@ -0,0 +1,101 @@ +--- +title: Installation +description: Install cascadeflow with pip extras for Python or npm packages for TypeScript, including provider-specific setup. +--- + +# Installation + +## Python + +### Minimal install + +```bash +pip install cascadeflow +``` + +Core dependencies: `pydantic>=2.0.0`, `httpx>=0.25.0`, `tiktoken>=0.5.0`, `rich>=13.0.0`. + +### With providers + +```bash +pip install "cascadeflow[providers]" # OpenAI + Anthropic + Groq +``` + +Individual providers: + +```bash +pip install "cascadeflow[openai]" # OpenAI +pip install "cascadeflow[anthropic]" # Anthropic +pip install "cascadeflow[groq]" # Groq +pip install "cascadeflow[huggingface]" # Hugging Face +pip install "cascadeflow[together]" # Together AI +``` + +### With framework integrations + +```bash +pip install "cascadeflow[langchain]" # LangChain/LangGraph +pip install "cascadeflow[openai-agents]" # OpenAI Agents SDK +pip install "cascadeflow[crewai]" # CrewAI (Python 3.10+) +pip install "cascadeflow[google-adk]" # Google ADK (Python 3.10+) +``` + +### Local inference + +```bash +pip install "cascadeflow[vllm]" # vLLM (Python 3.10-3.13) +``` + +Ollama does not need a Python package — cascadeflow communicates with Ollama via HTTP at `localhost:11434`. Install Ollama separately from [ollama.ai](https://ollama.ai). + +### Everything + +```bash +pip install "cascadeflow[all]" # All providers + semantic routing +``` + +### Development + +```bash +git clone https://github.com/lemony-ai/cascadeflow.git +cd cascadeflow +pip install -e ".[dev]" +``` + +## TypeScript + +### Core + +```bash +npm install @cascadeflow/core +``` + +### Framework packages + +```bash +npm install @cascadeflow/langchain # LangChain integration +npm install @cascadeflow/vercel-ai # Vercel AI SDK middleware +npm install @cascadeflow/n8n-nodes-cascadeflow # n8n community node +``` + +## Provider Setup + +Set API keys as environment variables: + +```bash +export OPENAI_API_KEY="sk-..." +export ANTHROPIC_API_KEY="sk-ant-..." +export GROQ_API_KEY="gsk_..." +``` + +cascadeflow auto-detects available providers based on which API keys are set. + +## Verify Installation + +```bash +python -c "import cascadeflow; print(cascadeflow.__version__)" +``` + +```bash +python -c "from cascadeflow import init, run, HarnessConfig, HarnessRunContext; print('OK')" +``` diff --git a/docs-site/get-started/introduction.mdx b/docs-site/get-started/introduction.mdx new file mode 100644 index 00000000..39c2f74c --- /dev/null +++ b/docs-site/get-started/introduction.mdx @@ -0,0 +1,62 @@ +--- +title: Introduction +description: What cascadeflow is, how it differs from external proxies, and when to use it for agent runtime intelligence. +--- + +# Introduction + +cascadeflow is an in-process intelligence layer that sits inside AI agent execution loops. Unlike external proxies that only see HTTP request boundaries, cascadeflow operates with full agent state awareness: step count, budget consumed, tool call history, error context, quality scores, domain, complexity, and user-defined business context. + +## What makes cascadeflow different + +**1. Inside-the-loop control.** Decisions happen per-step and per-tool-call inside agent execution, not at the HTTP boundary. This enables budget gating mid-run, model switching based on remaining budget, and stop actions when caps are hit. + +**2. Multi-dimensional optimization.** Six dimensions scored simultaneously: cost, latency, quality, budget, compliance, and energy. Not just cost routing. + +**3. Business logic injection.** KPI weights and targets let teams encode business priorities (e.g. 60% quality, 30% cost, 10% latency) into every model decision. + +**4. Actionable decisions.** Four actions: `allow`, `switch_model`, `deny_tool`, `stop`. The harness does not just observe — it controls execution flow. + +**5. Full transparency.** Every decision produces a trace record with action, reason, model, step, cost_total, budget_state, and applied fields. Audit-ready. + +**6. Measurable value.** Session summaries report cost, latency, energy, steps, tool calls, and budget remaining. Before/after comparison is built in. + +**7. Cross-framework policy layer.** Unified KPI semantics across LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, and Vercel AI SDK. + +**8. Latency advantage.** In-process instrumentation adds less than 1ms overhead per call. External proxies add 10-50ms of network round-trip latency per LLM call. + +## Proxy vs In-Process Harness + +| Dimension | External Proxy | cascadeflow Harness | +|---|---|---| +| **Scope** | HTTP request boundary | Inside agent execution loop | +| **Dimensions** | Cost only | Cost + quality + latency + budget + compliance + energy | +| **Latency overhead** | 10-50ms network RTT | <1ms in-process | +| **Business logic** | None | KPI weights and targets | +| **Enforcement** | None (observe only) | stop, deny_tool, switch_model | +| **Auditability** | Request logs | Per-step decision traces | + +## When to use cascadeflow + +- You run AI agents (LangChain, LangGraph, CrewAI, OpenAI Agents SDK, Google ADK, or custom) +- You want to reduce LLM costs without changing agent code +- You need budget enforcement across multi-step agent runs +- You need to inject business KPIs (quality, cost, latency, energy) into agent decisions +- You need compliance-aware model gating (GDPR, HIPAA, PCI, strict) +- You want full trace recording for auditability and tuning + +## When NOT to use cascadeflow + +- Single one-off LLM calls (overhead not justified) +- You only use one model and don't want routing +- You need a hosted proxy service (cascadeflow is a library, not a SaaS) + +## Two Engines + +cascadeflow ships two complementary engines: + +**Cascade Engine** — Speculative execution with quality validation. Tries cheap models first, validates quality, escalates only when needed. Achieves 40-85% cost savings on typical workloads. + +**Harness Engine** — Agent runtime intelligence. Budget enforcement, compliance gating, KPI-weighted routing, energy tracking, and decision traces. Works inside agent loops with full state awareness. + +Both engines can be used independently or together. diff --git a/docs-site/get-started/quickstart.mdx b/docs-site/get-started/quickstart.mdx new file mode 100644 index 00000000..64189077 --- /dev/null +++ b/docs-site/get-started/quickstart.mdx @@ -0,0 +1,118 @@ +--- +title: Quickstart +description: Get cascadeflow running in 3 minutes with zero code changes using the harness API. +--- + +# Quickstart + +Three tiers of integration — pick the one that matches your needs. + +## Install + + + +```bash pip +pip install "cascadeflow[openai]" +``` + +```bash With integrations +pip install "cascadeflow[langchain]" # LangChain/LangGraph +pip install "cascadeflow[openai-agents]" # OpenAI Agents SDK +pip install "cascadeflow[crewai]" # CrewAI +pip install "cascadeflow[google-adk]" # Google ADK +``` + +```bash npm +npm install @cascadeflow/core +``` + + + +## Tier 1: Zero-Change Observability + +Add two lines. All OpenAI and Anthropic SDK calls are automatically tracked. + +```python +import cascadeflow + +cascadeflow.init(mode="observe") + +# Your existing code — no changes needed +import openai +client = openai.OpenAI() +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "What is cascadeflow?"}] +) +# cascadeflow is now tracking cost, latency, energy, and model usage. +``` + +## Tier 2: Scoped Runs with Budget + +Wrap agent execution in a `run()` context manager for budget tracking and enforcement. + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run(budget=0.50, max_tool_calls=10) as session: + # Your agent code here + result = await agent.run("Analyze this dataset and create a report") + + # After execution, inspect metrics + summary = session.summary() + print(f"Cost: ${summary['cost_total']:.4f}") + print(f"Steps: {summary['steps']}") + print(f"Tool calls: {summary['tool_calls']}") + print(f"Budget remaining: ${summary['budget_remaining']:.4f}") + + # Full decision audit trail + for decision in session.trace(): + print(f" Step {decision['step']}: {decision['action']} — {decision['reason']}") +``` + +## Tier 3: Decorated Agents with Policy + +Annotate agent functions with budget, compliance, and KPI metadata. + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +@cascadeflow.agent( + budget=0.20, + compliance="gdpr", + kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1} +) +async def research_agent(query: str): + return await llm.complete(query) +``` + +## Harness Modes + +| Mode | Tracking | Enforcement | Use Case | +|---|---|---|---| +| `off` | No | No | Disabled | +| `observe` | Yes | No | Safe production rollout, metrics collection | +| `enforce` | Yes | Yes | Budget caps, compliance gating, stop actions | + +Start with `observe` in production. Switch to `enforce` once you've validated the metrics. + +## Next Steps + + + + All pip extras, npm packages, and provider setup. + + + Architecture of the Cascade and Harness engines. + + + Per-run and per-user budget caps. + + + LangChain, OpenAI Agents, CrewAI, Google ADK, n8n, Vercel AI. + + diff --git a/docs-site/harness/actions.mdx b/docs-site/harness/actions.mdx new file mode 100644 index 00000000..a904eed8 --- /dev/null +++ b/docs-site/harness/actions.mdx @@ -0,0 +1,99 @@ +--- +title: Decision Actions +description: Four harness actions — allow, switch_model, deny_tool, and stop — and when each is triggered. +--- + +# Decision Actions + +The harness makes one of four decisions at every step. Actions are computed in both `observe` and `enforce` modes, but only applied in `enforce` mode. + +## Actions + +### `allow` + +Proceed normally. No constraints are violated. + +``` +Step 1: allow — budget ok, model compliant +``` + +This is the most common action. It means all hard caps (budget, tool calls, latency, energy) are within limits and compliance is satisfied. + +### `switch_model` + +Route to a different model. Triggered when: +- The current model is not in the compliance allowlist +- KPI scoring indicates a better model choice +- Budget pressure suggests a cheaper alternative + +``` +Step 3: switch_model — compliance violation, switching to gpt-4o-mini (gdpr allowlist) +``` + +In `enforce` mode, the harness substitutes the model. In `observe` mode, the original model is used and the trace records what would have happened. + +### `deny_tool` + +Block a tool/function call. Triggered when `max_tool_calls` is reached. + +``` +Step 5: deny_tool — tool call cap reached (10/10) +``` + +In `enforce` mode, the tool call is blocked. The agent receives a signal that the tool was denied. + +### `stop` + +Halt agent execution. Triggered when: +- Budget is exceeded +- Latency cap is exceeded +- Energy cap is exceeded + +``` +Step 7: stop — budget exceeded ($0.52 > $0.50 cap) +``` + +In `enforce` mode, the agent loop is stopped. In `observe` mode, execution continues and the trace records the violation. + +## Decision Priority + +When multiple constraints are violated simultaneously, the harness applies this priority: + +1. **Compliance** — check first (switch_model or stop) +2. **Budget** — check second (stop) +3. **Tool calls** — check third (deny_tool) +4. **Latency** — check fourth (stop) +5. **Energy** — check fifth (stop) +6. **KPI scoring** — soft optimization (switch_model or allow) + +## Hard vs Soft Controls + +**Hard controls** trigger `stop` or `deny_tool` when limits are exceeded: +- `budget` — max USD +- `max_tool_calls` — max tool/function calls +- `max_latency_ms` — max wall-clock ms per call +- `max_energy` — max energy units +- `compliance` — model allowlist + +**Soft controls** influence model selection through KPI weights but never block execution: +- `kpi_weights` — relative importance of quality, cost, latency, energy +- `kpi_targets` — target values for KPI dimensions + +## Example: Combined Constraints + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run( + budget=1.00, + max_tool_calls=5, + compliance="gdpr", + kpi_weights={"quality": 0.6, "cost": 0.4} +) as session: + result = await agent.run("Process EU customer data") + + for record in session.trace(): + print(f"Step {record['step']}: {record['action']} — {record['reason']}") +``` diff --git a/docs-site/harness/budget-enforcement.mdx b/docs-site/harness/budget-enforcement.mdx new file mode 100644 index 00000000..079752ac --- /dev/null +++ b/docs-site/harness/budget-enforcement.mdx @@ -0,0 +1,83 @@ +--- +title: Budget Enforcement +description: Configure budget enforcement with per-run caps and automatic stop actions when budget is exceeded. +--- + +# Budget Enforcement + +The harness tracks cumulative cost across all LLM calls in a run and enforces budget caps in `enforce` mode. + +## Per-Run Budget + +Set a budget cap on a scoped run: + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run(budget=0.50) as session: + # Agent executes multiple LLM calls + result = await agent.run("Research and summarize this topic") + + summary = session.summary() + print(f"Total cost: ${summary['cost_total']:.4f}") + print(f"Budget remaining: ${summary['budget_remaining']:.4f}") +``` + +When cumulative cost exceeds the budget: +- In `observe` mode: the trace records `action: "stop"` with `applied: false` +- In `enforce` mode: the harness stops execution with `action: "stop"` and `applied: true` + +## Per-Agent Budget + +Attach budget metadata to agent functions: + +```python +@cascadeflow.agent(budget=0.20) +async def cheap_agent(query: str): + return await llm.complete(query) + +@cascadeflow.agent(budget=2.00) +async def premium_agent(query: str): + return await llm.complete(query) +``` + +## Budget Pressure Routing + +When budget is partially consumed, the harness can route to cheaper models. This happens automatically when KPI weights include a cost dimension: + +```python +cascadeflow.init(mode="enforce") + +with cascadeflow.run( + budget=1.00, + kpi_weights={"quality": 0.5, "cost": 0.5} +) as session: + # Early calls may use gpt-4o (high quality) + # As budget pressure increases, routing shifts toward gpt-4o-mini (lower cost) + for query in queries: + result = await agent.run(query) +``` + +## Cost Calculation + +Cost is estimated from the built-in pricing table: + +``` +cost = (input_tokens / 1_000_000) * input_price + (output_tokens / 1_000_000) * output_price +``` + +The pricing table covers 18 models across OpenAI, Anthropic, and Google. Unknown models are resolved via fuzzy matching. + +## Combining with Tool Call Caps + +Budget and tool call caps work together: + +```python +with cascadeflow.run(budget=0.50, max_tool_calls=10) as session: + # Stops when either limit is hit + result = await agent.run("Analyze this data") +``` + +The harness checks all constraints at every step. The first constraint that is violated triggers the corresponding action (`stop` for budget, `deny_tool` for tool calls). diff --git a/docs-site/harness/compliance.mdx b/docs-site/harness/compliance.mdx new file mode 100644 index 00000000..febb0de5 --- /dev/null +++ b/docs-site/harness/compliance.mdx @@ -0,0 +1,66 @@ +--- +title: Compliance Gating +description: GDPR, HIPAA, PCI, and strict model allowlists for compliance-aware model gating in agent workflows. +--- + +# Compliance Gating + +The harness enforces model allowlists based on compliance requirements. When a compliance mode is set, only models in the corresponding allowlist are permitted. + +## Compliance Modes + +| Mode | Allowed Models | Use Case | +|---|---|---| +| `gdpr` | gpt-4o, gpt-4o-mini, gpt-3.5-turbo | EU data protection | +| `hipaa` | gpt-4o, gpt-4o-mini | Healthcare data | +| `pci` | gpt-4o-mini, gpt-3.5-turbo | Payment card data | +| `strict` | gpt-4o | Maximum restriction | + +## Usage + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +# GDPR compliance — only gpt-4o, gpt-4o-mini, gpt-3.5-turbo allowed +with cascadeflow.run(compliance="gdpr") as session: + result = await agent.run("Process this EU customer data") +``` + +Or as agent metadata: + +```python +@cascadeflow.agent(compliance="hipaa") +async def medical_agent(query: str): + return await llm.complete(query) +``` + +## Enforcement Behavior + +When a model outside the allowlist is requested: + +- In `observe` mode: the trace records `action: "switch_model"` with the suggested compliant alternative, but execution continues with the original model +- In `enforce` mode: the harness blocks the non-compliant model and either switches to a compliant alternative or stops execution + +## Combining with Budget + +Compliance and budget constraints are independent. Both are checked at every step: + +```python +with cascadeflow.run(budget=0.50, compliance="gdpr") as session: + # Must stay within budget AND use only GDPR-approved models + result = await agent.run("Analyze EU customer feedback") +``` + +## Custom Allowlists + +The built-in allowlists cover common regulations. For custom requirements, set compliance at the integration level or use the `HarnessConfig` directly: + +```python +config = HarnessConfig( + mode="enforce", + compliance="strict", # Only gpt-4o +) +cascadeflow.init(config=config) +``` diff --git a/docs-site/harness/decision-trace.mdx b/docs-site/harness/decision-trace.mdx new file mode 100644 index 00000000..2b1b14a6 --- /dev/null +++ b/docs-site/harness/decision-trace.mdx @@ -0,0 +1,102 @@ +--- +title: Decision Traces +description: Per-step audit trail of every harness decision — action, reason, model, cost, budget state, and enforcement status. +--- + +# Decision Traces + +Every harness decision produces a trace record. Traces provide a full audit trail for debugging, compliance reporting, and performance tuning. + +## Trace Format + +Each trace record contains: + +| Field | Type | Description | +|---|---|---| +| `action` | string | `"allow"`, `"switch_model"`, `"deny_tool"`, or `"stop"` | +| `reason` | string | Human-readable explanation of the decision | +| `model` | string | Model name used for the call | +| `step` | int | Step number in the run (1-indexed) | +| `cost_total` | float | Cumulative cost in USD at this step | +| `budget_state` | string | `"ok"`, `"warning"`, or `"exceeded"` | +| `applied` | bool | `true` if the action was enforced, `false` in observe mode | + +## Accessing Traces + +```python +import cascadeflow + +cascadeflow.init(mode="observe") + +with cascadeflow.run(budget=0.50) as session: + result = await agent.run("Research this topic") + + # Full decision trace + for record in session.trace(): + print(f"Step {record['step']}: {record['action']} — {record['reason']}") + print(f" Model: {record['model']}, Cost: ${record['cost_total']:.4f}") + print(f" Budget: {record['budget_state']}, Applied: {record['applied']}") +``` + +Example output: + +``` +Step 1: allow — budget ok, model compliant + Model: gpt-4o-mini, Cost: $0.0003 + Budget: ok, Applied: false +Step 2: allow — budget ok, model compliant + Model: gpt-4o-mini, Cost: $0.0007 + Budget: ok, Applied: false +Step 3: switch_model — budget pressure, routing to cheaper model + Model: gpt-4o, Cost: $0.0032 + Budget: warning, Applied: false +``` + +## Observe vs Enforce + +In `observe` mode, traces record what the harness *would* do: +- `applied` is always `false` +- Agent execution continues regardless of the action + +In `enforce` mode, traces record what the harness *did*: +- `applied` is `true` when the action was enforced +- `stop` actions halt execution +- `deny_tool` actions block tool calls + +## Privacy + +Decision traces do not contain prompt content, response content, or user data. They only contain: +- Model names and step numbers +- Cost and budget metrics +- Action decisions and reasons + +This makes traces safe for logging, external storage, and compliance reporting without data classification concerns. + +## Callbacks + +Register callbacks to receive trace records in real time: + +```python +from cascadeflow import get_harness_callback_manager, set_harness_callback_manager + +cb_manager = get_harness_callback_manager() + +# Traces are emitted through the callback system +# Use framework-specific integrations for structured access +``` + +## Session Summary + +In addition to per-step traces, `session.summary()` provides aggregate metrics: + +```python +summary = session.summary() +# { +# "cost_total": 0.0032, +# "steps": 3, +# "tool_calls": 1, +# "latency_total_ms": 1250.0, +# "energy_used": 45.2, +# "budget_remaining": 0.4968, +# } +``` diff --git a/docs-site/harness/energy-tracking.mdx b/docs-site/harness/energy-tracking.mdx new file mode 100644 index 00000000..a3d292ee --- /dev/null +++ b/docs-site/harness/energy-tracking.mdx @@ -0,0 +1,99 @@ +--- +title: Energy Tracking +description: Deterministic compute-intensity coefficients for carbon-aware AI operations, with energy caps and per-model coefficients. +--- + +# Energy Tracking + +The harness tracks energy consumption using deterministic compute-intensity coefficients. This provides a proxy for carbon impact without requiring real-time power measurement. + +## Energy Formula + +``` +energy_units = coefficient * (input_tokens + output_tokens * 1.5) +``` + +Output tokens are weighted 1.5x because generation is more compute-intensive than prompt processing. + +## Energy Coefficients + +| Model | Coefficient | Relative Cost | +|---|---|---| +| gpt-3.5-turbo | 0.20 | Lowest | +| gemini-1.5-flash | 0.20 | Lowest | +| gemini-2.0-flash | 0.25 | Very low | +| claude-haiku-3.5 | 0.30 | Low | +| gemini-2.5-flash | 0.30 | Low | +| gpt-4o-mini | 0.30 | Low | +| gpt-5-mini | 0.35 | Low | +| o3-mini | 0.50 | Medium | +| o1-mini | 0.80 | Medium-high | +| gpt-4o | 1.00 | Baseline | +| claude-sonnet-4 | 1.00 | Baseline | +| gemini-1.5-pro | 1.00 | Baseline | +| gpt-5 | 1.20 | High | +| gemini-2.5-pro | 1.20 | High | +| gpt-4-turbo | 1.50 | High | +| gpt-4 | 1.50 | High | +| claude-opus-4.5 | 1.80 | Very high | +| o1 | 2.00 | Highest | + +## Energy Caps + +Set a maximum energy budget for a run: + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run(max_energy=100.0) as session: + result = await agent.run("Process this large dataset") + + summary = session.summary() + print(f"Energy used: {summary['energy_used']:.1f} units") +``` + +When energy exceeds the cap: +- In `observe` mode: logged but not enforced +- In `enforce` mode: execution stops with `action: "stop"` + +## Energy-Aware KPI Weights + +Include energy in KPI weights for carbon-aware routing: + +```python +with cascadeflow.run( + kpi_weights={"quality": 0.4, "cost": 0.3, "energy": 0.3} +) as session: + # Routes toward lower-energy models when quality allows + result = await agent.run("Summarize this article") +``` + +## Pricing Table + +Full pricing for all 18 supported models (USD per 1M tokens): + +| Model | Input | Output | +|---|---|---| +| **OpenAI** | | | +| gpt-4o | $2.50 | $10.00 | +| gpt-4o-mini | $0.15 | $0.60 | +| gpt-5 | $1.25 | $10.00 | +| gpt-5-mini | $0.20 | $0.80 | +| gpt-4-turbo | $10.00 | $30.00 | +| gpt-4 | $30.00 | $60.00 | +| gpt-3.5-turbo | $0.50 | $1.50 | +| o1 | $15.00 | $60.00 | +| o1-mini | $3.00 | $12.00 | +| o3-mini | $1.10 | $4.40 | +| **Anthropic** | | | +| claude-sonnet-4 | $3.00 | $15.00 | +| claude-haiku-3.5 | $1.00 | $5.00 | +| claude-opus-4.5 | $5.00 | $25.00 | +| **Google** | | | +| gemini-2.5-flash | $0.15 | $0.60 | +| gemini-2.5-pro | $1.25 | $10.00 | +| gemini-2.0-flash | $0.10 | $0.40 | +| gemini-1.5-flash | $0.075 | $0.30 | +| gemini-1.5-pro | $1.25 | $5.00 | diff --git a/docs-site/harness/kpi-optimization.mdx b/docs-site/harness/kpi-optimization.mdx new file mode 100644 index 00000000..e07e1023 --- /dev/null +++ b/docs-site/harness/kpi-optimization.mdx @@ -0,0 +1,103 @@ +--- +title: KPI-Weighted Routing +description: Inject business priorities as quality, cost, latency, and energy weights into every model routing decision. +--- + +# KPI-Weighted Routing + +The harness scores each model decision against configurable KPI weights. This lets teams encode business priorities into agent behavior without changing agent code. + +## KPI Dimensions + +| Dimension | Score Source | Range | What it means | +|---|---|---|---| +| `quality` | Model quality priors | 0.0-1.0 | Higher = better output quality | +| `cost` | Inverse of model cost | 0.0-1.0 | Higher = cheaper model | +| `latency` | Model latency priors | 0.0-1.0 | Higher = faster response | +| `energy` | Inverse of energy coefficient | 0.0-1.0 | Higher = lower compute intensity | + +## Configuration + +```python +import cascadeflow + +cascadeflow.init(mode="enforce") + +with cascadeflow.run( + kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1}, + kpi_targets={"quality": 0.9} +) as session: + result = await agent.run("Analyze this legal document") +``` + +### Weights + +Weights are relative — they don't need to sum to 1.0 (they are normalized internally). They control the relative importance of each dimension in the composite score. + +```python +# Quality-first (premium workload) +kpi_weights = {"quality": 0.8, "cost": 0.1, "latency": 0.1} + +# Cost-first (high-volume batch) +kpi_weights = {"quality": 0.2, "cost": 0.7, "latency": 0.1} + +# Balanced +kpi_weights = {"quality": 0.4, "cost": 0.3, "latency": 0.2, "energy": 0.1} +``` + +### Targets + +Targets set minimum acceptable values. If a model's score for a dimension falls below the target, it is penalized in the composite score. + +```python +kpi_targets = { + "quality": 0.9, # Require high quality + "latency": 0.7, # Require reasonable speed +} +``` + +## Scoring Formula + +The composite score for a model is: + +``` +score = quality_prior * w_quality + cost_utility * w_cost + latency_prior * w_latency + energy_utility * w_energy +``` + +Where `w_*` are the normalized weights and utility values are computed from model priors. + +## Quality Priors + +Built-in quality priors for common models (OpenAI): + +| Model | Quality | Latency | +|---|---|---| +| o1 | 0.95 | 0.40 | +| gpt-4o | 0.90 | 0.72 | +| gpt-4-turbo | 0.88 | 0.66 | +| gpt-4 | 0.87 | 0.52 | +| gpt-5-mini | 0.86 | 0.84 | +| o1-mini | 0.82 | 0.60 | +| o3-mini | 0.80 | 0.78 | +| gpt-4o-mini | 0.75 | 0.93 | +| gpt-3.5-turbo | 0.65 | 1.00 | + +## Per-Agent KPI Weights + +Different agents can have different priorities: + +```python +@cascadeflow.agent( + budget=0.50, + kpi_weights={"quality": 0.8, "cost": 0.2} +) +async def quality_agent(query: str): + return await llm.complete(query) + +@cascadeflow.agent( + budget=0.10, + kpi_weights={"cost": 0.8, "quality": 0.2} +) +async def budget_agent(query: str): + return await llm.complete(query) +``` diff --git a/docs-site/harness/modes.mdx b/docs-site/harness/modes.mdx new file mode 100644 index 00000000..46a86840 --- /dev/null +++ b/docs-site/harness/modes.mdx @@ -0,0 +1,78 @@ +--- +title: Harness Modes +description: Three harness modes — off, observe, and enforce — with rollout guidance for production deployments. +--- + +# Harness Modes + +cascadeflow operates in one of three modes, set at initialization. + +## Modes + +### `off` + +No tracking, no enforcement. The harness is completely disabled. This is the default. + +```python +cascadeflow.init(mode="off") +``` + +### `observe` + +Track all metrics and decisions, but never block execution. Every LLM call and tool execution is recorded with full decision traces. Actions are computed but not enforced — `applied` is always `false` in trace records. + +```python +cascadeflow.init(mode="observe") +``` + +Use `observe` for: +- Initial production rollout to validate metrics before enforcing +- Shadow-mode testing to understand what the harness would do +- Cost and usage analytics without affecting agent behavior + +### `enforce` + +Track all metrics and enforce constraints. When a hard cap is hit (budget, tool calls, latency, energy) or a compliance violation is detected, the harness takes action: `stop`, `deny_tool`, or `switch_model`. + +```python +cascadeflow.init(mode="enforce") +``` + +Use `enforce` when: +- You have validated metrics in `observe` mode +- You need hard budget caps to prevent runaway costs +- Compliance requirements mandate model gating + +## Rollout Guidance + +Recommended rollout sequence for production: + +1. **Deploy with `observe`** — No risk to agent behavior. Collect metrics, review decision traces, validate that the harness sees what you expect. + +2. **Review traces** — Check that compliance allowlists, budget calculations, and KPI scoring match your expectations. + +3. **Switch to `enforce`** — Once validated, change the mode. The harness will now enforce constraints. + +4. **Monitor** — Use `session.summary()` and `session.trace()` to monitor enforcement in production. + +```python +import os + +# Environment-driven mode selection +mode = os.getenv("CASCADEFLOW_MODE", "observe") +cascadeflow.init(mode=mode) +``` + +## Mode Behavior Matrix + +| Behavior | `off` | `observe` | `enforce` | +|---|---|---|---| +| Cost tracking | No | Yes | Yes | +| Latency tracking | No | Yes | Yes | +| Energy tracking | No | Yes | Yes | +| Decision traces | No | Yes | Yes | +| Budget enforcement | No | No | Yes | +| Tool call gating | No | No | Yes | +| Compliance gating | No | No | Yes | +| `session.summary()` | Empty | Full metrics | Full metrics | +| `session.trace()` | Empty | Decisions (applied=false) | Decisions (applied=true) | diff --git a/docs-site/harness/overview.mdx b/docs-site/harness/overview.mdx new file mode 100644 index 00000000..8486c8c4 --- /dev/null +++ b/docs-site/harness/overview.mdx @@ -0,0 +1,80 @@ +--- +title: Harness Overview +description: Overview of the cascadeflow harness — six optimization dimensions, HarnessConfig surface, and high-level decision flow. +--- + +# Harness Overview + +The cascadeflow harness is an in-process intelligence layer that wraps AI agent execution. It tracks, scores, and optionally enforces constraints across six dimensions for every LLM call and tool execution inside agent loops. + +## Six Dimensions + +| Dimension | What it measures | Hard cap | Soft scoring | +|---|---|---|---| +| **Cost** | Estimated USD from the pricing table | `budget` | `kpi_weights.cost` | +| **Latency** | Wall-clock milliseconds per LLM call | `max_latency_ms` | `kpi_weights.latency` | +| **Quality** | Model quality priors (0-1 score) | -- | `kpi_weights.quality` | +| **Tool calls** | Count of tool/function calls | `max_tool_calls` | -- | +| **Energy** | Compute-intensity coefficient | `max_energy` | `kpi_weights.energy` | +| **Compliance** | Model allowlist per regulation | `compliance` | -- | + +## HarnessConfig + +All harness behavior is configured through a single dataclass: + +```python +from cascadeflow import HarnessConfig + +config = HarnessConfig( + mode="enforce", # "off" | "observe" | "enforce" + verbose=False, # Print decisions to stderr + budget=0.50, # Max USD for the run (None = unlimited) + max_tool_calls=10, # Max tool/function calls (None = unlimited) + max_latency_ms=5000.0, # Max wall-clock ms per call (None = unlimited) + max_energy=100.0, # Max energy units (None = unlimited) + kpi_targets={"quality": 0.9}, # Target values for KPI dimensions + kpi_weights={ # Relative importance of each dimension + "quality": 0.6, + "cost": 0.3, + "latency": 0.1, + }, + compliance="gdpr", # "gdpr" | "hipaa" | "pci" | "strict" | None +) +``` + +## Activation + +```python +import cascadeflow + +# Global activation +cascadeflow.init(mode="observe") + +# Scoped run with overrides +with cascadeflow.run(budget=0.50, max_tool_calls=10) as session: + # agent code + pass + +# Decorated agent function +@cascadeflow.agent(budget=0.20, compliance="gdpr") +async def my_agent(query: str): + pass +``` + +## Decision Flow + +For each LLM call or tool execution: + +1. **Record** model, step number, cumulative cost, latency, energy +2. **Check compliance** — is the model in the allowlist for the configured regulation? +3. **Check hard caps** — budget, tool calls, latency, energy +4. **Score KPI dimensions** — quality, cost, latency, energy weighted by `kpi_weights` +5. **Decide action** — `allow`, `switch_model`, `deny_tool`, or `stop` +6. **Enforce or log** — enforce in `enforce` mode, log only in `observe` mode +7. **Append trace** — full decision record for auditability + +## Supported Models + +The harness includes a built-in pricing table for 18 models across OpenAI, Anthropic, and Google. Unknown models are resolved via fuzzy matching (e.g. `gpt-5-mini` matches even before official pricing is announced). + +See [Energy Tracking](/harness/energy-tracking) for the full pricing and energy coefficients table. diff --git a/docs-site/index.mdx b/docs-site/index.mdx new file mode 100644 index 00000000..2e99a0a2 --- /dev/null +++ b/docs-site/index.mdx @@ -0,0 +1,91 @@ +--- +title: cascadeflow +description: Agent runtime intelligence layer — optimize cost, latency, quality, budget, compliance, and energy across AI agent workflows. +--- + +# cascadeflow + +The in-process intelligence layer for AI agents. Optimize cost, latency, quality, budget, compliance, and energy — inside the execution loop, not at the HTTP boundary. + + + + Get running in 3 minutes with zero code changes. + + + Two engines: Cascade for model routing, Harness for agent intelligence. + + + Budget enforcement, compliance gating, KPI-weighted routing, energy tracking. + + + LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, Vercel AI SDK. + + + +## Install + + + +```bash pip +pip install cascadeflow +``` + +```bash npm +npm install @cascadeflow/core +``` + + + +## Quick Start + + + +```python Observe (zero-change) +import cascadeflow +cascadeflow.init(mode="observe") +# All OpenAI/Anthropic SDK calls are now tracked. +``` + +```python Scoped Run +import cascadeflow +cascadeflow.init(mode="enforce") + +with cascadeflow.run(budget=0.50) as session: + result = await agent.run("Analyze this dataset") + print(session.summary()) +``` + +```python Decorated Agent +import cascadeflow +cascadeflow.init(mode="enforce") + +@cascadeflow.agent(budget=0.20, compliance="gdpr") +async def my_agent(query: str): + return await llm.complete(query) +``` + + + +## Supported Frameworks + +| Framework | Python | TypeScript | Integration Type | +|---|---|---|---| +| LangChain / LangGraph | `cascadeflow[langchain]` | `@cascadeflow/langchain` | Callback handler | +| OpenAI Agents SDK | `cascadeflow[openai-agents]` | -- | ModelProvider | +| CrewAI | `cascadeflow[crewai]` | -- | llm_hooks | +| Google ADK | `cascadeflow[google-adk]` | -- | BasePlugin | +| n8n | -- | `@cascadeflow/n8n-nodes-cascadeflow` | Community node | +| Vercel AI SDK | -- | `@cascadeflow/vercel-ai` | Middleware | + +## Six Dimensions + +cascadeflow optimizes across six dimensions simultaneously: + +| Dimension | What it controls | Example | +|---|---|---| +| **Cost** | USD per LLM call from pricing table | Budget cap of $0.50 per run | +| **Latency** | Wall-clock milliseconds per call | Max 2000ms per call | +| **Quality** | Model quality priors for routing | 60% weight on quality KPI | +| **Budget** | Cumulative spend tracking and caps | Per-user daily limits | +| **Compliance** | Model allowlists per regulation | GDPR: only gpt-4o, gpt-4o-mini | +| **Energy** | Compute-intensity coefficients | Carbon-aware model selection | diff --git a/docs-site/integrations/crewai.mdx b/docs-site/integrations/crewai.mdx new file mode 100644 index 00000000..1fae1fde --- /dev/null +++ b/docs-site/integrations/crewai.mdx @@ -0,0 +1,78 @@ +--- +title: CrewAI +description: Hook-based harness integration for CrewAI with budget gating, metrics tracking, and decision traces across crew steps. +--- + +# CrewAI Integration + +cascadeflow integrates with CrewAI through the native `llm_hooks` system. Call `enable()` to register global hooks that track all crew steps, enforce budget caps, and record decision traces. + +## Install + +```bash +pip install "cascadeflow[crewai]" +``` + +## Quick Start + +```python +from crewai import Agent, Crew, Process, Task +import cascadeflow +from cascadeflow.integrations.crewai import CrewAIHarnessConfig, enable + +cascadeflow.init(mode="observe") + +# Enable harness hooks +config = CrewAIHarnessConfig( + fail_open=True, + budget_gate=True, +) +enable(config=config) + +# Define agents and tasks as usual +researcher = Agent( + role="Researcher", + goal="Find relevant information", + llm="gpt-4o-mini", +) + +task = Task( + description="Research the topic of AI agent frameworks", + agent=researcher, +) + +crew = Crew( + agents=[researcher], + tasks=[task], + process=Process.sequential, +) + +# Run with budget tracking +with cascadeflow.run(budget=1.00) as session: + result = crew.kickoff() + print(session.summary()) + for record in session.trace(): + print(f"Step {record['step']}: {record['action']} — {record['reason']}") +``` + +## Configuration + +```python +config = CrewAIHarnessConfig( + fail_open=True, # Continue on harness errors + budget_gate=True, # Enforce budget caps +) +``` + +## Features + +- Tracks all crew steps automatically via `llm_hooks` +- Budget gating stops crew execution when budget is exceeded +- Full decision trace across all agents in the crew +- Fail-open mode for production safety +- No changes to existing CrewAI agent or task definitions + +## Limitations + +- Tool-level gating is not currently applied (CrewAI hooks operate at the LLM call level) +- Model switching depends on CrewAI's model configuration diff --git a/docs-site/integrations/google-adk.mdx b/docs-site/integrations/google-adk.mdx new file mode 100644 index 00000000..8b6f3403 --- /dev/null +++ b/docs-site/integrations/google-adk.mdx @@ -0,0 +1,91 @@ +--- +title: Google ADK +description: Plugin-based harness integration for Google Agent Development Kit with budget enforcement and metrics tracking. +--- + +# Google ADK Integration + +cascadeflow integrates with Google's Agent Development Kit (ADK) through the `BasePlugin` system. Call `enable()` to get a plugin that plugs into `Runner(plugins=[...])`. + +## Install + +```bash +pip install "cascadeflow[google-adk]" +``` + +Requires Python 3.10+. + +## Quick Start + +```python +import asyncio +from google.adk.agents import Agent +from google.adk.runners import Runner +from google.adk.sessions import InMemorySessionService +from google.genai.types import Content, Part + +import cascadeflow +from cascadeflow.integrations.google_adk import GoogleADKHarnessConfig, enable + +cascadeflow.init(mode="observe") + +# Enable harness plugin +config = GoogleADKHarnessConfig( + fail_open=True, + enable_budget_gate=True, +) +plugin = enable(config=config) + +# Create ADK agent +agent = Agent( + name="research_agent", + model="gemini-2.5-flash", + instruction="You are a helpful research assistant.", +) + +# Run with plugin +session_service = InMemorySessionService() +runner = Runner(agent=agent, plugins=[plugin]) + +async def main(): + with cascadeflow.run(budget=0.50) as session: + user_content = Content(parts=[Part(text="Explain cascadeflow")]) + async for event in runner.run_async( + session_id="test", + user_id="user-1", + new_message=user_content, + ): + pass # Process streaming events + + print(session.summary()) + +asyncio.run(main()) +``` + +## Configuration + +```python +config = GoogleADKHarnessConfig( + fail_open=True, # Continue on harness errors + enable_budget_gate=True, # Enforce budget caps +) +``` + +## Supported Gemini Models + +| Model | Input $/1M | Output $/1M | Energy Coeff | +|---|---|---|---| +| gemini-2.5-flash | $0.15 | $0.60 | 0.30 | +| gemini-2.5-pro | $1.25 | $10.00 | 1.20 | +| gemini-2.0-flash | $0.10 | $0.40 | 0.25 | +| gemini-1.5-flash | $0.075 | $0.30 | 0.20 | +| gemini-1.5-pro | $1.25 | $5.00 | 1.00 | + +## Budget Enforcement + +When budget is exceeded in `enforce` mode, the plugin returns an `LlmResponse` with `error_code="BUDGET_EXCEEDED"`. The ADK runner handles this as a graceful stop. + +## Limitations + +- Tool gating is not applied (intentional design choice — ADK manages tool execution internally) +- Model switching depends on ADK's model configuration diff --git a/docs-site/integrations/langchain.mdx b/docs-site/integrations/langchain.mdx new file mode 100644 index 00000000..2f29062f --- /dev/null +++ b/docs-site/integrations/langchain.mdx @@ -0,0 +1,106 @@ +--- +title: LangChain +description: Harness-aware callback handler for LangChain and LangGraph with budget tracking, cost analytics, and decision traces. +--- + +# LangChain Integration + +cascadeflow integrates with LangChain through a callback handler that wraps any `BaseChatModel`. Works with LCEL chains, streaming, tool calling, structured output, and LangGraph agents. + +## Install + + + +```bash Python +pip install "cascadeflow[langchain]" +``` + +```bash TypeScript +npm install @cascadeflow/langchain @langchain/core @langchain/openai +``` + + + +## Quick Start + + + +```python Python — Harness callback +import cascadeflow +from cascadeflow.integrations.langchain import get_harness_callback +from langchain_openai import ChatOpenAI + +cascadeflow.init(mode="observe") + +model = ChatOpenAI(model="gpt-4o") +cb = get_harness_callback() + +with cascadeflow.run(budget=0.50) as session: + result = await model.ainvoke("Explain quantum computing", config={"callbacks": [cb]}) + print(session.summary()) +``` + +```python Python — Cascade routing +from langchain_openai import ChatOpenAI +from langchain_anthropic import ChatAnthropic +from cascadeflow.integrations.langchain import CascadeFlow + +cascade = CascadeFlow( + drafter=ChatOpenAI(model="gpt-4o-mini"), + verifier=ChatAnthropic(model="claude-sonnet-4"), + quality_threshold=0.8, +) + +result = await cascade.ainvoke("Explain quantum computing") +``` + +```typescript TypeScript — Drop-in cascade +import { ChatOpenAI } from '@langchain/openai'; +import { ChatAnthropic } from '@langchain/anthropic'; +import { withCascade } from '@cascadeflow/langchain'; + +const cascade = withCascade({ + drafter: new ChatOpenAI({ model: 'gpt-4o-mini' }), + verifier: new ChatAnthropic({ model: 'claude-sonnet-4' }), + qualityThreshold: 0.8, +}); + +const result = await cascade.invoke('Explain quantum computing'); +``` + + + +## Features + +- Full LCEL support (pipes, sequences, batch) +- Streaming with pre-routing +- Tool calling and structured output +- LangSmith cost tracking metadata +- Cost tracking callbacks +- Domain policies with `cascadeflow_domain` metadata + +## Cost Tracking Callback + +```python +from cascadeflow.integrations.langchain.langchain_callbacks import get_cascade_callback + +with get_cascade_callback() as cb: + response = await cascade.ainvoke("What is Python?") + print(f"Total cost: ${cb.total_cost:.6f}") + print(f"Drafter cost: ${cb.drafter_cost:.6f}") + print(f"Verifier cost: ${cb.verifier_cost:.6f}") +``` + +## LangSmith Integration + +When LangSmith tracing is enabled, cascadeflow adds metadata to runs: +- `cascade_decision`: whether the drafter was accepted +- `modelUsed`: which model produced the final response +- `drafterQuality`: quality score from validation +- `savingsPercentage`: cost savings achieved + +```bash +export LANGSMITH_API_KEY="..." +export LANGSMITH_PROJECT="my-project" +export LANGSMITH_TRACING=true +``` diff --git a/docs-site/integrations/n8n.mdx b/docs-site/integrations/n8n.mdx new file mode 100644 index 00000000..efb89f51 --- /dev/null +++ b/docs-site/integrations/n8n.mdx @@ -0,0 +1,70 @@ +--- +title: n8n +description: cascadeflow community nodes for n8n with cascade model routing, tool gating, and harness modes for no-code AI workflows. +--- + +# n8n Integration + +cascadeflow provides two community nodes for n8n workflows: a Model sub-node for drop-in cascade routing and an Agent node for standalone multi-step reasoning. + +## Install + +In n8n: +1. Go to **Settings** > **Community Nodes** +2. Search for: `@cascadeflow/n8n-nodes-cascadeflow` +3. Click **Install** + +Or via npm: +```bash +npm install @cascadeflow/n8n-nodes-cascadeflow +``` + +## Two Nodes + +| Node | Type | Use Case | +|---|---|---| +| **CascadeFlow (Model)** | Language Model sub-node | Drop-in for any Chain/LLM node | +| **CascadeFlow Agent** | Standalone agent | Tool calling, memory, multi-step reasoning | + +## CascadeFlow (Model) + +Drop-in replacement for any AI Chat Model in n8n chains: + +1. Add two **AI Chat Model** nodes (cheap drafter + powerful verifier) +2. Add **CascadeFlow (Model)** and connect both models +3. Connect to a **Basic LLM Chain** or **Chain** node +4. Check the **Logs tab** to see cascade decisions + +**Features:** +- Quality threshold (default: 0.4) +- 16 supported domains (Code, Math, Data, Legal, Medical, Financial, etc.) +- Complexity thresholds for automatic routing + +## CascadeFlow Agent + +Standalone agent with tool calling and multi-step reasoning: + +1. Add a **Chat Trigger** node +2. Add **CascadeFlow Agent** and connect to the trigger +3. Connect **Drafter**, **Verifier**, optional **Memory** and **Tools** +4. Check the **Output tab** for cascade metadata and decision trace + +**Features:** +- Harness mode: `observe` or `enforce` +- Budget caps and tool call limits +- Tool routing rules: Cascade (default) or Verifier (for high-stakes tools) +- Tool call validation with JSON schema checking + +## Complexity Thresholds + +| Level | Threshold | Routing | +|---|---|---| +| Trivial | 0.25 | Always use drafter | +| Simple | 0.40 | Prefer drafter | +| Moderate | 0.55 | Quality-dependent | +| Hard | 0.70 | Prefer verifier | +| Expert | 0.80 | Always use verifier | + +## Result + +40-85% cost savings in n8n workflows with zero changes to existing chains. diff --git a/docs-site/integrations/openai-agents.mdx b/docs-site/integrations/openai-agents.mdx new file mode 100644 index 00000000..1a189a6b --- /dev/null +++ b/docs-site/integrations/openai-agents.mdx @@ -0,0 +1,77 @@ +--- +title: OpenAI Agents SDK +description: CascadeFlowModelProvider for OpenAI Agents SDK with model candidates, tool gating, and budget tracking. +--- + +# OpenAI Agents SDK Integration + +cascadeflow provides a `CascadeFlowModelProvider` that integrates with the OpenAI Agents SDK as an explicit `ModelProvider`. Supports model candidates, tool gating, and scoped budget tracking. + +## Install + +```bash +pip install "cascadeflow[openai-agents]" +``` + +## Quick Start + +```python +import asyncio +from agents import Agent, Runner +import cascadeflow +from cascadeflow.integrations.openai_agents import ( + CascadeFlowModelProvider, + OpenAIAgentsIntegrationConfig, +) + +cascadeflow.init(mode="observe") + +# Configure integration +config = OpenAIAgentsIntegrationConfig( + model_candidates=["gpt-4o-mini", "gpt-4o"], + enable_tool_gating=True, +) + +provider = CascadeFlowModelProvider(config=config) + +agent = Agent( + name="research_agent", + instructions="You are a helpful research assistant.", + model_provider=provider, +) + +async def main(): + with cascadeflow.run(budget=0.50) as session: + result = await Runner.run(agent, "Explain cascadeflow") + print(result.final_output) + print(session.summary()) + +asyncio.run(main()) +``` + +## Features + +- **Model candidates**: List of models the provider can select from based on harness scoring +- **Tool gating**: Block tool calls when `max_tool_calls` is reached +- **Scoped runs**: Use `cascadeflow.run()` for per-task budget tracking +- **Decision traces**: Full audit trail of model selection and tool gating decisions +- **Fail-open**: If the harness encounters an error, execution continues with the default model + +## Configuration + +```python +config = OpenAIAgentsIntegrationConfig( + model_candidates=["gpt-4o-mini", "gpt-4o"], # Models to choose from + enable_tool_gating=True, # Block tools at cap +) +``` + +## Session Metrics + +After a run, `session.summary()` includes: +- `cost_total`: cumulative USD spent +- `budget_remaining`: USD left in the budget +- `step_count`: number of LLM calls +- `tool_calls`: number of tool executions +- `latency_used_ms`: total latency +- `energy_used`: total energy units diff --git a/docs-site/integrations/overview.mdx b/docs-site/integrations/overview.mdx new file mode 100644 index 00000000..92bda53e --- /dev/null +++ b/docs-site/integrations/overview.mdx @@ -0,0 +1,53 @@ +--- +title: Integrations Overview +description: Matrix of all cascadeflow framework integrations with supported features, languages, and integration patterns. +--- + +# Integrations Overview + +cascadeflow integrates with six agent frameworks. All integrations are opt-in — install the extra and explicitly enable. + +## Integration Matrix + +| Framework | Language | Package | Integration Type | Budget Gating | Tool Gating | Traces | +|---|---|---|---|---|---|---| +| [LangChain](/integrations/langchain) | Python, TS | `cascadeflow[langchain]`, `@cascadeflow/langchain` | Callback handler | Yes | No | Yes | +| [OpenAI Agents SDK](/integrations/openai-agents) | Python | `cascadeflow[openai-agents]` | ModelProvider | Yes | Yes | Yes | +| [CrewAI](/integrations/crewai) | Python | `cascadeflow[crewai]` | llm_hooks | Yes | No | Yes | +| [Google ADK](/integrations/google-adk) | Python | `cascadeflow[google-adk]` | BasePlugin | Yes | No | Yes | +| [n8n](/integrations/n8n) | TypeScript | `@cascadeflow/n8n-nodes-cascadeflow` | Community node | Yes | Yes | Yes | +| [Vercel AI SDK](/integrations/vercel-ai) | TypeScript | `@cascadeflow/vercel-ai` | Middleware | Yes | No | Yes | + +## Integration Patterns + +Each integration follows the same principle: wrap the framework's extension point with cascadeflow's harness, without modifying agent code. + +### Python + +```python +import cascadeflow +cascadeflow.init(mode="observe") + +# Framework-specific activation +from cascadeflow.integrations.langchain import get_harness_callback +from cascadeflow.integrations.openai_agents import CascadeFlowModelProvider +from cascadeflow.integrations.crewai import enable as enable_crewai +from cascadeflow.integrations.google_adk import enable as enable_adk +``` + +### TypeScript + +```bash +npm install @cascadeflow/langchain +npm install @cascadeflow/vercel-ai +npm install @cascadeflow/n8n-nodes-cascadeflow +``` + +## Choosing an Integration + +- **LangChain/LangGraph**: Use if you have existing LangChain chains or agents. The callback handler wraps any `BaseChatModel`. +- **OpenAI Agents SDK**: Use if you're building with OpenAI's Agents SDK. The `ModelProvider` supports model candidates and tool gating. +- **CrewAI**: Use if you're building multi-agent crews. The `llm_hooks` integration tracks all crew steps. +- **Google ADK**: Use if you're building with Google's Agent Development Kit. The plugin integrates with `Runner`. +- **n8n**: Use if you're building no-code workflows. The community node adds cascade routing to any n8n flow. +- **Vercel AI SDK**: Use if you're building TypeScript server-side agents. The middleware wraps AI SDK streams. diff --git a/docs-site/integrations/vercel-ai.mdx b/docs-site/integrations/vercel-ai.mdx new file mode 100644 index 00000000..9b2d9257 --- /dev/null +++ b/docs-site/integrations/vercel-ai.mdx @@ -0,0 +1,88 @@ +--- +title: Vercel AI SDK +description: TypeScript middleware integration for Vercel AI SDK with cascade routing, multi-turn chat, and tool execution. +--- + +# Vercel AI SDK Integration + +cascadeflow integrates with the Vercel AI SDK as middleware, providing cascade routing for server-side AI applications with streaming support. + +## Install + +```bash +npm install @cascadeflow/vercel-ai +``` + +## Quick Start + +```typescript +import { createChatHandler } from '@cascadeflow/vercel-ai'; +import { CascadeAgent } from '@cascadeflow/core'; + +const agent = new CascadeAgent({ + models: [ + { name: 'gpt-4o-mini', provider: 'openai', cost: 0.000375 }, + { name: 'gpt-4o', provider: 'openai', cost: 0.00625 }, + ], +}); + +const handler = createChatHandler(agent, { + protocol: 'data', // AI SDK v4 data stream + tools, // Tool definitions + toolHandlers, // Server-side tool execution + maxSteps: 5, // Multi-step tool loops +}); + +// Use in Next.js API route, Express, or any Node.js server +export const POST = handler; +``` + +## Features + +- **AI SDK v4 `data` stream** and **AI SDK v5/v6 UI streams** +- **`useChat` multi-turn support** — conversation history preserved +- **`parts` message format** (AI SDK v6) +- **Tool call streaming visibility** — see tool calls as they happen +- **Server-side tool execution** via `toolExecutor` or `toolHandlers` +- **Multi-step controls**: `maxSteps`, `forceDirect` +- **Cascade decision stream parts** — optional metadata in the stream +- **Request-level overrides** with allowlist + shared-secret guard + +## Multi-Turn Chat + +```typescript +import { useChat } from 'ai/react'; + +export default function Chat() { + const { messages, input, handleSubmit, handleInputChange } = useChat({ + api: '/api/chat', + }); + + return ( +
+ {messages.map((m) => ( +
{m.content}
+ ))} +
+ +
+
+ ); +} +``` + +## Request Overrides + +Override cascade behavior per request (protected by shared secret): + +```typescript +const handler = createChatHandler(agent, { + protocol: 'data', + allowOverrides: ['forceDirect', 'maxSteps'], + overrideSecret: process.env.OVERRIDE_SECRET, +}); +``` + +## Result + +40-85% cost savings for Vercel AI SDK applications with streaming support and zero client-side changes. diff --git a/docs-site/logo/cascadeflow-dark.svg b/docs-site/logo/cascadeflow-dark.svg new file mode 100644 index 00000000..3c1a2870 --- /dev/null +++ b/docs-site/logo/cascadeflow-dark.svg @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs-site/logo/cascadeflow-light.svg b/docs-site/logo/cascadeflow-light.svg new file mode 100644 index 00000000..8ca48234 --- /dev/null +++ b/docs-site/logo/cascadeflow-light.svg @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/README.md b/docs/README.md index 5280a562..08c5c0c8 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,8 +1,10 @@ # cascadeflow Documentation -Welcome to cascadeflow documentation! 🌊 +> **Full documentation is now at [docs.cascadeflow.dev](https://docs.cascadeflow.dev)** — the Mintlify-powered docs site is the primary reference for cascadeflow's agent runtime intelligence layer. The guides below remain for quick reference and deep links. -## 📖 Quick Links +Agent runtime intelligence layer — optimize cost, latency, quality, budget, compliance, and energy across AI agent workflows. In-process harness, not a proxy. + +## Quick Links - [Installation Guide](INSTALLATION.md) - [Quick Start Guide](guides/quickstart.md) @@ -40,6 +42,7 @@ Welcome to cascadeflow documentation! 🌊 - [Agent Intelligence V2/V2.1 Plan](strategy/agent-intelligence-v2-plan.md) - Unified strategic and execution plan for in-process agent intelligence harness delivery ### Integrations +- [LangChain Integration](guides/langchain_integration.md) - Callback handler for LangChain/LangGraph with harness-aware cascading - [OpenAI Agents SDK Integration](guides/openai_agents_integration.md) - Harness-aware model provider for existing OpenAI Agents apps - [CrewAI Integration](guides/crewai_integration.md) - Hook-based harness metrics + budget gating (opt-in) - [Google ADK Integration](guides/google_adk_integration.md) - Plugin-based harness integration for ADK runners (opt-in) diff --git a/llms.txt b/llms.txt index 51bb8437..dbba72ca 100644 --- a/llms.txt +++ b/llms.txt @@ -1,7 +1,7 @@ # cascadeflow -> Agent intelligence harness for cost, quality, latency, energy, and business KPI optimization. -> In-process library (not a proxy). Works inside agent loops with full state awareness. +> Agent runtime intelligence layer for AI agent workflows. +> In-process harness (not a proxy). Works inside agent loops with full state awareness. ## Install @@ -13,6 +13,41 @@ import cascadeflow cascadeflow.init(mode="observe") # All openai/anthropic SDK calls are now tracked. Switch to "enforce" for budget gating. +## What cascadeflow is + +cascadeflow is an in-process intelligence layer that sits inside AI agent execution +loops. Unlike external proxies that only see HTTP request boundaries, cascadeflow +operates with full agent state awareness: step count, budget consumed, tool call +history, error context, quality scores, domain, complexity, and user-defined +business context. + +Eight things make cascadeflow different: + +1. Inside-the-loop control. Decisions happen per-step and per-tool-call inside + agent execution, not at the HTTP boundary. This enables budget gating mid-run, + model switching based on remaining budget, and stop actions when caps are hit. + +2. Multi-dimensional optimization. Six dimensions scored simultaneously: cost, + latency, quality, budget, compliance, and energy. Not just cost routing. + +3. Business logic injection. KPI weights and targets let teams encode business + priorities (e.g. 60% quality, 30% cost, 10% latency) into every model decision. + +4. Actionable decisions. Four actions: allow, switch_model, deny_tool, stop. + The harness does not just observe — it controls execution flow. + +5. Full transparency. Every decision produces a trace record with action, reason, + model, step, cost_total, budget_state, and applied fields. Audit-ready. + +6. Measurable value. Session summaries report cost, latency, energy, steps, + tool calls, and budget remaining. Before/after comparison is built in. + +7. Category leadership. Cross-framework policy layer with unified KPI semantics + across LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, Vercel AI SDK. + +8. Latency advantage. In-process instrumentation adds <1ms overhead per call. + External proxies add 10-50ms of network round-trip latency per LLM call. + ## When to use cascadeflow - You run AI agents (LangChain, LangGraph, CrewAI, OpenAI Agents SDK, Google ADK, or custom) @@ -28,6 +63,17 @@ cascadeflow.init(mode="observe") - You only use one model and don't want routing - You need a hosted proxy service (cascadeflow is a library, not a SaaS) +## Proxy vs cascadeflow + +| Dimension | External proxy | cascadeflow harness | +|--------------------|----------------------------|------------------------------| +| Scope | HTTP request boundary | Inside agent execution loop | +| Dimensions | Cost only | Cost + quality + latency + budget + compliance + energy | +| Latency overhead | 10-50ms network RTT | <1ms in-process | +| Business logic | None | KPI weights and targets | +| Enforcement | None (observe only) | stop, deny_tool, switch_model | +| Auditability | Request logs | Per-step decision traces | + ## Key APIs - cascadeflow.init(mode) -- activate harness globally (off | observe | enforce) @@ -36,6 +82,20 @@ cascadeflow.init(mode="observe") - session.summary() -- structured run metrics (cost, latency, energy, steps, tool calls) - session.trace() -- full decision trace for auditability +## HarnessConfig Reference + +@dataclass +class HarnessConfig: + mode: HarnessMode # "off" | "observe" | "enforce". Default: "off" + verbose: bool # Print decisions to stderr. Default: False + budget: Optional[float] # Max USD for the run. Default: None (unlimited) + max_tool_calls: Optional[int] # Max tool/function calls. Default: None + max_latency_ms: Optional[float] # Max wall-clock ms per call. Default: None + max_energy: Optional[float] # Max energy units. Default: None + kpi_targets: Optional[dict] # {"quality": 0.9, "cost": 0.5, ...} + kpi_weights: Optional[dict] # {"quality": 0.6, "cost": 0.3, "latency": 0.1} + compliance: Optional[str] # "gdpr" | "hipaa" | "pci" | "strict" + ## Harness Modes - off: no tracking, no enforcement @@ -50,6 +110,31 @@ cascadeflow.init(mode="observe") - Tool calls: count of tool/function calls executed - Quality: model quality priors for KPI-weighted scoring +## Decision Actions + +- allow: proceed normally +- switch_model: route to cheaper/better model (where runtime allows) +- deny_tool: block tool execution when tool call cap reached +- stop: halt agent loop when budget/latency/energy cap exceeded + +## Decision Trace Format + +Each decision produces a record with these fields: +- action: "allow" | "switch_model" | "deny_tool" | "stop" +- reason: human-readable explanation +- model: model name used for the call +- step: integer step number in the run +- cost_total: cumulative cost in USD at this step +- budget_state: "ok" | "warning" | "exceeded" +- applied: true if the action was enforced (false in observe mode) + +## Compliance Model Allowlists + +- gdpr: gpt-4o, gpt-4o-mini, gpt-3.5-turbo +- hipaa: gpt-4o, gpt-4o-mini +- pci: gpt-4o-mini, gpt-3.5-turbo +- strict: gpt-4o only + ## Integrations pip install cascadeflow[langchain] # LangChain/LangGraph callback handler @@ -57,31 +142,73 @@ pip install cascadeflow[openai-agents] # OpenAI Agents SDK ModelProvider pip install cascadeflow[crewai] # CrewAI llm_hooks integration pip install cascadeflow[google-adk] # Google ADK BasePlugin -All integrations are opt-in. Install the extra and explicitly enable the integration. - -## Integration Patterns +npm install @cascadeflow/core # TypeScript core +npm install @cascadeflow/langchain # LangChain TypeScript +npm install @cascadeflow/vercel-ai # Vercel AI SDK middleware +npm install @cascadeflow/n8n-nodes-cascadeflow # n8n community node -- LangChain: HarnessAwareCascadeFlowCallbackHandler via get_harness_callback() -- OpenAI Agents SDK: CascadeFlowModelProvider with model candidates and tool gating -- CrewAI: enable() registers global llm_hooks for budget gating and tracking -- Google ADK: enable() returns a BasePlugin for Runner(plugins=[plugin]) -- n8n: Built-in harness mode (observe/enforce) on the Agent node with UI parameters -- Vercel AI SDK: TypeScript middleware integration - -## Decision Actions - -- allow: proceed normally -- switch_model: route to cheaper/better model (where runtime allows) -- deny_tool: block tool execution when tool call cap reached -- stop: halt agent loop when budget/latency/energy cap exceeded - -## Supported Models (pricing table) +All integrations are opt-in. Install the extra and explicitly enable the integration. -OpenAI: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-3.5-turbo, o1, o1-mini, o3-mini, gpt-5, gpt-5-mini -Anthropic: claude-sonnet-4, claude-haiku-3.5, claude-opus-4.5 -Google: gemini-2.5-flash, gemini-2.5-pro, gemini-2.0-flash, gemini-1.5-flash, gemini-1.5-pro +## Integration Code Snippets + +LangChain: + from cascadeflow.integrations.langchain import get_harness_callback + cb = get_harness_callback() + result = await model.ainvoke("query", config={"callbacks": [cb]}) + +OpenAI Agents SDK: + from cascadeflow.integrations.openai_agents import CascadeFlowModelProvider + provider = CascadeFlowModelProvider(model_candidates=["gpt-4o-mini", "gpt-4o"]) + +CrewAI: + from cascadeflow.integrations.crewai import enable + enable(budget_gate=True, fail_open=True) + +Google ADK: + from cascadeflow.integrations.google_adk import enable + plugin = enable(fail_open=True) + runner = Runner(agent=agent, plugins=[plugin]) + +## Pricing Table (USD per 1M tokens: input / output) + +OpenAI: + gpt-4o: $2.50 / $10.00 + gpt-4o-mini: $0.15 / $0.60 + gpt-5: $1.25 / $10.00 + gpt-5-mini: $0.20 / $0.80 + gpt-4-turbo: $10.00 / $30.00 + gpt-4: $30.00 / $60.00 + gpt-3.5-turbo: $0.50 / $1.50 + o1: $15.00 / $60.00 + o1-mini: $3.00 / $12.00 + o3-mini: $1.10 / $4.40 + +Anthropic: + claude-sonnet-4: $3.00 / $15.00 + claude-haiku-3.5: $1.00 / $5.00 + claude-opus-4.5: $5.00 / $25.00 + +Google: + gemini-2.5-flash: $0.15 / $0.60 + gemini-2.5-pro: $1.25 / $10.00 + gemini-2.0-flash: $0.10 / $0.40 + gemini-1.5-flash: $0.075 / $0.30 + gemini-1.5-pro: $1.25 / $5.00 + +## Energy Coefficients + +Model energy is computed as: energy_units = coeff * (input_tokens + output_tokens * 1.5) + + gpt-4o: 1.0 gpt-4o-mini: 0.3 gpt-5: 1.2 + gpt-5-mini: 0.35 gpt-4-turbo: 1.5 gpt-4: 1.5 + gpt-3.5-turbo: 0.2 o1: 2.0 o1-mini: 0.8 + o3-mini: 0.5 claude-sonnet-4: 1.0 claude-haiku-3.5: 0.3 + claude-opus-4.5: 1.8 gemini-2.5-flash: 0.3 gemini-2.5-pro: 1.2 + gemini-2.0-flash: 0.25 gemini-1.5-flash: 0.2 gemini-1.5-pro: 1.0 ## Links +- Docs: https://docs.cascadeflow.dev - Source: https://github.com/lemony-ai/cascadeflow - PyPI: pip install cascadeflow +- npm: npm install @cascadeflow/core diff --git a/pyproject.toml b/pyproject.toml index b746a6e0..bc7c7072 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "cascadeflow" version = "1.0.0" -description = "Smart AI model cascading for cost optimization - Save 40-85% on LLM costs with 2-6x faster responses. Available for Python and TypeScript/JavaScript." +description = "Agent runtime intelligence layer — optimize cost, latency, quality, budget, compliance, and energy across AI agent workflows." readme = "README.md" requires-python = ">=3.9" license = "MIT" @@ -32,9 +32,17 @@ keywords = [ "javascript", "browser", "edge-functions", + "agent-intelligence", + "runtime-optimization", + "budget-enforcement", + "compliance", + "harness", + "agent-runtime", + "kpi", + "energy-tracking", ] classifiers = [ - "Development Status :: 4 - Beta", + "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.9", @@ -141,7 +149,7 @@ all = [ [project.urls] Homepage = "https://lemony.ai" -Documentation = "https://github.com/lemony-ai/cascadeflow" +Documentation = "https://docs.cascadeflow.dev" Repository = "https://github.com/lemony-ai/cascadeflow" "Bug Tracker" = "https://github.com/lemony-ai/cascadeflow/issues" Changelog = "https://github.com/lemony-ai/cascadeflow/releases" From adbf47eee1b5bebb8921c6a96f7204a40bd79b08 Mon Sep 17 00:00:00 2001 From: saschabuehrle Date: Thu, 5 Mar 2026 17:15:21 +0100 Subject: [PATCH 49/49] fix: switch GitHub Stars badge from social to flat style Social-style shields.io badges intermittently render as "invalid" due to GitHub API rate limiting. Flat style is more reliable. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 27baf1be..51de5118 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ [![Python Docs](https://img.shields.io/badge/docs-Python-blue)](./docs/) [![TypeScript Docs](https://img.shields.io/badge/docs-TypeScript-red)](./docs/) [![X Follow](https://img.shields.io/twitter/follow/saschabuehrle?style=social)](https://x.com/saschabuehrle) -[![GitHub Stars](https://img.shields.io/github/stars/lemony-ai/cascadeflow?style=social)](https://github.com/lemony-ai/cascadeflow) +[![GitHub Stars](https://img.shields.io/github/stars/lemony-ai/cascadeflow?style=flat&color=yellow&label=Stars)](https://github.com/lemony-ai/cascadeflow/stargazers)