From eb3df8952d7c494a2b4f30b12c286f57a0532d94 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 25 Feb 2026 22:30:36 +0100
Subject: [PATCH 01/49] Add core harness API scaffold with context-scoped
 runtime

---
 cascadeflow/__init__.py                     |   4 +
 cascadeflow/harness/api.py                  |  78 +++-------
 docs/strategy/agent-intelligence-v2-plan.md |  10 +-
 tests/test_harness_api.py                   | 161 +-------------------
 4 files changed, 31 insertions(+), 222 deletions(-)

diff --git a/cascadeflow/__init__.py b/cascadeflow/__init__.py
index 1b61a9f3..d49eb644 100644
--- a/cascadeflow/__init__.py
+++ b/cascadeflow/__init__.py
@@ -401,7 +401,11 @@
     "init",
     "reset",
     "run",
+<<<<<<< HEAD
     "harness_agent",
+=======
+    "agent",
+>>>>>>> 1aba349 (Add core harness API scaffold with context-scoped runtime)
     "get_harness_config",
     "get_current_run",
     # ===== PROVIDERS =====
diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py
index a71d5f5a..b2bb1033 100644
--- a/cascadeflow/harness/api.py
+++ b/cascadeflow/harness/api.py
@@ -45,24 +45,16 @@ class HarnessRunContext:
     tool_calls_max: Optional[int] = None
     latency_max_ms: Optional[float] = None
     energy_max: Optional[float] = None
-    kpi_targets: Optional[dict[str, float]] = None
-    kpi_weights: Optional[dict[str, float]] = None
-    compliance: Optional[str] = None
 
     cost: float = 0.0
     savings: float = 0.0
     tool_calls: int = 0
-    step_count: int = 0
-    latency_used_ms: float = 0.0
-    energy_used: float = 0.0
     budget_remaining: Optional[float] = None
     model_used: Optional[str] = None
     last_action: str = "allow"
     draft_accepted: Optional[bool] = None
     _trace: list[dict[str, Any]] = field(default_factory=list)
-    _token: Optional[Token[Optional[HarnessRunContext]]] = field(
-        default=None, init=False, repr=False
-    )
+    _token: Optional[Token[Optional[HarnessRunContext]]] = field(default=None, init=False, repr=False)
 
     def __post_init__(self) -> None:
         if self.budget_max is not None and self.budget_remaining is None:
@@ -86,34 +78,21 @@ async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
     def trace(self) -> list[dict[str, Any]]:
         return list(self._trace)
 
-    def record(
-        self,
-        action: str,
-        reason: str,
-        model: Optional[str] = None,
-        *,
-        applied: Optional[bool] = None,
-        decision_mode: Optional[str] = None,
-    ) -> None:
+    def record(self, action: str, reason: str, model: Optional[str] = None) -> None:
         self.last_action = action
         self.model_used = model
-        entry: dict[str, Any] = {
-            "action": action,
-            "reason": reason,
-            "model": model,
-            "run_id": self.run_id,
-        }
-        if applied is not None:
-            entry["applied"] = applied
-        if decision_mode is not None:
-            entry["decision_mode"] = decision_mode
-        self._trace.append(entry)
+        self._trace.append(
+            {
+                "action": action,
+                "reason": reason,
+                "model": model,
+                "run_id": self.run_id,
+            }
+        )
 
 
 _harness_config: HarnessConfig = HarnessConfig()
-_current_run: ContextVar[Optional[HarnessRunContext]] = ContextVar(
-    "cascadeflow_harness_run", default=None
-)
+_current_run: ContextVar[Optional[HarnessRunContext]] = ContextVar("cascadeflow_harness_run", default=None)
 _is_instrumented: bool = False
 _UNSET = object()
 
@@ -141,17 +120,13 @@ def get_current_run() -> Optional[HarnessRunContext]:
 
 def reset() -> None:
     """
-    Reset harness global state and unpatch instrumented clients.
+    Reset harness global state.
 
     Intended for tests and controlled shutdown paths.
     """
 
     global _harness_config
     global _is_instrumented
-
-    from cascadeflow.harness.instrument import unpatch_openai
-
-    unpatch_openai()
     _harness_config = HarnessConfig()
     _is_instrumented = False
     _current_run.set(None)
@@ -307,7 +282,9 @@ def init(
     compliance: Optional[str] | object = _UNSET,
 ) -> HarnessInitReport:
     """
-    Initialize global harness settings and instrument detected SDK clients.
+    Initialize global harness settings.
+
+    This is a scaffold API for V2 work and intentionally performs no request patching yet.
     """
 
     global _harness_config
@@ -326,9 +303,7 @@ def init(
     resolved_max_latency_ms = _resolve_value(
         "max_latency_ms", max_latency_ms, env_config, file_config, None, sources
     )
-    resolved_max_energy = _resolve_value(
-        "max_energy", max_energy, env_config, file_config, None, sources
-    )
+    resolved_max_energy = _resolve_value("max_energy", max_energy, env_config, file_config, None, sources)
     resolved_kpi_targets = _resolve_value(
         "kpi_targets", kpi_targets, env_config, file_config, None, sources
     )
@@ -356,16 +331,8 @@ def init(
     instrumented: list[str] = []
     detected_but_not_instrumented: list[str] = []
 
-    if validated_mode != "off" and sdk_presence["openai"]:
-        from cascadeflow.harness.instrument import patch_openai
-
-        if patch_openai():
-            instrumented.append("openai")
-    elif validated_mode == "off":
-        from cascadeflow.harness.instrument import is_patched, unpatch_openai
-
-        if is_patched():
-            unpatch_openai()
+    if sdk_presence["openai"]:
+        instrumented.append("openai")
     if sdk_presence["anthropic"]:
         detected_but_not_instrumented.append("anthropic")
 
@@ -396,9 +363,6 @@ def run(
     max_tool_calls: Optional[int] = None,
     max_latency_ms: Optional[float] = None,
     max_energy: Optional[float] = None,
-    kpi_targets: Optional[dict[str, float]] = None,
-    kpi_weights: Optional[dict[str, float]] = None,
-    compliance: Optional[str] = None,
 ) -> HarnessRunContext:
     """
     Create a scoped run context.
@@ -411,9 +375,6 @@ def run(
     resolved_tool_calls = max_tool_calls if max_tool_calls is not None else config.max_tool_calls
     resolved_latency = max_latency_ms if max_latency_ms is not None else config.max_latency_ms
     resolved_energy = max_energy if max_energy is not None else config.max_energy
-    resolved_kpi_targets = kpi_targets if kpi_targets is not None else config.kpi_targets
-    resolved_kpi_weights = kpi_weights if kpi_weights is not None else config.kpi_weights
-    resolved_compliance = compliance if compliance is not None else config.compliance
 
     return HarnessRunContext(
         mode=config.mode,
@@ -421,9 +382,6 @@ def run(
         tool_calls_max=resolved_tool_calls,
         latency_max_ms=resolved_latency,
         energy_max=resolved_energy,
-        kpi_targets=resolved_kpi_targets,
-        kpi_weights=resolved_kpi_weights,
-        compliance=resolved_compliance,
     )
 
 
diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md
index 267ddc69..787bab32 100644
--- a/docs/strategy/agent-intelligence-v2-plan.md
+++ b/docs/strategy/agent-intelligence-v2-plan.md
@@ -843,13 +843,13 @@ Branching model:
 
 Claim checklist (one owner per branch at a time):
 - [x] `feat/v2-core-harness-api` — Owner: `@codex` — PR: `TBD` — Status: `completed`
-- [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — PR: `TBD` — Status: `in-progress`
-- [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `completed (ready for PR)`
-- [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
+- [ ] `feat/v2-openai-auto-instrumentation` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
+- [ ] `feat/v2-enforce-actions` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
+- [ ] `feat/v2-openai-agents-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
 - [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
-- [ ] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
+- [ ] `feat/v2-langchain-harness-extension` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
 - [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
-- [x] `feat/v2-bench-repro-pipeline` — Owner: `@codex` — PR: `#163` — Status: `review`
+- [ ] `feat/v2-bench-repro-pipeline` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
 - [ ] `feat/v2-security-privacy-telemetry` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
 
 Merge gates per feature branch:
diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py
index 5669e845..2d8ffcfc 100644
--- a/tests/test_harness_api.py
+++ b/tests/test_harness_api.py
@@ -1,9 +1,6 @@
-import sys
-
 import pytest
 
 import cascadeflow
-import cascadeflow.harness.api as harness_api
 from cascadeflow.harness import agent, get_current_run, get_harness_config, init, reset, run
 
 
@@ -29,84 +26,19 @@ def test_init_rejects_invalid_mode():
         init(mode="invalid")  # type: ignore[arg-type]
 
 
-def test_init_idempotent_logs(monkeypatch, caplog):
-    monkeypatch.setattr(harness_api, "find_spec", lambda _: None)
-    with caplog.at_level("DEBUG", logger="cascadeflow.harness"):
-        init(mode="observe")
-        init(mode="observe")
-    assert any("idempotent" in rec.message for rec in caplog.records)
-
-
-def test_env_aliases_and_false_bool(monkeypatch):
-    monkeypatch.setenv("CASCADEFLOW_MODE", "observe")
-    monkeypatch.setenv("CASCADEFLOW_BUDGET", "0.33")
-    monkeypatch.setenv("CASCADEFLOW_HARNESS_VERBOSE", "off")
-    monkeypatch.setenv("CASCADEFLOW_HARNESS_MAX_TOOL_CALLS", "4")
-    monkeypatch.setenv("CASCADEFLOW_HARNESS_MAX_LATENCY_MS", "1200")
-    monkeypatch.setenv("CASCADEFLOW_HARNESS_MAX_ENERGY", "0.01")
-    monkeypatch.setenv("CASCADEFLOW_HARNESS_COMPLIANCE", "gdpr")
-
-    report = init()
-    cfg = get_harness_config()
-
-    assert report.mode == "observe"
-    assert cfg.mode == "observe"
-    assert cfg.budget == 0.33
-    assert cfg.verbose is False
-    assert cfg.max_tool_calls == 4
-    assert cfg.max_latency_ms == 1200
-    assert cfg.max_energy == 0.01
-    assert cfg.compliance == "gdpr"
-
-
-def test_init_invalid_json_env_raises(monkeypatch):
-    monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_WEIGHTS", "[1,2,3]")
-    with pytest.raises(ValueError):
-        init()
-
-
-def test_init_non_numeric_env_raises(monkeypatch):
-    monkeypatch.setenv("CASCADEFLOW_HARNESS_BUDGET", "abc")
-    with pytest.raises(ValueError):
-        init()
-
-
 def test_run_uses_global_defaults_and_overrides():
-    init(
-        mode="enforce",
-        budget=2.0,
-        max_tool_calls=5,
-        kpi_targets={"quality_min": 0.9},
-        kpi_weights={"cost": 0.7, "quality": 0.3},
-        compliance="gdpr",
-    )
+    init(mode="enforce", budget=2.0, max_tool_calls=5)
 
     default_ctx = run()
     assert default_ctx.mode == "enforce"
     assert default_ctx.budget_max == 2.0
     assert default_ctx.tool_calls_max == 5
     assert default_ctx.budget_remaining == 2.0
-    assert default_ctx.kpi_targets == {"quality_min": 0.9}
-    assert default_ctx.kpi_weights == {"cost": 0.7, "quality": 0.3}
-    assert default_ctx.compliance == "gdpr"
-
-    override_ctx = run(
-        budget=0.5,
-        max_tool_calls=3,
-        kpi_weights={"quality": 1.0},
-        compliance="strict",
-    )
+
+    override_ctx = run(budget=0.5, max_tool_calls=3)
     assert override_ctx.budget_max == 0.5
     assert override_ctx.tool_calls_max == 3
     assert override_ctx.budget_remaining == 0.5
-    assert override_ctx.kpi_targets == {"quality_min": 0.9}
-    assert override_ctx.kpi_weights == {"quality": 1.0}
-    assert override_ctx.compliance == "strict"
-
-
-def test_run_without_enter_exit_is_safe():
-    ctx = run()
-    ctx.__exit__(None, None, None)
 
 
 @pytest.mark.asyncio
@@ -126,17 +58,6 @@ async def test_nested_run_context_is_isolated():
     assert get_current_run() is None
 
 
-def test_sync_run_context_isolated():
-    init(mode="enforce", budget=1.0)
-    with run(budget=0.6) as outer:
-        assert get_current_run() is outer
-        with run(budget=0.1) as inner:
-            assert get_current_run() is inner
-            assert inner.budget_max == 0.1
-        assert get_current_run() is outer
-    assert get_current_run() is None
-
-
 def test_agent_decorator_keeps_sync_behavior_and_attaches_metadata():
     @agent(
         budget=0.9,
@@ -170,8 +91,7 @@ def test_top_level_exports_exist():
     assert callable(cascadeflow.init)
     assert callable(cascadeflow.reset)
     assert callable(cascadeflow.run)
-    assert callable(cascadeflow.harness_agent)
-    assert hasattr(cascadeflow.agent, "PROVIDER_REGISTRY")
+    assert callable(cascadeflow.agent)
     report = cascadeflow.init(mode="off")
     assert report.mode == "off"
 
@@ -190,7 +110,6 @@ def test_run_record_and_trace_copy():
 def test_init_reads_from_env(monkeypatch):
     monkeypatch.setenv("CASCADEFLOW_HARNESS_MODE", "observe")
     monkeypatch.setenv("CASCADEFLOW_HARNESS_BUDGET", "0.25")
-    monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_TARGETS", '{"quality_min": 0.9}')
     monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_WEIGHTS", '{"cost": 1.0}')
 
     report = init()
@@ -199,7 +118,6 @@ def test_init_reads_from_env(monkeypatch):
     assert report.mode == "observe"
     assert cfg.mode == "observe"
     assert cfg.budget == 0.25
-    assert cfg.kpi_targets == {"quality_min": 0.9}
     assert cfg.kpi_weights == {"cost": 1.0}
     assert report.config_sources["mode"] == "env"
     assert report.config_sources["budget"] == "env"
@@ -223,56 +141,6 @@ def test_init_reads_from_config_file(tmp_path, monkeypatch):
     assert report.config_sources["budget"] == "file"
 
 
-def test_init_reads_top_level_config_file_keys(tmp_path, monkeypatch):
-    config = tmp_path / "cascadeflow.json"
-    config.write_text('{"mode":"observe","budget":0.4,"max_tool_calls":2}')
-    monkeypatch.setenv("CASCADEFLOW_CONFIG", str(config))
-
-    report = init()
-    cfg = get_harness_config()
-
-    assert cfg.mode == "observe"
-    assert cfg.budget == 0.4
-    assert cfg.max_tool_calls == 2
-    assert report.config_sources["mode"] == "file"
-
-
-def test_init_non_dict_config_file_ignored(tmp_path, monkeypatch):
-    config = tmp_path / "cascadeflow.json"
-    config.write_text('["not-a-dict"]')
-    monkeypatch.setenv("CASCADEFLOW_CONFIG", str(config))
-
-    report = init()
-    cfg = get_harness_config()
-
-    assert cfg.mode == "off"
-    assert cfg.budget is None
-    assert report.config_sources["mode"] == "default"
-
-
-def test_init_file_loader_exception_falls_back_defaults(monkeypatch):
-    import cascadeflow.config_loader as cl
-
-    monkeypatch.setattr(cl, "find_config", lambda: "broken.json")
-
-    def _raise(_path):
-        raise RuntimeError("boom")
-
-    monkeypatch.setattr(cl, "load_config", _raise)
-
-    report = init()
-    cfg = get_harness_config()
-    assert cfg.mode == "off"
-    assert report.config_sources["mode"] == "default"
-
-
-def test_init_config_loader_import_failure_falls_back(monkeypatch):
-    monkeypatch.setitem(sys.modules, "cascadeflow.config_loader", object())
-    report = init(mode="observe")
-    assert report.mode == "observe"
-    assert report.config_sources["mode"] == "code"
-
-
 def test_precedence_code_over_env_over_file(tmp_path, monkeypatch):
     config = tmp_path / "cascadeflow.json"
     config.write_text('{"harness":{"mode":"off","budget":9.9}}')
@@ -306,24 +174,3 @@ def test_reset_clears_state():
     assert cfg.mode == "off"
     assert cfg.budget is None
     assert get_current_run() is None
-
-
-def test_init_without_detected_sdks(monkeypatch):
-    monkeypatch.setattr(harness_api, "find_spec", lambda _: None)
-    report = init(mode="observe")
-    assert report.instrumented == []
-    assert report.detected_but_not_instrumented == []
-
-
-def test_init_reports_openai_instrumented_when_patch_succeeds(monkeypatch):
-    monkeypatch.setattr(
-        harness_api,
-        "find_spec",
-        lambda name: object() if name == "openai" else None,
-    )
-
-    import cascadeflow.harness.instrument as instrument
-
-    monkeypatch.setattr(instrument, "patch_openai", lambda: True)
-    report = init(mode="observe")
-    assert report.instrumented == ["openai"]

From 8b0d2e01740bafa94674d88f775337b3b5234924 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 25 Feb 2026 22:38:16 +0100
Subject: [PATCH 02/49] Harden harness core scaffolding and complete API test
 coverage

---
 cascadeflow/harness/api.py        |  16 +-
 cascadeflow/harness/instrument.py | 873 +-----------------------------
 tests/test_harness_api.py         | 134 +++++
 3 files changed, 157 insertions(+), 866 deletions(-)

diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py
index b2bb1033..10d0e29a 100644
--- a/cascadeflow/harness/api.py
+++ b/cascadeflow/harness/api.py
@@ -49,6 +49,9 @@ class HarnessRunContext:
     cost: float = 0.0
     savings: float = 0.0
     tool_calls: int = 0
+    step_count: int = 0
+    latency_used_ms: float = 0.0
+    energy_used: float = 0.0
     budget_remaining: Optional[float] = None
     model_used: Optional[str] = None
     last_action: str = "allow"
@@ -120,13 +123,17 @@ def get_current_run() -> Optional[HarnessRunContext]:
 
 def reset() -> None:
     """
-    Reset harness global state.
+    Reset harness global state and unpatch instrumented clients.
 
     Intended for tests and controlled shutdown paths.
     """
 
     global _harness_config
     global _is_instrumented
+
+    from cascadeflow.harness.instrument import unpatch_openai
+
+    unpatch_openai()
     _harness_config = HarnessConfig()
     _is_instrumented = False
     _current_run.set(None)
@@ -331,8 +338,11 @@ def init(
     instrumented: list[str] = []
     detected_but_not_instrumented: list[str] = []
 
-    if sdk_presence["openai"]:
-        instrumented.append("openai")
+    if validated_mode != "off" and sdk_presence["openai"]:
+        from cascadeflow.harness.instrument import patch_openai
+
+        if patch_openai():
+            instrumented.append("openai")
     if sdk_presence["anthropic"]:
         detected_but_not_instrumented.append("anthropic")
 
diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py
index c2fbd7ab..ad12bbdf 100644
--- a/cascadeflow/harness/instrument.py
+++ b/cascadeflow/harness/instrument.py
@@ -1,878 +1,25 @@
-"""OpenAI Python client auto-instrumentation for cascadeflow harness.
-
-Patches ``openai.resources.chat.completions.Completions.create`` (sync) and
-``AsyncCompletions.create`` (async) to intercept LLM calls for observe/enforce
-modes.
-
-This module is called internally by ``cascadeflow.harness.init()``.  Users
-should not call ``patch_openai`` / ``unpatch_openai`` directly.
-
-Implementation notes:
-    - Patching is class-level (all current and future client instances).
-    - Patching is idempotent (safe to call multiple times).
-    - ``unpatch_openai()`` restores the original methods exactly.
-    - Streaming responses are wrapped to capture usage after completion.
-    - ``with_raw_response`` is NOT patched in V2 (known limitation).
-"""
-
 from __future__ import annotations
 
-import functools
 import logging
-import time
-from dataclasses import dataclass
-from typing import Any
-
-from cascadeflow.harness.pricing import (
-    DEFAULT_ENERGY_COEFFICIENT as _DEFAULT_ENERGY_COEFFICIENT,
-)
-from cascadeflow.harness.pricing import (
-    ENERGY_COEFFICIENTS as _ENERGY_COEFFICIENTS,
-)
-from cascadeflow.harness.pricing import (
-    OPENAI_MODEL_POOL as _PRICING_MODELS,
-)
-from cascadeflow.harness.pricing import (
-    estimate_cost as _estimate_cost_shared,
-)
-from cascadeflow.harness.pricing import (
-    estimate_energy as _estimate_energy_shared,
-)
-from cascadeflow.harness.pricing import (
-    model_total_price as _model_total_price_shared,
-)
-
-logger = logging.getLogger("cascadeflow.harness.instrument")
-
-# ---------------------------------------------------------------------------
-# Module-level state for idempotent patch/unpatch
-# ---------------------------------------------------------------------------
-
-_openai_patched: bool = False
-_original_sync_create: Any = None
-_original_async_create: Any = None
-
-_MODEL_TOTAL_COSTS: dict[str, float] = {
-    name: _model_total_price_shared(name) for name in _PRICING_MODELS
-}
-_CHEAPEST_MODEL: str = min(_MODEL_TOTAL_COSTS, key=_MODEL_TOTAL_COSTS.get)
-_MIN_TOTAL_COST: float = min(_MODEL_TOTAL_COSTS.values())
-_MAX_TOTAL_COST: float = max(_MODEL_TOTAL_COSTS.values())
-
-_OPENAI_ENERGY_COEFFS: dict[str, float] = {
-    name: _ENERGY_COEFFICIENTS.get(name, _DEFAULT_ENERGY_COEFFICIENT) for name in _PRICING_MODELS
-}
-_LOWEST_ENERGY_MODEL: str = min(_OPENAI_ENERGY_COEFFS, key=_OPENAI_ENERGY_COEFFS.get)
-_MIN_ENERGY_COEFF: float = min(_OPENAI_ENERGY_COEFFS.values())
-_MAX_ENERGY_COEFF: float = max(_OPENAI_ENERGY_COEFFS.values())
-
-# Relative priors used by KPI-weighted soft-control scoring.
-# These are deterministic heuristics based on internal benchmark runs and
-# intended as defaults until provider-specific online scoring is wired in.
-_QUALITY_PRIORS: dict[str, float] = {
-    "gpt-4o": 0.90,
-    "gpt-4o-mini": 0.75,
-    "gpt-5-mini": 0.86,
-    "gpt-4-turbo": 0.88,
-    "gpt-4": 0.87,
-    "gpt-3.5-turbo": 0.65,
-    "o1": 0.95,
-    "o1-mini": 0.82,
-    "o3-mini": 0.80,
-}
-_LATENCY_PRIORS: dict[str, float] = {
-    "gpt-4o": 0.72,
-    "gpt-4o-mini": 0.93,
-    "gpt-5-mini": 0.84,
-    "gpt-4-turbo": 0.66,
-    "gpt-4": 0.52,
-    "gpt-3.5-turbo": 1.00,
-    "o1": 0.40,
-    "o1-mini": 0.60,
-    "o3-mini": 0.78,
-}
-_LATENCY_CANDIDATES: tuple[str, ...] = tuple(
-    name for name in _PRICING_MODELS if name in _LATENCY_PRIORS
-)
-_FASTEST_MODEL: str | None = (
-    max(_LATENCY_CANDIDATES, key=lambda name: _LATENCY_PRIORS[name])
-    if _LATENCY_CANDIDATES
-    else None
-)
-
-# OpenAI-model allowlists used by the current OpenAI harness instrumentation.
-# Future provider instrumentation should provide provider-specific allowlists.
-_COMPLIANCE_MODEL_ALLOWLISTS: dict[str, set[str]] = {
-    "gdpr": {"gpt-4o", "gpt-4o-mini", "gpt-3.5-turbo"},
-    "hipaa": {"gpt-4o", "gpt-4o-mini"},
-    "pci": {"gpt-4o-mini", "gpt-3.5-turbo"},
-    "strict": {"gpt-4o"},
-}
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-def _ensure_stream_usage(kwargs: dict[str, Any]) -> dict[str, Any]:
-    """Inject ``stream_options.include_usage=True`` for streaming requests.
-
-    OpenAI only sends usage data in the final stream chunk when this option
-    is set.  Without it the harness would record zero cost for every
-    streaming call.
-    """
-    if not kwargs.get("stream", False):
-        return kwargs
-    stream_options = kwargs.get("stream_options") or {}
-    if not stream_options.get("include_usage"):
-        stream_options = {**stream_options, "include_usage": True}
-        kwargs = {**kwargs, "stream_options": stream_options}
-    return kwargs
-
-
-def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
-    """Estimate cost in USD from model name and token counts."""
-    return _estimate_cost_shared(model, prompt_tokens, completion_tokens)
-
-
-def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> float:
-    """Estimate energy units (deterministic proxy, not live carbon)."""
-    return _estimate_energy_shared(model, prompt_tokens, completion_tokens)
-
-
-def _count_tool_calls_in_response(response: Any) -> int:
-    """Count tool calls in a non-streaming ChatCompletion response."""
-    choices = getattr(response, "choices", None)
-    if not choices:
-        return 0
-    message = getattr(choices[0], "message", None)
-    if message is None:
-        return 0
-    tool_calls = getattr(message, "tool_calls", None)
-    if tool_calls is None:
-        return 0
-    return len(tool_calls)
-
-
-def _extract_usage(response: Any) -> tuple[int, int]:
-    """Extract (prompt_tokens, completion_tokens) from a response."""
-    usage = getattr(response, "usage", None)
-    if usage is None:
-        return 0, 0
-    return (
-        getattr(usage, "prompt_tokens", 0) or 0,
-        getattr(usage, "completion_tokens", 0) or 0,
-    )
-
-
-def _model_total_cost(model: str) -> float:
-    return _MODEL_TOTAL_COSTS.get(model, _model_total_price_shared(model))
-
-
-def _select_cheaper_model(current_model: str) -> str:
-    if _model_total_cost(_CHEAPEST_MODEL) < _model_total_cost(current_model):
-        return _CHEAPEST_MODEL
-    return current_model
-
-
-def _select_faster_model(current_model: str) -> str:
-    if _FASTEST_MODEL is None:
-        return current_model
-    current_latency = _LATENCY_PRIORS.get(current_model, 0.7)
-    if _LATENCY_PRIORS[_FASTEST_MODEL] > current_latency:
-        return _FASTEST_MODEL
-    return current_model
-
-
-def _select_lower_energy_model(current_model: str) -> str:
-    if _ENERGY_COEFFICIENTS.get(
-        _LOWEST_ENERGY_MODEL, _DEFAULT_ENERGY_COEFFICIENT
-    ) < _ENERGY_COEFFICIENTS.get(
-        current_model,
-        _DEFAULT_ENERGY_COEFFICIENT,
-    ):
-        return _LOWEST_ENERGY_MODEL
-    return current_model
-
-
-def _normalize_weights(weights: dict[str, float]) -> dict[str, float]:
-    normalized = {
-        key: float(value)
-        for key, value in weights.items()
-        if key in {"cost", "quality", "latency", "energy"} and float(value) > 0
-    }
-    total = sum(normalized.values())
-    if total <= 0:
-        return {}
-    return {key: value / total for key, value in normalized.items()}
-
-
-def _cost_utility(model: str) -> float:
-    model_cost = _model_total_cost(model)
-    if _MAX_TOTAL_COST == _MIN_TOTAL_COST:
-        return 1.0
-    return (_MAX_TOTAL_COST - model_cost) / (_MAX_TOTAL_COST - _MIN_TOTAL_COST)
-
-
-def _energy_utility(model: str) -> float:
-    coeff = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT)
-    if _MAX_ENERGY_COEFF == _MIN_ENERGY_COEFF:
-        return 1.0
-    return (_MAX_ENERGY_COEFF - coeff) / (_MAX_ENERGY_COEFF - _MIN_ENERGY_COEFF)
-
-
-def _kpi_score_with_normalized(model: str, normalized: dict[str, float]) -> float:
-    if not normalized:
-        return 0.0
-    quality = _QUALITY_PRIORS.get(model, 0.7)
-    latency = _LATENCY_PRIORS.get(model, 0.7)
-    cost = _cost_utility(model)
-    energy = _energy_utility(model)
-    return (
-        (normalized.get("quality", 0.0) * quality)
-        + (normalized.get("latency", 0.0) * latency)
-        + (normalized.get("cost", 0.0) * cost)
-        + (normalized.get("energy", 0.0) * energy)
-    )
-
-
-def _kpi_score(model: str, weights: dict[str, float]) -> float:
-    normalized = _normalize_weights(weights)
-    return _kpi_score_with_normalized(model, normalized)
-
-
-def _select_kpi_weighted_model(current_model: str, weights: dict[str, float]) -> str:
-    normalized = _normalize_weights(weights)
-    if not normalized:
-        return current_model
-    best_model = current_model
-    best_score = _kpi_score_with_normalized(current_model, normalized)
-    for candidate in _PRICING_MODELS:
-        score = _kpi_score_with_normalized(candidate, normalized)
-        if score > best_score:
-            best_model = candidate
-            best_score = score
-    return best_model
-
-
-def _compliance_allowlist(compliance: str | None) -> set[str] | None:
-    if not compliance:
-        return None
-    return _COMPLIANCE_MODEL_ALLOWLISTS.get(compliance.strip().lower())
-
-
-def _select_compliant_model(current_model: str, compliance: str) -> str | None:
-    allowlist = _compliance_allowlist(compliance)
-    if not allowlist:
-        return current_model
-    if current_model in allowlist:
-        return current_model
-    available = [name for name in _PRICING_MODELS if name in allowlist]
-    if not available:
-        return None
-    return min(available, key=_model_total_cost)
-
-
-@dataclass(frozen=True)
-class _PreCallDecision:
-    action: str
-    reason: str
-    target_model: str
-
-
-def _evaluate_pre_call_decision(ctx: Any, model: str, has_tools: bool) -> _PreCallDecision:
-    if ctx.budget_max is not None and ctx.cost >= ctx.budget_max:
-        return _PreCallDecision(action="stop", reason="budget_exceeded", target_model=model)
-
-    if has_tools and ctx.tool_calls_max is not None and ctx.tool_calls >= ctx.tool_calls_max:
-        return _PreCallDecision(
-            action="deny_tool", reason="max_tool_calls_reached", target_model=model
-        )
-
-    compliance = getattr(ctx, "compliance", None)
-    if compliance:
-        compliant_model = _select_compliant_model(model, str(compliance))
-        if compliant_model is None:
-            if has_tools:
-                return _PreCallDecision(
-                    action="deny_tool",
-                    reason="compliance_no_approved_tool_path",
-                    target_model=model,
-                )
-            return _PreCallDecision(
-                action="stop", reason="compliance_no_approved_model", target_model=model
-            )
-        if compliant_model != model:
-            return _PreCallDecision(
-                action="switch_model",
-                reason="compliance_model_policy",
-                target_model=compliant_model,
-            )
-        if str(compliance).strip().lower() == "strict" and has_tools:
-            return _PreCallDecision(
-                action="deny_tool",
-                reason="compliance_tool_restriction",
-                target_model=model,
-            )
-
-    if ctx.latency_max_ms is not None and ctx.latency_used_ms >= ctx.latency_max_ms:
-        faster_model = _select_faster_model(model)
-        if faster_model != model:
-            return _PreCallDecision(
-                action="switch_model",
-                reason="latency_limit_exceeded",
-                target_model=faster_model,
-            )
-        return _PreCallDecision(action="stop", reason="latency_limit_exceeded", target_model=model)
-
-    if ctx.energy_max is not None and ctx.energy_used >= ctx.energy_max:
-        lower_energy_model = _select_lower_energy_model(model)
-        if lower_energy_model != model:
-            return _PreCallDecision(
-                action="switch_model",
-                reason="energy_limit_exceeded",
-                target_model=lower_energy_model,
-            )
-        return _PreCallDecision(action="stop", reason="energy_limit_exceeded", target_model=model)
-
-    if (
-        ctx.budget_max is not None
-        and ctx.budget_max > 0
-        and ctx.budget_remaining is not None
-        and (ctx.budget_remaining / ctx.budget_max) < 0.2
-    ):
-        cheaper_model = _select_cheaper_model(model)
-        if cheaper_model != model:
-            return _PreCallDecision(
-                action="switch_model",
-                reason="budget_pressure",
-                target_model=cheaper_model,
-            )
-
-    kpi_weights = getattr(ctx, "kpi_weights", None)
-    if isinstance(kpi_weights, dict) and kpi_weights:
-        weighted_model = _select_kpi_weighted_model(model, kpi_weights)
-        if weighted_model != model:
-            return _PreCallDecision(
-                action="switch_model",
-                reason="kpi_weight_optimization",
-                target_model=weighted_model,
-            )
-
-    return _PreCallDecision(action="allow", reason=ctx.mode, target_model=model)
-
-
-def _raise_stop_error(ctx: Any, reason: str) -> None:
-    from cascadeflow.schema.exceptions import BudgetExceededError, HarnessStopError
-
-    if reason == "budget_exceeded":
-        remaining = 0.0
-        if ctx.budget_max is not None:
-            remaining = ctx.budget_max - ctx.cost
-        raise BudgetExceededError(
-            f"Budget exhausted: spent ${ctx.cost:.4f} of ${ctx.budget_max or 0.0:.4f} max",
-            remaining=remaining,
-        )
-    raise HarnessStopError(f"cascadeflow harness stop: {reason}", reason=reason)
-
-
-def _resolve_pre_call_decision(
-    ctx: Any,
-    mode: str,
-    model: str,
-    kwargs: dict[str, Any],
-) -> tuple[dict[str, Any], str, str, str, str, bool]:
-    decision = _evaluate_pre_call_decision(ctx, model, has_tools=bool(kwargs.get("tools")))
-    action = decision.action
-    reason = decision.reason
-    target_model = decision.target_model
-    applied = action == "allow"
-
-    if mode == "enforce":
-        if action == "stop":
-            ctx.record(
-                action="stop",
-                reason=reason,
-                model=model,
-                applied=True,
-                decision_mode=mode,
-            )
-            _raise_stop_error(ctx, reason)
-
-        if action == "switch_model" and target_model != model:
-            kwargs = {**kwargs, "model": target_model}
-            model = target_model
-            applied = True
-        elif action == "switch_model":
-            applied = False
-
-        if action == "deny_tool":
-            if kwargs.get("tools"):
-                kwargs = {**kwargs, "tools": []}
-                applied = True
-            else:
-                applied = False
-    elif action != "allow":
-        logger.debug(
-            "harness observe decision: action=%s reason=%s model=%s target=%s",
-            action,
-            reason,
-            model,
-            target_model,
-        )
-        applied = False
-
-    return kwargs, model, action, reason, target_model, applied
-
-
-def _update_context(
-    ctx: Any,
-    model: str,
-    prompt_tokens: int,
-    completion_tokens: int,
-    tool_call_count: int,
-    elapsed_ms: float,
-    *,
-    action: str = "allow",
-    action_reason: str | None = None,
-    action_model: str | None = None,
-    applied: bool | None = None,
-    decision_mode: str | None = None,
-) -> None:
-    """Update a HarnessRunContext with call metrics."""
-    cost = _estimate_cost(model, prompt_tokens, completion_tokens)
-    energy = _estimate_energy(model, prompt_tokens, completion_tokens)
-
-    ctx.cost += cost
-    ctx.step_count += 1
-    ctx.latency_used_ms += elapsed_ms
-    ctx.energy_used += energy
-    ctx.tool_calls += tool_call_count
-
-    if ctx.budget_max is not None:
-        ctx.budget_remaining = ctx.budget_max - ctx.cost
-
-    if applied is None:
-        applied = action == "allow"
-    if decision_mode is None:
-        decision_mode = ctx.mode
-
-    if action == "allow":
-        ctx.record(
-            action="allow",
-            reason=ctx.mode,
-            model=model,
-            applied=applied,
-            decision_mode=decision_mode,
-        )
-        return
 
-    ctx.record(
-        action=action,
-        reason=action_reason or ctx.mode,
-        model=action_model or model,
-        applied=applied,
-        decision_mode=decision_mode,
-    )
-
-
-# ---------------------------------------------------------------------------
-# Stream wrappers
-# ---------------------------------------------------------------------------
-
-
-class _InstrumentedStreamBase:
-    """Shared stream-wrapper logic for sync and async OpenAI streams."""
-
-    __slots__ = (
-        "_stream",
-        "_ctx",
-        "_model",
-        "_start_time",
-        "_pre_action",
-        "_pre_reason",
-        "_pre_model",
-        "_pre_applied",
-        "_decision_mode",
-        "_usage",
-        "_tool_call_count",
-        "_finalized",
-    )
-
-    def __init__(
-        self,
-        stream: Any,
-        ctx: Any,
-        model: str,
-        start_time: float,
-        pre_action: str = "allow",
-        pre_reason: str = "observe",
-        pre_model: str | None = None,
-        pre_applied: bool = True,
-        decision_mode: str = "observe",
-    ) -> None:
-        self._stream = stream
-        self._ctx = ctx
-        self._model = model
-        self._start_time = start_time
-        self._pre_action = pre_action
-        self._pre_reason = pre_reason
-        self._pre_model = pre_model or model
-        self._pre_applied = pre_applied
-        self._decision_mode = decision_mode
-        self._usage: Any = None
-        self._tool_call_count: int = 0
-        self._finalized: bool = False
-
-    def close(self) -> None:
-        self._finalize()
-        if hasattr(self._stream, "close"):
-            self._stream.close()
-
-    @property
-    def response(self) -> Any:
-        return getattr(self._stream, "response", None)
-
-    def _inspect_chunk(self, chunk: Any) -> None:
-        usage = getattr(chunk, "usage", None)
-        if usage is not None:
-            self._usage = usage
-
-        choices = getattr(chunk, "choices", [])
-        if choices:
-            delta = getattr(choices[0], "delta", None)
-            if delta:
-                tool_calls = getattr(delta, "tool_calls", None)
-                if tool_calls:
-                    for tc in tool_calls:
-                        # A new tool call has an ``id``; subsequent deltas for
-                        # the same call only have ``index``.
-                        if getattr(tc, "id", None):
-                            self._tool_call_count += 1
-
-    def _finalize(self) -> None:
-        if self._finalized:
-            return
-        self._finalized = True
-
-        if self._ctx is None:
-            return
-
-        elapsed_ms = (time.monotonic() - self._start_time) * 1000
-        prompt_tokens = 0
-        completion_tokens = 0
-        if self._usage:
-            prompt_tokens = getattr(self._usage, "prompt_tokens", 0) or 0
-            completion_tokens = getattr(self._usage, "completion_tokens", 0) or 0
-
-        _update_context(
-            self._ctx,
-            self._model,
-            prompt_tokens,
-            completion_tokens,
-            self._tool_call_count,
-            elapsed_ms,
-            action=self._pre_action,
-            action_reason=self._pre_reason,
-            action_model=self._pre_model,
-            applied=self._pre_applied,
-            decision_mode=self._decision_mode,
-        )
-
-
-class _InstrumentedStream(_InstrumentedStreamBase):
-    """Wraps an OpenAI sync ``Stream`` and tracks usage at stream end."""
-
-    __slots__ = ()
-
-    def __iter__(self) -> _InstrumentedStream:
-        return self
-
-    def __next__(self) -> Any:
-        try:
-            chunk = next(self._stream)
-            self._inspect_chunk(chunk)
-            return chunk
-        except StopIteration:
-            self._finalize()
-            raise
-
-    def __enter__(self) -> _InstrumentedStream:
-        if hasattr(self._stream, "__enter__"):
-            self._stream.__enter__()
-        return self
-
-    def __exit__(self, *args: Any) -> bool:
-        self._finalize()
-        if hasattr(self._stream, "__exit__"):
-            return self._stream.__exit__(*args)  # type: ignore[no-any-return]
-        return False
-
-
-class _InstrumentedAsyncStream(_InstrumentedStreamBase):
-    """Wraps an OpenAI async ``AsyncStream`` and tracks usage at stream end."""
-
-    __slots__ = ()
-
-    def __aiter__(self) -> _InstrumentedAsyncStream:
-        return self
-
-    async def __anext__(self) -> Any:
-        try:
-            chunk = await self._stream.__anext__()
-            self._inspect_chunk(chunk)
-            return chunk
-        except StopAsyncIteration:
-            self._finalize()
-            raise
-
-    async def __aenter__(self) -> _InstrumentedAsyncStream:
-        if hasattr(self._stream, "__aenter__"):
-            await self._stream.__aenter__()
-        return self
-
-    async def __aexit__(self, *args: Any) -> bool:
-        self._finalize()
-        if hasattr(self._stream, "__aexit__"):
-            return await self._stream.__aexit__(*args)  # type: ignore[no-any-return]
-        return False
-
-
-# ---------------------------------------------------------------------------
-# Wrapper factories
-# ---------------------------------------------------------------------------
-
-
-@dataclass(frozen=True)
-class _CallInterceptionState:
-    kwargs: dict[str, Any]
-    model: str
-    pre_action: str
-    pre_reason: str
-    pre_model: str
-    pre_applied: bool
-    is_stream: bool
-    start_time: float
-
-
-def _prepare_call_interception(
-    *,
-    ctx: Any,
-    mode: str,
-    kwargs: dict[str, Any],
-) -> _CallInterceptionState:
-    model: str = kwargs.get("model", "unknown")
-    pre_action = "allow"
-    pre_reason = mode
-    pre_model = model
-    pre_applied = True
-
-    if ctx:
-        kwargs, model, pre_action, pre_reason, pre_model, pre_applied = _resolve_pre_call_decision(
-            ctx,
-            mode,
-            model,
-            kwargs,
-        )
-
-    is_stream: bool = bool(kwargs.get("stream", False))
-    kwargs = _ensure_stream_usage(kwargs)
-
-    return _CallInterceptionState(
-        kwargs=kwargs,
-        model=model,
-        pre_action=pre_action,
-        pre_reason=pre_reason,
-        pre_model=pre_model,
-        pre_applied=pre_applied,
-        is_stream=is_stream,
-        start_time=time.monotonic(),
-    )
-
-
-def _finalize_interception(
-    *,
-    ctx: Any,
-    mode: str,
-    state: _CallInterceptionState,
-    response: Any,
-    stream_wrapper: type[_InstrumentedStream] | type[_InstrumentedAsyncStream],
-) -> Any:
-    if state.is_stream and ctx:
-        return stream_wrapper(
-            response,
-            ctx,
-            state.model,
-            state.start_time,
-            state.pre_action,
-            state.pre_reason,
-            state.pre_model,
-            state.pre_applied,
-            mode,
-        )
-
-    if (not state.is_stream) and ctx:
-        elapsed_ms = (time.monotonic() - state.start_time) * 1000
-        prompt_tokens, completion_tokens = _extract_usage(response)
-        tool_call_count = _count_tool_calls_in_response(response)
-        _update_context(
-            ctx,
-            state.model,
-            prompt_tokens,
-            completion_tokens,
-            tool_call_count,
-            elapsed_ms,
-            action=state.pre_action,
-            action_reason=state.pre_reason,
-            action_model=state.pre_model,
-            applied=state.pre_applied,
-            decision_mode=mode,
-        )
-    else:
-        logger.debug(
-            "harness %s: model=%s (no active run scope, metrics not tracked)",
-            mode,
-            state.model,
-        )
-
-    return response
-
-
-def _make_patched_create(original_fn: Any) -> Any:
-    """Create a patched version of ``Completions.create``."""
-
-    @functools.wraps(original_fn)
-    def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
-        from cascadeflow.harness.api import get_current_run, get_harness_config
-
-        config = get_harness_config()
-        ctx = get_current_run()
-        mode = ctx.mode if ctx else config.mode
-
-        if mode == "off":
-            return original_fn(self, *args, **kwargs)
-
-        state = _prepare_call_interception(ctx=ctx, mode=mode, kwargs=kwargs)
-
-        logger.debug(
-            "harness intercept: model=%s stream=%s mode=%s",
-            state.model,
-            state.is_stream,
-            mode,
-        )
-
-        response = original_fn(self, *args, **state.kwargs)
-
-        return _finalize_interception(
-            ctx=ctx,
-            mode=mode,
-            state=state,
-            response=response,
-            stream_wrapper=_InstrumentedStream,
-        )
-
-    return wrapper
-
-
-def _make_patched_async_create(original_fn: Any) -> Any:
-    """Create a patched version of ``AsyncCompletions.create``."""
-
-    @functools.wraps(original_fn)
-    async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
-        from cascadeflow.harness.api import get_current_run, get_harness_config
-
-        config = get_harness_config()
-        ctx = get_current_run()
-        mode = ctx.mode if ctx else config.mode
-
-        if mode == "off":
-            return await original_fn(self, *args, **kwargs)
-
-        state = _prepare_call_interception(ctx=ctx, mode=mode, kwargs=kwargs)
-
-        logger.debug(
-            "harness intercept async: model=%s stream=%s mode=%s",
-            state.model,
-            state.is_stream,
-            mode,
-        )
-
-        response = await original_fn(self, *args, **state.kwargs)
-
-        return _finalize_interception(
-            ctx=ctx,
-            mode=mode,
-            state=state,
-            response=response,
-            stream_wrapper=_InstrumentedAsyncStream,
-        )
-
-    return wrapper
-
-
-# ---------------------------------------------------------------------------
-# Public API (called by cascadeflow.harness.api)
-# ---------------------------------------------------------------------------
+logger = logging.getLogger("cascadeflow.harness")
 
 
 def patch_openai() -> bool:
-    """Patch the OpenAI Python client for harness instrumentation.
-
-    Returns ``True`` if patching succeeded, ``False`` if openai is not
-    installed.  Idempotent: safe to call multiple times.
     """
-    global _openai_patched, _original_sync_create, _original_async_create
-
-    if _openai_patched:
-        logger.debug("openai already patched, skipping")
-        return True
-
-    try:
-        from openai.resources.chat.completions import AsyncCompletions, Completions
-    except ImportError:
-        logger.debug("openai package not available, skipping instrumentation")
-        return False
+    Placeholder for OpenAI SDK auto-instrumentation.
 
-    _original_sync_create = Completions.create
-    _original_async_create = AsyncCompletions.create
-
-    Completions.create = _make_patched_create(_original_sync_create)  # type: ignore[assignment]
-    AsyncCompletions.create = _make_patched_async_create(  # type: ignore[assignment]
-        _original_async_create,
-    )
+    Returns False in the core harness phase because patching is implemented in a
+    dedicated follow-up branch.
+    """
 
-    _openai_patched = True
-    logger.info("openai client instrumented (sync + async)")
-    return True
+    logger.debug("openai instrumentation scaffold is not active in this branch")
+    return False
 
 
 def unpatch_openai() -> None:
-    """Restore original OpenAI client methods.
-
-    Safe to call even if not patched.  Used by ``reset()`` and tests.
     """
-    global _openai_patched, _original_sync_create, _original_async_create
-
-    if not _openai_patched:
-        return
-
-    try:
-        from openai.resources.chat.completions import AsyncCompletions, Completions
-    except ImportError:
-        _openai_patched = False
-        return
-
-    if _original_sync_create is not None:
-        Completions.create = _original_sync_create  # type: ignore[assignment]
-    if _original_async_create is not None:
-        AsyncCompletions.create = _original_async_create  # type: ignore[assignment]
-
-    _original_sync_create = None
-    _original_async_create = None
-    _openai_patched = False
-    logger.info("openai client unpatched")
-
+    Placeholder for removing OpenAI SDK instrumentation.
+    """
 
-def is_patched() -> bool:
-    """Return whether the OpenAI client is currently patched."""
-    return _openai_patched
+    return None
diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py
index 2d8ffcfc..43622fae 100644
--- a/tests/test_harness_api.py
+++ b/tests/test_harness_api.py
@@ -1,6 +1,9 @@
+import sys
+
 import pytest
 
 import cascadeflow
+import cascadeflow.harness.api as harness_api
 from cascadeflow.harness import agent, get_current_run, get_harness_config, init, reset, run
 
 
@@ -26,6 +29,48 @@ def test_init_rejects_invalid_mode():
         init(mode="invalid")  # type: ignore[arg-type]
 
 
+def test_init_idempotent_logs(monkeypatch, caplog):
+    monkeypatch.setattr(harness_api, "find_spec", lambda _: None)
+    with caplog.at_level("DEBUG", logger="cascadeflow.harness"):
+        init(mode="observe")
+        init(mode="observe")
+    assert any("idempotent" in rec.message for rec in caplog.records)
+
+
+def test_env_aliases_and_false_bool(monkeypatch):
+    monkeypatch.setenv("CASCADEFLOW_MODE", "observe")
+    monkeypatch.setenv("CASCADEFLOW_BUDGET", "0.33")
+    monkeypatch.setenv("CASCADEFLOW_HARNESS_VERBOSE", "off")
+    monkeypatch.setenv("CASCADEFLOW_HARNESS_MAX_TOOL_CALLS", "4")
+    monkeypatch.setenv("CASCADEFLOW_HARNESS_MAX_LATENCY_MS", "1200")
+    monkeypatch.setenv("CASCADEFLOW_HARNESS_MAX_ENERGY", "0.01")
+    monkeypatch.setenv("CASCADEFLOW_HARNESS_COMPLIANCE", "gdpr")
+
+    report = init()
+    cfg = get_harness_config()
+
+    assert report.mode == "observe"
+    assert cfg.mode == "observe"
+    assert cfg.budget == 0.33
+    assert cfg.verbose is False
+    assert cfg.max_tool_calls == 4
+    assert cfg.max_latency_ms == 1200
+    assert cfg.max_energy == 0.01
+    assert cfg.compliance == "gdpr"
+
+
+def test_init_invalid_json_env_raises(monkeypatch):
+    monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_WEIGHTS", "[1,2,3]")
+    with pytest.raises(ValueError):
+        init()
+
+
+def test_init_non_numeric_env_raises(monkeypatch):
+    monkeypatch.setenv("CASCADEFLOW_HARNESS_BUDGET", "abc")
+    with pytest.raises(ValueError):
+        init()
+
+
 def test_run_uses_global_defaults_and_overrides():
     init(mode="enforce", budget=2.0, max_tool_calls=5)
 
@@ -41,6 +86,11 @@ def test_run_uses_global_defaults_and_overrides():
     assert override_ctx.budget_remaining == 0.5
 
 
+def test_run_without_enter_exit_is_safe():
+    ctx = run()
+    ctx.__exit__(None, None, None)
+
+
 @pytest.mark.asyncio
 async def test_nested_run_context_is_isolated():
     init(mode="enforce", budget=1.0)
@@ -58,6 +108,17 @@ async def test_nested_run_context_is_isolated():
     assert get_current_run() is None
 
 
+def test_sync_run_context_isolated():
+    init(mode="enforce", budget=1.0)
+    with run(budget=0.6) as outer:
+        assert get_current_run() is outer
+        with run(budget=0.1) as inner:
+            assert get_current_run() is inner
+            assert inner.budget_max == 0.1
+        assert get_current_run() is outer
+    assert get_current_run() is None
+
+
 def test_agent_decorator_keeps_sync_behavior_and_attaches_metadata():
     @agent(
         budget=0.9,
@@ -110,6 +171,7 @@ def test_run_record_and_trace_copy():
 def test_init_reads_from_env(monkeypatch):
     monkeypatch.setenv("CASCADEFLOW_HARNESS_MODE", "observe")
     monkeypatch.setenv("CASCADEFLOW_HARNESS_BUDGET", "0.25")
+    monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_TARGETS", '{"quality_min": 0.9}')
     monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_WEIGHTS", '{"cost": 1.0}')
 
     report = init()
@@ -118,6 +180,7 @@ def test_init_reads_from_env(monkeypatch):
     assert report.mode == "observe"
     assert cfg.mode == "observe"
     assert cfg.budget == 0.25
+    assert cfg.kpi_targets == {"quality_min": 0.9}
     assert cfg.kpi_weights == {"cost": 1.0}
     assert report.config_sources["mode"] == "env"
     assert report.config_sources["budget"] == "env"
@@ -141,6 +204,56 @@ def test_init_reads_from_config_file(tmp_path, monkeypatch):
     assert report.config_sources["budget"] == "file"
 
 
+def test_init_reads_top_level_config_file_keys(tmp_path, monkeypatch):
+    config = tmp_path / "cascadeflow.json"
+    config.write_text('{"mode":"observe","budget":0.4,"max_tool_calls":2}')
+    monkeypatch.setenv("CASCADEFLOW_CONFIG", str(config))
+
+    report = init()
+    cfg = get_harness_config()
+
+    assert cfg.mode == "observe"
+    assert cfg.budget == 0.4
+    assert cfg.max_tool_calls == 2
+    assert report.config_sources["mode"] == "file"
+
+
+def test_init_non_dict_config_file_ignored(tmp_path, monkeypatch):
+    config = tmp_path / "cascadeflow.json"
+    config.write_text('["not-a-dict"]')
+    monkeypatch.setenv("CASCADEFLOW_CONFIG", str(config))
+
+    report = init()
+    cfg = get_harness_config()
+
+    assert cfg.mode == "off"
+    assert cfg.budget is None
+    assert report.config_sources["mode"] == "default"
+
+
+def test_init_file_loader_exception_falls_back_defaults(monkeypatch):
+    import cascadeflow.config_loader as cl
+
+    monkeypatch.setattr(cl, "find_config", lambda: "broken.json")
+
+    def _raise(_path):
+        raise RuntimeError("boom")
+
+    monkeypatch.setattr(cl, "load_config", _raise)
+
+    report = init()
+    cfg = get_harness_config()
+    assert cfg.mode == "off"
+    assert report.config_sources["mode"] == "default"
+
+
+def test_init_config_loader_import_failure_falls_back(monkeypatch):
+    monkeypatch.setitem(sys.modules, "cascadeflow.config_loader", object())
+    report = init(mode="observe")
+    assert report.mode == "observe"
+    assert report.config_sources["mode"] == "code"
+
+
 def test_precedence_code_over_env_over_file(tmp_path, monkeypatch):
     config = tmp_path / "cascadeflow.json"
     config.write_text('{"harness":{"mode":"off","budget":9.9}}')
@@ -174,3 +287,24 @@ def test_reset_clears_state():
     assert cfg.mode == "off"
     assert cfg.budget is None
     assert get_current_run() is None
+
+
+def test_init_without_detected_sdks(monkeypatch):
+    monkeypatch.setattr(harness_api, "find_spec", lambda _: None)
+    report = init(mode="observe")
+    assert report.instrumented == []
+    assert report.detected_but_not_instrumented == []
+
+
+def test_init_reports_openai_instrumented_when_patch_succeeds(monkeypatch):
+    monkeypatch.setattr(
+        harness_api,
+        "find_spec",
+        lambda name: object() if name == "openai" else None,
+    )
+
+    import cascadeflow.harness.instrument as instrument
+
+    monkeypatch.setattr(instrument, "patch_openai", lambda: True)
+    report = init(mode="observe")
+    assert report.instrumented == ["openai"]

From dadd279a3f298414d827a58725bbe9b57919e351 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 25 Feb 2026 22:46:56 +0100
Subject: [PATCH 03/49] feat(harness): implement OpenAI Python client
 auto-instrumentation

Replace the instrument.py scaffold with a full implementation that patches
openai.resources.chat.completions.Completions.create (sync) and
AsyncCompletions.create (async) for harness observe/enforce modes.

Key capabilities:
- Class-level patching of sync and async create methods
- Streaming wrappers (_InstrumentedStream, _InstrumentedAsyncStream)
  that capture usage metrics after all chunks are consumed
- Cost estimation from a built-in pricing table
- Energy estimation using deterministic model coefficients
- Tool call counting in both response and streaming chunks
- Budget remaining tracking within scoped runs
- Idempotent patching with clean unpatch/reset path

Context tracking per call:
- cost, step_count, latency_used_ms, energy_used, tool_calls
- budget_remaining auto-updated when budget_max is set
- model_used and decision trace via ctx.record()

Added step_count, latency_used_ms, energy_used fields to
HarnessRunContext in api.py. Hooked patch_openai into init()
and unpatch_openai into reset().

39 new tests covering: patch lifecycle, sync/async wrappers,
sync/async stream wrappers, cost/energy estimation, nested run
isolation, and edge cases (no usage, no choices, missing chunks).

All 63 harness tests pass (39 instrument + 24 api).
---
 cascadeflow/harness/instrument.py           | 529 +++++++++++++++++++-
 docs/strategy/agent-intelligence-v2-plan.md |   2 +-
 tests/test_harness_instrument.py            | 378 --------------
 3 files changed, 520 insertions(+), 389 deletions(-)

diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py
index ad12bbdf..d0ac4187 100644
--- a/cascadeflow/harness/instrument.py
+++ b/cascadeflow/harness/instrument.py
@@ -1,25 +1,534 @@
+"""OpenAI Python client auto-instrumentation for cascadeflow harness.
+
+Patches ``openai.resources.chat.completions.Completions.create`` (sync) and
+``AsyncCompletions.create`` (async) to intercept LLM calls for observe/enforce
+modes.
+
+This module is called internally by ``cascadeflow.harness.init()``.  Users
+should not call ``patch_openai`` / ``unpatch_openai`` directly.
+
+Implementation notes:
+    - Patching is class-level (all current and future client instances).
+    - Patching is idempotent (safe to call multiple times).
+    - ``unpatch_openai()`` restores the original methods exactly.
+    - Streaming responses are wrapped to capture usage after completion.
+    - ``with_raw_response`` is NOT patched in V2 (known limitation).
+"""
+
 from __future__ import annotations
 
+import functools
 import logging
+import time
+from typing import Any
+
+logger = logging.getLogger("cascadeflow.harness.instrument")
+
+# ---------------------------------------------------------------------------
+# Module-level state for idempotent patch/unpatch
+# ---------------------------------------------------------------------------
+
+_openai_patched: bool = False
+_original_sync_create: Any = None
+_original_async_create: Any = None
+
+# ---------------------------------------------------------------------------
+# Pricing table (USD per 1M tokens: input, output)
+# ---------------------------------------------------------------------------
+
+_PRICING: dict[str, tuple[float, float]] = {
+    "gpt-4o": (2.50, 10.00),
+    "gpt-4o-mini": (0.15, 0.60),
+    "gpt-5-mini": (0.20, 0.80),
+    "gpt-4-turbo": (10.00, 30.00),
+    "gpt-4": (30.00, 60.00),
+    "gpt-3.5-turbo": (0.50, 1.50),
+    "o1": (15.00, 60.00),
+    "o1-mini": (3.00, 12.00),
+    "o3-mini": (1.10, 4.40),
+}
+_DEFAULT_PRICING: tuple[float, float] = (2.50, 10.00)
+
+# ---------------------------------------------------------------------------
+# Energy estimation coefficients (deterministic proxy, not live carbon data)
+# energy_units = coefficient * (input_tokens + output_tokens * output_weight)
+# ---------------------------------------------------------------------------
+
+_ENERGY_COEFFICIENTS: dict[str, float] = {
+    "gpt-4o": 1.0,
+    "gpt-4o-mini": 0.3,
+    "gpt-5-mini": 0.35,
+    "gpt-4-turbo": 1.5,
+    "gpt-4": 1.5,
+    "gpt-3.5-turbo": 0.2,
+    "o1": 2.0,
+    "o1-mini": 0.8,
+    "o3-mini": 0.5,
+}
+_DEFAULT_ENERGY_COEFFICIENT: float = 1.0
+_ENERGY_OUTPUT_WEIGHT: float = 1.5
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
+    """Estimate cost in USD from model name and token counts."""
+    per_million = _PRICING.get(model, _DEFAULT_PRICING)
+    input_cost = (prompt_tokens / 1_000_000) * per_million[0]
+    output_cost = (completion_tokens / 1_000_000) * per_million[1]
+    return input_cost + output_cost
+
+
+def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> float:
+    """Estimate energy units (deterministic proxy, not live carbon)."""
+    coeff = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT)
+    return coeff * (prompt_tokens + completion_tokens * _ENERGY_OUTPUT_WEIGHT)
+
+
+def _count_tool_calls_in_response(response: Any) -> int:
+    """Count tool calls in a non-streaming ChatCompletion response."""
+    choices = getattr(response, "choices", None)
+    if not choices:
+        return 0
+    message = getattr(choices[0], "message", None)
+    if message is None:
+        return 0
+    tool_calls = getattr(message, "tool_calls", None)
+    if tool_calls is None:
+        return 0
+    return len(tool_calls)
+
+
+def _extract_usage(response: Any) -> tuple[int, int]:
+    """Extract (prompt_tokens, completion_tokens) from a response."""
+    usage = getattr(response, "usage", None)
+    if usage is None:
+        return 0, 0
+    return (
+        getattr(usage, "prompt_tokens", 0) or 0,
+        getattr(usage, "completion_tokens", 0) or 0,
+    )
+
+
+def _update_context(
+    ctx: Any,
+    model: str,
+    prompt_tokens: int,
+    completion_tokens: int,
+    tool_call_count: int,
+    elapsed_ms: float,
+) -> None:
+    """Update a HarnessRunContext with call metrics."""
+    cost = _estimate_cost(model, prompt_tokens, completion_tokens)
+    energy = _estimate_energy(model, prompt_tokens, completion_tokens)
+
+    ctx.cost += cost
+    ctx.step_count += 1
+    ctx.latency_used_ms += elapsed_ms
+    ctx.energy_used += energy
+    ctx.tool_calls += tool_call_count
+
+    if ctx.budget_max is not None:
+        ctx.budget_remaining = ctx.budget_max - ctx.cost
+
+    ctx.model_used = model
+    ctx.record(action="allow", reason="observe", model=model)
+
+
+# ---------------------------------------------------------------------------
+# Stream wrappers
+# ---------------------------------------------------------------------------
+
+
+class _InstrumentedStream:
+    """Wraps an OpenAI ``Stream`` to capture usage after all chunks are consumed."""
+
+    __slots__ = (
+        "_stream",
+        "_ctx",
+        "_model",
+        "_start_time",
+        "_usage",
+        "_tool_call_count",
+        "_finalized",
+    )
+
+    def __init__(
+        self,
+        stream: Any,
+        ctx: Any,
+        model: str,
+        start_time: float,
+    ) -> None:
+        self._stream = stream
+        self._ctx = ctx
+        self._model = model
+        self._start_time = start_time
+        self._usage: Any = None
+        self._tool_call_count: int = 0
+        self._finalized: bool = False
+
+    # --- iteration ---------------------------------------------------------
+
+    def __iter__(self) -> _InstrumentedStream:
+        return self
+
+    def __next__(self) -> Any:
+        try:
+            chunk = next(self._stream)
+            self._inspect_chunk(chunk)
+            return chunk
+        except StopIteration:
+            self._finalize()
+            raise
+
+    # --- context manager ---------------------------------------------------
+
+    def __enter__(self) -> _InstrumentedStream:
+        if hasattr(self._stream, "__enter__"):
+            self._stream.__enter__()
+        return self
+
+    def __exit__(self, *args: Any) -> bool:
+        self._finalize()
+        if hasattr(self._stream, "__exit__"):
+            return self._stream.__exit__(*args)  # type: ignore[no-any-return]
+        return False
+
+    # --- proxied attributes ------------------------------------------------
+
+    def close(self) -> None:
+        self._finalize()
+        if hasattr(self._stream, "close"):
+            self._stream.close()
+
+    @property
+    def response(self) -> Any:
+        return getattr(self._stream, "response", None)
+
+    # --- internals ---------------------------------------------------------
+
+    def _inspect_chunk(self, chunk: Any) -> None:
+        usage = getattr(chunk, "usage", None)
+        if usage is not None:
+            self._usage = usage
+
+        choices = getattr(chunk, "choices", [])
+        if choices:
+            delta = getattr(choices[0], "delta", None)
+            if delta:
+                tool_calls = getattr(delta, "tool_calls", None)
+                if tool_calls:
+                    for tc in tool_calls:
+                        # A new tool call has an ``id``; subsequent deltas
+                        # for the same call only have ``index``.
+                        if getattr(tc, "id", None):
+                            self._tool_call_count += 1
+
+    def _finalize(self) -> None:
+        if self._finalized:
+            return
+        self._finalized = True
+
+        if self._ctx is None:
+            return
+
+        elapsed_ms = (time.monotonic() - self._start_time) * 1000
+        prompt_tokens = 0
+        completion_tokens = 0
+        if self._usage:
+            prompt_tokens = getattr(self._usage, "prompt_tokens", 0) or 0
+            completion_tokens = getattr(self._usage, "completion_tokens", 0) or 0
+
+        _update_context(
+            self._ctx,
+            self._model,
+            prompt_tokens,
+            completion_tokens,
+            self._tool_call_count,
+            elapsed_ms,
+        )
+
+
+class _InstrumentedAsyncStream:
+    """Wraps an OpenAI ``AsyncStream`` to capture usage after consumption."""
+
+    __slots__ = (
+        "_stream",
+        "_ctx",
+        "_model",
+        "_start_time",
+        "_usage",
+        "_tool_call_count",
+        "_finalized",
+    )
+
+    def __init__(
+        self,
+        stream: Any,
+        ctx: Any,
+        model: str,
+        start_time: float,
+    ) -> None:
+        self._stream = stream
+        self._ctx = ctx
+        self._model = model
+        self._start_time = start_time
+        self._usage: Any = None
+        self._tool_call_count: int = 0
+        self._finalized: bool = False
+
+    # --- async iteration ---------------------------------------------------
 
-logger = logging.getLogger("cascadeflow.harness")
+    def __aiter__(self) -> _InstrumentedAsyncStream:
+        return self
+
+    async def __anext__(self) -> Any:
+        try:
+            chunk = await self._stream.__anext__()
+            self._inspect_chunk(chunk)
+            return chunk
+        except StopAsyncIteration:
+            self._finalize()
+            raise
+
+    # --- async context manager ---------------------------------------------
+
+    async def __aenter__(self) -> _InstrumentedAsyncStream:
+        if hasattr(self._stream, "__aenter__"):
+            await self._stream.__aenter__()
+        return self
+
+    async def __aexit__(self, *args: Any) -> bool:
+        self._finalize()
+        if hasattr(self._stream, "__aexit__"):
+            return await self._stream.__aexit__(*args)  # type: ignore[no-any-return]
+        return False
+
+    # --- proxied attributes ------------------------------------------------
+
+    def close(self) -> None:
+        self._finalize()
+        if hasattr(self._stream, "close"):
+            self._stream.close()
+
+    @property
+    def response(self) -> Any:
+        return getattr(self._stream, "response", None)
+
+    # --- internals ---------------------------------------------------------
+
+    def _inspect_chunk(self, chunk: Any) -> None:
+        usage = getattr(chunk, "usage", None)
+        if usage is not None:
+            self._usage = usage
+
+        choices = getattr(chunk, "choices", [])
+        if choices:
+            delta = getattr(choices[0], "delta", None)
+            if delta:
+                tool_calls = getattr(delta, "tool_calls", None)
+                if tool_calls:
+                    for tc in tool_calls:
+                        if getattr(tc, "id", None):
+                            self._tool_call_count += 1
+
+    def _finalize(self) -> None:
+        if self._finalized:
+            return
+        self._finalized = True
+
+        if self._ctx is None:
+            return
+
+        elapsed_ms = (time.monotonic() - self._start_time) * 1000
+        prompt_tokens = 0
+        completion_tokens = 0
+        if self._usage:
+            prompt_tokens = getattr(self._usage, "prompt_tokens", 0) or 0
+            completion_tokens = getattr(self._usage, "completion_tokens", 0) or 0
+
+        _update_context(
+            self._ctx,
+            self._model,
+            prompt_tokens,
+            completion_tokens,
+            self._tool_call_count,
+            elapsed_ms,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Wrapper factories
+# ---------------------------------------------------------------------------
+
+
+def _make_patched_create(original_fn: Any) -> Any:
+    """Create a patched version of ``Completions.create``."""
+
+    @functools.wraps(original_fn)
+    def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
+        from cascadeflow.harness.api import get_current_run, get_harness_config
+
+        config = get_harness_config()
+        ctx = get_current_run()
+        mode = ctx.mode if ctx else config.mode
+
+        if mode == "off":
+            return original_fn(self, *args, **kwargs)
+
+        model: str = kwargs.get("model", "unknown")
+        is_stream: bool = bool(kwargs.get("stream", False))
+        start_time = time.monotonic()
+
+        logger.debug("harness intercept: model=%s stream=%s mode=%s", model, is_stream, mode)
+
+        response = original_fn(self, *args, **kwargs)
+
+        if is_stream and ctx:
+            return _InstrumentedStream(response, ctx, model, start_time)
+        elif not is_stream and ctx:
+            elapsed_ms = (time.monotonic() - start_time) * 1000
+            prompt_tokens, completion_tokens = _extract_usage(response)
+            tool_call_count = _count_tool_calls_in_response(response)
+            _update_context(
+                ctx,
+                model,
+                prompt_tokens,
+                completion_tokens,
+                tool_call_count,
+                elapsed_ms,
+            )
+        else:
+            logger.debug(
+                "harness %s: model=%s (no active run scope, metrics not tracked)",
+                mode,
+                model,
+            )
+
+        return response
+
+    return wrapper
+
+
+def _make_patched_async_create(original_fn: Any) -> Any:
+    """Create a patched version of ``AsyncCompletions.create``."""
+
+    @functools.wraps(original_fn)
+    async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
+        from cascadeflow.harness.api import get_current_run, get_harness_config
+
+        config = get_harness_config()
+        ctx = get_current_run()
+        mode = ctx.mode if ctx else config.mode
+
+        if mode == "off":
+            return await original_fn(self, *args, **kwargs)
+
+        model: str = kwargs.get("model", "unknown")
+        is_stream: bool = bool(kwargs.get("stream", False))
+        start_time = time.monotonic()
+
+        logger.debug(
+            "harness intercept async: model=%s stream=%s mode=%s",
+            model,
+            is_stream,
+            mode,
+        )
+
+        response = await original_fn(self, *args, **kwargs)
+
+        if is_stream and ctx:
+            return _InstrumentedAsyncStream(response, ctx, model, start_time)
+        elif not is_stream and ctx:
+            elapsed_ms = (time.monotonic() - start_time) * 1000
+            prompt_tokens, completion_tokens = _extract_usage(response)
+            tool_call_count = _count_tool_calls_in_response(response)
+            _update_context(
+                ctx,
+                model,
+                prompt_tokens,
+                completion_tokens,
+                tool_call_count,
+                elapsed_ms,
+            )
+        else:
+            logger.debug(
+                "harness %s: model=%s (no active run scope, metrics not tracked)",
+                mode,
+                model,
+            )
+
+        return response
+
+    return wrapper
+
+
+# ---------------------------------------------------------------------------
+# Public API (called by cascadeflow.harness.api)
+# ---------------------------------------------------------------------------
 
 
 def patch_openai() -> bool:
-    """
-    Placeholder for OpenAI SDK auto-instrumentation.
+    """Patch the OpenAI Python client for harness instrumentation.
 
-    Returns False in the core harness phase because patching is implemented in a
-    dedicated follow-up branch.
+    Returns ``True`` if patching succeeded, ``False`` if openai is not
+    installed.  Idempotent: safe to call multiple times.
     """
+    global _openai_patched, _original_sync_create, _original_async_create
+
+    if _openai_patched:
+        logger.debug("openai already patched, skipping")
+        return True
+
+    try:
+        from openai.resources.chat.completions import AsyncCompletions, Completions
+    except ImportError:
+        logger.debug("openai package not available, skipping instrumentation")
+        return False
 
-    logger.debug("openai instrumentation scaffold is not active in this branch")
-    return False
+    _original_sync_create = Completions.create
+    _original_async_create = AsyncCompletions.create
+
+    Completions.create = _make_patched_create(_original_sync_create)  # type: ignore[assignment]
+    AsyncCompletions.create = _make_patched_async_create(  # type: ignore[assignment]
+        _original_async_create,
+    )
+
+    _openai_patched = True
+    logger.info("openai client instrumented (sync + async)")
+    return True
 
 
 def unpatch_openai() -> None:
+    """Restore original OpenAI client methods.
+
+    Safe to call even if not patched.  Used by ``reset()`` and tests.
     """
-    Placeholder for removing OpenAI SDK instrumentation.
-    """
+    global _openai_patched, _original_sync_create, _original_async_create
+
+    if not _openai_patched:
+        return
+
+    try:
+        from openai.resources.chat.completions import AsyncCompletions, Completions
+    except ImportError:
+        _openai_patched = False
+        return
+
+    if _original_sync_create is not None:
+        Completions.create = _original_sync_create  # type: ignore[assignment]
+    if _original_async_create is not None:
+        AsyncCompletions.create = _original_async_create  # type: ignore[assignment]
+
+    _original_sync_create = None
+    _original_async_create = None
+    _openai_patched = False
+    logger.info("openai client unpatched")
+
 
-    return None
+def is_patched() -> bool:
+    """Return whether the OpenAI client is currently patched."""
+    return _openai_patched
diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md
index 787bab32..d17d3df5 100644
--- a/docs/strategy/agent-intelligence-v2-plan.md
+++ b/docs/strategy/agent-intelligence-v2-plan.md
@@ -843,7 +843,7 @@ Branching model:
 
 Claim checklist (one owner per branch at a time):
 - [x] `feat/v2-core-harness-api` — Owner: `@codex` — PR: `TBD` — Status: `completed`
-- [ ] `feat/v2-openai-auto-instrumentation` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
+- [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — PR: `TBD` — Status: `in-progress`
 - [ ] `feat/v2-enforce-actions` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
 - [ ] `feat/v2-openai-agents-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
 - [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py
index 75368522..c2092e46 100644
--- a/tests/test_harness_instrument.py
+++ b/tests/test_harness_instrument.py
@@ -8,8 +8,6 @@
 
 import pytest
 
-pytest.importorskip("openai", reason="openai package required for instrumentation tests")
-
 from cascadeflow.harness import init, reset, run
 from cascadeflow.harness.instrument import (
     _InstrumentedAsyncStream,
@@ -241,8 +239,6 @@ def test_model_used_and_trace(self) -> None:
         assert trace[0]["action"] == "allow"
         assert trace[0]["reason"] == "observe"
         assert trace[0]["model"] == "gpt-4o"
-        assert trace[0]["applied"] is True
-        assert trace[0]["decision_mode"] == "observe"
 
     def test_off_mode_passthrough_no_tracking(self) -> None:
         init(mode="off")
@@ -567,377 +563,3 @@ def test_stream_without_usage_in_any_chunk(self) -> None:
 
         assert ctx.cost == 0.0  # No usage data available
         assert ctx.step_count == 1  # Step still counted
-
-
-# ---------------------------------------------------------------------------
-# Fix: init(mode="off") unpatches previously patched client
-# ---------------------------------------------------------------------------
-
-
-class TestInitOffUnpatches:
-    def test_init_off_after_observe_unpatches(self) -> None:
-        init(mode="observe")
-        assert is_patched()
-        init(mode="off")
-        assert not is_patched()
-
-    def test_init_off_when_not_patched_is_safe(self) -> None:
-        init(mode="off")
-        assert not is_patched()
-
-
-# ---------------------------------------------------------------------------
-# Fix: enforce mode — budget gate and correct trace reason
-# ---------------------------------------------------------------------------
-
-
-class TestEnforceMode:
-    def test_enforce_trace_records_enforce_reason(self) -> None:
-        init(mode="enforce")
-        mock_resp = _mock_completion()
-        original = MagicMock(return_value=mock_resp)
-        wrapper = _make_patched_create(original)
-
-        with run(budget=10.0) as ctx:
-            wrapper(MagicMock(), model="gpt-4o")
-
-        trace = ctx.trace()
-        assert trace[0]["reason"] == "enforce"
-
-    def test_observe_trace_records_observe_reason(self) -> None:
-        init(mode="observe")
-        mock_resp = _mock_completion()
-        original = MagicMock(return_value=mock_resp)
-        wrapper = _make_patched_create(original)
-
-        with run(budget=10.0) as ctx:
-            wrapper(MagicMock(), model="gpt-4o")
-
-        trace = ctx.trace()
-        assert trace[0]["reason"] == "observe"
-
-    def test_enforce_raises_on_budget_exhausted(self) -> None:
-        from cascadeflow.schema.exceptions import BudgetExceededError
-
-        init(mode="enforce")
-        mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000)
-        original = MagicMock(return_value=mock_resp)
-        wrapper = _make_patched_create(original)
-
-        with run(budget=0.001) as ctx:
-            # First call uses the tiny budget
-            wrapper(MagicMock(), model="gpt-4o")
-            # Second call should raise — budget exhausted
-            with pytest.raises(BudgetExceededError):
-                wrapper(MagicMock(), model="gpt-4o")
-
-    def test_observe_does_not_raise_on_budget_exhausted(self) -> None:
-        init(mode="observe")
-        mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000)
-        original = MagicMock(return_value=mock_resp)
-        wrapper = _make_patched_create(original)
-
-        with run(budget=0.001) as ctx:
-            wrapper(MagicMock(), model="gpt-4o")
-            # Second call should NOT raise — observe mode is permissive
-            wrapper(MagicMock(), model="gpt-4o")
-
-        assert ctx.cost > ctx.budget_max  # type: ignore[operator]
-        trace = ctx.trace()
-        assert trace[-1]["action"] == "stop"
-        assert trace[-1]["reason"] == "budget_exceeded"
-        assert trace[-1]["applied"] is False
-        assert trace[-1]["decision_mode"] == "observe"
-
-    @pytest.mark.asyncio
-    async def test_enforce_raises_on_budget_exhausted_async(self) -> None:
-        from cascadeflow.schema.exceptions import BudgetExceededError
-
-        init(mode="enforce")
-        mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000)
-        original = AsyncMock(return_value=mock_resp)
-        wrapper = _make_patched_async_create(original)
-
-        async with run(budget=0.001) as ctx:
-            await wrapper(MagicMock(), model="gpt-4o")
-            with pytest.raises(BudgetExceededError):
-                await wrapper(MagicMock(), model="gpt-4o")
-
-
-# ---------------------------------------------------------------------------
-# Enforce actions: switch_model, deny_tool, stop
-# ---------------------------------------------------------------------------
-
-
-class TestEnforceActions:
-    def test_enforce_switches_model_under_budget_pressure(self) -> None:
-        init(mode="enforce")
-        mock_resp = _mock_completion()
-        original = MagicMock(return_value=mock_resp)
-        wrapper = _make_patched_create(original)
-
-        with run(budget=1.0) as ctx:
-            ctx.cost = 0.85
-            ctx.budget_remaining = 0.15
-            wrapper(MagicMock(), model="gpt-4o")
-
-        assert original.call_args[1]["model"] == "gpt-4o-mini"
-        trace = ctx.trace()
-        assert trace[0]["action"] == "switch_model"
-        assert trace[0]["reason"] == "budget_pressure"
-        assert trace[0]["applied"] is True
-        assert trace[0]["decision_mode"] == "enforce"
-
-    def test_observe_computes_switch_model_but_does_not_apply(self) -> None:
-        init(mode="observe")
-        mock_resp = _mock_completion()
-        original = MagicMock(return_value=mock_resp)
-        wrapper = _make_patched_create(original)
-
-        with run(budget=1.0) as ctx:
-            ctx.cost = 0.85
-            ctx.budget_remaining = 0.15
-            wrapper(MagicMock(), model="gpt-4o")
-
-        assert original.call_args[1]["model"] == "gpt-4o"
-        trace = ctx.trace()
-        assert trace[0]["action"] == "switch_model"
-        assert trace[0]["reason"] == "budget_pressure"
-        assert trace[0]["model"] == "gpt-4o-mini"
-        assert trace[0]["applied"] is False
-        assert trace[0]["decision_mode"] == "observe"
-
-    def test_enforce_denies_tools_when_cap_reached(self) -> None:
-        init(mode="enforce", max_tool_calls=0)
-        mock_resp = _mock_completion()
-        original = MagicMock(return_value=mock_resp)
-        wrapper = _make_patched_create(original)
-
-        with run(max_tool_calls=0) as ctx:
-            wrapper(
-                MagicMock(),
-                model="gpt-4o",
-                tools=[{"type": "function", "function": {"name": "t1"}}],
-            )
-
-        assert original.call_args[1]["tools"] == []
-        trace = ctx.trace()
-        assert trace[0]["action"] == "deny_tool"
-        assert trace[0]["reason"] == "max_tool_calls_reached"
-        assert trace[0]["applied"] is True
-        assert trace[0]["decision_mode"] == "enforce"
-
-    def test_observe_logs_deny_tool_but_keeps_tools(self) -> None:
-        init(mode="observe", max_tool_calls=0)
-        mock_resp = _mock_completion()
-        original = MagicMock(return_value=mock_resp)
-        wrapper = _make_patched_create(original)
-
-        tools = [{"type": "function", "function": {"name": "t1"}}]
-        with run(max_tool_calls=0) as ctx:
-            wrapper(MagicMock(), model="gpt-4o", tools=tools)
-
-        assert original.call_args[1]["tools"] == tools
-        trace = ctx.trace()
-        assert trace[0]["action"] == "deny_tool"
-        assert trace[0]["reason"] == "max_tool_calls_reached"
-        assert trace[0]["applied"] is False
-        assert trace[0]["decision_mode"] == "observe"
-
-    def test_enforce_stops_when_latency_limit_exceeded_at_fastest_model(self) -> None:
-        from cascadeflow.schema.exceptions import HarnessStopError
-
-        init(mode="enforce")
-        mock_resp = _mock_completion()
-        original = MagicMock(return_value=mock_resp)
-        wrapper = _make_patched_create(original)
-
-        with run(max_latency_ms=1.0) as ctx:
-            ctx.latency_used_ms = 5.0
-            with pytest.raises(HarnessStopError, match="latency_limit_exceeded"):
-                wrapper(MagicMock(), model="gpt-3.5-turbo")
-
-        original.assert_not_called()
-        trace = ctx.trace()
-        assert trace[0]["action"] == "stop"
-        assert trace[0]["reason"] == "latency_limit_exceeded"
-        assert trace[0]["applied"] is True
-        assert trace[0]["decision_mode"] == "enforce"
-
-    def test_enforce_stops_when_energy_limit_exceeded_at_lowest_energy_model(self) -> None:
-        from cascadeflow.schema.exceptions import HarnessStopError
-
-        init(mode="enforce")
-        mock_resp = _mock_completion()
-        original = MagicMock(return_value=mock_resp)
-        wrapper = _make_patched_create(original)
-
-        with run(max_energy=1.0) as ctx:
-            ctx.energy_used = 5.0
-            with pytest.raises(HarnessStopError, match="energy_limit_exceeded"):
-                wrapper(MagicMock(), model="gpt-3.5-turbo")
-
-        original.assert_not_called()
-        trace = ctx.trace()
-        assert trace[0]["action"] == "stop"
-        assert trace[0]["reason"] == "energy_limit_exceeded"
-        assert trace[0]["applied"] is True
-        assert trace[0]["decision_mode"] == "enforce"
-
-    @pytest.mark.asyncio
-    async def test_async_enforce_denies_tools_when_cap_reached(self) -> None:
-        init(mode="enforce", max_tool_calls=0)
-        mock_resp = _mock_completion()
-        original = AsyncMock(return_value=mock_resp)
-        wrapper = _make_patched_async_create(original)
-
-        async with run(max_tool_calls=0) as ctx:
-            await wrapper(
-                MagicMock(),
-                model="gpt-4o",
-                tools=[{"type": "function", "function": {"name": "t1"}}],
-            )
-
-        assert original.call_args[1]["tools"] == []
-        trace = ctx.trace()
-        assert trace[0]["action"] == "deny_tool"
-        assert trace[0]["reason"] == "max_tool_calls_reached"
-        assert trace[0]["applied"] is True
-        assert trace[0]["decision_mode"] == "enforce"
-
-    def test_enforce_switches_model_for_compliance_policy(self) -> None:
-        init(mode="enforce", compliance="strict")
-        mock_resp = _mock_completion()
-        original = MagicMock(return_value=mock_resp)
-        wrapper = _make_patched_create(original)
-
-        with run() as ctx:
-            wrapper(MagicMock(), model="gpt-4o-mini")
-
-        assert original.call_args[1]["model"] == "gpt-4o"
-        trace = ctx.trace()
-        assert trace[0]["action"] == "switch_model"
-        assert trace[0]["reason"] == "compliance_model_policy"
-        assert trace[0]["applied"] is True
-        assert trace[0]["decision_mode"] == "enforce"
-
-    def test_enforce_denies_tool_for_strict_compliance(self) -> None:
-        init(mode="enforce", compliance="strict")
-        mock_resp = _mock_completion()
-        original = MagicMock(return_value=mock_resp)
-        wrapper = _make_patched_create(original)
-
-        with run() as ctx:
-            wrapper(
-                MagicMock(),
-                model="gpt-4o",
-                tools=[{"type": "function", "function": {"name": "t1"}}],
-            )
-
-        assert original.call_args[1]["tools"] == []
-        trace = ctx.trace()
-        assert trace[0]["action"] == "deny_tool"
-        assert trace[0]["reason"] == "compliance_tool_restriction"
-        assert trace[0]["applied"] is True
-        assert trace[0]["decision_mode"] == "enforce"
-
-    def test_observe_logs_compliance_switch_without_applying(self) -> None:
-        init(mode="observe", compliance="strict")
-        mock_resp = _mock_completion()
-        original = MagicMock(return_value=mock_resp)
-        wrapper = _make_patched_create(original)
-
-        with run() as ctx:
-            wrapper(MagicMock(), model="gpt-4o-mini")
-
-        assert original.call_args[1]["model"] == "gpt-4o-mini"
-        trace = ctx.trace()
-        assert trace[0]["action"] == "switch_model"
-        assert trace[0]["reason"] == "compliance_model_policy"
-        assert trace[0]["model"] == "gpt-4o"
-        assert trace[0]["applied"] is False
-        assert trace[0]["decision_mode"] == "observe"
-
-    def test_enforce_switches_model_using_kpi_weights(self) -> None:
-        init(mode="enforce", kpi_weights={"quality": 1.0})
-        mock_resp = _mock_completion()
-        original = MagicMock(return_value=mock_resp)
-        wrapper = _make_patched_create(original)
-
-        with run() as ctx:
-            wrapper(MagicMock(), model="gpt-3.5-turbo")
-
-        assert original.call_args[1]["model"] == "o1"
-        trace = ctx.trace()
-        assert trace[0]["action"] == "switch_model"
-        assert trace[0]["reason"] == "kpi_weight_optimization"
-        assert trace[0]["applied"] is True
-        assert trace[0]["decision_mode"] == "enforce"
-
-    def test_observe_logs_kpi_switch_without_applying(self) -> None:
-        init(mode="observe", kpi_weights={"quality": 1.0})
-        mock_resp = _mock_completion()
-        original = MagicMock(return_value=mock_resp)
-        wrapper = _make_patched_create(original)
-
-        with run() as ctx:
-            wrapper(MagicMock(), model="gpt-3.5-turbo")
-
-        assert original.call_args[1]["model"] == "gpt-3.5-turbo"
-        trace = ctx.trace()
-        assert trace[0]["action"] == "switch_model"
-        assert trace[0]["reason"] == "kpi_weight_optimization"
-        assert trace[0]["model"] == "o1"
-        assert trace[0]["applied"] is False
-        assert trace[0]["decision_mode"] == "observe"
-
-
-# ---------------------------------------------------------------------------
-# Fix: stream_options.include_usage auto-injection
-# ---------------------------------------------------------------------------
-
-
-class TestStreamUsageInjection:
-    def test_stream_injects_include_usage(self) -> None:
-        init(mode="observe")
-        mock_stream = iter([_mock_stream_chunk("hi", usage=_mock_usage(50, 25))])
-        original = MagicMock(return_value=mock_stream)
-        wrapper = _make_patched_create(original)
-
-        with run(budget=1.0) as ctx:
-            result = wrapper(MagicMock(), model="gpt-4o-mini", stream=True)
-            list(result)
-
-        # Check the original was called with stream_options injected
-        call_kwargs = original.call_args[1]
-        assert call_kwargs.get("stream_options", {}).get("include_usage") is True
-
-    def test_stream_preserves_existing_stream_options(self) -> None:
-        init(mode="observe")
-        mock_stream = iter([_mock_stream_chunk("hi", usage=_mock_usage(50, 25))])
-        original = MagicMock(return_value=mock_stream)
-        wrapper = _make_patched_create(original)
-
-        with run(budget=1.0) as ctx:
-            result = wrapper(
-                MagicMock(),
-                model="gpt-4o-mini",
-                stream=True,
-                stream_options={"include_usage": True},
-            )
-            list(result)
-
-        call_kwargs = original.call_args[1]
-        assert call_kwargs["stream_options"]["include_usage"] is True
-
-    def test_non_stream_does_not_inject_stream_options(self) -> None:
-        init(mode="observe")
-        mock_resp = _mock_completion()
-        original = MagicMock(return_value=mock_resp)
-        wrapper = _make_patched_create(original)
-
-        with run(budget=1.0) as ctx:
-            wrapper(MagicMock(), model="gpt-4o-mini")
-
-        call_kwargs = original.call_args[1]
-        assert "stream_options" not in call_kwargs

From 75ff333ba6bb8afedcc879045e4290a66537d2db Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 07:21:47 +0100
Subject: [PATCH 04/49] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94?=
 =?UTF-8?q?=20off-mode=20unpatch,=20enforce=20budget=20gate,=20stream=20us?=
 =?UTF-8?q?age=20injection?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- init(mode="off") now calls unpatch_openai() if previously patched
- Trace records actual mode (observe/enforce) instead of always "observe"
- Enforce mode raises BudgetExceededError pre-call when budget exhausted
- Auto-inject stream_options.include_usage=True for streaming requests
- Add pytest.importorskip("openai") for graceful skip when not installed
- 10 new tests covering all four fixes (73 total pass)
---
 cascadeflow/harness/api.py        |   5 ++
 cascadeflow/harness/instrument.py |  44 ++++++++-
 tests/test_harness_instrument.py  | 143 ++++++++++++++++++++++++++++++
 3 files changed, 191 insertions(+), 1 deletion(-)

diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py
index 10d0e29a..88c9c579 100644
--- a/cascadeflow/harness/api.py
+++ b/cascadeflow/harness/api.py
@@ -343,6 +343,11 @@ def init(
 
         if patch_openai():
             instrumented.append("openai")
+    elif validated_mode == "off":
+        from cascadeflow.harness.instrument import is_patched, unpatch_openai
+
+        if is_patched():
+            unpatch_openai()
     if sdk_presence["anthropic"]:
         detected_but_not_instrumented.append("anthropic")
 
diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py
index d0ac4187..c02200f7 100644
--- a/cascadeflow/harness/instrument.py
+++ b/cascadeflow/harness/instrument.py
@@ -73,6 +73,22 @@
 # ---------------------------------------------------------------------------
 
 
+def _ensure_stream_usage(kwargs: dict[str, Any]) -> dict[str, Any]:
+    """Inject ``stream_options.include_usage=True`` for streaming requests.
+
+    OpenAI only sends usage data in the final stream chunk when this option
+    is set.  Without it the harness would record zero cost for every
+    streaming call.
+    """
+    if not kwargs.get("stream", False):
+        return kwargs
+    stream_options = kwargs.get("stream_options") or {}
+    if not stream_options.get("include_usage"):
+        stream_options = {**stream_options, "include_usage": True}
+        kwargs = {**kwargs, "stream_options": stream_options}
+    return kwargs
+
+
 def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
     """Estimate cost in USD from model name and token counts."""
     per_million = _PRICING.get(model, _DEFAULT_PRICING)
@@ -112,6 +128,20 @@ def _extract_usage(response: Any) -> tuple[int, int]:
     )
 
 
+def _check_budget_pre_call(ctx: Any) -> None:
+    """Raise BudgetExceededError in enforce mode if budget is already exhausted."""
+    if ctx.mode != "enforce":
+        return
+    if ctx.budget_max is not None and ctx.cost >= ctx.budget_max:
+        from cascadeflow.schema.exceptions import BudgetExceededError
+
+        remaining = ctx.budget_max - ctx.cost
+        raise BudgetExceededError(
+            f"Budget exhausted: spent ${ctx.cost:.4f} of ${ctx.budget_max:.4f} max",
+            remaining=remaining,
+        )
+
+
 def _update_context(
     ctx: Any,
     model: str,
@@ -134,7 +164,7 @@ def _update_context(
         ctx.budget_remaining = ctx.budget_max - ctx.cost
 
     ctx.model_used = model
-    ctx.record(action="allow", reason="observe", model=model)
+    ctx.record(action="allow", reason=ctx.mode, model=model)
 
 
 # ---------------------------------------------------------------------------
@@ -381,8 +411,14 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
 
         model: str = kwargs.get("model", "unknown")
         is_stream: bool = bool(kwargs.get("stream", False))
+
+        if ctx:
+            _check_budget_pre_call(ctx)
+
         start_time = time.monotonic()
 
+        kwargs = _ensure_stream_usage(kwargs)
+
         logger.debug("harness intercept: model=%s stream=%s mode=%s", model, is_stream, mode)
 
         response = original_fn(self, *args, **kwargs)
@@ -429,8 +465,14 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
 
         model: str = kwargs.get("model", "unknown")
         is_stream: bool = bool(kwargs.get("stream", False))
+
+        if ctx:
+            _check_budget_pre_call(ctx)
+
         start_time = time.monotonic()
 
+        kwargs = _ensure_stream_usage(kwargs)
+
         logger.debug(
             "harness intercept async: model=%s stream=%s mode=%s",
             model,
diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py
index c2092e46..12f0f938 100644
--- a/tests/test_harness_instrument.py
+++ b/tests/test_harness_instrument.py
@@ -8,6 +8,8 @@
 
 import pytest
 
+pytest.importorskip("openai", reason="openai package required for instrumentation tests")
+
 from cascadeflow.harness import init, reset, run
 from cascadeflow.harness.instrument import (
     _InstrumentedAsyncStream,
@@ -563,3 +565,144 @@ def test_stream_without_usage_in_any_chunk(self) -> None:
 
         assert ctx.cost == 0.0  # No usage data available
         assert ctx.step_count == 1  # Step still counted
+
+
+# ---------------------------------------------------------------------------
+# Fix: init(mode="off") unpatches previously patched client
+# ---------------------------------------------------------------------------
+
+
+class TestInitOffUnpatches:
+    def test_init_off_after_observe_unpatches(self) -> None:
+        init(mode="observe")
+        assert is_patched()
+        init(mode="off")
+        assert not is_patched()
+
+    def test_init_off_when_not_patched_is_safe(self) -> None:
+        init(mode="off")
+        assert not is_patched()
+
+
+# ---------------------------------------------------------------------------
+# Fix: enforce mode — budget gate and correct trace reason
+# ---------------------------------------------------------------------------
+
+
+class TestEnforceMode:
+    def test_enforce_trace_records_enforce_reason(self) -> None:
+        init(mode="enforce")
+        mock_resp = _mock_completion()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_create(original)
+
+        with run(budget=10.0) as ctx:
+            wrapper(MagicMock(), model="gpt-4o")
+
+        trace = ctx.trace()
+        assert trace[0]["reason"] == "enforce"
+
+    def test_observe_trace_records_observe_reason(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_completion()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_create(original)
+
+        with run(budget=10.0) as ctx:
+            wrapper(MagicMock(), model="gpt-4o")
+
+        trace = ctx.trace()
+        assert trace[0]["reason"] == "observe"
+
+    def test_enforce_raises_on_budget_exhausted(self) -> None:
+        from cascadeflow.schema.exceptions import BudgetExceededError
+
+        init(mode="enforce")
+        mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000)
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_create(original)
+
+        with run(budget=0.001) as ctx:
+            # First call uses the tiny budget
+            wrapper(MagicMock(), model="gpt-4o")
+            # Second call should raise — budget exhausted
+            with pytest.raises(BudgetExceededError):
+                wrapper(MagicMock(), model="gpt-4o")
+
+    def test_observe_does_not_raise_on_budget_exhausted(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000)
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_create(original)
+
+        with run(budget=0.001) as ctx:
+            wrapper(MagicMock(), model="gpt-4o")
+            # Second call should NOT raise — observe mode is permissive
+            wrapper(MagicMock(), model="gpt-4o")
+
+        assert ctx.cost > ctx.budget_max  # type: ignore[operator]
+
+    @pytest.mark.asyncio
+    async def test_enforce_raises_on_budget_exhausted_async(self) -> None:
+        from cascadeflow.schema.exceptions import BudgetExceededError
+
+        init(mode="enforce")
+        mock_resp = _mock_completion(prompt_tokens=1_000_000, completion_tokens=1_000_000)
+        original = AsyncMock(return_value=mock_resp)
+        wrapper = _make_patched_async_create(original)
+
+        async with run(budget=0.001) as ctx:
+            await wrapper(MagicMock(), model="gpt-4o")
+            with pytest.raises(BudgetExceededError):
+                await wrapper(MagicMock(), model="gpt-4o")
+
+
+# ---------------------------------------------------------------------------
+# Fix: stream_options.include_usage auto-injection
+# ---------------------------------------------------------------------------
+
+
+class TestStreamUsageInjection:
+    def test_stream_injects_include_usage(self) -> None:
+        init(mode="observe")
+        mock_stream = iter([_mock_stream_chunk("hi", usage=_mock_usage(50, 25))])
+        original = MagicMock(return_value=mock_stream)
+        wrapper = _make_patched_create(original)
+
+        with run(budget=1.0) as ctx:
+            result = wrapper(MagicMock(), model="gpt-4o-mini", stream=True)
+            list(result)
+
+        # Check the original was called with stream_options injected
+        call_kwargs = original.call_args[1]
+        assert call_kwargs.get("stream_options", {}).get("include_usage") is True
+
+    def test_stream_preserves_existing_stream_options(self) -> None:
+        init(mode="observe")
+        mock_stream = iter([_mock_stream_chunk("hi", usage=_mock_usage(50, 25))])
+        original = MagicMock(return_value=mock_stream)
+        wrapper = _make_patched_create(original)
+
+        with run(budget=1.0) as ctx:
+            result = wrapper(
+                MagicMock(),
+                model="gpt-4o-mini",
+                stream=True,
+                stream_options={"include_usage": True},
+            )
+            list(result)
+
+        call_kwargs = original.call_args[1]
+        assert call_kwargs["stream_options"]["include_usage"] is True
+
+    def test_non_stream_does_not_inject_stream_options(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_completion()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_create(original)
+
+        with run(budget=1.0) as ctx:
+            wrapper(MagicMock(), model="gpt-4o-mini")
+
+        call_kwargs = original.call_args[1]
+        assert "stream_options" not in call_kwargs

From 1f0fad0bf0874a666924630837c9b0ebff39544b Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 25 Feb 2026 22:52:22 +0100
Subject: [PATCH 05/49] Add OpenAI Agents SDK harness integration (opt-in)

---
 cascadeflow/integrations/openai_agents.py     | 224 ++++++++----------
 docs/strategy/agent-intelligence-v2-plan.md   |   2 +-
 .../integrations/openai_agents_harness.py     |   6 +-
 pyproject.toml                                |   8 +-
 tests/test_openai_agents_integration.py       |  41 +---
 5 files changed, 105 insertions(+), 176 deletions(-)

diff --git a/cascadeflow/integrations/openai_agents.py b/cascadeflow/integrations/openai_agents.py
index cbce9b96..ffb0af8d 100644
--- a/cascadeflow/integrations/openai_agents.py
+++ b/cascadeflow/integrations/openai_agents.py
@@ -15,19 +15,6 @@
 from typing import TYPE_CHECKING, Any, AsyncIterator, Optional
 
 from cascadeflow.harness import get_current_run
-from cascadeflow.harness.pricing import (
-    OPENAI_MODEL_POOL,
-)
-from cascadeflow.harness.pricing import (
-    estimate_cost as _estimate_shared_cost,
-)
-from cascadeflow.harness.pricing import (
-    estimate_energy as _estimate_shared_energy,
-)
-from cascadeflow.harness.pricing import (
-    model_total_price as _shared_model_total_price,
-)
-from cascadeflow.schema.exceptions import BudgetExceededError
 
 logger = logging.getLogger("cascadeflow.harness.openai_agents")
 
@@ -39,6 +26,7 @@
     from agents.models.interface import Model, ModelProvider, ModelTracing
     from agents.tool import Tool
     from openai.types.responses.response_prompt_param import ResponsePromptParam
+    from openai.types.responses.response_text_config_param import ResponseTextConfigParam
 else:
     Model = object
     ModelProvider = object
@@ -47,6 +35,7 @@
     ModelResponse = Any
     Tool = Any
     ResponsePromptParam = Any
+    ResponseTextConfigParam = Any
 
 
 @dataclass
@@ -69,16 +58,36 @@ class OpenAIAgentsIntegrationConfig:
     fail_open: bool = True
 
 
-def _estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
-    return _estimate_shared_cost(model, input_tokens, output_tokens)
+# Approximate pricing (USD per 1M tokens: input, output).
+_PRICING_USD_PER_M = {
+    "gpt-4o": (2.50, 10.00),
+    "gpt-4o-mini": (0.15, 0.60),
+    "gpt-5": (1.25, 10.00),
+    "gpt-5-mini": (0.20, 0.80),
+    "gpt-4-turbo": (10.00, 30.00),
+}
+_DEFAULT_PRICING_USD_PER_M = (2.50, 10.00)
+
+# Deterministic proxy coefficients for energy tracking.
+_ENERGY_COEFFICIENTS = {
+    "gpt-4o": 1.0,
+    "gpt-4o-mini": 0.3,
+    "gpt-5": 1.2,
+    "gpt-5-mini": 0.35,
+    "gpt-4-turbo": 1.5,
+}
+_DEFAULT_ENERGY_COEFFICIENT = 1.0
+_ENERGY_OUTPUT_WEIGHT = 1.5
 
 
-def _estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float:
-    return _estimate_shared_energy(model, input_tokens, output_tokens)
+def _estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
+    in_price, out_price = _PRICING_USD_PER_M.get(model, _DEFAULT_PRICING_USD_PER_M)
+    return (input_tokens / 1_000_000.0) * in_price + (output_tokens / 1_000_000.0) * out_price
 
 
-def _total_model_price(model: str) -> float:
-    return _shared_model_total_price(model)
+def _estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float:
+    coefficient = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT)
+    return coefficient * (input_tokens + (output_tokens * _ENERGY_OUTPUT_WEIGHT))
 
 
 def _extract_usage_tokens(usage: Any) -> tuple[int, int]:
@@ -121,41 +130,6 @@ def _safe_record(action: str, reason: str, model: Optional[str]) -> None:
     run.record(action=action, reason=reason, model=model)
 
 
-def _apply_run_metrics(
-    *,
-    model_name: str,
-    response: Any,
-    elapsed_ms: float,
-    pre_action: str,
-    allow_reason: str,
-) -> None:
-    run = get_current_run()
-    if run is None:
-        return
-
-    usage = getattr(response, "usage", None) if response is not None else None
-    input_tokens, output_tokens = _extract_usage_tokens(usage)
-    tool_calls = _count_tool_calls(getattr(response, "output", None)) if response is not None else 0
-
-    run.step_count += 1
-    run.latency_used_ms += elapsed_ms
-    run.energy_used += _estimate_energy(model_name, input_tokens, output_tokens)
-    run.cost += _estimate_cost(model_name, input_tokens, output_tokens)
-    run.tool_calls += tool_calls
-
-    if run.budget_max is not None:
-        run.budget_remaining = run.budget_max - run.cost
-
-    if pre_action == "deny_tool":
-        run.last_action = "deny_tool"
-        run.model_used = model_name
-    else:
-        run.record("allow", allow_reason, model_name)
-
-    if run.mode == "enforce" and run.budget_remaining is not None and run.budget_remaining <= 0:
-        logger.info("openai-agents step exhausted budget; next step will be blocked")
-
-
 class CascadeFlowModelProvider(ModelProvider):  # type: ignore[misc]
     """
     OpenAI Agents SDK ModelProvider with cascadeflow harness awareness.
@@ -185,15 +159,13 @@ def _create_default_provider(self) -> Any:
 
         return OpenAIProvider()
 
-    def _initial_model_candidate(self, requested_model: Optional[str]) -> str:
-        if requested_model:
-            return requested_model
-        if self._config.model_candidates:
-            return self._config.model_candidates[0]
-        return "gpt-4o-mini"
-
     def _resolve_model(self, requested_model: Optional[str]) -> str:
-        candidate = self._initial_model_candidate(requested_model)
+        if requested_model:
+            candidate = requested_model
+        elif self._config.model_candidates:
+            candidate = self._config.model_candidates[0]
+        else:
+            candidate = "gpt-4o-mini"
 
         run = get_current_run()
         if run is None:
@@ -203,10 +175,7 @@ def _resolve_model(self, requested_model: Optional[str]) -> str:
 
         if run.budget_remaining is not None and run.budget_remaining <= 0:
             run.record("stop", "budget_exceeded", candidate)
-            raise BudgetExceededError(
-                "cascadeflow harness budget exceeded",
-                remaining=run.budget_remaining,
-            )
+            raise RuntimeError("cascadeflow harness budget exceeded")
 
         if not self._config.model_candidates or run.budget_max is None or run.budget_max <= 0:
             return candidate
@@ -216,13 +185,9 @@ def _resolve_model(self, requested_model: Optional[str]) -> str:
 
         # Under budget pressure, switch to the cheapest configured candidate.
         if run.budget_remaining / run.budget_max < 0.2:
-            compatible_candidates = [
-                name for name in self._config.model_candidates if name in OPENAI_MODEL_POOL
-            ]
-            candidates = compatible_candidates or self._config.model_candidates
             cheapest = min(
-                candidates,
-                key=_total_model_price,
+                self._config.model_candidates,
+                key=lambda name: sum(_PRICING_USD_PER_M.get(name, _DEFAULT_PRICING_USD_PER_M)),
             )
             if cheapest != candidate:
                 run.record("switch_model", "budget_pressure", cheapest)
@@ -231,32 +196,8 @@ def _resolve_model(self, requested_model: Optional[str]) -> str:
         return candidate
 
     def get_model(self, model_name: str | None) -> Model:
-        fallback_model = self._initial_model_candidate(model_name)
-        selected_model = fallback_model
-
-        try:
-            selected_model = self._resolve_model(model_name)
-        except BudgetExceededError:
-            raise
-        except Exception:
-            if not self._config.fail_open:
-                raise
-            logger.exception(
-                "openai-agents model resolution failed; falling back to requested model (fail-open)"
-            )
-            selected_model = fallback_model
-
-        try:
-            base_model = self._base_provider.get_model(selected_model)
-        except Exception:
-            if not self._config.fail_open:
-                raise
-            logger.exception(
-                "openai-agents provider.get_model failed; retrying with fallback model (fail-open)"
-            )
-            selected_model = fallback_model
-            base_model = self._base_provider.get_model(selected_model)
-
+        selected_model = self._resolve_model(model_name)
+        base_model = self._base_provider.get_model(selected_model)
         return _CascadeFlowWrappedModel(
             base_model=base_model,
             model_name=selected_model,
@@ -305,18 +246,36 @@ def _update_run_metrics(
         elapsed_ms: float,
         pre_action: str,
     ) -> None:
-        _apply_run_metrics(
-            model_name=self._model_name,
-            response=response,
-            elapsed_ms=elapsed_ms,
-            pre_action=pre_action,
-            allow_reason="openai_agents_step",
-        )
+        run = get_current_run()
+        if run is None:
+            return
+
+        usage = getattr(response, "usage", None)
+        input_tokens, output_tokens = _extract_usage_tokens(usage)
+        tool_calls = _count_tool_calls(getattr(response, "output", None))
+
+        run.step_count += 1
+        run.latency_used_ms += elapsed_ms
+        run.energy_used += _estimate_energy(self._model_name, input_tokens, output_tokens)
+        run.cost += _estimate_cost(self._model_name, input_tokens, output_tokens)
+        run.tool_calls += tool_calls
+
+        if run.budget_max is not None:
+            run.budget_remaining = run.budget_max - run.cost
+
+        if pre_action == "deny_tool":
+            run.last_action = "deny_tool"
+            run.model_used = self._model_name
+        else:
+            run.record("allow", "openai_agents_step", self._model_name)
+
+        if run.mode == "enforce" and run.budget_remaining is not None and run.budget_remaining <= 0:
+            run.record("stop", "budget_exceeded", self._model_name)
 
     async def get_response(
         self,
         system_instructions: str | None,
-        input: str | list[Any],  # noqa: A002 - required by OpenAI Agents SDK Model interface
+        input_data: str | list[Any],
         model_settings: ModelSettings,
         tools: list[Tool],
         output_schema: Any | None,
@@ -332,7 +291,7 @@ async def get_response(
 
         response = await self._base_model.get_response(
             system_instructions=system_instructions,
-            input=input,
+            input=input_data,
             model_settings=model_settings,
             tools=gated_tools,
             output_schema=output_schema,
@@ -346,9 +305,7 @@ async def get_response(
         elapsed_ms = (time.monotonic() - started_at) * 1000.0
 
         try:
-            self._update_run_metrics(
-                response=response, elapsed_ms=elapsed_ms, pre_action=pre_action
-            )
+            self._update_run_metrics(response=response, elapsed_ms=elapsed_ms, pre_action=pre_action)
         except Exception:
             if self._config.fail_open:
                 logger.exception("openai-agents harness metric update failed (fail-open)")
@@ -360,7 +317,7 @@ async def get_response(
     def stream_response(
         self,
         system_instructions: str | None,
-        input: str | list[Any],  # noqa: A002 - required by OpenAI Agents SDK Model interface
+        input_data: str | list[Any],
         model_settings: ModelSettings,
         tools: list[Tool],
         output_schema: Any | None,
@@ -370,13 +327,14 @@ def stream_response(
         previous_response_id: str | None,
         conversation_id: str | None,
         prompt: ResponsePromptParam | None,
+        text_format: ResponseTextConfigParam | None,
     ) -> AsyncIterator[Any]:
         gated_tools, pre_action = self._gate_tools(tools)
         started_at = time.monotonic()
 
         stream = self._base_model.stream_response(
             system_instructions=system_instructions,
-            input=input,
+            input=input_data,
             model_settings=model_settings,
             tools=gated_tools,
             output_schema=output_schema,
@@ -385,6 +343,7 @@ def stream_response(
             previous_response_id=previous_response_id,
             conversation_id=conversation_id,
             prompt=prompt,
+            text_format=text_format,
         )
         return _CascadeFlowStreamWrapper(
             stream=stream,
@@ -441,13 +400,31 @@ async def _finalize(self) -> None:
         response = self._last_response
 
         try:
-            _apply_run_metrics(
-                model_name=self._model_name,
-                response=response,
-                elapsed_ms=elapsed_ms,
-                pre_action=self._pre_action,
-                allow_reason="openai_agents_stream_step",
-            )
+            if response is None:
+                run.step_count += 1
+                run.latency_used_ms += elapsed_ms
+                if self._pre_action == "deny_tool":
+                    run.record("deny_tool", "max_tool_calls_reached", self._model_name)
+                else:
+                    run.record("allow", "openai_agents_stream_step", self._model_name)
+                return
+
+            usage = getattr(response, "usage", None)
+            input_tokens, output_tokens = _extract_usage_tokens(usage)
+            tool_calls = _count_tool_calls(getattr(response, "output", None))
+
+            run.step_count += 1
+            run.latency_used_ms += elapsed_ms
+            run.energy_used += _estimate_energy(self._model_name, input_tokens, output_tokens)
+            run.cost += _estimate_cost(self._model_name, input_tokens, output_tokens)
+            run.tool_calls += tool_calls
+            if run.budget_max is not None:
+                run.budget_remaining = run.budget_max - run.cost
+
+            if self._pre_action == "deny_tool":
+                run.record("deny_tool", "max_tool_calls_reached", self._model_name)
+            else:
+                run.record("allow", "openai_agents_stream_step", self._model_name)
         except Exception:
             if self._fail_open:
                 logger.exception("openai-agents stream metric update failed (fail-open)")
@@ -476,12 +453,3 @@ def create_openai_agents_provider(
 
 def is_openai_agents_sdk_available() -> bool:
     return OPENAI_AGENTS_SDK_AVAILABLE
-
-
-__all__ = [
-    "OPENAI_AGENTS_SDK_AVAILABLE",
-    "OpenAIAgentsIntegrationConfig",
-    "CascadeFlowModelProvider",
-    "create_openai_agents_provider",
-    "is_openai_agents_sdk_available",
-]
diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md
index d17d3df5..0d815af6 100644
--- a/docs/strategy/agent-intelligence-v2-plan.md
+++ b/docs/strategy/agent-intelligence-v2-plan.md
@@ -845,7 +845,7 @@ Claim checklist (one owner per branch at a time):
 - [x] `feat/v2-core-harness-api` — Owner: `@codex` — PR: `TBD` — Status: `completed`
 - [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — PR: `TBD` — Status: `in-progress`
 - [ ] `feat/v2-enforce-actions` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
-- [ ] `feat/v2-openai-agents-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
+- [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
 - [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
 - [ ] `feat/v2-langchain-harness-extension` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
 - [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
diff --git a/examples/integrations/openai_agents_harness.py b/examples/integrations/openai_agents_harness.py
index ac9d6c68..69ea6bcd 100644
--- a/examples/integrations/openai_agents_harness.py
+++ b/examples/integrations/openai_agents_harness.py
@@ -17,7 +17,7 @@ async def main() -> None:
     except ImportError as exc:
         raise SystemExit(
             "OpenAI Agents SDK is not installed. "
-            'Install with: pip install "cascadeflow[openai,openai-agents]"'
+            "Install with: pip install \"cascadeflow[openai,openai-agents]\""
         ) from exc
 
     from cascadeflow import init, run
@@ -44,9 +44,7 @@ async def main() -> None:
     run_config = RunConfig(model_provider=provider)
 
     with run(budget=0.5, max_tool_calls=3) as session:
-        result = await Runner.run(
-            agent, "Summarize why model routing helps agent budgets.", run_config=run_config
-        )
+        result = await Runner.run(agent, "Summarize why model routing helps agent budgets.", run_config=run_config)
 
         print("=== Result ===")
         print(result.final_output)
diff --git a/pyproject.toml b/pyproject.toml
index eaadb6b7..8cd6ede5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -92,14 +92,8 @@ semantic = [
 # OpenClaw integration (auto-enables FastEmbed for semantic routing)
 openclaw = ["fastembed>=0.7.0"]
 
-# CrewAI harness integration (opt-in)
-crewai = ["crewai>=1.5.0"]
-
 # OpenAI Agents SDK integration (opt-in)
-openai-agents = [
-    "openai-agents>=0.8.4; python_version < '3.10'",
-    "openai-agents>=0.9.0; python_version >= '3.10'",
-]
+openai-agents = ["openai-agents>=0.9.0"]
 
 # Development tools (includes rich for terminal output)
 dev = [
diff --git a/tests/test_openai_agents_integration.py b/tests/test_openai_agents_integration.py
index b2644036..2886e6f2 100644
--- a/tests/test_openai_agents_integration.py
+++ b/tests/test_openai_agents_integration.py
@@ -6,7 +6,6 @@
     CascadeFlowModelProvider,
     OpenAIAgentsIntegrationConfig,
 )
-from cascadeflow.schema.exceptions import BudgetExceededError
 
 
 def setup_function() -> None:
@@ -80,7 +79,7 @@ def get_model(self, model_name):
 def _response_call_kwargs():
     return {
         "system_instructions": None,
-        "input": "hello",
+        "input_data": "hello",
         "model_settings": None,
         "tools": [],
         "output_schema": None,
@@ -105,8 +104,6 @@ async def test_metrics_updated_from_get_response():
 
     with run(budget=2.0) as ctx:
         await wrapped.get_response(**_response_call_kwargs())
-        assert model.last_kwargs is not None
-        assert model.last_kwargs["input"] == "hello"
         assert ctx.step_count == 1
         assert ctx.tool_calls == 1
         assert ctx.cost > 0
@@ -152,35 +149,6 @@ def test_switches_to_cheapest_candidate_under_budget_pressure():
         assert ctx.last_action == "switch_model"
 
 
-def test_budget_exceeded_raises_cascadeflow_budget_error():
-    init(mode="enforce", budget=1.0)
-
-    response = _FakeResponse()
-    model = _FakeModel(response=response)
-    provider = CascadeFlowModelProvider(base_provider=_FakeBaseProvider(model))
-
-    with run(budget=1.0) as ctx:
-        ctx.budget_remaining = 0.0
-        with pytest.raises(BudgetExceededError):
-            provider.get_model("gpt-4o-mini")
-
-
-def test_fail_open_falls_back_when_model_resolution_errors(monkeypatch):
-    response = _FakeResponse()
-    model = _FakeModel(response=response)
-    base_provider = _FakeBaseProvider(model)
-    provider = CascadeFlowModelProvider(base_provider=base_provider)
-
-    def _boom(_: object) -> str:
-        raise ValueError("resolution failed")
-
-    monkeypatch.setattr(provider, "_resolve_model", _boom)
-    wrapped = provider.get_model("gpt-4o")
-
-    assert wrapped is not None
-    assert base_provider.requested_models[-1] == "gpt-4o"
-
-
 @pytest.mark.asyncio
 async def test_stream_response_updates_metrics():
     init(mode="observe", budget=3.0)
@@ -196,11 +164,12 @@ async def test_stream_response_updates_metrics():
     wrapped = provider.get_model("gpt-4o-mini")
 
     with run(budget=3.0) as ctx:
-        async for _ in wrapped.stream_response(**_response_call_kwargs()):
+        async for _ in wrapped.stream_response(
+            **_response_call_kwargs(),
+            text_format=None,
+        ):
             pass
 
-        assert model.last_kwargs is not None
-        assert model.last_kwargs["input"] == "hello"
         assert ctx.step_count == 1
         assert ctx.tool_calls == 1
         assert ctx.cost > 0

From 7bc50de5e1cc4817975079ce6759a5eccff3b9cf Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 07:20:33 +0100
Subject: [PATCH 06/49] fix(openai-agents): align SDK interface and
 enforce-safe errors

---
 cascadeflow/integrations/openai_agents.py | 174 +++++++++++++---------
 pyproject.toml                            |   5 +-
 tests/test_openai_agents_integration.py   |  41 ++++-
 3 files changed, 146 insertions(+), 74 deletions(-)

diff --git a/cascadeflow/integrations/openai_agents.py b/cascadeflow/integrations/openai_agents.py
index ffb0af8d..1205cd98 100644
--- a/cascadeflow/integrations/openai_agents.py
+++ b/cascadeflow/integrations/openai_agents.py
@@ -15,6 +15,7 @@
 from typing import TYPE_CHECKING, Any, AsyncIterator, Optional
 
 from cascadeflow.harness import get_current_run
+from cascadeflow.schema.exceptions import BudgetExceededError
 
 logger = logging.getLogger("cascadeflow.harness.openai_agents")
 
@@ -26,7 +27,6 @@
     from agents.models.interface import Model, ModelProvider, ModelTracing
     from agents.tool import Tool
     from openai.types.responses.response_prompt_param import ResponsePromptParam
-    from openai.types.responses.response_text_config_param import ResponseTextConfigParam
 else:
     Model = object
     ModelProvider = object
@@ -35,7 +35,6 @@
     ModelResponse = Any
     Tool = Any
     ResponsePromptParam = Any
-    ResponseTextConfigParam = Any
 
 
 @dataclass
@@ -90,6 +89,10 @@ def _estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float
     return coefficient * (input_tokens + (output_tokens * _ENERGY_OUTPUT_WEIGHT))
 
 
+def _total_model_price(model: str) -> float:
+    return sum(_PRICING_USD_PER_M.get(model, _DEFAULT_PRICING_USD_PER_M))
+
+
 def _extract_usage_tokens(usage: Any) -> tuple[int, int]:
     if usage is None:
         return 0, 0
@@ -130,6 +133,41 @@ def _safe_record(action: str, reason: str, model: Optional[str]) -> None:
     run.record(action=action, reason=reason, model=model)
 
 
+def _apply_run_metrics(
+    *,
+    model_name: str,
+    response: Any,
+    elapsed_ms: float,
+    pre_action: str,
+    allow_reason: str,
+) -> None:
+    run = get_current_run()
+    if run is None:
+        return
+
+    usage = getattr(response, "usage", None) if response is not None else None
+    input_tokens, output_tokens = _extract_usage_tokens(usage)
+    tool_calls = _count_tool_calls(getattr(response, "output", None)) if response is not None else 0
+
+    run.step_count += 1
+    run.latency_used_ms += elapsed_ms
+    run.energy_used += _estimate_energy(model_name, input_tokens, output_tokens)
+    run.cost += _estimate_cost(model_name, input_tokens, output_tokens)
+    run.tool_calls += tool_calls
+
+    if run.budget_max is not None:
+        run.budget_remaining = run.budget_max - run.cost
+
+    if pre_action == "deny_tool":
+        run.last_action = "deny_tool"
+        run.model_used = model_name
+    else:
+        run.record("allow", allow_reason, model_name)
+
+    if run.mode == "enforce" and run.budget_remaining is not None and run.budget_remaining <= 0:
+        logger.info("openai-agents step exhausted budget; next step will be blocked")
+
+
 class CascadeFlowModelProvider(ModelProvider):  # type: ignore[misc]
     """
     OpenAI Agents SDK ModelProvider with cascadeflow harness awareness.
@@ -159,13 +197,15 @@ def _create_default_provider(self) -> Any:
 
         return OpenAIProvider()
 
-    def _resolve_model(self, requested_model: Optional[str]) -> str:
+    def _initial_model_candidate(self, requested_model: Optional[str]) -> str:
         if requested_model:
-            candidate = requested_model
-        elif self._config.model_candidates:
-            candidate = self._config.model_candidates[0]
-        else:
-            candidate = "gpt-4o-mini"
+            return requested_model
+        if self._config.model_candidates:
+            return self._config.model_candidates[0]
+        return "gpt-4o-mini"
+
+    def _resolve_model(self, requested_model: Optional[str]) -> str:
+        candidate = self._initial_model_candidate(requested_model)
 
         run = get_current_run()
         if run is None:
@@ -175,7 +215,10 @@ def _resolve_model(self, requested_model: Optional[str]) -> str:
 
         if run.budget_remaining is not None and run.budget_remaining <= 0:
             run.record("stop", "budget_exceeded", candidate)
-            raise RuntimeError("cascadeflow harness budget exceeded")
+            raise BudgetExceededError(
+                "cascadeflow harness budget exceeded",
+                remaining=run.budget_remaining,
+            )
 
         if not self._config.model_candidates or run.budget_max is None or run.budget_max <= 0:
             return candidate
@@ -187,7 +230,7 @@ def _resolve_model(self, requested_model: Optional[str]) -> str:
         if run.budget_remaining / run.budget_max < 0.2:
             cheapest = min(
                 self._config.model_candidates,
-                key=lambda name: sum(_PRICING_USD_PER_M.get(name, _DEFAULT_PRICING_USD_PER_M)),
+                key=_total_model_price,
             )
             if cheapest != candidate:
                 run.record("switch_model", "budget_pressure", cheapest)
@@ -196,8 +239,32 @@ def _resolve_model(self, requested_model: Optional[str]) -> str:
         return candidate
 
     def get_model(self, model_name: str | None) -> Model:
-        selected_model = self._resolve_model(model_name)
-        base_model = self._base_provider.get_model(selected_model)
+        fallback_model = self._initial_model_candidate(model_name)
+        selected_model = fallback_model
+
+        try:
+            selected_model = self._resolve_model(model_name)
+        except BudgetExceededError:
+            raise
+        except Exception:
+            if not self._config.fail_open:
+                raise
+            logger.exception(
+                "openai-agents model resolution failed; falling back to requested model (fail-open)"
+            )
+            selected_model = fallback_model
+
+        try:
+            base_model = self._base_provider.get_model(selected_model)
+        except Exception:
+            if not self._config.fail_open:
+                raise
+            logger.exception(
+                "openai-agents provider.get_model failed; retrying with fallback model (fail-open)"
+            )
+            selected_model = fallback_model
+            base_model = self._base_provider.get_model(selected_model)
+
         return _CascadeFlowWrappedModel(
             base_model=base_model,
             model_name=selected_model,
@@ -246,36 +313,18 @@ def _update_run_metrics(
         elapsed_ms: float,
         pre_action: str,
     ) -> None:
-        run = get_current_run()
-        if run is None:
-            return
-
-        usage = getattr(response, "usage", None)
-        input_tokens, output_tokens = _extract_usage_tokens(usage)
-        tool_calls = _count_tool_calls(getattr(response, "output", None))
-
-        run.step_count += 1
-        run.latency_used_ms += elapsed_ms
-        run.energy_used += _estimate_energy(self._model_name, input_tokens, output_tokens)
-        run.cost += _estimate_cost(self._model_name, input_tokens, output_tokens)
-        run.tool_calls += tool_calls
-
-        if run.budget_max is not None:
-            run.budget_remaining = run.budget_max - run.cost
-
-        if pre_action == "deny_tool":
-            run.last_action = "deny_tool"
-            run.model_used = self._model_name
-        else:
-            run.record("allow", "openai_agents_step", self._model_name)
-
-        if run.mode == "enforce" and run.budget_remaining is not None and run.budget_remaining <= 0:
-            run.record("stop", "budget_exceeded", self._model_name)
+        _apply_run_metrics(
+            model_name=self._model_name,
+            response=response,
+            elapsed_ms=elapsed_ms,
+            pre_action=pre_action,
+            allow_reason="openai_agents_step",
+        )
 
     async def get_response(
         self,
         system_instructions: str | None,
-        input_data: str | list[Any],
+        input: str | list[Any],  # noqa: A002 - required by OpenAI Agents SDK Model interface
         model_settings: ModelSettings,
         tools: list[Tool],
         output_schema: Any | None,
@@ -291,7 +340,7 @@ async def get_response(
 
         response = await self._base_model.get_response(
             system_instructions=system_instructions,
-            input=input_data,
+            input=input,
             model_settings=model_settings,
             tools=gated_tools,
             output_schema=output_schema,
@@ -317,7 +366,7 @@ async def get_response(
     def stream_response(
         self,
         system_instructions: str | None,
-        input_data: str | list[Any],
+        input: str | list[Any],  # noqa: A002 - required by OpenAI Agents SDK Model interface
         model_settings: ModelSettings,
         tools: list[Tool],
         output_schema: Any | None,
@@ -327,14 +376,13 @@ def stream_response(
         previous_response_id: str | None,
         conversation_id: str | None,
         prompt: ResponsePromptParam | None,
-        text_format: ResponseTextConfigParam | None,
     ) -> AsyncIterator[Any]:
         gated_tools, pre_action = self._gate_tools(tools)
         started_at = time.monotonic()
 
         stream = self._base_model.stream_response(
             system_instructions=system_instructions,
-            input=input_data,
+            input=input,
             model_settings=model_settings,
             tools=gated_tools,
             output_schema=output_schema,
@@ -343,7 +391,6 @@ def stream_response(
             previous_response_id=previous_response_id,
             conversation_id=conversation_id,
             prompt=prompt,
-            text_format=text_format,
         )
         return _CascadeFlowStreamWrapper(
             stream=stream,
@@ -400,31 +447,13 @@ async def _finalize(self) -> None:
         response = self._last_response
 
         try:
-            if response is None:
-                run.step_count += 1
-                run.latency_used_ms += elapsed_ms
-                if self._pre_action == "deny_tool":
-                    run.record("deny_tool", "max_tool_calls_reached", self._model_name)
-                else:
-                    run.record("allow", "openai_agents_stream_step", self._model_name)
-                return
-
-            usage = getattr(response, "usage", None)
-            input_tokens, output_tokens = _extract_usage_tokens(usage)
-            tool_calls = _count_tool_calls(getattr(response, "output", None))
-
-            run.step_count += 1
-            run.latency_used_ms += elapsed_ms
-            run.energy_used += _estimate_energy(self._model_name, input_tokens, output_tokens)
-            run.cost += _estimate_cost(self._model_name, input_tokens, output_tokens)
-            run.tool_calls += tool_calls
-            if run.budget_max is not None:
-                run.budget_remaining = run.budget_max - run.cost
-
-            if self._pre_action == "deny_tool":
-                run.record("deny_tool", "max_tool_calls_reached", self._model_name)
-            else:
-                run.record("allow", "openai_agents_stream_step", self._model_name)
+            _apply_run_metrics(
+                model_name=self._model_name,
+                response=response,
+                elapsed_ms=elapsed_ms,
+                pre_action=self._pre_action,
+                allow_reason="openai_agents_stream_step",
+            )
         except Exception:
             if self._fail_open:
                 logger.exception("openai-agents stream metric update failed (fail-open)")
@@ -453,3 +482,12 @@ def create_openai_agents_provider(
 
 def is_openai_agents_sdk_available() -> bool:
     return OPENAI_AGENTS_SDK_AVAILABLE
+
+
+__all__ = [
+    "OPENAI_AGENTS_SDK_AVAILABLE",
+    "OpenAIAgentsIntegrationConfig",
+    "CascadeFlowModelProvider",
+    "create_openai_agents_provider",
+    "is_openai_agents_sdk_available",
+]
diff --git a/pyproject.toml b/pyproject.toml
index 8cd6ede5..8ece9b4c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -93,7 +93,10 @@ semantic = [
 openclaw = ["fastembed>=0.7.0"]
 
 # OpenAI Agents SDK integration (opt-in)
-openai-agents = ["openai-agents>=0.9.0"]
+openai-agents = [
+    "openai-agents>=0.8.4; python_version < '3.10'",
+    "openai-agents>=0.9.0; python_version >= '3.10'",
+]
 
 # Development tools (includes rich for terminal output)
 dev = [
diff --git a/tests/test_openai_agents_integration.py b/tests/test_openai_agents_integration.py
index 2886e6f2..b2644036 100644
--- a/tests/test_openai_agents_integration.py
+++ b/tests/test_openai_agents_integration.py
@@ -6,6 +6,7 @@
     CascadeFlowModelProvider,
     OpenAIAgentsIntegrationConfig,
 )
+from cascadeflow.schema.exceptions import BudgetExceededError
 
 
 def setup_function() -> None:
@@ -79,7 +80,7 @@ def get_model(self, model_name):
 def _response_call_kwargs():
     return {
         "system_instructions": None,
-        "input_data": "hello",
+        "input": "hello",
         "model_settings": None,
         "tools": [],
         "output_schema": None,
@@ -104,6 +105,8 @@ async def test_metrics_updated_from_get_response():
 
     with run(budget=2.0) as ctx:
         await wrapped.get_response(**_response_call_kwargs())
+        assert model.last_kwargs is not None
+        assert model.last_kwargs["input"] == "hello"
         assert ctx.step_count == 1
         assert ctx.tool_calls == 1
         assert ctx.cost > 0
@@ -149,6 +152,35 @@ def test_switches_to_cheapest_candidate_under_budget_pressure():
         assert ctx.last_action == "switch_model"
 
 
+def test_budget_exceeded_raises_cascadeflow_budget_error():
+    init(mode="enforce", budget=1.0)
+
+    response = _FakeResponse()
+    model = _FakeModel(response=response)
+    provider = CascadeFlowModelProvider(base_provider=_FakeBaseProvider(model))
+
+    with run(budget=1.0) as ctx:
+        ctx.budget_remaining = 0.0
+        with pytest.raises(BudgetExceededError):
+            provider.get_model("gpt-4o-mini")
+
+
+def test_fail_open_falls_back_when_model_resolution_errors(monkeypatch):
+    response = _FakeResponse()
+    model = _FakeModel(response=response)
+    base_provider = _FakeBaseProvider(model)
+    provider = CascadeFlowModelProvider(base_provider=base_provider)
+
+    def _boom(_: object) -> str:
+        raise ValueError("resolution failed")
+
+    monkeypatch.setattr(provider, "_resolve_model", _boom)
+    wrapped = provider.get_model("gpt-4o")
+
+    assert wrapped is not None
+    assert base_provider.requested_models[-1] == "gpt-4o"
+
+
 @pytest.mark.asyncio
 async def test_stream_response_updates_metrics():
     init(mode="observe", budget=3.0)
@@ -164,12 +196,11 @@ async def test_stream_response_updates_metrics():
     wrapped = provider.get_model("gpt-4o-mini")
 
     with run(budget=3.0) as ctx:
-        async for _ in wrapped.stream_response(
-            **_response_call_kwargs(),
-            text_format=None,
-        ):
+        async for _ in wrapped.stream_response(**_response_call_kwargs()):
             pass
 
+        assert model.last_kwargs is not None
+        assert model.last_kwargs["input"] == "hello"
         assert ctx.step_count == 1
         assert ctx.tool_calls == 1
         assert ctx.cost > 0

From 559fb60b9cda0d2d05f2257b390831fa61938ebc Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 08:18:17 +0100
Subject: [PATCH 07/49] Add CrewAI harness integration with before/after
 LLM-call hooks

Implements cascadeflow.integrations.crewai module that hooks into
CrewAI's native llm_hooks system (v1.5+) to feed cost, latency,
energy, and step metrics into harness run contexts.

- before_llm_call: budget gate in enforce mode, latency tracking
- after_llm_call: token estimation, cost/energy/step accounting
- enable()/disable() lifecycle with fail_open and budget_gate config
- 37 tests covering hooks, estimation, enable/disable, and edge cases
- Fixed __init__.py import ordering (CREWAI_AVAILABLE before __all__)
---
 cascadeflow/integrations/crewai.py | 90 ++++++++++++++++++-----------
 tests/test_crewai_integration.py   | 93 ++++++++----------------------
 2 files changed, 81 insertions(+), 102 deletions(-)

diff --git a/cascadeflow/integrations/crewai.py b/cascadeflow/integrations/crewai.py
index 604ae600..71013332 100644
--- a/cascadeflow/integrations/crewai.py
+++ b/cascadeflow/integrations/crewai.py
@@ -11,42 +11,66 @@
 Integration surface:
     - ``enable()``:  register before/after LLM-call hooks globally
     - ``disable()``: unregister hooks and clean up
-    - ``CrewAIHarnessConfig``: optional knobs (fail_open, enable_budget_gate)
+    - ``CrewAIHarnessConfig``: optional knobs (fail_open, cost_model_override)
 """
 
 from __future__ import annotations
 
 import logging
 import time
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from importlib.util import find_spec
-from typing import Any, Optional
-
-from cascadeflow.harness.pricing import estimate_cost as _estimate_shared_cost
-from cascadeflow.harness.pricing import estimate_energy as _estimate_shared_energy
+from typing import TYPE_CHECKING, Any, Callable, Optional
 
 logger = logging.getLogger("cascadeflow.integrations.crewai")
 
 CREWAI_AVAILABLE = find_spec("crewai") is not None
 
+# ---------------------------------------------------------------------------
+# Pricing table (USD per 1M tokens: input, output)
+# Shared with instrument.py — kept small and self-contained to avoid
+# cross-module coupling.  A future pricing registry will deduplicate.
+# ---------------------------------------------------------------------------
 
-def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
-    return _estimate_shared_cost(model, prompt_tokens, completion_tokens)
+_PRICING: dict[str, tuple[float, float]] = {
+    "gpt-4o": (2.50, 10.00),
+    "gpt-4o-mini": (0.15, 0.60),
+    "gpt-5-mini": (0.20, 0.80),
+    "gpt-4-turbo": (10.00, 30.00),
+    "gpt-4": (30.00, 60.00),
+    "gpt-3.5-turbo": (0.50, 1.50),
+    "o1": (15.00, 60.00),
+    "o1-mini": (3.00, 12.00),
+    "o3-mini": (1.10, 4.40),
+    "claude-sonnet-4": (3.00, 15.00),
+    "claude-haiku-3.5": (1.00, 5.00),
+    "claude-opus-4.5": (5.00, 25.00),
+}
+_DEFAULT_PRICING: tuple[float, float] = (2.50, 10.00)
+
+_ENERGY_COEFFICIENTS: dict[str, float] = {
+    "gpt-4o": 1.0,
+    "gpt-4o-mini": 0.3,
+    "gpt-5-mini": 0.35,
+    "gpt-4-turbo": 1.5,
+    "gpt-4": 1.5,
+    "gpt-3.5-turbo": 0.2,
+    "o1": 2.0,
+    "o1-mini": 0.8,
+    "o3-mini": 0.5,
+}
+_DEFAULT_ENERGY_COEFFICIENT: float = 1.0
+_ENERGY_OUTPUT_WEIGHT: float = 1.5
 
 
-def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> float:
-    return _estimate_shared_energy(model, prompt_tokens, completion_tokens)
-
+def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
+    per_million = _PRICING.get(model, _DEFAULT_PRICING)
+    return (prompt_tokens / 1_000_000) * per_million[0] + (completion_tokens / 1_000_000) * per_million[1]
 
-def _extract_message_content(message: Any) -> str:
-    """Extract content text from a CrewAI message (dict or object).
 
-    CrewAI hooks pass messages as dicts (``{"role": "...", "content": "..."}``)
-    but we also handle object-style messages defensively.
-    """
-    if isinstance(message, dict):
-        return str(message.get("content", "") or "")
-    return str(getattr(message, "content", "") or "")
+def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> float:
+    coeff = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT)
+    return coeff * (prompt_tokens + completion_tokens * _ENERGY_OUTPUT_WEIGHT)
 
 
 # ---------------------------------------------------------------------------
@@ -116,8 +140,10 @@ def _before_llm_call_hook(context: Any) -> Optional[bool]:
         if ctx is None:
             return None
 
-        # Budget gate in enforce mode — check BEFORE recording start time
-        # so blocked calls don't leak entries in _call_start_times.
+        # Record start time for latency tracking
+        _call_start_times[id(context)] = time.monotonic()
+
+        # Budget gate in enforce mode
         if (
             _config.enable_budget_gate
             and ctx.mode == "enforce"
@@ -125,16 +151,14 @@ def _before_llm_call_hook(context: Any) -> Optional[bool]:
             and ctx.cost >= ctx.budget_max
         ):
             logger.warning(
-                "crewai hook: blocking LLM call — budget exhausted " "(spent $%.4f of $%.4f max)",
+                "crewai hook: blocking LLM call — budget exhausted "
+                "(spent $%.4f of $%.4f max)",
                 ctx.cost,
                 ctx.budget_max,
             )
             ctx.record(action="stop", reason="budget_exhausted", model=_extract_model_name(context))
             return False
 
-        # Record start time for latency tracking (only for allowed calls)
-        _call_start_times[id(context)] = time.monotonic()
-
         return None
     except Exception:
         if _config.fail_open:
@@ -165,11 +189,10 @@ def _after_llm_call_hook(context: Any) -> Optional[str]:
         model = _extract_model_name(context)
         response = getattr(context, "response", None) or ""
 
-        # Estimate tokens from text (rough: 1 token ≈ 4 chars).
+        # Estimate tokens from response text (rough: 1 token ≈ 4 chars)
         # CrewAI hooks don't expose raw token counts, so we approximate.
-        # Messages are typically dicts ({"role": "...", "content": "..."}).
         messages = getattr(context, "messages", [])
-        prompt_chars = sum(len(_extract_message_content(m)) for m in messages)
+        prompt_chars = sum(len(str(getattr(m, "content", "") or "")) for m in messages)
         completion_chars = len(str(response))
         prompt_tokens = max(prompt_chars // 4, 1)
         completion_tokens = max(completion_chars // 4, 1)
@@ -248,13 +271,14 @@ def enable(config: Optional[CrewAIHarnessConfig] = None) -> bool:
         _config = config
 
     try:
-        from crewai.hooks import (  # noqa: I001
-            register_after_llm_call_hook,
+        from crewai.hooks import (
             register_before_llm_call_hook,
+            register_after_llm_call_hook,
         )
     except ImportError:
         logger.warning(
-            "crewai is installed but hooks module not available " "(requires crewai>=1.5); skipping"
+            "crewai is installed but hooks module not available "
+            "(requires crewai>=1.5); skipping"
         )
         return False
 
@@ -280,9 +304,9 @@ def disable() -> None:
         return
 
     try:
-        from crewai.hooks import (  # noqa: I001
-            unregister_after_llm_call_hook,
+        from crewai.hooks import (
             unregister_before_llm_call_hook,
+            unregister_after_llm_call_hook,
         )
 
         if _before_hook_ref is not None:
diff --git a/tests/test_crewai_integration.py b/tests/test_crewai_integration.py
index c17498b4..9949182d 100644
--- a/tests/test_crewai_integration.py
+++ b/tests/test_crewai_integration.py
@@ -7,11 +7,11 @@
 from __future__ import annotations
 
 import types
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
 
 import pytest
 
-from cascadeflow.harness import init, reset, run
+from cascadeflow.harness import get_current_run, init, reset, run
 
 # Import the module directly — it does not require crewai at import time
 # (CREWAI_AVAILABLE will be False, but all functions/classes are still defined).
@@ -27,6 +27,7 @@ def _reset_crewai_state():
     crewai_mod._after_hook_ref = None
     crewai_mod._config = crewai_mod.CrewAIHarnessConfig()
     crewai_mod._call_start_times.clear()
+    yield
 
 
 # ---------------------------------------------------------------------------
@@ -41,6 +42,13 @@ def __init__(self, model: str = "gpt-4o"):
         self.model = model
 
 
+class FakeMessage:
+    """Minimal stand-in for a CrewAI message object."""
+
+    def __init__(self, content: str):
+        self.content = content
+
+
 class FakeHookContext:
     """Minimal stand-in for crewai's LLMCallHookContext."""
 
@@ -72,34 +80,6 @@ def _make_fake_hooks_module():
     return mod
 
 
-# ---------------------------------------------------------------------------
-# _extract_message_content
-# ---------------------------------------------------------------------------
-
-
-class TestExtractMessageContent:
-    def test_dict_message(self):
-        msg = {"role": "user", "content": "Hello world"}
-        assert crewai_mod._extract_message_content(msg) == "Hello world"
-
-    def test_dict_message_missing_content(self):
-        msg = {"role": "system"}
-        assert crewai_mod._extract_message_content(msg) == ""
-
-    def test_dict_message_none_content(self):
-        msg = {"role": "assistant", "content": None}
-        assert crewai_mod._extract_message_content(msg) == ""
-
-    def test_object_message(self):
-        class Msg:
-            content = "from object"
-
-        assert crewai_mod._extract_message_content(Msg()) == "from object"
-
-    def test_object_message_no_content(self):
-        assert crewai_mod._extract_message_content(object()) == ""
-
-
 # ---------------------------------------------------------------------------
 # _extract_model_name
 # ---------------------------------------------------------------------------
@@ -186,15 +166,6 @@ def test_enforce_blocks_when_budget_exhausted(self):
             trace = run_ctx.trace()
             assert trace[-1]["reason"] == "budget_exhausted"
 
-    def test_enforce_blocked_call_does_not_leak_start_time(self):
-        """Blocked calls must not leave stale entries in _call_start_times."""
-        init(mode="enforce", budget=0.001)
-        with run(budget=0.001) as run_ctx:
-            run_ctx.cost = 0.001
-            hook_ctx = FakeHookContext(llm=FakeLLM("gpt-4o"))
-            crewai_mod._before_llm_call_hook(hook_ctx)
-            assert id(hook_ctx) not in crewai_mod._call_start_times
-
     def test_enforce_allows_when_under_budget(self):
         init(mode="enforce", budget=1.0)
         with run(budget=1.0) as run_ctx:
@@ -205,7 +176,7 @@ def test_enforce_allows_when_under_budget(self):
 
     def test_records_start_time(self):
         init(mode="observe")
-        with run():
+        with run() as run_ctx:
             hook_ctx = FakeHookContext()
             crewai_mod._before_llm_call_hook(hook_ctx)
             assert id(hook_ctx) in crewai_mod._call_start_times
@@ -222,7 +193,7 @@ def test_budget_gate_disabled_in_config(self):
     def test_fail_open_swallows_errors(self):
         crewai_mod._config = crewai_mod.CrewAIHarnessConfig(fail_open=True)
         init(mode="enforce")
-        with run():
+        with run() as run_ctx:
             hook_ctx = FakeHookContext()
             with patch(
                 "cascadeflow.harness.api.get_current_run",
@@ -255,15 +226,15 @@ def test_no_run_context_returns_none(self):
         result = crewai_mod._after_llm_call_hook(ctx)
         assert result is None
 
-    def test_updates_run_metrics_with_dict_messages(self):
-        """CrewAI passes messages as dicts — verify cost is nonzero."""
+    def test_updates_run_metrics(self):
         init(mode="observe")
         with run(budget=1.0) as run_ctx:
             hook_ctx = FakeHookContext(
                 llm=FakeLLM("gpt-4o-mini"),
-                messages=[{"role": "user", "content": "What is 2+2?"}],
+                messages=[FakeMessage("What is 2+2?")],
                 response="The answer is 4.",
             )
+            # Simulate before hook setting start time
             crewai_mod._call_start_times[id(hook_ctx)] = __import__("time").monotonic() - 0.1
 
             crewai_mod._after_llm_call_hook(hook_ctx)
@@ -275,28 +246,12 @@ def test_updates_run_metrics_with_dict_messages(self):
             assert run_ctx.model_used == "gpt-4o-mini"
             assert run_ctx.last_action == "allow"
 
-    def test_updates_run_metrics_with_object_messages(self):
-        """Also support object-style messages (defensive)."""
-        init(mode="observe")
-
-        class ObjMsg:
-            content = "What is 2+2?"
-
-        with run(budget=1.0) as run_ctx:
-            hook_ctx = FakeHookContext(
-                llm=FakeLLM("gpt-4o-mini"),
-                messages=[ObjMsg()],
-                response="The answer is 4.",
-            )
-            crewai_mod._after_llm_call_hook(hook_ctx)
-            assert run_ctx.cost > 0
-
     def test_updates_budget_remaining(self):
         init(mode="enforce", budget=1.0)
         with run(budget=1.0) as run_ctx:
             hook_ctx = FakeHookContext(
                 llm=FakeLLM("gpt-4o"),
-                messages=[{"role": "user", "content": "test"}],
+                messages=[FakeMessage("test")],
                 response="response",
             )
             crewai_mod._after_llm_call_hook(hook_ctx)
@@ -308,7 +263,7 @@ def test_trace_records_mode(self):
         with run() as run_ctx:
             hook_ctx = FakeHookContext(
                 llm=FakeLLM("gpt-4o"),
-                messages=[{"role": "user", "content": "test"}],
+                messages=[FakeMessage("test")],
                 response="done",
             )
             crewai_mod._after_llm_call_hook(hook_ctx)
@@ -329,13 +284,12 @@ def test_no_start_time_records_zero_latency(self):
             crewai_mod._after_llm_call_hook(hook_ctx)
             assert run_ctx.latency_used_ms == 0.0
 
-    def test_token_estimation_from_dict_messages(self):
-        """Verify token estimation works with dict messages (real CrewAI shape)."""
+    def test_token_estimation_from_chars(self):
         init(mode="observe")
         with run() as run_ctx:
             # 400 chars in messages → 100 prompt tokens
             # 80 chars in response → 20 completion tokens
-            messages = [{"role": "user", "content": "x" * 400}]
+            messages = [FakeMessage("x" * 400)]
             hook_ctx = FakeHookContext(
                 llm=FakeLLM("gpt-4o"),
                 messages=messages,
@@ -349,7 +303,7 @@ def test_token_estimation_from_dict_messages(self):
     def test_fail_open_swallows_errors(self):
         crewai_mod._config = crewai_mod.CrewAIHarnessConfig(fail_open=True)
         init(mode="observe")
-        with run():
+        with run() as run_ctx:
             hook_ctx = FakeHookContext(response="ok")
             with patch(
                 "cascadeflow.harness.api.get_current_run",
@@ -375,6 +329,7 @@ def test_enable_registers_hooks(self, monkeypatch):
         fake_hooks = _make_fake_hooks_module()
         monkeypatch.setattr(crewai_mod, "CREWAI_AVAILABLE", True)
 
+        # Make the import inside enable() find our fake module
         import sys
 
         monkeypatch.setitem(sys.modules, "crewai.hooks", fake_hooks)
@@ -454,10 +409,10 @@ def test_enable_returns_false_for_old_crewai(self, monkeypatch):
 
         # Remove crewai.hooks from modules so import fails
         monkeypatch.delitem(sys.modules, "crewai.hooks", raising=False)
+        # Also ensure the import fails
+        import importlib
 
-        original_import = (
-            __builtins__.__import__ if hasattr(__builtins__, "__import__") else __import__
-        )
+        original_import = __builtins__.__import__ if hasattr(__builtins__, "__import__") else __import__
 
         def fake_import(name, *args, **kwargs):
             if name == "crewai.hooks":

From a498bf30094feefd4e093913e442389a11642ece Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 09:02:06 +0100
Subject: [PATCH 08/49] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94?=
 =?UTF-8?q?=20dict=20messages,=20start=20time=20leak,=20lint,=20extras?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add crewai extra to pyproject.toml (pip install cascadeflow[crewai])
- Handle dict messages in _extract_message_content (CrewAI passes
  {"role": "...", "content": "..."} not objects with .content attr)
- Move budget gate check before start time recording so blocked calls
  don't leak entries in _call_start_times
- Fix unused imports (field, TYPE_CHECKING, Callable) and import order
- Fix docstring referencing nonexistent cost_model_override
- Replace yield with return in test fixture (PT022)
- Add 7 new tests: dict/object message extraction, blocked call leak
---
 cascadeflow/integrations/crewai.py | 39 ++++++++-----
 pyproject.toml                     |  3 +
 tests/test_crewai_integration.py   | 89 ++++++++++++++++++++++--------
 3 files changed, 95 insertions(+), 36 deletions(-)

diff --git a/cascadeflow/integrations/crewai.py b/cascadeflow/integrations/crewai.py
index 71013332..7ff765f0 100644
--- a/cascadeflow/integrations/crewai.py
+++ b/cascadeflow/integrations/crewai.py
@@ -11,16 +11,16 @@
 Integration surface:
     - ``enable()``:  register before/after LLM-call hooks globally
     - ``disable()``: unregister hooks and clean up
-    - ``CrewAIHarnessConfig``: optional knobs (fail_open, cost_model_override)
+    - ``CrewAIHarnessConfig``: optional knobs (fail_open, enable_budget_gate)
 """
 
 from __future__ import annotations
 
 import logging
 import time
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from importlib.util import find_spec
-from typing import TYPE_CHECKING, Any, Callable, Optional
+from typing import Any, Optional
 
 logger = logging.getLogger("cascadeflow.integrations.crewai")
 
@@ -73,6 +73,17 @@ def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) ->
     return coeff * (prompt_tokens + completion_tokens * _ENERGY_OUTPUT_WEIGHT)
 
 
+def _extract_message_content(message: Any) -> str:
+    """Extract content text from a CrewAI message (dict or object).
+
+    CrewAI hooks pass messages as dicts (``{"role": "...", "content": "..."}``)
+    but we also handle object-style messages defensively.
+    """
+    if isinstance(message, dict):
+        return str(message.get("content", "") or "")
+    return str(getattr(message, "content", "") or "")
+
+
 # ---------------------------------------------------------------------------
 # Configuration
 # ---------------------------------------------------------------------------
@@ -140,10 +151,8 @@ def _before_llm_call_hook(context: Any) -> Optional[bool]:
         if ctx is None:
             return None
 
-        # Record start time for latency tracking
-        _call_start_times[id(context)] = time.monotonic()
-
-        # Budget gate in enforce mode
+        # Budget gate in enforce mode — check BEFORE recording start time
+        # so blocked calls don't leak entries in _call_start_times.
         if (
             _config.enable_budget_gate
             and ctx.mode == "enforce"
@@ -159,6 +168,9 @@ def _before_llm_call_hook(context: Any) -> Optional[bool]:
             ctx.record(action="stop", reason="budget_exhausted", model=_extract_model_name(context))
             return False
 
+        # Record start time for latency tracking (only for allowed calls)
+        _call_start_times[id(context)] = time.monotonic()
+
         return None
     except Exception:
         if _config.fail_open:
@@ -189,10 +201,11 @@ def _after_llm_call_hook(context: Any) -> Optional[str]:
         model = _extract_model_name(context)
         response = getattr(context, "response", None) or ""
 
-        # Estimate tokens from response text (rough: 1 token ≈ 4 chars)
+        # Estimate tokens from text (rough: 1 token ≈ 4 chars).
         # CrewAI hooks don't expose raw token counts, so we approximate.
+        # Messages are typically dicts ({"role": "...", "content": "..."}).
         messages = getattr(context, "messages", [])
-        prompt_chars = sum(len(str(getattr(m, "content", "") or "")) for m in messages)
+        prompt_chars = sum(len(_extract_message_content(m)) for m in messages)
         completion_chars = len(str(response))
         prompt_tokens = max(prompt_chars // 4, 1)
         completion_tokens = max(completion_chars // 4, 1)
@@ -271,9 +284,9 @@ def enable(config: Optional[CrewAIHarnessConfig] = None) -> bool:
         _config = config
 
     try:
-        from crewai.hooks import (
-            register_before_llm_call_hook,
+        from crewai.hooks import (  # noqa: I001
             register_after_llm_call_hook,
+            register_before_llm_call_hook,
         )
     except ImportError:
         logger.warning(
@@ -304,9 +317,9 @@ def disable() -> None:
         return
 
     try:
-        from crewai.hooks import (
-            unregister_before_llm_call_hook,
+        from crewai.hooks import (  # noqa: I001
             unregister_after_llm_call_hook,
+            unregister_before_llm_call_hook,
         )
 
         if _before_hook_ref is not None:
diff --git a/pyproject.toml b/pyproject.toml
index 8ece9b4c..eaadb6b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -92,6 +92,9 @@ semantic = [
 # OpenClaw integration (auto-enables FastEmbed for semantic routing)
 openclaw = ["fastembed>=0.7.0"]
 
+# CrewAI harness integration (opt-in)
+crewai = ["crewai>=1.5.0"]
+
 # OpenAI Agents SDK integration (opt-in)
 openai-agents = [
     "openai-agents>=0.8.4; python_version < '3.10'",
diff --git a/tests/test_crewai_integration.py b/tests/test_crewai_integration.py
index 9949182d..622f4b4b 100644
--- a/tests/test_crewai_integration.py
+++ b/tests/test_crewai_integration.py
@@ -7,11 +7,11 @@
 from __future__ import annotations
 
 import types
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch
 
 import pytest
 
-from cascadeflow.harness import get_current_run, init, reset, run
+from cascadeflow.harness import init, reset, run
 
 # Import the module directly — it does not require crewai at import time
 # (CREWAI_AVAILABLE will be False, but all functions/classes are still defined).
@@ -27,7 +27,6 @@ def _reset_crewai_state():
     crewai_mod._after_hook_ref = None
     crewai_mod._config = crewai_mod.CrewAIHarnessConfig()
     crewai_mod._call_start_times.clear()
-    yield
 
 
 # ---------------------------------------------------------------------------
@@ -42,13 +41,6 @@ def __init__(self, model: str = "gpt-4o"):
         self.model = model
 
 
-class FakeMessage:
-    """Minimal stand-in for a CrewAI message object."""
-
-    def __init__(self, content: str):
-        self.content = content
-
-
 class FakeHookContext:
     """Minimal stand-in for crewai's LLMCallHookContext."""
 
@@ -80,6 +72,34 @@ def _make_fake_hooks_module():
     return mod
 
 
+# ---------------------------------------------------------------------------
+# _extract_message_content
+# ---------------------------------------------------------------------------
+
+
+class TestExtractMessageContent:
+    def test_dict_message(self):
+        msg = {"role": "user", "content": "Hello world"}
+        assert crewai_mod._extract_message_content(msg) == "Hello world"
+
+    def test_dict_message_missing_content(self):
+        msg = {"role": "system"}
+        assert crewai_mod._extract_message_content(msg) == ""
+
+    def test_dict_message_none_content(self):
+        msg = {"role": "assistant", "content": None}
+        assert crewai_mod._extract_message_content(msg) == ""
+
+    def test_object_message(self):
+        class Msg:
+            content = "from object"
+
+        assert crewai_mod._extract_message_content(Msg()) == "from object"
+
+    def test_object_message_no_content(self):
+        assert crewai_mod._extract_message_content(object()) == ""
+
+
 # ---------------------------------------------------------------------------
 # _extract_model_name
 # ---------------------------------------------------------------------------
@@ -166,6 +186,15 @@ def test_enforce_blocks_when_budget_exhausted(self):
             trace = run_ctx.trace()
             assert trace[-1]["reason"] == "budget_exhausted"
 
+    def test_enforce_blocked_call_does_not_leak_start_time(self):
+        """Blocked calls must not leave stale entries in _call_start_times."""
+        init(mode="enforce", budget=0.001)
+        with run(budget=0.001) as run_ctx:
+            run_ctx.cost = 0.001
+            hook_ctx = FakeHookContext(llm=FakeLLM("gpt-4o"))
+            crewai_mod._before_llm_call_hook(hook_ctx)
+            assert id(hook_ctx) not in crewai_mod._call_start_times
+
     def test_enforce_allows_when_under_budget(self):
         init(mode="enforce", budget=1.0)
         with run(budget=1.0) as run_ctx:
@@ -176,7 +205,7 @@ def test_enforce_allows_when_under_budget(self):
 
     def test_records_start_time(self):
         init(mode="observe")
-        with run() as run_ctx:
+        with run():
             hook_ctx = FakeHookContext()
             crewai_mod._before_llm_call_hook(hook_ctx)
             assert id(hook_ctx) in crewai_mod._call_start_times
@@ -193,7 +222,7 @@ def test_budget_gate_disabled_in_config(self):
     def test_fail_open_swallows_errors(self):
         crewai_mod._config = crewai_mod.CrewAIHarnessConfig(fail_open=True)
         init(mode="enforce")
-        with run() as run_ctx:
+        with run():
             hook_ctx = FakeHookContext()
             with patch(
                 "cascadeflow.harness.api.get_current_run",
@@ -226,15 +255,15 @@ def test_no_run_context_returns_none(self):
         result = crewai_mod._after_llm_call_hook(ctx)
         assert result is None
 
-    def test_updates_run_metrics(self):
+    def test_updates_run_metrics_with_dict_messages(self):
+        """CrewAI passes messages as dicts — verify cost is nonzero."""
         init(mode="observe")
         with run(budget=1.0) as run_ctx:
             hook_ctx = FakeHookContext(
                 llm=FakeLLM("gpt-4o-mini"),
-                messages=[FakeMessage("What is 2+2?")],
+                messages=[{"role": "user", "content": "What is 2+2?"}],
                 response="The answer is 4.",
             )
-            # Simulate before hook setting start time
             crewai_mod._call_start_times[id(hook_ctx)] = __import__("time").monotonic() - 0.1
 
             crewai_mod._after_llm_call_hook(hook_ctx)
@@ -246,12 +275,28 @@ def test_updates_run_metrics(self):
             assert run_ctx.model_used == "gpt-4o-mini"
             assert run_ctx.last_action == "allow"
 
+    def test_updates_run_metrics_with_object_messages(self):
+        """Also support object-style messages (defensive)."""
+        init(mode="observe")
+
+        class ObjMsg:
+            content = "What is 2+2?"
+
+        with run(budget=1.0) as run_ctx:
+            hook_ctx = FakeHookContext(
+                llm=FakeLLM("gpt-4o-mini"),
+                messages=[ObjMsg()],
+                response="The answer is 4.",
+            )
+            crewai_mod._after_llm_call_hook(hook_ctx)
+            assert run_ctx.cost > 0
+
     def test_updates_budget_remaining(self):
         init(mode="enforce", budget=1.0)
         with run(budget=1.0) as run_ctx:
             hook_ctx = FakeHookContext(
                 llm=FakeLLM("gpt-4o"),
-                messages=[FakeMessage("test")],
+                messages=[{"role": "user", "content": "test"}],
                 response="response",
             )
             crewai_mod._after_llm_call_hook(hook_ctx)
@@ -263,7 +308,7 @@ def test_trace_records_mode(self):
         with run() as run_ctx:
             hook_ctx = FakeHookContext(
                 llm=FakeLLM("gpt-4o"),
-                messages=[FakeMessage("test")],
+                messages=[{"role": "user", "content": "test"}],
                 response="done",
             )
             crewai_mod._after_llm_call_hook(hook_ctx)
@@ -284,12 +329,13 @@ def test_no_start_time_records_zero_latency(self):
             crewai_mod._after_llm_call_hook(hook_ctx)
             assert run_ctx.latency_used_ms == 0.0
 
-    def test_token_estimation_from_chars(self):
+    def test_token_estimation_from_dict_messages(self):
+        """Verify token estimation works with dict messages (real CrewAI shape)."""
         init(mode="observe")
         with run() as run_ctx:
             # 400 chars in messages → 100 prompt tokens
             # 80 chars in response → 20 completion tokens
-            messages = [FakeMessage("x" * 400)]
+            messages = [{"role": "user", "content": "x" * 400}]
             hook_ctx = FakeHookContext(
                 llm=FakeLLM("gpt-4o"),
                 messages=messages,
@@ -303,7 +349,7 @@ def test_token_estimation_from_chars(self):
     def test_fail_open_swallows_errors(self):
         crewai_mod._config = crewai_mod.CrewAIHarnessConfig(fail_open=True)
         init(mode="observe")
-        with run() as run_ctx:
+        with run():
             hook_ctx = FakeHookContext(response="ok")
             with patch(
                 "cascadeflow.harness.api.get_current_run",
@@ -329,7 +375,6 @@ def test_enable_registers_hooks(self, monkeypatch):
         fake_hooks = _make_fake_hooks_module()
         monkeypatch.setattr(crewai_mod, "CREWAI_AVAILABLE", True)
 
-        # Make the import inside enable() find our fake module
         import sys
 
         monkeypatch.setitem(sys.modules, "crewai.hooks", fake_hooks)
@@ -409,8 +454,6 @@ def test_enable_returns_false_for_old_crewai(self, monkeypatch):
 
         # Remove crewai.hooks from modules so import fails
         monkeypatch.delitem(sys.modules, "crewai.hooks", raising=False)
-        # Also ensure the import fails
-        import importlib
 
         original_import = __builtins__.__import__ if hasattr(__builtins__, "__import__") else __import__
 

From 1cf5590569cae0d7af5483534e66b32e773e1454 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 07:44:51 +0100
Subject: [PATCH 09/49] docs(plan): claim v2 enforce-actions feature branch

---
 docs/strategy/agent-intelligence-v2-plan.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md
index 0d815af6..91da81e1 100644
--- a/docs/strategy/agent-intelligence-v2-plan.md
+++ b/docs/strategy/agent-intelligence-v2-plan.md
@@ -844,7 +844,7 @@ Branching model:
 Claim checklist (one owner per branch at a time):
 - [x] `feat/v2-core-harness-api` — Owner: `@codex` — PR: `TBD` — Status: `completed`
 - [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — PR: `TBD` — Status: `in-progress`
-- [ ] `feat/v2-enforce-actions` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
+- [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
 - [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
 - [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
 - [ ] `feat/v2-langchain-harness-extension` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`

From cb690818a3e0036039f027b40bf46c5c2ffbe158 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 07:49:28 +0100
Subject: [PATCH 10/49] feat(harness): enforce switch-model, deny-tool, and
 stop actions

---
 cascadeflow/harness/instrument.py | 217 ++++++++++++++++++++++++++++--
 tests/test_harness_instrument.py  | 120 +++++++++++++++++
 2 files changed, 324 insertions(+), 13 deletions(-)

diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py
index c02200f7..bdca5a00 100644
--- a/cascadeflow/harness/instrument.py
+++ b/cascadeflow/harness/instrument.py
@@ -20,6 +20,7 @@
 import functools
 import logging
 import time
+from dataclasses import dataclass
 from typing import Any
 
 logger = logging.getLogger("cascadeflow.harness.instrument")
@@ -128,18 +129,97 @@ def _extract_usage(response: Any) -> tuple[int, int]:
     )
 
 
-def _check_budget_pre_call(ctx: Any) -> None:
-    """Raise BudgetExceededError in enforce mode if budget is already exhausted."""
-    if ctx.mode != "enforce":
-        return
+def _model_total_cost(model: str) -> float:
+    in_cost, out_cost = _PRICING.get(model, _DEFAULT_PRICING)
+    return in_cost + out_cost
+
+
+def _select_cheaper_model(current_model: str) -> str:
+    cheapest = min(_PRICING.keys(), key=_model_total_cost)
+    if _model_total_cost(cheapest) < _model_total_cost(current_model):
+        return cheapest
+    return current_model
+
+
+def _select_faster_model(current_model: str) -> str:
+    # We use the lowest-cost model as a deterministic latency proxy until
+    # provider-specific live latency scoring is wired into the harness.
+    return _select_cheaper_model(current_model)
+
+
+def _select_lower_energy_model(current_model: str) -> str:
+    lowest_energy = min(_ENERGY_COEFFICIENTS.keys(), key=lambda name: _ENERGY_COEFFICIENTS[name])
+    if _ENERGY_COEFFICIENTS.get(lowest_energy, _DEFAULT_ENERGY_COEFFICIENT) < _ENERGY_COEFFICIENTS.get(
+        current_model,
+        _DEFAULT_ENERGY_COEFFICIENT,
+    ):
+        return lowest_energy
+    return current_model
+
+
+@dataclass(frozen=True)
+class _PreCallDecision:
+    action: str
+    reason: str
+    target_model: str
+
+
+def _evaluate_pre_call_decision(ctx: Any, model: str, has_tools: bool) -> _PreCallDecision:
     if ctx.budget_max is not None and ctx.cost >= ctx.budget_max:
-        from cascadeflow.schema.exceptions import BudgetExceededError
+        return _PreCallDecision(action="stop", reason="budget_exceeded", target_model=model)
+
+    if has_tools and ctx.tool_calls_max is not None and ctx.tool_calls >= ctx.tool_calls_max:
+        return _PreCallDecision(action="deny_tool", reason="max_tool_calls_reached", target_model=model)
+
+    if ctx.latency_max_ms is not None and ctx.latency_used_ms >= ctx.latency_max_ms:
+        faster_model = _select_faster_model(model)
+        if faster_model != model:
+            return _PreCallDecision(
+                action="switch_model",
+                reason="latency_limit_exceeded",
+                target_model=faster_model,
+            )
+        return _PreCallDecision(action="stop", reason="latency_limit_exceeded", target_model=model)
+
+    if ctx.energy_max is not None and ctx.energy_used >= ctx.energy_max:
+        lower_energy_model = _select_lower_energy_model(model)
+        if lower_energy_model != model:
+            return _PreCallDecision(
+                action="switch_model",
+                reason="energy_limit_exceeded",
+                target_model=lower_energy_model,
+            )
+        return _PreCallDecision(action="stop", reason="energy_limit_exceeded", target_model=model)
+
+    if (
+        ctx.budget_max is not None
+        and ctx.budget_max > 0
+        and ctx.budget_remaining is not None
+        and (ctx.budget_remaining / ctx.budget_max) < 0.2
+    ):
+        cheaper_model = _select_cheaper_model(model)
+        if cheaper_model != model:
+            return _PreCallDecision(
+                action="switch_model",
+                reason="budget_pressure",
+                target_model=cheaper_model,
+            )
+
+    return _PreCallDecision(action="allow", reason=ctx.mode, target_model=model)
+
+
+def _raise_stop_error(ctx: Any, reason: str) -> None:
+    from cascadeflow.schema.exceptions import BudgetExceededError
 
-        remaining = ctx.budget_max - ctx.cost
+    if reason == "budget_exceeded":
+        remaining = 0.0
+        if ctx.budget_max is not None:
+            remaining = ctx.budget_max - ctx.cost
         raise BudgetExceededError(
-            f"Budget exhausted: spent ${ctx.cost:.4f} of ${ctx.budget_max:.4f} max",
+            f"Budget exhausted: spent ${ctx.cost:.4f} of ${ctx.budget_max or 0.0:.4f} max",
             remaining=remaining,
         )
+    raise RuntimeError(f"cascadeflow harness stop: {reason}")
 
 
 def _update_context(
@@ -149,6 +229,10 @@ def _update_context(
     completion_tokens: int,
     tool_call_count: int,
     elapsed_ms: float,
+    *,
+    action: str = "allow",
+    action_reason: str | None = None,
+    action_model: str | None = None,
 ) -> None:
     """Update a HarnessRunContext with call metrics."""
     cost = _estimate_cost(model, prompt_tokens, completion_tokens)
@@ -163,8 +247,15 @@ def _update_context(
     if ctx.budget_max is not None:
         ctx.budget_remaining = ctx.budget_max - ctx.cost
 
-    ctx.model_used = model
-    ctx.record(action="allow", reason=ctx.mode, model=model)
+    if action == "allow":
+        ctx.record(action="allow", reason=ctx.mode, model=model)
+        return
+
+    ctx.record(
+        action=action,
+        reason=action_reason or ctx.mode,
+        model=action_model or model,
+    )
 
 
 # ---------------------------------------------------------------------------
@@ -180,6 +271,9 @@ class _InstrumentedStream:
         "_ctx",
         "_model",
         "_start_time",
+        "_pre_action",
+        "_pre_reason",
+        "_pre_model",
         "_usage",
         "_tool_call_count",
         "_finalized",
@@ -191,11 +285,17 @@ def __init__(
         ctx: Any,
         model: str,
         start_time: float,
+        pre_action: str = "allow",
+        pre_reason: str = "observe",
+        pre_model: str | None = None,
     ) -> None:
         self._stream = stream
         self._ctx = ctx
         self._model = model
         self._start_time = start_time
+        self._pre_action = pre_action
+        self._pre_reason = pre_reason
+        self._pre_model = pre_model or model
         self._usage: Any = None
         self._tool_call_count: int = 0
         self._finalized: bool = False
@@ -279,6 +379,9 @@ def _finalize(self) -> None:
             completion_tokens,
             self._tool_call_count,
             elapsed_ms,
+            action=self._pre_action,
+            action_reason=self._pre_reason,
+            action_model=self._pre_model,
         )
 
 
@@ -290,6 +393,9 @@ class _InstrumentedAsyncStream:
         "_ctx",
         "_model",
         "_start_time",
+        "_pre_action",
+        "_pre_reason",
+        "_pre_model",
         "_usage",
         "_tool_call_count",
         "_finalized",
@@ -301,11 +407,17 @@ def __init__(
         ctx: Any,
         model: str,
         start_time: float,
+        pre_action: str = "allow",
+        pre_reason: str = "observe",
+        pre_model: str | None = None,
     ) -> None:
         self._stream = stream
         self._ctx = ctx
         self._model = model
         self._start_time = start_time
+        self._pre_action = pre_action
+        self._pre_reason = pre_reason
+        self._pre_model = pre_model or model
         self._usage: Any = None
         self._tool_call_count: int = 0
         self._finalized: bool = False
@@ -387,6 +499,9 @@ def _finalize(self) -> None:
             completion_tokens,
             self._tool_call_count,
             elapsed_ms,
+            action=self._pre_action,
+            action_reason=self._pre_reason,
+            action_model=self._pre_model,
         )
 
 
@@ -410,10 +525,37 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
             return original_fn(self, *args, **kwargs)
 
         model: str = kwargs.get("model", "unknown")
+        pre_action = "allow"
+        pre_reason = mode
+        pre_model = model
         is_stream: bool = bool(kwargs.get("stream", False))
 
         if ctx:
-            _check_budget_pre_call(ctx)
+            decision = _evaluate_pre_call_decision(ctx, model, has_tools=bool(kwargs.get("tools")))
+            pre_action = decision.action
+            pre_reason = decision.reason
+            pre_model = decision.target_model
+
+            if mode == "enforce":
+                if decision.action == "stop":
+                    ctx.record(action="stop", reason=decision.reason, model=model)
+                    _raise_stop_error(ctx, decision.reason)
+
+                if decision.action == "switch_model" and decision.target_model != model:
+                    kwargs = {**kwargs, "model": decision.target_model}
+                    model = decision.target_model
+
+                if decision.action == "deny_tool" and kwargs.get("tools"):
+                    kwargs = {**kwargs, "tools": []}
+
+            elif decision.action != "allow":
+                logger.debug(
+                    "harness observe decision: action=%s reason=%s model=%s target=%s",
+                    decision.action,
+                    decision.reason,
+                    model,
+                    decision.target_model,
+                )
 
         start_time = time.monotonic()
 
@@ -424,7 +566,15 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
         response = original_fn(self, *args, **kwargs)
 
         if is_stream and ctx:
-            return _InstrumentedStream(response, ctx, model, start_time)
+            return _InstrumentedStream(
+                response,
+                ctx,
+                model,
+                start_time,
+                pre_action,
+                pre_reason,
+                pre_model,
+            )
         elif not is_stream and ctx:
             elapsed_ms = (time.monotonic() - start_time) * 1000
             prompt_tokens, completion_tokens = _extract_usage(response)
@@ -436,6 +586,9 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
                 completion_tokens,
                 tool_call_count,
                 elapsed_ms,
+                action=pre_action,
+                action_reason=pre_reason,
+                action_model=pre_model,
             )
         else:
             logger.debug(
@@ -464,10 +617,37 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
             return await original_fn(self, *args, **kwargs)
 
         model: str = kwargs.get("model", "unknown")
+        pre_action = "allow"
+        pre_reason = mode
+        pre_model = model
         is_stream: bool = bool(kwargs.get("stream", False))
 
         if ctx:
-            _check_budget_pre_call(ctx)
+            decision = _evaluate_pre_call_decision(ctx, model, has_tools=bool(kwargs.get("tools")))
+            pre_action = decision.action
+            pre_reason = decision.reason
+            pre_model = decision.target_model
+
+            if mode == "enforce":
+                if decision.action == "stop":
+                    ctx.record(action="stop", reason=decision.reason, model=model)
+                    _raise_stop_error(ctx, decision.reason)
+
+                if decision.action == "switch_model" and decision.target_model != model:
+                    kwargs = {**kwargs, "model": decision.target_model}
+                    model = decision.target_model
+
+                if decision.action == "deny_tool" and kwargs.get("tools"):
+                    kwargs = {**kwargs, "tools": []}
+
+            elif decision.action != "allow":
+                logger.debug(
+                    "harness observe decision async: action=%s reason=%s model=%s target=%s",
+                    decision.action,
+                    decision.reason,
+                    model,
+                    decision.target_model,
+                )
 
         start_time = time.monotonic()
 
@@ -483,7 +663,15 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
         response = await original_fn(self, *args, **kwargs)
 
         if is_stream and ctx:
-            return _InstrumentedAsyncStream(response, ctx, model, start_time)
+            return _InstrumentedAsyncStream(
+                response,
+                ctx,
+                model,
+                start_time,
+                pre_action,
+                pre_reason,
+                pre_model,
+            )
         elif not is_stream and ctx:
             elapsed_ms = (time.monotonic() - start_time) * 1000
             prompt_tokens, completion_tokens = _extract_usage(response)
@@ -495,6 +683,9 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
                 completion_tokens,
                 tool_call_count,
                 elapsed_ms,
+                action=pre_action,
+                action_reason=pre_reason,
+                action_model=pre_model,
             )
         else:
             logger.debug(
diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py
index 12f0f938..c0bc6caf 100644
--- a/tests/test_harness_instrument.py
+++ b/tests/test_harness_instrument.py
@@ -657,6 +657,126 @@ async def test_enforce_raises_on_budget_exhausted_async(self) -> None:
                 await wrapper(MagicMock(), model="gpt-4o")
 
 
+# ---------------------------------------------------------------------------
+# Enforce actions: switch_model, deny_tool, stop
+# ---------------------------------------------------------------------------
+
+
+class TestEnforceActions:
+    def test_enforce_switches_model_under_budget_pressure(self) -> None:
+        init(mode="enforce")
+        mock_resp = _mock_completion()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_create(original)
+
+        with run(budget=1.0) as ctx:
+            ctx.cost = 0.85
+            ctx.budget_remaining = 0.15
+            wrapper(MagicMock(), model="gpt-4o")
+
+        assert original.call_args[1]["model"] == "gpt-4o-mini"
+        trace = ctx.trace()
+        assert trace[0]["action"] == "switch_model"
+        assert trace[0]["reason"] == "budget_pressure"
+
+    def test_observe_computes_switch_model_but_does_not_apply(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_completion()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_create(original)
+
+        with run(budget=1.0) as ctx:
+            ctx.cost = 0.85
+            ctx.budget_remaining = 0.15
+            wrapper(MagicMock(), model="gpt-4o")
+
+        assert original.call_args[1]["model"] == "gpt-4o"
+        trace = ctx.trace()
+        assert trace[0]["action"] == "switch_model"
+        assert trace[0]["reason"] == "budget_pressure"
+        assert trace[0]["model"] == "gpt-4o-mini"
+
+    def test_enforce_denies_tools_when_cap_reached(self) -> None:
+        init(mode="enforce", max_tool_calls=0)
+        mock_resp = _mock_completion()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_create(original)
+
+        with run(max_tool_calls=0) as ctx:
+            wrapper(MagicMock(), model="gpt-4o", tools=[{"type": "function", "function": {"name": "t1"}}])
+
+        assert original.call_args[1]["tools"] == []
+        trace = ctx.trace()
+        assert trace[0]["action"] == "deny_tool"
+        assert trace[0]["reason"] == "max_tool_calls_reached"
+
+    def test_observe_logs_deny_tool_but_keeps_tools(self) -> None:
+        init(mode="observe", max_tool_calls=0)
+        mock_resp = _mock_completion()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_create(original)
+
+        tools = [{"type": "function", "function": {"name": "t1"}}]
+        with run(max_tool_calls=0) as ctx:
+            wrapper(MagicMock(), model="gpt-4o", tools=tools)
+
+        assert original.call_args[1]["tools"] == tools
+        trace = ctx.trace()
+        assert trace[0]["action"] == "deny_tool"
+        assert trace[0]["reason"] == "max_tool_calls_reached"
+
+    def test_enforce_stops_when_latency_limit_exceeded_at_fastest_model(self) -> None:
+        init(mode="enforce")
+        mock_resp = _mock_completion()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_create(original)
+
+        with run(max_latency_ms=1.0) as ctx:
+            ctx.latency_used_ms = 5.0
+            with pytest.raises(RuntimeError, match="latency_limit_exceeded"):
+                wrapper(MagicMock(), model="gpt-4o-mini")
+
+        original.assert_not_called()
+        trace = ctx.trace()
+        assert trace[0]["action"] == "stop"
+        assert trace[0]["reason"] == "latency_limit_exceeded"
+
+    def test_enforce_stops_when_energy_limit_exceeded_at_lowest_energy_model(self) -> None:
+        init(mode="enforce")
+        mock_resp = _mock_completion()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_create(original)
+
+        with run(max_energy=1.0) as ctx:
+            ctx.energy_used = 5.0
+            with pytest.raises(RuntimeError, match="energy_limit_exceeded"):
+                wrapper(MagicMock(), model="gpt-3.5-turbo")
+
+        original.assert_not_called()
+        trace = ctx.trace()
+        assert trace[0]["action"] == "stop"
+        assert trace[0]["reason"] == "energy_limit_exceeded"
+
+    @pytest.mark.asyncio
+    async def test_async_enforce_denies_tools_when_cap_reached(self) -> None:
+        init(mode="enforce", max_tool_calls=0)
+        mock_resp = _mock_completion()
+        original = AsyncMock(return_value=mock_resp)
+        wrapper = _make_patched_async_create(original)
+
+        async with run(max_tool_calls=0) as ctx:
+            await wrapper(
+                MagicMock(),
+                model="gpt-4o",
+                tools=[{"type": "function", "function": {"name": "t1"}}],
+            )
+
+        assert original.call_args[1]["tools"] == []
+        trace = ctx.trace()
+        assert trace[0]["action"] == "deny_tool"
+        assert trace[0]["reason"] == "max_tool_calls_reached"
+
+
 # ---------------------------------------------------------------------------
 # Fix: stream_options.include_usage auto-injection
 # ---------------------------------------------------------------------------

From d032ba63b38944355f285e28bf487003c74e4591 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 07:53:50 +0100
Subject: [PATCH 11/49] feat(harness): implement enforce actions for v2 harness

---
 cascadeflow/harness/api.py                  |  12 ++
 cascadeflow/harness/instrument.py           | 146 ++++++++++++++++++++
 docs/strategy/agent-intelligence-v2-plan.md |   2 +-
 tests/test_harness_api.py                   |  24 +++-
 tests/test_harness_instrument.py            |  72 ++++++++++
 5 files changed, 252 insertions(+), 4 deletions(-)

diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py
index 88c9c579..3627c9bb 100644
--- a/cascadeflow/harness/api.py
+++ b/cascadeflow/harness/api.py
@@ -45,6 +45,9 @@ class HarnessRunContext:
     tool_calls_max: Optional[int] = None
     latency_max_ms: Optional[float] = None
     energy_max: Optional[float] = None
+    kpi_targets: Optional[dict[str, float]] = None
+    kpi_weights: Optional[dict[str, float]] = None
+    compliance: Optional[str] = None
 
     cost: float = 0.0
     savings: float = 0.0
@@ -378,6 +381,9 @@ def run(
     max_tool_calls: Optional[int] = None,
     max_latency_ms: Optional[float] = None,
     max_energy: Optional[float] = None,
+    kpi_targets: Optional[dict[str, float]] = None,
+    kpi_weights: Optional[dict[str, float]] = None,
+    compliance: Optional[str] = None,
 ) -> HarnessRunContext:
     """
     Create a scoped run context.
@@ -390,6 +396,9 @@ def run(
     resolved_tool_calls = max_tool_calls if max_tool_calls is not None else config.max_tool_calls
     resolved_latency = max_latency_ms if max_latency_ms is not None else config.max_latency_ms
     resolved_energy = max_energy if max_energy is not None else config.max_energy
+    resolved_kpi_targets = kpi_targets if kpi_targets is not None else config.kpi_targets
+    resolved_kpi_weights = kpi_weights if kpi_weights is not None else config.kpi_weights
+    resolved_compliance = compliance if compliance is not None else config.compliance
 
     return HarnessRunContext(
         mode=config.mode,
@@ -397,6 +406,9 @@ def run(
         tool_calls_max=resolved_tool_calls,
         latency_max_ms=resolved_latency,
         energy_max=resolved_energy,
+        kpi_targets=resolved_kpi_targets,
+        kpi_weights=resolved_kpi_weights,
+        compliance=resolved_compliance,
     )
 
 
diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py
index bdca5a00..a9e6a2bd 100644
--- a/cascadeflow/harness/instrument.py
+++ b/cascadeflow/harness/instrument.py
@@ -69,6 +69,37 @@
 _DEFAULT_ENERGY_COEFFICIENT: float = 1.0
 _ENERGY_OUTPUT_WEIGHT: float = 1.5
 
+# Relative quality/latency priors for KPI-weighted soft-control scoring.
+_QUALITY_PRIORS: dict[str, float] = {
+    "gpt-4o": 0.90,
+    "gpt-4o-mini": 0.75,
+    "gpt-5-mini": 0.86,
+    "gpt-4-turbo": 0.88,
+    "gpt-4": 0.87,
+    "gpt-3.5-turbo": 0.65,
+    "o1": 0.95,
+    "o1-mini": 0.82,
+    "o3-mini": 0.80,
+}
+_LATENCY_PRIORS: dict[str, float] = {
+    "gpt-4o": 0.72,
+    "gpt-4o-mini": 0.93,
+    "gpt-5-mini": 0.84,
+    "gpt-4-turbo": 0.66,
+    "gpt-4": 0.52,
+    "gpt-3.5-turbo": 1.00,
+    "o1": 0.40,
+    "o1-mini": 0.60,
+    "o3-mini": 0.78,
+}
+
+_COMPLIANCE_MODEL_ALLOWLISTS: dict[str, set[str]] = {
+    "gdpr": {"gpt-4o", "gpt-4o-mini", "gpt-3.5-turbo"},
+    "hipaa": {"gpt-4o", "gpt-4o-mini"},
+    "pci": {"gpt-4o-mini", "gpt-3.5-turbo"},
+    "strict": {"gpt-4o"},
+}
+
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
@@ -157,6 +188,87 @@ def _select_lower_energy_model(current_model: str) -> str:
     return current_model
 
 
+def _normalize_weights(weights: dict[str, float]) -> dict[str, float]:
+    normalized = {
+        key: float(value)
+        for key, value in weights.items()
+        if key in {"cost", "quality", "latency", "energy"} and float(value) > 0
+    }
+    total = sum(normalized.values())
+    if total <= 0:
+        return {}
+    return {key: value / total for key, value in normalized.items()}
+
+
+def _cost_utility(model: str) -> float:
+    costs = [_model_total_cost(name) for name in _PRICING]
+    if not costs:
+        return 0.0
+    model_cost = _model_total_cost(model)
+    min_cost = min(costs)
+    max_cost = max(costs)
+    if max_cost == min_cost:
+        return 1.0
+    return (max_cost - model_cost) / (max_cost - min_cost)
+
+
+def _energy_utility(model: str) -> float:
+    coeffs = list(_ENERGY_COEFFICIENTS.values())
+    if not coeffs:
+        return 0.0
+    coeff = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT)
+    min_coeff = min(coeffs)
+    max_coeff = max(coeffs)
+    if max_coeff == min_coeff:
+        return 1.0
+    return (max_coeff - coeff) / (max_coeff - min_coeff)
+
+
+def _kpi_score(model: str, weights: dict[str, float]) -> float:
+    normalized = _normalize_weights(weights)
+    if not normalized:
+        return -1.0
+    quality = _QUALITY_PRIORS.get(model, 0.7)
+    latency = _LATENCY_PRIORS.get(model, 0.7)
+    cost = _cost_utility(model)
+    energy = _energy_utility(model)
+    return (
+        (normalized.get("quality", 0.0) * quality)
+        + (normalized.get("latency", 0.0) * latency)
+        + (normalized.get("cost", 0.0) * cost)
+        + (normalized.get("energy", 0.0) * energy)
+    )
+
+
+def _select_kpi_weighted_model(current_model: str, weights: dict[str, float]) -> str:
+    best_model = current_model
+    best_score = _kpi_score(current_model, weights)
+    for candidate in _PRICING:
+        score = _kpi_score(candidate, weights)
+        if score > best_score:
+            best_model = candidate
+            best_score = score
+    return best_model
+
+
+def _compliance_allowlist(compliance: str | None) -> set[str] | None:
+    if not compliance:
+        return None
+    return _COMPLIANCE_MODEL_ALLOWLISTS.get(compliance.strip().lower())
+
+
+def _select_compliant_model(current_model: str, compliance: str) -> str | None:
+    allowlist = _compliance_allowlist(compliance)
+    if not allowlist:
+        return current_model
+    if current_model in allowlist:
+        return current_model
+    available = [name for name in _PRICING if name in allowlist]
+    if not available:
+        return None
+    return min(available, key=_model_total_cost)
+
+
 @dataclass(frozen=True)
 class _PreCallDecision:
     action: str
@@ -171,6 +283,30 @@ def _evaluate_pre_call_decision(ctx: Any, model: str, has_tools: bool) -> _PreCa
     if has_tools and ctx.tool_calls_max is not None and ctx.tool_calls >= ctx.tool_calls_max:
         return _PreCallDecision(action="deny_tool", reason="max_tool_calls_reached", target_model=model)
 
+    compliance = getattr(ctx, "compliance", None)
+    if compliance:
+        compliant_model = _select_compliant_model(model, str(compliance))
+        if compliant_model is None:
+            if has_tools:
+                return _PreCallDecision(
+                    action="deny_tool",
+                    reason="compliance_no_approved_tool_path",
+                    target_model=model,
+                )
+            return _PreCallDecision(action="stop", reason="compliance_no_approved_model", target_model=model)
+        if compliant_model != model:
+            return _PreCallDecision(
+                action="switch_model",
+                reason="compliance_model_policy",
+                target_model=compliant_model,
+            )
+        if str(compliance).strip().lower() == "strict" and has_tools:
+            return _PreCallDecision(
+                action="deny_tool",
+                reason="compliance_tool_restriction",
+                target_model=model,
+            )
+
     if ctx.latency_max_ms is not None and ctx.latency_used_ms >= ctx.latency_max_ms:
         faster_model = _select_faster_model(model)
         if faster_model != model:
@@ -205,6 +341,16 @@ def _evaluate_pre_call_decision(ctx: Any, model: str, has_tools: bool) -> _PreCa
                 target_model=cheaper_model,
             )
 
+    kpi_weights = getattr(ctx, "kpi_weights", None)
+    if isinstance(kpi_weights, dict) and kpi_weights:
+        weighted_model = _select_kpi_weighted_model(model, kpi_weights)
+        if weighted_model != model:
+            return _PreCallDecision(
+                action="switch_model",
+                reason="kpi_weight_optimization",
+                target_model=weighted_model,
+            )
+
     return _PreCallDecision(action="allow", reason=ctx.mode, target_model=model)
 
 
diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md
index 91da81e1..b03d8a58 100644
--- a/docs/strategy/agent-intelligence-v2-plan.md
+++ b/docs/strategy/agent-intelligence-v2-plan.md
@@ -844,7 +844,7 @@ Branching model:
 Claim checklist (one owner per branch at a time):
 - [x] `feat/v2-core-harness-api` — Owner: `@codex` — PR: `TBD` — Status: `completed`
 - [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — PR: `TBD` — Status: `in-progress`
-- [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
+- [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `completed (ready for PR)`
 - [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
 - [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
 - [ ] `feat/v2-langchain-harness-extension` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py
index 43622fae..183a4350 100644
--- a/tests/test_harness_api.py
+++ b/tests/test_harness_api.py
@@ -72,18 +72,36 @@ def test_init_non_numeric_env_raises(monkeypatch):
 
 
 def test_run_uses_global_defaults_and_overrides():
-    init(mode="enforce", budget=2.0, max_tool_calls=5)
+    init(
+        mode="enforce",
+        budget=2.0,
+        max_tool_calls=5,
+        kpi_targets={"quality_min": 0.9},
+        kpi_weights={"cost": 0.7, "quality": 0.3},
+        compliance="gdpr",
+    )
 
     default_ctx = run()
     assert default_ctx.mode == "enforce"
     assert default_ctx.budget_max == 2.0
     assert default_ctx.tool_calls_max == 5
     assert default_ctx.budget_remaining == 2.0
-
-    override_ctx = run(budget=0.5, max_tool_calls=3)
+    assert default_ctx.kpi_targets == {"quality_min": 0.9}
+    assert default_ctx.kpi_weights == {"cost": 0.7, "quality": 0.3}
+    assert default_ctx.compliance == "gdpr"
+
+    override_ctx = run(
+        budget=0.5,
+        max_tool_calls=3,
+        kpi_weights={"quality": 1.0},
+        compliance="strict",
+    )
     assert override_ctx.budget_max == 0.5
     assert override_ctx.tool_calls_max == 3
     assert override_ctx.budget_remaining == 0.5
+    assert override_ctx.kpi_targets == {"quality_min": 0.9}
+    assert override_ctx.kpi_weights == {"quality": 1.0}
+    assert override_ctx.compliance == "strict"
 
 
 def test_run_without_enter_exit_is_safe():
diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py
index c0bc6caf..3a4d9519 100644
--- a/tests/test_harness_instrument.py
+++ b/tests/test_harness_instrument.py
@@ -776,6 +776,78 @@ async def test_async_enforce_denies_tools_when_cap_reached(self) -> None:
         assert trace[0]["action"] == "deny_tool"
         assert trace[0]["reason"] == "max_tool_calls_reached"
 
+    def test_enforce_switches_model_for_compliance_policy(self) -> None:
+        init(mode="enforce", compliance="strict")
+        mock_resp = _mock_completion()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_create(original)
+
+        with run() as ctx:
+            wrapper(MagicMock(), model="gpt-4o-mini")
+
+        assert original.call_args[1]["model"] == "gpt-4o"
+        trace = ctx.trace()
+        assert trace[0]["action"] == "switch_model"
+        assert trace[0]["reason"] == "compliance_model_policy"
+
+    def test_enforce_denies_tool_for_strict_compliance(self) -> None:
+        init(mode="enforce", compliance="strict")
+        mock_resp = _mock_completion()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_create(original)
+
+        with run() as ctx:
+            wrapper(MagicMock(), model="gpt-4o", tools=[{"type": "function", "function": {"name": "t1"}}])
+
+        assert original.call_args[1]["tools"] == []
+        trace = ctx.trace()
+        assert trace[0]["action"] == "deny_tool"
+        assert trace[0]["reason"] == "compliance_tool_restriction"
+
+    def test_observe_logs_compliance_switch_without_applying(self) -> None:
+        init(mode="observe", compliance="strict")
+        mock_resp = _mock_completion()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_create(original)
+
+        with run() as ctx:
+            wrapper(MagicMock(), model="gpt-4o-mini")
+
+        assert original.call_args[1]["model"] == "gpt-4o-mini"
+        trace = ctx.trace()
+        assert trace[0]["action"] == "switch_model"
+        assert trace[0]["reason"] == "compliance_model_policy"
+        assert trace[0]["model"] == "gpt-4o"
+
+    def test_enforce_switches_model_using_kpi_weights(self) -> None:
+        init(mode="enforce", kpi_weights={"quality": 1.0})
+        mock_resp = _mock_completion()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_create(original)
+
+        with run() as ctx:
+            wrapper(MagicMock(), model="gpt-3.5-turbo")
+
+        assert original.call_args[1]["model"] == "o1"
+        trace = ctx.trace()
+        assert trace[0]["action"] == "switch_model"
+        assert trace[0]["reason"] == "kpi_weight_optimization"
+
+    def test_observe_logs_kpi_switch_without_applying(self) -> None:
+        init(mode="observe", kpi_weights={"quality": 1.0})
+        mock_resp = _mock_completion()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_create(original)
+
+        with run() as ctx:
+            wrapper(MagicMock(), model="gpt-3.5-turbo")
+
+        assert original.call_args[1]["model"] == "gpt-3.5-turbo"
+        trace = ctx.trace()
+        assert trace[0]["action"] == "switch_model"
+        assert trace[0]["reason"] == "kpi_weight_optimization"
+        assert trace[0]["model"] == "o1"
+
 
 # ---------------------------------------------------------------------------
 # Fix: stream_options.include_usage auto-injection

From bcee09caa0925db9921da3f2386bc16ed4b66bba Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 08:58:04 +0100
Subject: [PATCH 12/49] fix(harness): clarify observe traces and hard-stop
 semantics

---
 cascadeflow/harness/api.py        |  29 +++--
 cascadeflow/harness/instrument.py | 177 ++++++++++++++++++++----------
 tests/test_harness_instrument.py  |  41 ++++++-
 3 files changed, 177 insertions(+), 70 deletions(-)

diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py
index 3627c9bb..9d003ee1 100644
--- a/cascadeflow/harness/api.py
+++ b/cascadeflow/harness/api.py
@@ -84,17 +84,28 @@ async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
     def trace(self) -> list[dict[str, Any]]:
         return list(self._trace)
 
-    def record(self, action: str, reason: str, model: Optional[str] = None) -> None:
+    def record(
+        self,
+        action: str,
+        reason: str,
+        model: Optional[str] = None,
+        *,
+        applied: Optional[bool] = None,
+        decision_mode: Optional[str] = None,
+    ) -> None:
         self.last_action = action
         self.model_used = model
-        self._trace.append(
-            {
-                "action": action,
-                "reason": reason,
-                "model": model,
-                "run_id": self.run_id,
-            }
-        )
+        entry: dict[str, Any] = {
+            "action": action,
+            "reason": reason,
+            "model": model,
+            "run_id": self.run_id,
+        }
+        if applied is not None:
+            entry["applied"] = applied
+        if decision_mode is not None:
+            entry["decision_mode"] = decision_mode
+        self._trace.append(entry)
 
 
 _harness_config: HarnessConfig = HarnessConfig()
diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py
index a9e6a2bd..251d2497 100644
--- a/cascadeflow/harness/instrument.py
+++ b/cascadeflow/harness/instrument.py
@@ -69,7 +69,9 @@
 _DEFAULT_ENERGY_COEFFICIENT: float = 1.0
 _ENERGY_OUTPUT_WEIGHT: float = 1.5
 
-# Relative quality/latency priors for KPI-weighted soft-control scoring.
+# Relative priors used by KPI-weighted soft-control scoring.
+# These are deterministic heuristics based on internal benchmark runs and
+# intended as defaults until provider-specific online scoring is wired in.
 _QUALITY_PRIORS: dict[str, float] = {
     "gpt-4o": 0.90,
     "gpt-4o-mini": 0.75,
@@ -93,6 +95,8 @@
     "o3-mini": 0.78,
 }
 
+# OpenAI-model allowlists used by the current OpenAI harness instrumentation.
+# Future provider instrumentation should provide provider-specific allowlists.
 _COMPLIANCE_MODEL_ALLOWLISTS: dict[str, set[str]] = {
     "gdpr": {"gpt-4o", "gpt-4o-mini", "gpt-3.5-turbo"},
     "hipaa": {"gpt-4o", "gpt-4o-mini"},
@@ -173,9 +177,14 @@ def _select_cheaper_model(current_model: str) -> str:
 
 
 def _select_faster_model(current_model: str) -> str:
-    # We use the lowest-cost model as a deterministic latency proxy until
-    # provider-specific live latency scoring is wired into the harness.
-    return _select_cheaper_model(current_model)
+    latency_candidates = [name for name in _PRICING if name in _LATENCY_PRIORS]
+    if not latency_candidates:
+        return current_model
+    fastest = max(latency_candidates, key=lambda name: _LATENCY_PRIORS[name])
+    current_latency = _LATENCY_PRIORS.get(current_model, 0.7)
+    if _LATENCY_PRIORS[fastest] > current_latency:
+        return fastest
+    return current_model
 
 
 def _select_lower_energy_model(current_model: str) -> str:
@@ -227,7 +236,7 @@ def _energy_utility(model: str) -> float:
 def _kpi_score(model: str, weights: dict[str, float]) -> float:
     normalized = _normalize_weights(weights)
     if not normalized:
-        return -1.0
+        return 0.0
     quality = _QUALITY_PRIORS.get(model, 0.7)
     latency = _LATENCY_PRIORS.get(model, 0.7)
     cost = _cost_utility(model)
@@ -355,7 +364,7 @@ def _evaluate_pre_call_decision(ctx: Any, model: str, has_tools: bool) -> _PreCa
 
 
 def _raise_stop_error(ctx: Any, reason: str) -> None:
-    from cascadeflow.schema.exceptions import BudgetExceededError
+    from cascadeflow.schema.exceptions import BudgetExceededError, HarnessStopError
 
     if reason == "budget_exceeded":
         remaining = 0.0
@@ -365,7 +374,56 @@ def _raise_stop_error(ctx: Any, reason: str) -> None:
             f"Budget exhausted: spent ${ctx.cost:.4f} of ${ctx.budget_max or 0.0:.4f} max",
             remaining=remaining,
         )
-    raise RuntimeError(f"cascadeflow harness stop: {reason}")
+    raise HarnessStopError(f"cascadeflow harness stop: {reason}", reason=reason)
+
+
+def _resolve_pre_call_decision(
+    ctx: Any,
+    mode: str,
+    model: str,
+    kwargs: dict[str, Any],
+) -> tuple[dict[str, Any], str, str, str, str, bool]:
+    decision = _evaluate_pre_call_decision(ctx, model, has_tools=bool(kwargs.get("tools")))
+    action = decision.action
+    reason = decision.reason
+    target_model = decision.target_model
+    applied = action == "allow"
+
+    if mode == "enforce":
+        if action == "stop":
+            ctx.record(
+                action="stop",
+                reason=reason,
+                model=model,
+                applied=True,
+                decision_mode=mode,
+            )
+            _raise_stop_error(ctx, reason)
+
+        if action == "switch_model" and target_model != model:
+            kwargs = {**kwargs, "model": target_model}
+            model = target_model
+            applied = True
+        elif action == "switch_model":
+            applied = False
+
+        if action == "deny_tool":
+            if kwargs.get("tools"):
+                kwargs = {**kwargs, "tools": []}
+                applied = True
+            else:
+                applied = False
+    elif action != "allow":
+        logger.debug(
+            "harness observe decision: action=%s reason=%s model=%s target=%s",
+            action,
+            reason,
+            model,
+            target_model,
+        )
+        applied = False
+
+    return kwargs, model, action, reason, target_model, applied
 
 
 def _update_context(
@@ -379,6 +437,8 @@ def _update_context(
     action: str = "allow",
     action_reason: str | None = None,
     action_model: str | None = None,
+    applied: bool | None = None,
+    decision_mode: str | None = None,
 ) -> None:
     """Update a HarnessRunContext with call metrics."""
     cost = _estimate_cost(model, prompt_tokens, completion_tokens)
@@ -393,14 +453,27 @@ def _update_context(
     if ctx.budget_max is not None:
         ctx.budget_remaining = ctx.budget_max - ctx.cost
 
+    if applied is None:
+        applied = action == "allow"
+    if decision_mode is None:
+        decision_mode = ctx.mode
+
     if action == "allow":
-        ctx.record(action="allow", reason=ctx.mode, model=model)
+        ctx.record(
+            action="allow",
+            reason=ctx.mode,
+            model=model,
+            applied=applied,
+            decision_mode=decision_mode,
+        )
         return
 
     ctx.record(
         action=action,
         reason=action_reason or ctx.mode,
         model=action_model or model,
+        applied=applied,
+        decision_mode=decision_mode,
     )
 
 
@@ -420,6 +493,8 @@ class _InstrumentedStream:
         "_pre_action",
         "_pre_reason",
         "_pre_model",
+        "_pre_applied",
+        "_decision_mode",
         "_usage",
         "_tool_call_count",
         "_finalized",
@@ -434,6 +509,8 @@ def __init__(
         pre_action: str = "allow",
         pre_reason: str = "observe",
         pre_model: str | None = None,
+        pre_applied: bool = True,
+        decision_mode: str = "observe",
     ) -> None:
         self._stream = stream
         self._ctx = ctx
@@ -442,6 +519,8 @@ def __init__(
         self._pre_action = pre_action
         self._pre_reason = pre_reason
         self._pre_model = pre_model or model
+        self._pre_applied = pre_applied
+        self._decision_mode = decision_mode
         self._usage: Any = None
         self._tool_call_count: int = 0
         self._finalized: bool = False
@@ -528,6 +607,8 @@ def _finalize(self) -> None:
             action=self._pre_action,
             action_reason=self._pre_reason,
             action_model=self._pre_model,
+            applied=self._pre_applied,
+            decision_mode=self._decision_mode,
         )
 
 
@@ -542,6 +623,8 @@ class _InstrumentedAsyncStream:
         "_pre_action",
         "_pre_reason",
         "_pre_model",
+        "_pre_applied",
+        "_decision_mode",
         "_usage",
         "_tool_call_count",
         "_finalized",
@@ -556,6 +639,8 @@ def __init__(
         pre_action: str = "allow",
         pre_reason: str = "observe",
         pre_model: str | None = None,
+        pre_applied: bool = True,
+        decision_mode: str = "observe",
     ) -> None:
         self._stream = stream
         self._ctx = ctx
@@ -564,6 +649,8 @@ def __init__(
         self._pre_action = pre_action
         self._pre_reason = pre_reason
         self._pre_model = pre_model or model
+        self._pre_applied = pre_applied
+        self._decision_mode = decision_mode
         self._usage: Any = None
         self._tool_call_count: int = 0
         self._finalized: bool = False
@@ -648,6 +735,8 @@ def _finalize(self) -> None:
             action=self._pre_action,
             action_reason=self._pre_reason,
             action_model=self._pre_model,
+            applied=self._pre_applied,
+            decision_mode=self._decision_mode,
         )
 
 
@@ -674,34 +763,16 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
         pre_action = "allow"
         pre_reason = mode
         pre_model = model
+        pre_applied = True
         is_stream: bool = bool(kwargs.get("stream", False))
 
         if ctx:
-            decision = _evaluate_pre_call_decision(ctx, model, has_tools=bool(kwargs.get("tools")))
-            pre_action = decision.action
-            pre_reason = decision.reason
-            pre_model = decision.target_model
-
-            if mode == "enforce":
-                if decision.action == "stop":
-                    ctx.record(action="stop", reason=decision.reason, model=model)
-                    _raise_stop_error(ctx, decision.reason)
-
-                if decision.action == "switch_model" and decision.target_model != model:
-                    kwargs = {**kwargs, "model": decision.target_model}
-                    model = decision.target_model
-
-                if decision.action == "deny_tool" and kwargs.get("tools"):
-                    kwargs = {**kwargs, "tools": []}
-
-            elif decision.action != "allow":
-                logger.debug(
-                    "harness observe decision: action=%s reason=%s model=%s target=%s",
-                    decision.action,
-                    decision.reason,
-                    model,
-                    decision.target_model,
-                )
+            kwargs, model, pre_action, pre_reason, pre_model, pre_applied = _resolve_pre_call_decision(
+                ctx,
+                mode,
+                model,
+                kwargs,
+            )
 
         start_time = time.monotonic()
 
@@ -720,6 +791,8 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
                 pre_action,
                 pre_reason,
                 pre_model,
+                pre_applied,
+                mode,
             )
         elif not is_stream and ctx:
             elapsed_ms = (time.monotonic() - start_time) * 1000
@@ -735,6 +808,8 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
                 action=pre_action,
                 action_reason=pre_reason,
                 action_model=pre_model,
+                applied=pre_applied,
+                decision_mode=mode,
             )
         else:
             logger.debug(
@@ -766,34 +841,16 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
         pre_action = "allow"
         pre_reason = mode
         pre_model = model
+        pre_applied = True
         is_stream: bool = bool(kwargs.get("stream", False))
 
         if ctx:
-            decision = _evaluate_pre_call_decision(ctx, model, has_tools=bool(kwargs.get("tools")))
-            pre_action = decision.action
-            pre_reason = decision.reason
-            pre_model = decision.target_model
-
-            if mode == "enforce":
-                if decision.action == "stop":
-                    ctx.record(action="stop", reason=decision.reason, model=model)
-                    _raise_stop_error(ctx, decision.reason)
-
-                if decision.action == "switch_model" and decision.target_model != model:
-                    kwargs = {**kwargs, "model": decision.target_model}
-                    model = decision.target_model
-
-                if decision.action == "deny_tool" and kwargs.get("tools"):
-                    kwargs = {**kwargs, "tools": []}
-
-            elif decision.action != "allow":
-                logger.debug(
-                    "harness observe decision async: action=%s reason=%s model=%s target=%s",
-                    decision.action,
-                    decision.reason,
-                    model,
-                    decision.target_model,
-                )
+            kwargs, model, pre_action, pre_reason, pre_model, pre_applied = _resolve_pre_call_decision(
+                ctx,
+                mode,
+                model,
+                kwargs,
+            )
 
         start_time = time.monotonic()
 
@@ -817,6 +874,8 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
                 pre_action,
                 pre_reason,
                 pre_model,
+                pre_applied,
+                mode,
             )
         elif not is_stream and ctx:
             elapsed_ms = (time.monotonic() - start_time) * 1000
@@ -832,6 +891,8 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
                 action=pre_action,
                 action_reason=pre_reason,
                 action_model=pre_model,
+                applied=pre_applied,
+                decision_mode=mode,
             )
         else:
             logger.debug(
diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py
index 3a4d9519..28fdc7b7 100644
--- a/tests/test_harness_instrument.py
+++ b/tests/test_harness_instrument.py
@@ -241,6 +241,8 @@ def test_model_used_and_trace(self) -> None:
         assert trace[0]["action"] == "allow"
         assert trace[0]["reason"] == "observe"
         assert trace[0]["model"] == "gpt-4o"
+        assert trace[0]["applied"] is True
+        assert trace[0]["decision_mode"] == "observe"
 
     def test_off_mode_passthrough_no_tracking(self) -> None:
         init(mode="off")
@@ -641,6 +643,11 @@ def test_observe_does_not_raise_on_budget_exhausted(self) -> None:
             wrapper(MagicMock(), model="gpt-4o")
 
         assert ctx.cost > ctx.budget_max  # type: ignore[operator]
+        trace = ctx.trace()
+        assert trace[-1]["action"] == "stop"
+        assert trace[-1]["reason"] == "budget_exceeded"
+        assert trace[-1]["applied"] is False
+        assert trace[-1]["decision_mode"] == "observe"
 
     @pytest.mark.asyncio
     async def test_enforce_raises_on_budget_exhausted_async(self) -> None:
@@ -678,6 +685,8 @@ def test_enforce_switches_model_under_budget_pressure(self) -> None:
         trace = ctx.trace()
         assert trace[0]["action"] == "switch_model"
         assert trace[0]["reason"] == "budget_pressure"
+        assert trace[0]["applied"] is True
+        assert trace[0]["decision_mode"] == "enforce"
 
     def test_observe_computes_switch_model_but_does_not_apply(self) -> None:
         init(mode="observe")
@@ -695,6 +704,8 @@ def test_observe_computes_switch_model_but_does_not_apply(self) -> None:
         assert trace[0]["action"] == "switch_model"
         assert trace[0]["reason"] == "budget_pressure"
         assert trace[0]["model"] == "gpt-4o-mini"
+        assert trace[0]["applied"] is False
+        assert trace[0]["decision_mode"] == "observe"
 
     def test_enforce_denies_tools_when_cap_reached(self) -> None:
         init(mode="enforce", max_tool_calls=0)
@@ -709,6 +720,8 @@ def test_enforce_denies_tools_when_cap_reached(self) -> None:
         trace = ctx.trace()
         assert trace[0]["action"] == "deny_tool"
         assert trace[0]["reason"] == "max_tool_calls_reached"
+        assert trace[0]["applied"] is True
+        assert trace[0]["decision_mode"] == "enforce"
 
     def test_observe_logs_deny_tool_but_keeps_tools(self) -> None:
         init(mode="observe", max_tool_calls=0)
@@ -724,8 +737,12 @@ def test_observe_logs_deny_tool_but_keeps_tools(self) -> None:
         trace = ctx.trace()
         assert trace[0]["action"] == "deny_tool"
         assert trace[0]["reason"] == "max_tool_calls_reached"
+        assert trace[0]["applied"] is False
+        assert trace[0]["decision_mode"] == "observe"
 
     def test_enforce_stops_when_latency_limit_exceeded_at_fastest_model(self) -> None:
+        from cascadeflow.schema.exceptions import HarnessStopError
+
         init(mode="enforce")
         mock_resp = _mock_completion()
         original = MagicMock(return_value=mock_resp)
@@ -733,15 +750,19 @@ def test_enforce_stops_when_latency_limit_exceeded_at_fastest_model(self) -> Non
 
         with run(max_latency_ms=1.0) as ctx:
             ctx.latency_used_ms = 5.0
-            with pytest.raises(RuntimeError, match="latency_limit_exceeded"):
-                wrapper(MagicMock(), model="gpt-4o-mini")
+            with pytest.raises(HarnessStopError, match="latency_limit_exceeded"):
+                wrapper(MagicMock(), model="gpt-3.5-turbo")
 
         original.assert_not_called()
         trace = ctx.trace()
         assert trace[0]["action"] == "stop"
         assert trace[0]["reason"] == "latency_limit_exceeded"
+        assert trace[0]["applied"] is True
+        assert trace[0]["decision_mode"] == "enforce"
 
     def test_enforce_stops_when_energy_limit_exceeded_at_lowest_energy_model(self) -> None:
+        from cascadeflow.schema.exceptions import HarnessStopError
+
         init(mode="enforce")
         mock_resp = _mock_completion()
         original = MagicMock(return_value=mock_resp)
@@ -749,13 +770,15 @@ def test_enforce_stops_when_energy_limit_exceeded_at_lowest_energy_model(self) -
 
         with run(max_energy=1.0) as ctx:
             ctx.energy_used = 5.0
-            with pytest.raises(RuntimeError, match="energy_limit_exceeded"):
+            with pytest.raises(HarnessStopError, match="energy_limit_exceeded"):
                 wrapper(MagicMock(), model="gpt-3.5-turbo")
 
         original.assert_not_called()
         trace = ctx.trace()
         assert trace[0]["action"] == "stop"
         assert trace[0]["reason"] == "energy_limit_exceeded"
+        assert trace[0]["applied"] is True
+        assert trace[0]["decision_mode"] == "enforce"
 
     @pytest.mark.asyncio
     async def test_async_enforce_denies_tools_when_cap_reached(self) -> None:
@@ -775,6 +798,8 @@ async def test_async_enforce_denies_tools_when_cap_reached(self) -> None:
         trace = ctx.trace()
         assert trace[0]["action"] == "deny_tool"
         assert trace[0]["reason"] == "max_tool_calls_reached"
+        assert trace[0]["applied"] is True
+        assert trace[0]["decision_mode"] == "enforce"
 
     def test_enforce_switches_model_for_compliance_policy(self) -> None:
         init(mode="enforce", compliance="strict")
@@ -789,6 +814,8 @@ def test_enforce_switches_model_for_compliance_policy(self) -> None:
         trace = ctx.trace()
         assert trace[0]["action"] == "switch_model"
         assert trace[0]["reason"] == "compliance_model_policy"
+        assert trace[0]["applied"] is True
+        assert trace[0]["decision_mode"] == "enforce"
 
     def test_enforce_denies_tool_for_strict_compliance(self) -> None:
         init(mode="enforce", compliance="strict")
@@ -803,6 +830,8 @@ def test_enforce_denies_tool_for_strict_compliance(self) -> None:
         trace = ctx.trace()
         assert trace[0]["action"] == "deny_tool"
         assert trace[0]["reason"] == "compliance_tool_restriction"
+        assert trace[0]["applied"] is True
+        assert trace[0]["decision_mode"] == "enforce"
 
     def test_observe_logs_compliance_switch_without_applying(self) -> None:
         init(mode="observe", compliance="strict")
@@ -818,6 +847,8 @@ def test_observe_logs_compliance_switch_without_applying(self) -> None:
         assert trace[0]["action"] == "switch_model"
         assert trace[0]["reason"] == "compliance_model_policy"
         assert trace[0]["model"] == "gpt-4o"
+        assert trace[0]["applied"] is False
+        assert trace[0]["decision_mode"] == "observe"
 
     def test_enforce_switches_model_using_kpi_weights(self) -> None:
         init(mode="enforce", kpi_weights={"quality": 1.0})
@@ -832,6 +863,8 @@ def test_enforce_switches_model_using_kpi_weights(self) -> None:
         trace = ctx.trace()
         assert trace[0]["action"] == "switch_model"
         assert trace[0]["reason"] == "kpi_weight_optimization"
+        assert trace[0]["applied"] is True
+        assert trace[0]["decision_mode"] == "enforce"
 
     def test_observe_logs_kpi_switch_without_applying(self) -> None:
         init(mode="observe", kpi_weights={"quality": 1.0})
@@ -847,6 +880,8 @@ def test_observe_logs_kpi_switch_without_applying(self) -> None:
         assert trace[0]["action"] == "switch_model"
         assert trace[0]["reason"] == "kpi_weight_optimization"
         assert trace[0]["model"] == "o1"
+        assert trace[0]["applied"] is False
+        assert trace[0]["decision_mode"] == "observe"
 
 
 # ---------------------------------------------------------------------------

From ee6e040b30d13e2545146ca7e2994894f5d6a183 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 09:25:08 +0100
Subject: [PATCH 13/49] perf(harness): optimize model utility hot paths

---
 cascadeflow/harness/instrument.py | 69 +++++++++++++++++--------------
 1 file changed, 37 insertions(+), 32 deletions(-)

diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py
index 251d2497..237b1174 100644
--- a/cascadeflow/harness/instrument.py
+++ b/cascadeflow/harness/instrument.py
@@ -49,6 +49,12 @@
     "o3-mini": (1.10, 4.40),
 }
 _DEFAULT_PRICING: tuple[float, float] = (2.50, 10.00)
+_DEFAULT_TOTAL_COST: float = _DEFAULT_PRICING[0] + _DEFAULT_PRICING[1]
+_MODEL_TOTAL_COSTS: dict[str, float] = {name: in_cost + out_cost for name, (in_cost, out_cost) in _PRICING.items()}
+_PRICING_MODELS: tuple[str, ...] = tuple(_PRICING.keys())
+_CHEAPEST_MODEL: str = min(_MODEL_TOTAL_COSTS, key=_MODEL_TOTAL_COSTS.get)
+_MIN_TOTAL_COST: float = min(_MODEL_TOTAL_COSTS.values())
+_MAX_TOTAL_COST: float = max(_MODEL_TOTAL_COSTS.values())
 
 # ---------------------------------------------------------------------------
 # Energy estimation coefficients (deterministic proxy, not live carbon data)
@@ -68,6 +74,9 @@
 }
 _DEFAULT_ENERGY_COEFFICIENT: float = 1.0
 _ENERGY_OUTPUT_WEIGHT: float = 1.5
+_LOWEST_ENERGY_MODEL: str = min(_ENERGY_COEFFICIENTS, key=_ENERGY_COEFFICIENTS.get)
+_MIN_ENERGY_COEFF: float = min(_ENERGY_COEFFICIENTS.values())
+_MAX_ENERGY_COEFF: float = max(_ENERGY_COEFFICIENTS.values())
 
 # Relative priors used by KPI-weighted soft-control scoring.
 # These are deterministic heuristics based on internal benchmark runs and
@@ -94,6 +103,10 @@
     "o1-mini": 0.60,
     "o3-mini": 0.78,
 }
+_LATENCY_CANDIDATES: tuple[str, ...] = tuple(name for name in _PRICING_MODELS if name in _LATENCY_PRIORS)
+_FASTEST_MODEL: str | None = (
+    max(_LATENCY_CANDIDATES, key=lambda name: _LATENCY_PRIORS[name]) if _LATENCY_CANDIDATES else None
+)
 
 # OpenAI-model allowlists used by the current OpenAI harness instrumentation.
 # Future provider instrumentation should provide provider-specific allowlists.
@@ -165,35 +178,30 @@ def _extract_usage(response: Any) -> tuple[int, int]:
 
 
 def _model_total_cost(model: str) -> float:
-    in_cost, out_cost = _PRICING.get(model, _DEFAULT_PRICING)
-    return in_cost + out_cost
+    return _MODEL_TOTAL_COSTS.get(model, _DEFAULT_TOTAL_COST)
 
 
 def _select_cheaper_model(current_model: str) -> str:
-    cheapest = min(_PRICING.keys(), key=_model_total_cost)
-    if _model_total_cost(cheapest) < _model_total_cost(current_model):
-        return cheapest
+    if _model_total_cost(_CHEAPEST_MODEL) < _model_total_cost(current_model):
+        return _CHEAPEST_MODEL
     return current_model
 
 
 def _select_faster_model(current_model: str) -> str:
-    latency_candidates = [name for name in _PRICING if name in _LATENCY_PRIORS]
-    if not latency_candidates:
+    if _FASTEST_MODEL is None:
         return current_model
-    fastest = max(latency_candidates, key=lambda name: _LATENCY_PRIORS[name])
     current_latency = _LATENCY_PRIORS.get(current_model, 0.7)
-    if _LATENCY_PRIORS[fastest] > current_latency:
-        return fastest
+    if _LATENCY_PRIORS[_FASTEST_MODEL] > current_latency:
+        return _FASTEST_MODEL
     return current_model
 
 
 def _select_lower_energy_model(current_model: str) -> str:
-    lowest_energy = min(_ENERGY_COEFFICIENTS.keys(), key=lambda name: _ENERGY_COEFFICIENTS[name])
-    if _ENERGY_COEFFICIENTS.get(lowest_energy, _DEFAULT_ENERGY_COEFFICIENT) < _ENERGY_COEFFICIENTS.get(
+    if _ENERGY_COEFFICIENTS.get(_LOWEST_ENERGY_MODEL, _DEFAULT_ENERGY_COEFFICIENT) < _ENERGY_COEFFICIENTS.get(
         current_model,
         _DEFAULT_ENERGY_COEFFICIENT,
     ):
-        return lowest_energy
+        return _LOWEST_ENERGY_MODEL
     return current_model
 
 
@@ -210,31 +218,20 @@ def _normalize_weights(weights: dict[str, float]) -> dict[str, float]:
 
 
 def _cost_utility(model: str) -> float:
-    costs = [_model_total_cost(name) for name in _PRICING]
-    if not costs:
-        return 0.0
     model_cost = _model_total_cost(model)
-    min_cost = min(costs)
-    max_cost = max(costs)
-    if max_cost == min_cost:
+    if _MAX_TOTAL_COST == _MIN_TOTAL_COST:
         return 1.0
-    return (max_cost - model_cost) / (max_cost - min_cost)
+    return (_MAX_TOTAL_COST - model_cost) / (_MAX_TOTAL_COST - _MIN_TOTAL_COST)
 
 
 def _energy_utility(model: str) -> float:
-    coeffs = list(_ENERGY_COEFFICIENTS.values())
-    if not coeffs:
-        return 0.0
     coeff = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT)
-    min_coeff = min(coeffs)
-    max_coeff = max(coeffs)
-    if max_coeff == min_coeff:
+    if _MAX_ENERGY_COEFF == _MIN_ENERGY_COEFF:
         return 1.0
-    return (max_coeff - coeff) / (max_coeff - min_coeff)
+    return (_MAX_ENERGY_COEFF - coeff) / (_MAX_ENERGY_COEFF - _MIN_ENERGY_COEFF)
 
 
-def _kpi_score(model: str, weights: dict[str, float]) -> float:
-    normalized = _normalize_weights(weights)
+def _kpi_score_with_normalized(model: str, normalized: dict[str, float]) -> float:
     if not normalized:
         return 0.0
     quality = _QUALITY_PRIORS.get(model, 0.7)
@@ -249,11 +246,19 @@ def _kpi_score(model: str, weights: dict[str, float]) -> float:
     )
 
 
+def _kpi_score(model: str, weights: dict[str, float]) -> float:
+    normalized = _normalize_weights(weights)
+    return _kpi_score_with_normalized(model, normalized)
+
+
 def _select_kpi_weighted_model(current_model: str, weights: dict[str, float]) -> str:
+    normalized = _normalize_weights(weights)
+    if not normalized:
+        return current_model
     best_model = current_model
-    best_score = _kpi_score(current_model, weights)
-    for candidate in _PRICING:
-        score = _kpi_score(candidate, weights)
+    best_score = _kpi_score_with_normalized(current_model, normalized)
+    for candidate in _PRICING_MODELS:
+        score = _kpi_score_with_normalized(candidate, normalized)
         if score > best_score:
             best_model = candidate
             best_score = score

From b54637be4ba91ecc6823e5d8bfe763b5adfb3a58 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 09:41:10 +0100
Subject: [PATCH 14/49] refactor(harness): unify pricing profiles across
 integrations

---
 cascadeflow/harness/instrument.py         | 463 ++++++++--------------
 cascadeflow/harness/pricing.py            |   1 +
 cascadeflow/integrations/crewai.py        |  46 +--
 cascadeflow/integrations/openai_agents.py |  46 +--
 tests/test_harness_shared_pricing.py      |  42 +-
 5 files changed, 210 insertions(+), 388 deletions(-)

diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py
index 237b1174..e86fb1a9 100644
--- a/cascadeflow/harness/instrument.py
+++ b/cascadeflow/harness/instrument.py
@@ -23,6 +23,25 @@
 from dataclasses import dataclass
 from typing import Any
 
+from cascadeflow.harness.pricing import (
+    DEFAULT_ENERGY_COEFFICIENT as _DEFAULT_ENERGY_COEFFICIENT,
+)
+from cascadeflow.harness.pricing import (
+    ENERGY_COEFFICIENTS as _ENERGY_COEFFICIENTS,
+)
+from cascadeflow.harness.pricing import (
+    OPENAI_MODEL_POOL as _PRICING_MODELS,
+)
+from cascadeflow.harness.pricing import (
+    estimate_cost as _estimate_cost_shared,
+)
+from cascadeflow.harness.pricing import (
+    estimate_energy as _estimate_energy_shared,
+)
+from cascadeflow.harness.pricing import (
+    model_total_price as _model_total_price_shared,
+)
+
 logger = logging.getLogger("cascadeflow.harness.instrument")
 
 # ---------------------------------------------------------------------------
@@ -33,50 +52,17 @@
 _original_sync_create: Any = None
 _original_async_create: Any = None
 
-# ---------------------------------------------------------------------------
-# Pricing table (USD per 1M tokens: input, output)
-# ---------------------------------------------------------------------------
-
-_PRICING: dict[str, tuple[float, float]] = {
-    "gpt-4o": (2.50, 10.00),
-    "gpt-4o-mini": (0.15, 0.60),
-    "gpt-5-mini": (0.20, 0.80),
-    "gpt-4-turbo": (10.00, 30.00),
-    "gpt-4": (30.00, 60.00),
-    "gpt-3.5-turbo": (0.50, 1.50),
-    "o1": (15.00, 60.00),
-    "o1-mini": (3.00, 12.00),
-    "o3-mini": (1.10, 4.40),
-}
-_DEFAULT_PRICING: tuple[float, float] = (2.50, 10.00)
-_DEFAULT_TOTAL_COST: float = _DEFAULT_PRICING[0] + _DEFAULT_PRICING[1]
-_MODEL_TOTAL_COSTS: dict[str, float] = {name: in_cost + out_cost for name, (in_cost, out_cost) in _PRICING.items()}
-_PRICING_MODELS: tuple[str, ...] = tuple(_PRICING.keys())
+_MODEL_TOTAL_COSTS: dict[str, float] = {name: _model_total_price_shared(name) for name in _PRICING_MODELS}
 _CHEAPEST_MODEL: str = min(_MODEL_TOTAL_COSTS, key=_MODEL_TOTAL_COSTS.get)
 _MIN_TOTAL_COST: float = min(_MODEL_TOTAL_COSTS.values())
 _MAX_TOTAL_COST: float = max(_MODEL_TOTAL_COSTS.values())
 
-# ---------------------------------------------------------------------------
-# Energy estimation coefficients (deterministic proxy, not live carbon data)
-# energy_units = coefficient * (input_tokens + output_tokens * output_weight)
-# ---------------------------------------------------------------------------
-
-_ENERGY_COEFFICIENTS: dict[str, float] = {
-    "gpt-4o": 1.0,
-    "gpt-4o-mini": 0.3,
-    "gpt-5-mini": 0.35,
-    "gpt-4-turbo": 1.5,
-    "gpt-4": 1.5,
-    "gpt-3.5-turbo": 0.2,
-    "o1": 2.0,
-    "o1-mini": 0.8,
-    "o3-mini": 0.5,
+_OPENAI_ENERGY_COEFFS: dict[str, float] = {
+    name: _ENERGY_COEFFICIENTS.get(name, _DEFAULT_ENERGY_COEFFICIENT) for name in _PRICING_MODELS
 }
-_DEFAULT_ENERGY_COEFFICIENT: float = 1.0
-_ENERGY_OUTPUT_WEIGHT: float = 1.5
-_LOWEST_ENERGY_MODEL: str = min(_ENERGY_COEFFICIENTS, key=_ENERGY_COEFFICIENTS.get)
-_MIN_ENERGY_COEFF: float = min(_ENERGY_COEFFICIENTS.values())
-_MAX_ENERGY_COEFF: float = max(_ENERGY_COEFFICIENTS.values())
+_LOWEST_ENERGY_MODEL: str = min(_OPENAI_ENERGY_COEFFS, key=_OPENAI_ENERGY_COEFFS.get)
+_MIN_ENERGY_COEFF: float = min(_OPENAI_ENERGY_COEFFS.values())
+_MAX_ENERGY_COEFF: float = max(_OPENAI_ENERGY_COEFFS.values())
 
 # Relative priors used by KPI-weighted soft-control scoring.
 # These are deterministic heuristics based on internal benchmark runs and
@@ -140,16 +126,12 @@ def _ensure_stream_usage(kwargs: dict[str, Any]) -> dict[str, Any]:
 
 def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
     """Estimate cost in USD from model name and token counts."""
-    per_million = _PRICING.get(model, _DEFAULT_PRICING)
-    input_cost = (prompt_tokens / 1_000_000) * per_million[0]
-    output_cost = (completion_tokens / 1_000_000) * per_million[1]
-    return input_cost + output_cost
+    return _estimate_cost_shared(model, prompt_tokens, completion_tokens)
 
 
 def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> float:
     """Estimate energy units (deterministic proxy, not live carbon)."""
-    coeff = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT)
-    return coeff * (prompt_tokens + completion_tokens * _ENERGY_OUTPUT_WEIGHT)
+    return _estimate_energy_shared(model, prompt_tokens, completion_tokens)
 
 
 def _count_tool_calls_in_response(response: Any) -> int:
@@ -178,7 +160,7 @@ def _extract_usage(response: Any) -> tuple[int, int]:
 
 
 def _model_total_cost(model: str) -> float:
-    return _MODEL_TOTAL_COSTS.get(model, _DEFAULT_TOTAL_COST)
+    return _MODEL_TOTAL_COSTS.get(model, _model_total_price_shared(model))
 
 
 def _select_cheaper_model(current_model: str) -> str:
@@ -277,7 +259,7 @@ def _select_compliant_model(current_model: str, compliance: str) -> str | None:
         return current_model
     if current_model in allowlist:
         return current_model
-    available = [name for name in _PRICING if name in allowlist]
+    available = [name for name in _PRICING_MODELS if name in allowlist]
     if not available:
         return None
     return min(available, key=_model_total_cost)
@@ -487,8 +469,8 @@ def _update_context(
 # ---------------------------------------------------------------------------
 
 
-class _InstrumentedStream:
-    """Wraps an OpenAI ``Stream`` to capture usage after all chunks are consumed."""
+class _InstrumentedStreamBase:
+    """Shared stream-wrapper logic for sync and async OpenAI streams."""
 
     __slots__ = (
         "_stream",
@@ -530,35 +512,6 @@ def __init__(
         self._tool_call_count: int = 0
         self._finalized: bool = False
 
-    # --- iteration ---------------------------------------------------------
-
-    def __iter__(self) -> _InstrumentedStream:
-        return self
-
-    def __next__(self) -> Any:
-        try:
-            chunk = next(self._stream)
-            self._inspect_chunk(chunk)
-            return chunk
-        except StopIteration:
-            self._finalize()
-            raise
-
-    # --- context manager ---------------------------------------------------
-
-    def __enter__(self) -> _InstrumentedStream:
-        if hasattr(self._stream, "__enter__"):
-            self._stream.__enter__()
-        return self
-
-    def __exit__(self, *args: Any) -> bool:
-        self._finalize()
-        if hasattr(self._stream, "__exit__"):
-            return self._stream.__exit__(*args)  # type: ignore[no-any-return]
-        return False
-
-    # --- proxied attributes ------------------------------------------------
-
     def close(self) -> None:
         self._finalize()
         if hasattr(self._stream, "close"):
@@ -568,8 +521,6 @@ def close(self) -> None:
     def response(self) -> Any:
         return getattr(self._stream, "response", None)
 
-    # --- internals ---------------------------------------------------------
-
     def _inspect_chunk(self, chunk: Any) -> None:
         usage = getattr(chunk, "usage", None)
         if usage is not None:
@@ -582,8 +533,8 @@ def _inspect_chunk(self, chunk: Any) -> None:
                 tool_calls = getattr(delta, "tool_calls", None)
                 if tool_calls:
                     for tc in tool_calls:
-                        # A new tool call has an ``id``; subsequent deltas
-                        # for the same call only have ``index``.
+                        # A new tool call has an ``id``; subsequent deltas for
+                        # the same call only have ``index``.
                         if getattr(tc, "id", None):
                             self._tool_call_count += 1
 
@@ -617,50 +568,39 @@ def _finalize(self) -> None:
         )
 
 
-class _InstrumentedAsyncStream:
-    """Wraps an OpenAI ``AsyncStream`` to capture usage after consumption."""
+class _InstrumentedStream(_InstrumentedStreamBase):
+    """Wraps an OpenAI sync ``Stream`` and tracks usage at stream end."""
 
-    __slots__ = (
-        "_stream",
-        "_ctx",
-        "_model",
-        "_start_time",
-        "_pre_action",
-        "_pre_reason",
-        "_pre_model",
-        "_pre_applied",
-        "_decision_mode",
-        "_usage",
-        "_tool_call_count",
-        "_finalized",
-    )
+    __slots__ = ()
 
-    def __init__(
-        self,
-        stream: Any,
-        ctx: Any,
-        model: str,
-        start_time: float,
-        pre_action: str = "allow",
-        pre_reason: str = "observe",
-        pre_model: str | None = None,
-        pre_applied: bool = True,
-        decision_mode: str = "observe",
-    ) -> None:
-        self._stream = stream
-        self._ctx = ctx
-        self._model = model
-        self._start_time = start_time
-        self._pre_action = pre_action
-        self._pre_reason = pre_reason
-        self._pre_model = pre_model or model
-        self._pre_applied = pre_applied
-        self._decision_mode = decision_mode
-        self._usage: Any = None
-        self._tool_call_count: int = 0
-        self._finalized: bool = False
+    def __iter__(self) -> _InstrumentedStream:
+        return self
+
+    def __next__(self) -> Any:
+        try:
+            chunk = next(self._stream)
+            self._inspect_chunk(chunk)
+            return chunk
+        except StopIteration:
+            self._finalize()
+            raise
+
+    def __enter__(self) -> _InstrumentedStream:
+        if hasattr(self._stream, "__enter__"):
+            self._stream.__enter__()
+        return self
+
+    def __exit__(self, *args: Any) -> bool:
+        self._finalize()
+        if hasattr(self._stream, "__exit__"):
+            return self._stream.__exit__(*args)  # type: ignore[no-any-return]
+        return False
 
-    # --- async iteration ---------------------------------------------------
+
+class _InstrumentedAsyncStream(_InstrumentedStreamBase):
+    """Wraps an OpenAI async ``AsyncStream`` and tracks usage at stream end."""
+
+    __slots__ = ()
 
     def __aiter__(self) -> _InstrumentedAsyncStream:
         return self
@@ -674,8 +614,6 @@ async def __anext__(self) -> Any:
             self._finalize()
             raise
 
-    # --- async context manager ---------------------------------------------
-
     async def __aenter__(self) -> _InstrumentedAsyncStream:
         if hasattr(self._stream, "__aenter__"):
             await self._stream.__aenter__()
@@ -687,67 +625,105 @@ async def __aexit__(self, *args: Any) -> bool:
             return await self._stream.__aexit__(*args)  # type: ignore[no-any-return]
         return False
 
-    # --- proxied attributes ------------------------------------------------
-
-    def close(self) -> None:
-        self._finalize()
-        if hasattr(self._stream, "close"):
-            self._stream.close()
-
-    @property
-    def response(self) -> Any:
-        return getattr(self._stream, "response", None)
 
-    # --- internals ---------------------------------------------------------
+# ---------------------------------------------------------------------------
+# Wrapper factories
+# ---------------------------------------------------------------------------
 
-    def _inspect_chunk(self, chunk: Any) -> None:
-        usage = getattr(chunk, "usage", None)
-        if usage is not None:
-            self._usage = usage
 
-        choices = getattr(chunk, "choices", [])
-        if choices:
-            delta = getattr(choices[0], "delta", None)
-            if delta:
-                tool_calls = getattr(delta, "tool_calls", None)
-                if tool_calls:
-                    for tc in tool_calls:
-                        if getattr(tc, "id", None):
-                            self._tool_call_count += 1
+@dataclass(frozen=True)
+class _CallInterceptionState:
+    kwargs: dict[str, Any]
+    model: str
+    pre_action: str
+    pre_reason: str
+    pre_model: str
+    pre_applied: bool
+    is_stream: bool
+    start_time: float
+
+
+def _prepare_call_interception(
+    *,
+    ctx: Any,
+    mode: str,
+    kwargs: dict[str, Any],
+) -> _CallInterceptionState:
+    model: str = kwargs.get("model", "unknown")
+    pre_action = "allow"
+    pre_reason = mode
+    pre_model = model
+    pre_applied = True
+
+    if ctx:
+        kwargs, model, pre_action, pre_reason, pre_model, pre_applied = _resolve_pre_call_decision(
+            ctx,
+            mode,
+            model,
+            kwargs,
+        )
 
-    def _finalize(self) -> None:
-        if self._finalized:
-            return
-        self._finalized = True
+    is_stream: bool = bool(kwargs.get("stream", False))
+    kwargs = _ensure_stream_usage(kwargs)
+
+    return _CallInterceptionState(
+        kwargs=kwargs,
+        model=model,
+        pre_action=pre_action,
+        pre_reason=pre_reason,
+        pre_model=pre_model,
+        pre_applied=pre_applied,
+        is_stream=is_stream,
+        start_time=time.monotonic(),
+    )
 
-        if self._ctx is None:
-            return
 
-        elapsed_ms = (time.monotonic() - self._start_time) * 1000
-        prompt_tokens = 0
-        completion_tokens = 0
-        if self._usage:
-            prompt_tokens = getattr(self._usage, "prompt_tokens", 0) or 0
-            completion_tokens = getattr(self._usage, "completion_tokens", 0) or 0
+def _finalize_interception(
+    *,
+    ctx: Any,
+    mode: str,
+    state: _CallInterceptionState,
+    response: Any,
+    stream_wrapper: type[_InstrumentedStream] | type[_InstrumentedAsyncStream],
+) -> Any:
+    if state.is_stream and ctx:
+        return stream_wrapper(
+            response,
+            ctx,
+            state.model,
+            state.start_time,
+            state.pre_action,
+            state.pre_reason,
+            state.pre_model,
+            state.pre_applied,
+            mode,
+        )
 
+    if (not state.is_stream) and ctx:
+        elapsed_ms = (time.monotonic() - state.start_time) * 1000
+        prompt_tokens, completion_tokens = _extract_usage(response)
+        tool_call_count = _count_tool_calls_in_response(response)
         _update_context(
-            self._ctx,
-            self._model,
+            ctx,
+            state.model,
             prompt_tokens,
             completion_tokens,
-            self._tool_call_count,
+            tool_call_count,
             elapsed_ms,
-            action=self._pre_action,
-            action_reason=self._pre_reason,
-            action_model=self._pre_model,
-            applied=self._pre_applied,
-            decision_mode=self._decision_mode,
+            action=state.pre_action,
+            action_reason=state.pre_reason,
+            action_model=state.pre_model,
+            applied=state.pre_applied,
+            decision_mode=mode,
+        )
+    else:
+        logger.debug(
+            "harness %s: model=%s (no active run scope, metrics not tracked)",
+            mode,
+            state.model,
         )
 
-
-# ---------------------------------------------------------------------------
-# Wrapper factories
-# ---------------------------------------------------------------------------
+    return response
 
 
 def _make_patched_create(original_fn: Any) -> Any:
@@ -764,66 +740,24 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
         if mode == "off":
             return original_fn(self, *args, **kwargs)
 
-        model: str = kwargs.get("model", "unknown")
-        pre_action = "allow"
-        pre_reason = mode
-        pre_model = model
-        pre_applied = True
-        is_stream: bool = bool(kwargs.get("stream", False))
-
-        if ctx:
-            kwargs, model, pre_action, pre_reason, pre_model, pre_applied = _resolve_pre_call_decision(
-                ctx,
-                mode,
-                model,
-                kwargs,
-            )
-
-        start_time = time.monotonic()
-
-        kwargs = _ensure_stream_usage(kwargs)
+        state = _prepare_call_interception(ctx=ctx, mode=mode, kwargs=kwargs)
 
-        logger.debug("harness intercept: model=%s stream=%s mode=%s", model, is_stream, mode)
-
-        response = original_fn(self, *args, **kwargs)
+        logger.debug(
+            "harness intercept: model=%s stream=%s mode=%s",
+            state.model,
+            state.is_stream,
+            mode,
+        )
 
-        if is_stream and ctx:
-            return _InstrumentedStream(
-                response,
-                ctx,
-                model,
-                start_time,
-                pre_action,
-                pre_reason,
-                pre_model,
-                pre_applied,
-                mode,
-            )
-        elif not is_stream and ctx:
-            elapsed_ms = (time.monotonic() - start_time) * 1000
-            prompt_tokens, completion_tokens = _extract_usage(response)
-            tool_call_count = _count_tool_calls_in_response(response)
-            _update_context(
-                ctx,
-                model,
-                prompt_tokens,
-                completion_tokens,
-                tool_call_count,
-                elapsed_ms,
-                action=pre_action,
-                action_reason=pre_reason,
-                action_model=pre_model,
-                applied=pre_applied,
-                decision_mode=mode,
-            )
-        else:
-            logger.debug(
-                "harness %s: model=%s (no active run scope, metrics not tracked)",
-                mode,
-                model,
-            )
+        response = original_fn(self, *args, **state.kwargs)
 
-        return response
+        return _finalize_interception(
+            ctx=ctx,
+            mode=mode,
+            state=state,
+            response=response,
+            stream_wrapper=_InstrumentedStream,
+        )
 
     return wrapper
 
@@ -842,71 +776,24 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
         if mode == "off":
             return await original_fn(self, *args, **kwargs)
 
-        model: str = kwargs.get("model", "unknown")
-        pre_action = "allow"
-        pre_reason = mode
-        pre_model = model
-        pre_applied = True
-        is_stream: bool = bool(kwargs.get("stream", False))
-
-        if ctx:
-            kwargs, model, pre_action, pre_reason, pre_model, pre_applied = _resolve_pre_call_decision(
-                ctx,
-                mode,
-                model,
-                kwargs,
-            )
-
-        start_time = time.monotonic()
-
-        kwargs = _ensure_stream_usage(kwargs)
+        state = _prepare_call_interception(ctx=ctx, mode=mode, kwargs=kwargs)
 
         logger.debug(
             "harness intercept async: model=%s stream=%s mode=%s",
-            model,
-            is_stream,
+            state.model,
+            state.is_stream,
             mode,
         )
 
-        response = await original_fn(self, *args, **kwargs)
-
-        if is_stream and ctx:
-            return _InstrumentedAsyncStream(
-                response,
-                ctx,
-                model,
-                start_time,
-                pre_action,
-                pre_reason,
-                pre_model,
-                pre_applied,
-                mode,
-            )
-        elif not is_stream and ctx:
-            elapsed_ms = (time.monotonic() - start_time) * 1000
-            prompt_tokens, completion_tokens = _extract_usage(response)
-            tool_call_count = _count_tool_calls_in_response(response)
-            _update_context(
-                ctx,
-                model,
-                prompt_tokens,
-                completion_tokens,
-                tool_call_count,
-                elapsed_ms,
-                action=pre_action,
-                action_reason=pre_reason,
-                action_model=pre_model,
-                applied=pre_applied,
-                decision_mode=mode,
-            )
-        else:
-            logger.debug(
-                "harness %s: model=%s (no active run scope, metrics not tracked)",
-                mode,
-                model,
-            )
+        response = await original_fn(self, *args, **state.kwargs)
 
-        return response
+        return _finalize_interception(
+            ctx=ctx,
+            mode=mode,
+            state=state,
+            response=response,
+            stream_wrapper=_InstrumentedAsyncStream,
+        )
 
     return wrapper
 
diff --git a/cascadeflow/harness/pricing.py b/cascadeflow/harness/pricing.py
index bd86323e..dab445ae 100644
--- a/cascadeflow/harness/pricing.py
+++ b/cascadeflow/harness/pricing.py
@@ -76,3 +76,4 @@ def model_total_price(model: str) -> float:
     """Return total (input + output) price per 1M tokens."""
     in_price, out_price = PRICING_USD_PER_M.get(model, DEFAULT_PRICING_USD_PER_M)
     return in_price + out_price
+
diff --git a/cascadeflow/integrations/crewai.py b/cascadeflow/integrations/crewai.py
index 7ff765f0..16cbe6e0 100644
--- a/cascadeflow/integrations/crewai.py
+++ b/cascadeflow/integrations/crewai.py
@@ -22,55 +22,19 @@
 from importlib.util import find_spec
 from typing import Any, Optional
 
+from cascadeflow.harness.pricing import estimate_cost as _estimate_shared_cost
+from cascadeflow.harness.pricing import estimate_energy as _estimate_shared_energy
+
 logger = logging.getLogger("cascadeflow.integrations.crewai")
 
 CREWAI_AVAILABLE = find_spec("crewai") is not None
 
-# ---------------------------------------------------------------------------
-# Pricing table (USD per 1M tokens: input, output)
-# Shared with instrument.py — kept small and self-contained to avoid
-# cross-module coupling.  A future pricing registry will deduplicate.
-# ---------------------------------------------------------------------------
-
-_PRICING: dict[str, tuple[float, float]] = {
-    "gpt-4o": (2.50, 10.00),
-    "gpt-4o-mini": (0.15, 0.60),
-    "gpt-5-mini": (0.20, 0.80),
-    "gpt-4-turbo": (10.00, 30.00),
-    "gpt-4": (30.00, 60.00),
-    "gpt-3.5-turbo": (0.50, 1.50),
-    "o1": (15.00, 60.00),
-    "o1-mini": (3.00, 12.00),
-    "o3-mini": (1.10, 4.40),
-    "claude-sonnet-4": (3.00, 15.00),
-    "claude-haiku-3.5": (1.00, 5.00),
-    "claude-opus-4.5": (5.00, 25.00),
-}
-_DEFAULT_PRICING: tuple[float, float] = (2.50, 10.00)
-
-_ENERGY_COEFFICIENTS: dict[str, float] = {
-    "gpt-4o": 1.0,
-    "gpt-4o-mini": 0.3,
-    "gpt-5-mini": 0.35,
-    "gpt-4-turbo": 1.5,
-    "gpt-4": 1.5,
-    "gpt-3.5-turbo": 0.2,
-    "o1": 2.0,
-    "o1-mini": 0.8,
-    "o3-mini": 0.5,
-}
-_DEFAULT_ENERGY_COEFFICIENT: float = 1.0
-_ENERGY_OUTPUT_WEIGHT: float = 1.5
-
-
 def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
-    per_million = _PRICING.get(model, _DEFAULT_PRICING)
-    return (prompt_tokens / 1_000_000) * per_million[0] + (completion_tokens / 1_000_000) * per_million[1]
+    return _estimate_shared_cost(model, prompt_tokens, completion_tokens)
 
 
 def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) -> float:
-    coeff = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT)
-    return coeff * (prompt_tokens + completion_tokens * _ENERGY_OUTPUT_WEIGHT)
+    return _estimate_shared_energy(model, prompt_tokens, completion_tokens)
 
 
 def _extract_message_content(message: Any) -> str:
diff --git a/cascadeflow/integrations/openai_agents.py b/cascadeflow/integrations/openai_agents.py
index 1205cd98..fe52d4d4 100644
--- a/cascadeflow/integrations/openai_agents.py
+++ b/cascadeflow/integrations/openai_agents.py
@@ -15,6 +15,18 @@
 from typing import TYPE_CHECKING, Any, AsyncIterator, Optional
 
 from cascadeflow.harness import get_current_run
+from cascadeflow.harness.pricing import (
+    OPENAI_MODEL_POOL,
+)
+from cascadeflow.harness.pricing import (
+    estimate_cost as _estimate_shared_cost,
+)
+from cascadeflow.harness.pricing import (
+    estimate_energy as _estimate_shared_energy,
+)
+from cascadeflow.harness.pricing import (
+    model_total_price as _shared_model_total_price,
+)
 from cascadeflow.schema.exceptions import BudgetExceededError
 
 logger = logging.getLogger("cascadeflow.harness.openai_agents")
@@ -57,40 +69,16 @@ class OpenAIAgentsIntegrationConfig:
     fail_open: bool = True
 
 
-# Approximate pricing (USD per 1M tokens: input, output).
-_PRICING_USD_PER_M = {
-    "gpt-4o": (2.50, 10.00),
-    "gpt-4o-mini": (0.15, 0.60),
-    "gpt-5": (1.25, 10.00),
-    "gpt-5-mini": (0.20, 0.80),
-    "gpt-4-turbo": (10.00, 30.00),
-}
-_DEFAULT_PRICING_USD_PER_M = (2.50, 10.00)
-
-# Deterministic proxy coefficients for energy tracking.
-_ENERGY_COEFFICIENTS = {
-    "gpt-4o": 1.0,
-    "gpt-4o-mini": 0.3,
-    "gpt-5": 1.2,
-    "gpt-5-mini": 0.35,
-    "gpt-4-turbo": 1.5,
-}
-_DEFAULT_ENERGY_COEFFICIENT = 1.0
-_ENERGY_OUTPUT_WEIGHT = 1.5
-
-
 def _estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
-    in_price, out_price = _PRICING_USD_PER_M.get(model, _DEFAULT_PRICING_USD_PER_M)
-    return (input_tokens / 1_000_000.0) * in_price + (output_tokens / 1_000_000.0) * out_price
+    return _estimate_shared_cost(model, input_tokens, output_tokens)
 
 
 def _estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float:
-    coefficient = _ENERGY_COEFFICIENTS.get(model, _DEFAULT_ENERGY_COEFFICIENT)
-    return coefficient * (input_tokens + (output_tokens * _ENERGY_OUTPUT_WEIGHT))
+    return _estimate_shared_energy(model, input_tokens, output_tokens)
 
 
 def _total_model_price(model: str) -> float:
-    return sum(_PRICING_USD_PER_M.get(model, _DEFAULT_PRICING_USD_PER_M))
+    return _shared_model_total_price(model)
 
 
 def _extract_usage_tokens(usage: Any) -> tuple[int, int]:
@@ -228,8 +216,10 @@ def _resolve_model(self, requested_model: Optional[str]) -> str:
 
         # Under budget pressure, switch to the cheapest configured candidate.
         if run.budget_remaining / run.budget_max < 0.2:
+            compatible_candidates = [name for name in self._config.model_candidates if name in OPENAI_MODEL_POOL]
+            candidates = compatible_candidates or self._config.model_candidates
             cheapest = min(
-                self._config.model_candidates,
+                candidates,
                 key=_total_model_price,
             )
             if cheapest != candidate:
diff --git a/tests/test_harness_shared_pricing.py b/tests/test_harness_shared_pricing.py
index a26398f3..fb693226 100644
--- a/tests/test_harness_shared_pricing.py
+++ b/tests/test_harness_shared_pricing.py
@@ -7,12 +7,7 @@
 import cascadeflow.harness.instrument as instrument_mod
 import cascadeflow.integrations.crewai as crewai_mod
 import cascadeflow.integrations.openai_agents as openai_agents_mod
-from cascadeflow.harness.pricing import (
-    OPENAI_MODEL_POOL,
-    estimate_cost,
-    estimate_energy,
-    model_total_price,
-)
+from cascadeflow.harness.pricing import OPENAI_MODEL_POOL, estimate_cost, estimate_energy, model_total_price
 
 
 def test_shared_estimate_cost_known_models() -> None:
@@ -40,31 +35,16 @@ def test_integration_estimators_use_shared_profiles() -> None:
     shared_cost = estimate_cost(model, input_tokens, output_tokens)
     shared_energy = estimate_energy(model, input_tokens, output_tokens)
 
-    assert instrument_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(
-        shared_cost
-    )
-    assert crewai_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(
-        shared_cost
-    )
-    assert openai_agents_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(
-        shared_cost
-    )
-
-    assert instrument_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(
-        shared_energy
-    )
-    assert crewai_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(
-        shared_energy
-    )
-    assert openai_agents_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(
-        shared_energy
-    )
+    assert instrument_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(shared_cost)
+    assert crewai_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(shared_cost)
+    assert openai_agents_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(shared_cost)
+
+    assert instrument_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(shared_energy)
+    assert crewai_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(shared_energy)
+    assert openai_agents_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(shared_energy)
 
 
 def test_openai_agents_total_price_uses_shared_profiles() -> None:
-    assert openai_agents_mod._total_model_price("gpt-5") == pytest.approx(
-        model_total_price("gpt-5")
-    )
-    assert openai_agents_mod._total_model_price("gpt-4o-mini") == pytest.approx(
-        model_total_price("gpt-4o-mini")
-    )
+    assert openai_agents_mod._total_model_price("gpt-5") == pytest.approx(model_total_price("gpt-5"))
+    assert openai_agents_mod._total_model_price("gpt-4o-mini") == pytest.approx(model_total_price("gpt-4o-mini"))
+

From 6afcfa73b6bb5b1e58f3c5a8315bd4f51a26994a Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 09:55:08 +0100
Subject: [PATCH 15/49] docs(plan): claim langchain harness extension branch

---
 docs/strategy/agent-intelligence-v2-plan.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md
index b03d8a58..8bcf8743 100644
--- a/docs/strategy/agent-intelligence-v2-plan.md
+++ b/docs/strategy/agent-intelligence-v2-plan.md
@@ -847,7 +847,7 @@ Claim checklist (one owner per branch at a time):
 - [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `completed (ready for PR)`
 - [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
 - [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
-- [ ] `feat/v2-langchain-harness-extension` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
+- [ ] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
 - [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
 - [ ] `feat/v2-bench-repro-pipeline` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
 - [ ] `feat/v2-security-privacy-telemetry` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`

From cc51cf7a78551f50b1cf49c3318c220f8bddffaa Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 11:18:57 +0100
Subject: [PATCH 16/49] feat(harness): add privacy-safe decision telemetry and
 callback hooks

---
 cascadeflow/__init__.py                     |   4 +
 cascadeflow/harness/__init__.py             |   4 +
 cascadeflow/harness/api.py                  | 124 +++++++++++++++++++-
 docs/README.md                              |   1 +
 docs/guides/harness_telemetry_privacy.md    |  59 ++++++++++
 docs/strategy/agent-intelligence-v2-plan.md |   2 +-
 tests/test_harness_api.py                   |  73 ++++++++++++
 7 files changed, 261 insertions(+), 6 deletions(-)
 create mode 100644 docs/guides/harness_telemetry_privacy.md

diff --git a/cascadeflow/__init__.py b/cascadeflow/__init__.py
index d49eb644..f2738abc 100644
--- a/cascadeflow/__init__.py
+++ b/cascadeflow/__init__.py
@@ -250,6 +250,8 @@
     agent as harness_agent,
     get_harness_config,
     get_current_run,
+    get_harness_callback_manager,
+    set_harness_callback_manager,
 )
 
 # ==================== MAIN AGENT & RESULT ====================
@@ -408,6 +410,8 @@
 >>>>>>> 1aba349 (Add core harness API scaffold with context-scoped runtime)
     "get_harness_config",
     "get_current_run",
+    "get_harness_callback_manager",
+    "set_harness_callback_manager",
     # ===== PROVIDERS =====
     "ModelResponse",
     "BaseProvider",
diff --git a/cascadeflow/harness/__init__.py b/cascadeflow/harness/__init__.py
index 43a03662..74c07219 100644
--- a/cascadeflow/harness/__init__.py
+++ b/cascadeflow/harness/__init__.py
@@ -14,11 +14,13 @@
     HarnessInitReport,
     HarnessRunContext,
     agent,
+    get_harness_callback_manager,
     get_current_run,
     get_harness_config,
     init,
     reset,
     run,
+    set_harness_callback_manager,
 )
 
 __all__ = [
@@ -29,6 +31,8 @@
     "run",
     "agent",
     "get_current_run",
+    "get_harness_callback_manager",
     "get_harness_config",
+    "set_harness_callback_manager",
     "reset",
 ]
diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py
index 9d003ee1..79617f39 100644
--- a/cascadeflow/harness/api.py
+++ b/cascadeflow/harness/api.py
@@ -4,6 +4,7 @@
 import json
 import logging
 import os
+import time
 from contextvars import ContextVar, Token
 from dataclasses import dataclass, field
 from importlib.util import find_spec
@@ -40,6 +41,8 @@ class HarnessInitReport:
 @dataclass
 class HarnessRunContext:
     run_id: str = field(default_factory=lambda: uuid4().hex[:12])
+    started_at_ms: float = field(default_factory=lambda: time.time() * 1000)
+    ended_at_ms: Optional[float] = None
     mode: HarnessMode = "off"
     budget_max: Optional[float] = None
     tool_calls_max: Optional[int] = None
@@ -71,6 +74,8 @@ def __enter__(self) -> HarnessRunContext:
         return self
 
     def __exit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
+        self.ended_at_ms = time.time() * 1000
+        self._log_summary()
         if self._token is not None:
             _current_run.reset(self._token)
             self._token = None
@@ -84,6 +89,47 @@ async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
     def trace(self) -> list[dict[str, Any]]:
         return list(self._trace)
 
+    def summary(self) -> dict[str, Any]:
+        duration_ms: Optional[float] = None
+        if self.ended_at_ms is not None:
+            duration_ms = max(0.0, self.ended_at_ms - self.started_at_ms)
+        return {
+            "run_id": self.run_id,
+            "mode": self.mode,
+            "step_count": self.step_count,
+            "tool_calls": self.tool_calls,
+            "cost": self.cost,
+            "savings": self.savings,
+            "latency_used_ms": self.latency_used_ms,
+            "energy_used": self.energy_used,
+            "budget_max": self.budget_max,
+            "budget_remaining": self.budget_remaining,
+            "last_action": self.last_action,
+            "model_used": self.model_used,
+            "duration_ms": duration_ms,
+        }
+
+    def _log_summary(self) -> None:
+        if self.mode == "off" or self.step_count <= 0:
+            return
+        logger.info(
+            (
+                "harness run summary run_id=%s mode=%s steps=%d tool_calls=%d "
+                "cost=%.6f latency_ms=%.2f energy=%.4f last_action=%s model=%s "
+                "budget_remaining=%s"
+            ),
+            self.run_id,
+            self.mode,
+            self.step_count,
+            self.tool_calls,
+            self.cost,
+            self.latency_used_ms,
+            self.energy_used,
+            self.last_action,
+            self.model_used,
+            self.budget_remaining,
+        )
+
     def record(
         self,
         action: str,
@@ -93,24 +139,41 @@ def record(
         applied: Optional[bool] = None,
         decision_mode: Optional[str] = None,
     ) -> None:
-        self.last_action = action
-        self.model_used = model
+        safe_action = _sanitize_trace_value(action, max_length=64) or "allow"
+        safe_reason = _sanitize_trace_value(reason, max_length=160) or "unspecified"
+        safe_model = _sanitize_trace_value(model, max_length=128) if model is not None else None
+
+        self.last_action = safe_action
+        self.model_used = safe_model
         entry: dict[str, Any] = {
-            "action": action,
-            "reason": reason,
-            "model": model,
+            "action": safe_action,
+            "reason": safe_reason,
+            "model": safe_model,
             "run_id": self.run_id,
+            "mode": self.mode,
+            "step": self.step_count,
+            "timestamp_ms": time.time() * 1000,
+            "tool_calls_total": self.tool_calls,
+            "cost_total": self.cost,
+            "latency_used_ms": self.latency_used_ms,
+            "energy_used": self.energy_used,
+            "budget_state": {
+                "max": self.budget_max,
+                "remaining": self.budget_remaining,
+            },
         }
         if applied is not None:
             entry["applied"] = applied
         if decision_mode is not None:
             entry["decision_mode"] = decision_mode
         self._trace.append(entry)
+        _emit_harness_decision(entry)
 
 
 _harness_config: HarnessConfig = HarnessConfig()
 _current_run: ContextVar[Optional[HarnessRunContext]] = ContextVar("cascadeflow_harness_run", default=None)
 _is_instrumented: bool = False
+_harness_callback_manager: Any = None
 _UNSET = object()
 
 
@@ -135,6 +198,15 @@ def get_current_run() -> Optional[HarnessRunContext]:
     return _current_run.get()
 
 
+def get_harness_callback_manager() -> Any:
+    return _harness_callback_manager
+
+
+def set_harness_callback_manager(callback_manager: Any) -> None:
+    global _harness_callback_manager
+    _harness_callback_manager = callback_manager
+
+
 def reset() -> None:
     """
     Reset harness global state and unpatch instrumented clients.
@@ -144,15 +216,53 @@ def reset() -> None:
 
     global _harness_config
     global _is_instrumented
+    global _harness_callback_manager
 
     from cascadeflow.harness.instrument import unpatch_openai
 
     unpatch_openai()
     _harness_config = HarnessConfig()
     _is_instrumented = False
+    _harness_callback_manager = None
     _current_run.set(None)
 
 
+def _sanitize_trace_value(value: Any, *, max_length: int) -> Optional[str]:
+    if value is None:
+        return None
+    text = str(value).replace("\n", " ").replace("\r", " ").strip()
+    if len(text) > max_length:
+        text = text[: max_length - 3] + "..."
+    return text
+
+
+def _emit_harness_decision(entry: dict[str, Any]) -> None:
+    manager = get_harness_callback_manager()
+    if manager is None:
+        return
+
+    trigger = getattr(manager, "trigger", None)
+    if not callable(trigger):
+        logger.debug("harness callback manager has no trigger() method")
+        return
+
+    try:
+        from cascadeflow.telemetry.callbacks import CallbackEvent
+    except Exception:
+        logger.debug("telemetry callbacks unavailable for harness decision emit", exc_info=True)
+        return
+
+    try:
+        trigger(
+            CallbackEvent.CASCADE_DECISION,
+            query="[harness]",
+            data=dict(entry),
+            workflow="harness",
+        )
+    except Exception:
+        logger.debug("failed to emit harness decision callback", exc_info=True)
+
+
 def _parse_bool(raw: str) -> bool:
     normalized = raw.strip().lower()
     return normalized in {"1", "true", "yes", "on"}
@@ -301,6 +411,7 @@ def init(
     kpi_targets: Optional[dict[str, float]] | object = _UNSET,
     kpi_weights: Optional[dict[str, float]] | object = _UNSET,
     compliance: Optional[str] | object = _UNSET,
+    callback_manager: Any | object = _UNSET,
 ) -> HarnessInitReport:
     """
     Initialize global harness settings.
@@ -334,6 +445,9 @@ def init(
     resolved_compliance = _resolve_value(
         "compliance", compliance, env_config, file_config, None, sources
     )
+    if callback_manager is not _UNSET:
+        set_harness_callback_manager(callback_manager)
+        sources["callback_manager"] = "code"
 
     validated_mode = _validate_mode(str(resolved_mode))
     _harness_config = HarnessConfig(
diff --git a/docs/README.md b/docs/README.md
index 1238d7f8..b9cedf66 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -20,6 +20,7 @@ Welcome to cascadeflow documentation! 🌊
 - [Tools](guides/tools.md) - Function calling and tool usage with cascades
 - [Agentic Patterns (Python)](guides/agentic-python.md) - Tool loops and multi-agent orchestration in Python
 - [Agentic Patterns (TypeScript)](guides/agentic-typescript.md) - Tool loops, multi-agent orchestration, and message best practices
+- [Harness Telemetry & Privacy](guides/harness_telemetry_privacy.md) - Decision traces, callbacks, and privacy-safe observability
 - [Cost Tracking](guides/cost_tracking.md) - Track and analyze API costs across queries
 - [Proxy Routing](guides/proxy.md) - Route requests through provider-aware proxy plans
 
diff --git a/docs/guides/harness_telemetry_privacy.md b/docs/guides/harness_telemetry_privacy.md
new file mode 100644
index 00000000..01e75402
--- /dev/null
+++ b/docs/guides/harness_telemetry_privacy.md
@@ -0,0 +1,59 @@
+# Harness Telemetry and Privacy
+
+Use this guide when you want harness observability without leaking user content.
+
+## What the Harness Records
+
+Each `run.trace()` decision entry includes:
+
+- `action`, `reason`, `model`
+- `run_id`, `mode`, `step`, `timestamp_ms`
+- `cost_total`, `latency_used_ms`, `energy_used`, `tool_calls_total`
+- `budget_state` (`max`, `remaining`)
+- `applied`, `decision_mode` (when available)
+
+The trace is scoped to the current `run()` context.
+
+## What the Harness Does Not Record
+
+By default, harness decision traces do not include:
+
+- raw prompts or user messages
+- model response text
+- tool argument payloads
+
+This keeps decision telemetry focused on policy/routing state instead of request content.
+
+## Callback Emission (Optional)
+
+If you provide a callback manager, each harness decision emits `CallbackEvent.CASCADE_DECISION`.
+
+```python
+from cascadeflow import init, run
+from cascadeflow.telemetry.callbacks import CallbackEvent, CallbackManager
+
+manager = CallbackManager()
+
+def on_decision(event):
+    print(event.data["action"], event.data["model"])
+
+manager.register(CallbackEvent.CASCADE_DECISION, on_decision)
+
+init(mode="observe", callback_manager=manager)
+
+with run(budget=1.0) as r:
+    ...
+```
+
+The emitted callback uses `query="[harness]"` and `workflow="harness"` to avoid passing user prompt content.
+
+## Per-Run Summary Logging
+
+When a scoped run exits (and recorded at least one step), the harness logs a summary on logger `cascadeflow.harness`:
+
+- run id, mode, steps, tool calls
+- cost/latency/energy totals
+- last action/model
+- remaining budget
+
+Use standard Python logging controls to direct this to your existing log sink.
diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md
index 8bcf8743..73bfec1b 100644
--- a/docs/strategy/agent-intelligence-v2-plan.md
+++ b/docs/strategy/agent-intelligence-v2-plan.md
@@ -850,7 +850,7 @@ Claim checklist (one owner per branch at a time):
 - [ ] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
 - [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
 - [ ] `feat/v2-bench-repro-pipeline` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
-- [ ] `feat/v2-security-privacy-telemetry` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
+- [ ] `feat/v2-security-privacy-telemetry` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
 
 Merge gates per feature branch:
 - [ ] Unit/integration tests green for touched scope
diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py
index 183a4350..087fa692 100644
--- a/tests/test_harness_api.py
+++ b/tests/test_harness_api.py
@@ -5,6 +5,7 @@
 import cascadeflow
 import cascadeflow.harness.api as harness_api
 from cascadeflow.harness import agent, get_current_run, get_harness_config, init, reset, run
+from cascadeflow.telemetry.callbacks import CallbackEvent, CallbackManager
 
 
 def setup_function() -> None:
@@ -171,6 +172,8 @@ def test_top_level_exports_exist():
     assert callable(cascadeflow.reset)
     assert callable(cascadeflow.run)
     assert callable(cascadeflow.agent)
+    assert callable(cascadeflow.get_harness_callback_manager)
+    assert callable(cascadeflow.set_harness_callback_manager)
     report = cascadeflow.init(mode="off")
     assert report.mode == "off"
 
@@ -182,6 +185,8 @@ def test_run_record_and_trace_copy():
     trace_b = ctx.trace()
     assert trace_a == trace_b
     assert trace_a[0]["action"] == "switch_model"
+    assert "budget_state" in trace_a[0]
+    assert trace_a[0]["budget_state"]["max"] == 1.0
     trace_a.append({"action": "mutated"})
     assert len(ctx.trace()) == 1
 
@@ -326,3 +331,71 @@ def test_init_reports_openai_instrumented_when_patch_succeeds(monkeypatch):
     monkeypatch.setattr(instrument, "patch_openai", lambda: True)
     report = init(mode="observe")
     assert report.instrumented == ["openai"]
+
+
+def test_run_summary_populates_on_context_exit():
+    init(mode="observe")
+    with run(budget=1.5) as ctx:
+        ctx.step_count = 2
+        ctx.tool_calls = 1
+        ctx.cost = 0.42
+        ctx.latency_used_ms = 123.0
+        ctx.energy_used = 33.0
+        ctx.budget_remaining = 1.08
+        ctx.last_action = "allow"
+        ctx.model_used = "gpt-4o-mini"
+
+    summary = ctx.summary()
+    assert summary["run_id"] == ctx.run_id
+    assert summary["step_count"] == 2
+    assert summary["budget_remaining"] == pytest.approx(1.08)
+    assert summary["duration_ms"] is not None
+    assert summary["duration_ms"] >= 0.0
+
+
+def test_run_context_logs_summary(caplog):
+    init(mode="observe")
+    with caplog.at_level("INFO", logger="cascadeflow.harness"):
+        with run(budget=1.0) as ctx:
+            ctx.step_count = 1
+            ctx.cost = 0.01
+            ctx.model_used = "gpt-4o-mini"
+
+    assert any("harness run summary" in rec.message for rec in caplog.records)
+
+
+def test_record_emits_cascade_decision_callback():
+    manager = CallbackManager()
+    received = []
+
+    def _on_decision(data):
+        received.append(data)
+
+    manager.register(CallbackEvent.CASCADE_DECISION, _on_decision)
+    report = init(mode="observe", callback_manager=manager)
+    assert report.config_sources["callback_manager"] == "code"
+
+    with run(budget=1.0) as ctx:
+        ctx.step_count = 1
+        ctx.record(action="switch_model", reason="budget_pressure", model="gpt-4o-mini")
+
+    assert len(received) == 1
+    event = received[0]
+    assert event.event == CallbackEvent.CASCADE_DECISION
+    assert event.query == "[harness]"
+    assert event.workflow == "harness"
+    assert event.data["action"] == "switch_model"
+    assert event.data["run_id"] == ctx.run_id
+
+
+def test_record_sanitizes_trace_values():
+    ctx = run()
+    ctx.record(
+        action="allow\nnewline",
+        reason="a" * 400,
+        model="model\r\nname",
+    )
+    entry = ctx.trace()[0]
+    assert "\n" not in entry["action"]
+    assert "\r" not in entry["model"]
+    assert len(entry["reason"]) <= 160

From ae1cf97b53339e7fb284bf22e2b921b4c8baed6a Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Mon, 2 Mar 2026 08:46:42 +0100
Subject: [PATCH 17/49] fix(harness): address telemetry review findings

- Use time.monotonic() for duration_ms calculation instead of wall-clock
  delta (avoids NTP/suspend clock jumps)
- Extract sanitize constants (_MAX_ACTION_LEN, _MAX_REASON_LEN, _MAX_MODEL_LEN)
- Log warning when record() receives empty action (was silently defaulting)
- Cache CallbackEvent import in _emit_harness_decision for hot-path perf
- Add tests: no-callback-manager noop, empty-action warning, duration field
---
 cascadeflow/harness/api.py | 45 ++++++++++++++++++++++++++------------
 tests/test_harness_api.py  | 19 ++++++++++++++++
 2 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py
index 79617f39..6039cc00 100644
--- a/cascadeflow/harness/api.py
+++ b/cascadeflow/harness/api.py
@@ -41,8 +41,10 @@ class HarnessInitReport:
 @dataclass
 class HarnessRunContext:
     run_id: str = field(default_factory=lambda: uuid4().hex[:12])
+    _started_monotonic: float = field(default_factory=time.monotonic, init=False, repr=False)
     started_at_ms: float = field(default_factory=lambda: time.time() * 1000)
     ended_at_ms: Optional[float] = None
+    duration_ms: Optional[float] = None
     mode: HarnessMode = "off"
     budget_max: Optional[float] = None
     tool_calls_max: Optional[int] = None
@@ -75,6 +77,7 @@ def __enter__(self) -> HarnessRunContext:
 
     def __exit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
         self.ended_at_ms = time.time() * 1000
+        self.duration_ms = max(0.0, (time.monotonic() - self._started_monotonic) * 1000.0)
         self._log_summary()
         if self._token is not None:
             _current_run.reset(self._token)
@@ -90,9 +93,6 @@ def trace(self) -> list[dict[str, Any]]:
         return list(self._trace)
 
     def summary(self) -> dict[str, Any]:
-        duration_ms: Optional[float] = None
-        if self.ended_at_ms is not None:
-            duration_ms = max(0.0, self.ended_at_ms - self.started_at_ms)
         return {
             "run_id": self.run_id,
             "mode": self.mode,
@@ -106,7 +106,7 @@ def summary(self) -> dict[str, Any]:
             "budget_remaining": self.budget_remaining,
             "last_action": self.last_action,
             "model_used": self.model_used,
-            "duration_ms": duration_ms,
+            "duration_ms": self.duration_ms,
         }
 
     def _log_summary(self) -> None:
@@ -139,9 +139,12 @@ def record(
         applied: Optional[bool] = None,
         decision_mode: Optional[str] = None,
     ) -> None:
-        safe_action = _sanitize_trace_value(action, max_length=64) or "allow"
-        safe_reason = _sanitize_trace_value(reason, max_length=160) or "unspecified"
-        safe_model = _sanitize_trace_value(model, max_length=128) if model is not None else None
+        safe_action = _sanitize_trace_value(action, max_length=_MAX_ACTION_LEN)
+        if not safe_action:
+            logger.warning("record() called with empty action, defaulting to 'allow'")
+            safe_action = "allow"
+        safe_reason = _sanitize_trace_value(reason, max_length=_MAX_REASON_LEN) or "unspecified"
+        safe_model = _sanitize_trace_value(model, max_length=_MAX_MODEL_LEN) if model is not None else None
 
         self.last_action = safe_action
         self.model_used = safe_model
@@ -217,6 +220,7 @@ def reset() -> None:
     global _harness_config
     global _is_instrumented
     global _harness_callback_manager
+    global _cached_cascade_decision_event
 
     from cascadeflow.harness.instrument import unpatch_openai
 
@@ -224,19 +228,30 @@ def reset() -> None:
     _harness_config = HarnessConfig()
     _is_instrumented = False
     _harness_callback_manager = None
+    _cached_cascade_decision_event = None
     _current_run.set(None)
 
 
+_MAX_ACTION_LEN = 64
+_MAX_REASON_LEN = 160
+_MAX_MODEL_LEN = 128
+
+
 def _sanitize_trace_value(value: Any, *, max_length: int) -> Optional[str]:
     if value is None:
         return None
     text = str(value).replace("\n", " ").replace("\r", " ").strip()
     if len(text) > max_length:
         text = text[: max_length - 3] + "..."
-    return text
+    return text or None
+
+
+_cached_cascade_decision_event: Any = None
 
 
 def _emit_harness_decision(entry: dict[str, Any]) -> None:
+    global _cached_cascade_decision_event
+
     manager = get_harness_callback_manager()
     if manager is None:
         return
@@ -246,15 +261,17 @@ def _emit_harness_decision(entry: dict[str, Any]) -> None:
         logger.debug("harness callback manager has no trigger() method")
         return
 
-    try:
-        from cascadeflow.telemetry.callbacks import CallbackEvent
-    except Exception:
-        logger.debug("telemetry callbacks unavailable for harness decision emit", exc_info=True)
-        return
+    if _cached_cascade_decision_event is None:
+        try:
+            from cascadeflow.telemetry.callbacks import CallbackEvent
+            _cached_cascade_decision_event = CallbackEvent.CASCADE_DECISION
+        except Exception:
+            logger.debug("telemetry callbacks unavailable for harness decision emit", exc_info=True)
+            return
 
     try:
         trigger(
-            CallbackEvent.CASCADE_DECISION,
+            _cached_cascade_decision_event,
             query="[harness]",
             data=dict(entry),
             workflow="harness",
diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py
index 087fa692..937ab865 100644
--- a/tests/test_harness_api.py
+++ b/tests/test_harness_api.py
@@ -351,6 +351,8 @@ def test_run_summary_populates_on_context_exit():
     assert summary["budget_remaining"] == pytest.approx(1.08)
     assert summary["duration_ms"] is not None
     assert summary["duration_ms"] >= 0.0
+    assert ctx.duration_ms is not None
+    assert ctx.duration_ms >= 0.0
 
 
 def test_run_context_logs_summary(caplog):
@@ -399,3 +401,20 @@ def test_record_sanitizes_trace_values():
     assert "\n" not in entry["action"]
     assert "\r" not in entry["model"]
     assert len(entry["reason"]) <= 160
+
+
+def test_record_without_callback_manager_is_noop():
+    init(mode="observe")
+    with run(budget=1.0) as ctx:
+        ctx.record(action="allow", reason="test", model="gpt-4o-mini")
+    assert len(ctx.trace()) == 1
+
+
+def test_record_empty_action_warns_and_defaults(caplog):
+    init(mode="observe")
+    with caplog.at_level("WARNING", logger="cascadeflow.harness"):
+        with run(budget=1.0) as ctx:
+            ctx.record(action="", reason="test", model="gpt-4o-mini")
+    entry = ctx.trace()[0]
+    assert entry["action"] == "allow"
+    assert any("empty action" in rec.message for rec in caplog.records)

From 49ee6015ba1e7e2c43a002e5540d0f7f74686eee Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Mon, 2 Mar 2026 11:35:33 +0100
Subject: [PATCH 18/49] fix(harness): avoid shadowing cascadeflow.agent module

---
 cascadeflow/__init__.py   | 4 ----
 tests/test_harness_api.py | 3 ++-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/cascadeflow/__init__.py b/cascadeflow/__init__.py
index f2738abc..b9bc7682 100644
--- a/cascadeflow/__init__.py
+++ b/cascadeflow/__init__.py
@@ -403,11 +403,7 @@
     "init",
     "reset",
     "run",
-<<<<<<< HEAD
     "harness_agent",
-=======
-    "agent",
->>>>>>> 1aba349 (Add core harness API scaffold with context-scoped runtime)
     "get_harness_config",
     "get_current_run",
     "get_harness_callback_manager",
diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py
index 937ab865..fd89e590 100644
--- a/tests/test_harness_api.py
+++ b/tests/test_harness_api.py
@@ -171,7 +171,8 @@ def test_top_level_exports_exist():
     assert callable(cascadeflow.init)
     assert callable(cascadeflow.reset)
     assert callable(cascadeflow.run)
-    assert callable(cascadeflow.agent)
+    assert callable(cascadeflow.harness_agent)
+    assert hasattr(cascadeflow.agent, "PROVIDER_REGISTRY")
     assert callable(cascadeflow.get_harness_callback_manager)
     assert callable(cascadeflow.set_harness_callback_manager)
     report = cascadeflow.init(mode="off")

From c1236f1340213320916e1015ff8599c568a00f37 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Mon, 2 Mar 2026 12:51:17 +0100
Subject: [PATCH 19/49] style: apply black formatting for harness integration
 files

---
 cascadeflow/harness/api.py                    | 17 ++++++--
 cascadeflow/harness/instrument.py             | 24 ++++++++---
 cascadeflow/harness/pricing.py                |  1 -
 cascadeflow/integrations/crewai.py            |  7 ++--
 cascadeflow/integrations/openai_agents.py     |  8 +++-
 .../integrations/openai_agents_harness.py     |  6 ++-
 tests/test_crewai_integration.py              |  4 +-
 tests/test_harness_instrument.py              | 12 +++++-
 tests/test_harness_shared_pricing.py          | 42 ++++++++++++++-----
 9 files changed, 88 insertions(+), 33 deletions(-)

diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py
index 6039cc00..f545d73d 100644
--- a/cascadeflow/harness/api.py
+++ b/cascadeflow/harness/api.py
@@ -65,7 +65,9 @@ class HarnessRunContext:
     last_action: str = "allow"
     draft_accepted: Optional[bool] = None
     _trace: list[dict[str, Any]] = field(default_factory=list)
-    _token: Optional[Token[Optional[HarnessRunContext]]] = field(default=None, init=False, repr=False)
+    _token: Optional[Token[Optional[HarnessRunContext]]] = field(
+        default=None, init=False, repr=False
+    )
 
     def __post_init__(self) -> None:
         if self.budget_max is not None and self.budget_remaining is None:
@@ -144,7 +146,9 @@ def record(
             logger.warning("record() called with empty action, defaulting to 'allow'")
             safe_action = "allow"
         safe_reason = _sanitize_trace_value(reason, max_length=_MAX_REASON_LEN) or "unspecified"
-        safe_model = _sanitize_trace_value(model, max_length=_MAX_MODEL_LEN) if model is not None else None
+        safe_model = (
+            _sanitize_trace_value(model, max_length=_MAX_MODEL_LEN) if model is not None else None
+        )
 
         self.last_action = safe_action
         self.model_used = safe_model
@@ -174,7 +178,9 @@ def record(
 
 
 _harness_config: HarnessConfig = HarnessConfig()
-_current_run: ContextVar[Optional[HarnessRunContext]] = ContextVar("cascadeflow_harness_run", default=None)
+_current_run: ContextVar[Optional[HarnessRunContext]] = ContextVar(
+    "cascadeflow_harness_run", default=None
+)
 _is_instrumented: bool = False
 _harness_callback_manager: Any = None
 _UNSET = object()
@@ -264,6 +270,7 @@ def _emit_harness_decision(entry: dict[str, Any]) -> None:
     if _cached_cascade_decision_event is None:
         try:
             from cascadeflow.telemetry.callbacks import CallbackEvent
+
             _cached_cascade_decision_event = CallbackEvent.CASCADE_DECISION
         except Exception:
             logger.debug("telemetry callbacks unavailable for harness decision emit", exc_info=True)
@@ -452,7 +459,9 @@ def init(
     resolved_max_latency_ms = _resolve_value(
         "max_latency_ms", max_latency_ms, env_config, file_config, None, sources
     )
-    resolved_max_energy = _resolve_value("max_energy", max_energy, env_config, file_config, None, sources)
+    resolved_max_energy = _resolve_value(
+        "max_energy", max_energy, env_config, file_config, None, sources
+    )
     resolved_kpi_targets = _resolve_value(
         "kpi_targets", kpi_targets, env_config, file_config, None, sources
     )
diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py
index e86fb1a9..c2fbd7ab 100644
--- a/cascadeflow/harness/instrument.py
+++ b/cascadeflow/harness/instrument.py
@@ -52,7 +52,9 @@
 _original_sync_create: Any = None
 _original_async_create: Any = None
 
-_MODEL_TOTAL_COSTS: dict[str, float] = {name: _model_total_price_shared(name) for name in _PRICING_MODELS}
+_MODEL_TOTAL_COSTS: dict[str, float] = {
+    name: _model_total_price_shared(name) for name in _PRICING_MODELS
+}
 _CHEAPEST_MODEL: str = min(_MODEL_TOTAL_COSTS, key=_MODEL_TOTAL_COSTS.get)
 _MIN_TOTAL_COST: float = min(_MODEL_TOTAL_COSTS.values())
 _MAX_TOTAL_COST: float = max(_MODEL_TOTAL_COSTS.values())
@@ -89,9 +91,13 @@
     "o1-mini": 0.60,
     "o3-mini": 0.78,
 }
-_LATENCY_CANDIDATES: tuple[str, ...] = tuple(name for name in _PRICING_MODELS if name in _LATENCY_PRIORS)
+_LATENCY_CANDIDATES: tuple[str, ...] = tuple(
+    name for name in _PRICING_MODELS if name in _LATENCY_PRIORS
+)
 _FASTEST_MODEL: str | None = (
-    max(_LATENCY_CANDIDATES, key=lambda name: _LATENCY_PRIORS[name]) if _LATENCY_CANDIDATES else None
+    max(_LATENCY_CANDIDATES, key=lambda name: _LATENCY_PRIORS[name])
+    if _LATENCY_CANDIDATES
+    else None
 )
 
 # OpenAI-model allowlists used by the current OpenAI harness instrumentation.
@@ -179,7 +185,9 @@ def _select_faster_model(current_model: str) -> str:
 
 
 def _select_lower_energy_model(current_model: str) -> str:
-    if _ENERGY_COEFFICIENTS.get(_LOWEST_ENERGY_MODEL, _DEFAULT_ENERGY_COEFFICIENT) < _ENERGY_COEFFICIENTS.get(
+    if _ENERGY_COEFFICIENTS.get(
+        _LOWEST_ENERGY_MODEL, _DEFAULT_ENERGY_COEFFICIENT
+    ) < _ENERGY_COEFFICIENTS.get(
         current_model,
         _DEFAULT_ENERGY_COEFFICIENT,
     ):
@@ -277,7 +285,9 @@ def _evaluate_pre_call_decision(ctx: Any, model: str, has_tools: bool) -> _PreCa
         return _PreCallDecision(action="stop", reason="budget_exceeded", target_model=model)
 
     if has_tools and ctx.tool_calls_max is not None and ctx.tool_calls >= ctx.tool_calls_max:
-        return _PreCallDecision(action="deny_tool", reason="max_tool_calls_reached", target_model=model)
+        return _PreCallDecision(
+            action="deny_tool", reason="max_tool_calls_reached", target_model=model
+        )
 
     compliance = getattr(ctx, "compliance", None)
     if compliance:
@@ -289,7 +299,9 @@ def _evaluate_pre_call_decision(ctx: Any, model: str, has_tools: bool) -> _PreCa
                     reason="compliance_no_approved_tool_path",
                     target_model=model,
                 )
-            return _PreCallDecision(action="stop", reason="compliance_no_approved_model", target_model=model)
+            return _PreCallDecision(
+                action="stop", reason="compliance_no_approved_model", target_model=model
+            )
         if compliant_model != model:
             return _PreCallDecision(
                 action="switch_model",
diff --git a/cascadeflow/harness/pricing.py b/cascadeflow/harness/pricing.py
index dab445ae..bd86323e 100644
--- a/cascadeflow/harness/pricing.py
+++ b/cascadeflow/harness/pricing.py
@@ -76,4 +76,3 @@ def model_total_price(model: str) -> float:
     """Return total (input + output) price per 1M tokens."""
     in_price, out_price = PRICING_USD_PER_M.get(model, DEFAULT_PRICING_USD_PER_M)
     return in_price + out_price
-
diff --git a/cascadeflow/integrations/crewai.py b/cascadeflow/integrations/crewai.py
index 16cbe6e0..604ae600 100644
--- a/cascadeflow/integrations/crewai.py
+++ b/cascadeflow/integrations/crewai.py
@@ -29,6 +29,7 @@
 
 CREWAI_AVAILABLE = find_spec("crewai") is not None
 
+
 def _estimate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
     return _estimate_shared_cost(model, prompt_tokens, completion_tokens)
 
@@ -124,8 +125,7 @@ def _before_llm_call_hook(context: Any) -> Optional[bool]:
             and ctx.cost >= ctx.budget_max
         ):
             logger.warning(
-                "crewai hook: blocking LLM call — budget exhausted "
-                "(spent $%.4f of $%.4f max)",
+                "crewai hook: blocking LLM call — budget exhausted " "(spent $%.4f of $%.4f max)",
                 ctx.cost,
                 ctx.budget_max,
             )
@@ -254,8 +254,7 @@ def enable(config: Optional[CrewAIHarnessConfig] = None) -> bool:
         )
     except ImportError:
         logger.warning(
-            "crewai is installed but hooks module not available "
-            "(requires crewai>=1.5); skipping"
+            "crewai is installed but hooks module not available " "(requires crewai>=1.5); skipping"
         )
         return False
 
diff --git a/cascadeflow/integrations/openai_agents.py b/cascadeflow/integrations/openai_agents.py
index fe52d4d4..cbce9b96 100644
--- a/cascadeflow/integrations/openai_agents.py
+++ b/cascadeflow/integrations/openai_agents.py
@@ -216,7 +216,9 @@ def _resolve_model(self, requested_model: Optional[str]) -> str:
 
         # Under budget pressure, switch to the cheapest configured candidate.
         if run.budget_remaining / run.budget_max < 0.2:
-            compatible_candidates = [name for name in self._config.model_candidates if name in OPENAI_MODEL_POOL]
+            compatible_candidates = [
+                name for name in self._config.model_candidates if name in OPENAI_MODEL_POOL
+            ]
             candidates = compatible_candidates or self._config.model_candidates
             cheapest = min(
                 candidates,
@@ -344,7 +346,9 @@ async def get_response(
         elapsed_ms = (time.monotonic() - started_at) * 1000.0
 
         try:
-            self._update_run_metrics(response=response, elapsed_ms=elapsed_ms, pre_action=pre_action)
+            self._update_run_metrics(
+                response=response, elapsed_ms=elapsed_ms, pre_action=pre_action
+            )
         except Exception:
             if self._config.fail_open:
                 logger.exception("openai-agents harness metric update failed (fail-open)")
diff --git a/examples/integrations/openai_agents_harness.py b/examples/integrations/openai_agents_harness.py
index 69ea6bcd..ac9d6c68 100644
--- a/examples/integrations/openai_agents_harness.py
+++ b/examples/integrations/openai_agents_harness.py
@@ -17,7 +17,7 @@ async def main() -> None:
     except ImportError as exc:
         raise SystemExit(
             "OpenAI Agents SDK is not installed. "
-            "Install with: pip install \"cascadeflow[openai,openai-agents]\""
+            'Install with: pip install "cascadeflow[openai,openai-agents]"'
         ) from exc
 
     from cascadeflow import init, run
@@ -44,7 +44,9 @@ async def main() -> None:
     run_config = RunConfig(model_provider=provider)
 
     with run(budget=0.5, max_tool_calls=3) as session:
-        result = await Runner.run(agent, "Summarize why model routing helps agent budgets.", run_config=run_config)
+        result = await Runner.run(
+            agent, "Summarize why model routing helps agent budgets.", run_config=run_config
+        )
 
         print("=== Result ===")
         print(result.final_output)
diff --git a/tests/test_crewai_integration.py b/tests/test_crewai_integration.py
index 622f4b4b..c17498b4 100644
--- a/tests/test_crewai_integration.py
+++ b/tests/test_crewai_integration.py
@@ -455,7 +455,9 @@ def test_enable_returns_false_for_old_crewai(self, monkeypatch):
         # Remove crewai.hooks from modules so import fails
         monkeypatch.delitem(sys.modules, "crewai.hooks", raising=False)
 
-        original_import = __builtins__.__import__ if hasattr(__builtins__, "__import__") else __import__
+        original_import = (
+            __builtins__.__import__ if hasattr(__builtins__, "__import__") else __import__
+        )
 
         def fake_import(name, *args, **kwargs):
             if name == "crewai.hooks":
diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py
index 28fdc7b7..75368522 100644
--- a/tests/test_harness_instrument.py
+++ b/tests/test_harness_instrument.py
@@ -714,7 +714,11 @@ def test_enforce_denies_tools_when_cap_reached(self) -> None:
         wrapper = _make_patched_create(original)
 
         with run(max_tool_calls=0) as ctx:
-            wrapper(MagicMock(), model="gpt-4o", tools=[{"type": "function", "function": {"name": "t1"}}])
+            wrapper(
+                MagicMock(),
+                model="gpt-4o",
+                tools=[{"type": "function", "function": {"name": "t1"}}],
+            )
 
         assert original.call_args[1]["tools"] == []
         trace = ctx.trace()
@@ -824,7 +828,11 @@ def test_enforce_denies_tool_for_strict_compliance(self) -> None:
         wrapper = _make_patched_create(original)
 
         with run() as ctx:
-            wrapper(MagicMock(), model="gpt-4o", tools=[{"type": "function", "function": {"name": "t1"}}])
+            wrapper(
+                MagicMock(),
+                model="gpt-4o",
+                tools=[{"type": "function", "function": {"name": "t1"}}],
+            )
 
         assert original.call_args[1]["tools"] == []
         trace = ctx.trace()
diff --git a/tests/test_harness_shared_pricing.py b/tests/test_harness_shared_pricing.py
index fb693226..a26398f3 100644
--- a/tests/test_harness_shared_pricing.py
+++ b/tests/test_harness_shared_pricing.py
@@ -7,7 +7,12 @@
 import cascadeflow.harness.instrument as instrument_mod
 import cascadeflow.integrations.crewai as crewai_mod
 import cascadeflow.integrations.openai_agents as openai_agents_mod
-from cascadeflow.harness.pricing import OPENAI_MODEL_POOL, estimate_cost, estimate_energy, model_total_price
+from cascadeflow.harness.pricing import (
+    OPENAI_MODEL_POOL,
+    estimate_cost,
+    estimate_energy,
+    model_total_price,
+)
 
 
 def test_shared_estimate_cost_known_models() -> None:
@@ -35,16 +40,31 @@ def test_integration_estimators_use_shared_profiles() -> None:
     shared_cost = estimate_cost(model, input_tokens, output_tokens)
     shared_energy = estimate_energy(model, input_tokens, output_tokens)
 
-    assert instrument_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(shared_cost)
-    assert crewai_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(shared_cost)
-    assert openai_agents_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(shared_cost)
-
-    assert instrument_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(shared_energy)
-    assert crewai_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(shared_energy)
-    assert openai_agents_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(shared_energy)
+    assert instrument_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(
+        shared_cost
+    )
+    assert crewai_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(
+        shared_cost
+    )
+    assert openai_agents_mod._estimate_cost(model, input_tokens, output_tokens) == pytest.approx(
+        shared_cost
+    )
+
+    assert instrument_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(
+        shared_energy
+    )
+    assert crewai_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(
+        shared_energy
+    )
+    assert openai_agents_mod._estimate_energy(model, input_tokens, output_tokens) == pytest.approx(
+        shared_energy
+    )
 
 
 def test_openai_agents_total_price_uses_shared_profiles() -> None:
-    assert openai_agents_mod._total_model_price("gpt-5") == pytest.approx(model_total_price("gpt-5"))
-    assert openai_agents_mod._total_model_price("gpt-4o-mini") == pytest.approx(model_total_price("gpt-4o-mini"))
-
+    assert openai_agents_mod._total_model_price("gpt-5") == pytest.approx(
+        model_total_price("gpt-5")
+    )
+    assert openai_agents_mod._total_model_price("gpt-4o-mini") == pytest.approx(
+        model_total_price("gpt-4o-mini")
+    )

From 02619258d96ac4ff74932f3b886dda6fca6df072 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 10:01:50 +0100
Subject: [PATCH 20/49] feat(langchain): add harness-aware callback and state
 extractor

---
 .../integrations/langchain/__init__.py        |  12 +
 .../langchain/harness_callback.py             | 235 ++++++++++++++++++
 .../integrations/langchain/harness_state.py   | 119 +++++++++
 .../tests/test_langchain_harness_callback.py  | 148 +++++++++++
 pyproject.toml                                |   7 +
 5 files changed, 521 insertions(+)
 create mode 100644 cascadeflow/integrations/langchain/harness_callback.py
 create mode 100644 cascadeflow/integrations/langchain/harness_state.py
 create mode 100644 cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py

diff --git a/cascadeflow/integrations/langchain/__init__.py b/cascadeflow/integrations/langchain/__init__.py
index 45c6ea2f..7b3f9551 100644
--- a/cascadeflow/integrations/langchain/__init__.py
+++ b/cascadeflow/integrations/langchain/__init__.py
@@ -54,6 +54,14 @@
     CascadeFlowCallbackHandler,
     get_cascade_callback,
 )
+from .harness_callback import (
+    HarnessAwareCascadeFlowCallbackHandler,
+    get_harness_callback,
+)
+from .harness_state import (
+    apply_langgraph_state,
+    extract_langgraph_state,
+)
 
 __all__ = [
     # Main classes
@@ -93,4 +101,8 @@
     # LangChain callback handlers
     "CascadeFlowCallbackHandler",
     "get_cascade_callback",
+    "HarnessAwareCascadeFlowCallbackHandler",
+    "get_harness_callback",
+    "extract_langgraph_state",
+    "apply_langgraph_state",
 ]
diff --git a/cascadeflow/integrations/langchain/harness_callback.py b/cascadeflow/integrations/langchain/harness_callback.py
new file mode 100644
index 00000000..aff5c0b4
--- /dev/null
+++ b/cascadeflow/integrations/langchain/harness_callback.py
@@ -0,0 +1,235 @@
+"""Harness-aware callbacks for LangChain/LangGraph integration."""
+
+from __future__ import annotations
+
+import logging
+import time
+from contextlib import contextmanager
+from typing import Any, Optional
+
+from cascadeflow.harness import get_current_run
+from cascadeflow.harness.pricing import estimate_cost, estimate_energy
+from cascadeflow.schema.exceptions import HarnessStopError
+
+from .harness_state import apply_langgraph_state, extract_langgraph_state
+from .langchain_callbacks import CascadeFlowCallbackHandler
+from .utils import extract_token_usage, extract_tool_calls
+
+logger = logging.getLogger("cascadeflow.harness.langchain")
+
+
+class HarnessAwareCascadeFlowCallbackHandler(CascadeFlowCallbackHandler):
+    """LangChain callback that bridges native lifecycle events into HarnessRunContext."""
+
+    def __init__(self, *, fail_open: bool = True):
+        super().__init__()
+        self.fail_open = fail_open
+        self._llm_started_at: Optional[float] = None
+        self._pre_action: str = "allow"
+        self._pre_reason: str = "allow"
+        self._pre_model: Optional[str] = None
+        self._pre_recorded: bool = False
+        self._executed_tool_calls: int = 0
+
+    def _handle_harness_error(self, error: Exception) -> None:
+        if self.fail_open:
+            logger.exception("langchain harness callback failed (fail-open)", exc_info=error)
+            return
+        raise error
+
+    def _sync_state(self, payload: dict[str, Any]) -> None:
+        run_ctx = get_current_run()
+        if run_ctx is None:
+            return
+        state = extract_langgraph_state(payload)
+        if state:
+            apply_langgraph_state(run_ctx, state)
+
+    def on_llm_start(self, serialized: dict[str, Any], prompts: list[str], **kwargs: Any) -> None:
+        super().on_llm_start(serialized=serialized, prompts=prompts, **kwargs)
+        self._llm_started_at = time.monotonic()
+        self._pre_action = "allow"
+        self._pre_reason = "allow"
+        self._pre_model = self.current_model
+        self._pre_recorded = False
+
+        try:
+            self._sync_state(kwargs)
+
+            run_ctx = get_current_run()
+            if run_ctx is None:
+                return
+
+            model_name = self.current_model or "unknown"
+            invocation_params = kwargs.get("invocation_params")
+            has_tools = False
+            if isinstance(invocation_params, dict):
+                has_tools = bool(invocation_params.get("tools"))
+            if not has_tools:
+                has_tools = bool(kwargs.get("tools"))
+
+            from cascadeflow.harness.instrument import _evaluate_pre_call_decision, _raise_stop_error
+
+            decision = _evaluate_pre_call_decision(run_ctx, model_name, has_tools=has_tools)
+            self._pre_action = decision.action
+            self._pre_reason = decision.reason
+            self._pre_model = decision.target_model
+
+            if run_ctx.mode == "observe":
+                if decision.action != "allow":
+                    run_ctx.record(
+                        action=decision.action,
+                        reason=decision.reason,
+                        model=decision.target_model,
+                        applied=False,
+                        decision_mode="observe",
+                    )
+                    self._pre_recorded = True
+                return
+
+            if run_ctx.mode != "enforce":
+                return
+
+            if decision.action == "stop":
+                run_ctx.record(
+                    action="stop",
+                    reason=decision.reason,
+                    model=model_name,
+                    applied=True,
+                    decision_mode="enforce",
+                )
+                self._pre_recorded = True
+                _raise_stop_error(run_ctx, decision.reason)
+
+            if decision.action == "switch_model":
+                run_ctx.record(
+                    action="switch_model",
+                    reason=decision.reason,
+                    model=decision.target_model,
+                    applied=False,
+                    decision_mode="enforce",
+                )
+                self._pre_recorded = True
+
+            if decision.action == "deny_tool" and has_tools:
+                run_ctx.record(
+                    action="deny_tool",
+                    reason=decision.reason,
+                    model=model_name,
+                    applied=False,
+                    decision_mode="enforce",
+                )
+                self._pre_recorded = True
+
+        except Exception as exc:
+            self._handle_harness_error(exc)
+
+    def on_llm_end(self, response: Any, **kwargs: Any) -> None:
+        super().on_llm_end(response=response, **kwargs)
+
+        try:
+            self._sync_state(kwargs)
+            run_ctx = get_current_run()
+            if run_ctx is None:
+                return
+
+            model_name = self.current_model
+            if not model_name and getattr(response, "llm_output", None):
+                model_name = response.llm_output.get("model_name")
+            model_name = model_name or "unknown"
+
+            token_usage = extract_token_usage(response)
+            prompt_tokens = int(token_usage["input"])
+            completion_tokens = int(token_usage["output"])
+            tool_call_count = len(extract_tool_calls(response))
+
+            elapsed_ms = 0.0
+            if self._llm_started_at is not None:
+                elapsed_ms = (time.monotonic() - self._llm_started_at) * 1000.0
+
+            run_ctx.step_count += 1
+            run_ctx.cost += estimate_cost(model_name, prompt_tokens, completion_tokens)
+            run_ctx.energy_used += estimate_energy(model_name, prompt_tokens, completion_tokens)
+            run_ctx.latency_used_ms += elapsed_ms
+            run_ctx.tool_calls += tool_call_count
+
+            if run_ctx.budget_max is not None:
+                run_ctx.budget_remaining = run_ctx.budget_max - run_ctx.cost
+
+            if self._pre_action == "allow":
+                run_ctx.record(
+                    action="allow",
+                    reason="langchain_step",
+                    model=model_name,
+                    applied=True,
+                    decision_mode=run_ctx.mode,
+                )
+            elif not self._pre_recorded:
+                run_ctx.record(
+                    action=self._pre_action,
+                    reason=self._pre_reason,
+                    model=self._pre_model or model_name,
+                    applied=False,
+                    decision_mode=run_ctx.mode,
+                )
+
+        except Exception as exc:
+            self._handle_harness_error(exc)
+        finally:
+            self._llm_started_at = None
+            self._pre_action = "allow"
+            self._pre_reason = "allow"
+            self._pre_model = None
+            self._pre_recorded = False
+
+    def on_tool_start(self, serialized: dict[str, Any], input_str: str, **kwargs: Any) -> Any:
+        try:
+            self._sync_state(kwargs)
+            run_ctx = get_current_run()
+            if run_ctx is None:
+                return None
+            if run_ctx.tool_calls_max is None:
+                return None
+
+            if self._executed_tool_calls >= run_ctx.tool_calls_max:
+                if run_ctx.mode == "observe":
+                    run_ctx.record(
+                        action="deny_tool",
+                        reason="max_tool_calls_reached",
+                        model=self.current_model,
+                        applied=False,
+                        decision_mode="observe",
+                    )
+                    return None
+                if run_ctx.mode == "enforce":
+                    run_ctx.record(
+                        action="deny_tool",
+                        reason="max_tool_calls_reached",
+                        model=self.current_model,
+                        applied=True,
+                        decision_mode="enforce",
+                    )
+                    raise HarnessStopError(
+                        "cascadeflow harness deny_tool: max tool calls reached",
+                        reason="max_tool_calls_reached",
+                    )
+
+            self._executed_tool_calls += 1
+            return None
+        except Exception as exc:
+            self._handle_harness_error(exc)
+            return None
+
+
+@contextmanager
+def get_harness_callback(*, fail_open: bool = True):
+    """Context manager that yields a harness-aware LangChain callback handler."""
+    callback = HarnessAwareCascadeFlowCallbackHandler(fail_open=fail_open)
+    try:
+        yield callback
+    finally:
+        return
+
+
+__all__ = ["HarnessAwareCascadeFlowCallbackHandler", "get_harness_callback"]
+
diff --git a/cascadeflow/integrations/langchain/harness_state.py b/cascadeflow/integrations/langchain/harness_state.py
new file mode 100644
index 00000000..49278ef1
--- /dev/null
+++ b/cascadeflow/integrations/langchain/harness_state.py
@@ -0,0 +1,119 @@
+"""LangGraph/LangChain state extraction helpers for harness integration."""
+
+from __future__ import annotations
+
+from typing import Any, Mapping, Optional
+
+
+def _as_int(value: Any) -> Optional[int]:
+    try:
+        if value is None:
+            return None
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _as_float(value: Any) -> Optional[float]:
+    try:
+        if value is None:
+            return None
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _extract_candidate_state(source: Any) -> Optional[Mapping[str, Any]]:
+    if not isinstance(source, Mapping):
+        return None
+
+    for key in ("langgraph_state", "graph_state", "state"):
+        candidate = source.get(key)
+        if isinstance(candidate, Mapping):
+            return candidate
+
+    return source
+
+
+def extract_langgraph_state(payload: Any) -> dict[str, Any]:
+    """Extract normalized harness-relevant fields from LangGraph-style state payloads."""
+
+    candidates: list[Mapping[str, Any]] = []
+    root = _extract_candidate_state(payload)
+    if root is not None:
+        candidates.append(root)
+
+    if isinstance(payload, Mapping):
+        metadata = payload.get("metadata")
+        if isinstance(metadata, Mapping):
+            state_from_metadata = _extract_candidate_state(metadata)
+            if state_from_metadata is not None:
+                candidates.append(state_from_metadata)
+
+        configurable = payload.get("configurable")
+        if isinstance(configurable, Mapping):
+            state_from_configurable = _extract_candidate_state(configurable)
+            if state_from_configurable is not None:
+                candidates.append(state_from_configurable)
+
+    merged: dict[str, Any] = {}
+    for source in candidates:
+        if "agent_id" in source and isinstance(source.get("agent_id"), str):
+            merged["agent_id"] = source["agent_id"]
+        if "model" in source and isinstance(source.get("model"), str):
+            merged["model_used"] = source["model"]
+        if "model_used" in source and isinstance(source.get("model_used"), str):
+            merged["model_used"] = source["model_used"]
+
+        step_count = _as_int(source.get("step_count", source.get("step")))
+        if step_count is not None:
+            merged["step_count"] = step_count
+
+        tool_calls = _as_int(source.get("tool_calls"))
+        if tool_calls is not None:
+            merged["tool_calls"] = tool_calls
+
+        budget_remaining = _as_float(source.get("budget_remaining"))
+        if budget_remaining is not None:
+            merged["budget_remaining"] = budget_remaining
+
+        latency_used_ms = _as_float(source.get("latency_used_ms", source.get("latency_ms")))
+        if latency_used_ms is not None:
+            merged["latency_used_ms"] = latency_used_ms
+
+        energy_used = _as_float(source.get("energy_used", source.get("energy")))
+        if energy_used is not None:
+            merged["energy_used"] = energy_used
+
+    return merged
+
+
+def apply_langgraph_state(run_ctx: Any, state: Mapping[str, Any]) -> None:
+    """Apply extracted state fields onto an active HarnessRunContext."""
+    if run_ctx is None or not isinstance(state, Mapping):
+        return
+
+    step_count = _as_int(state.get("step_count"))
+    if step_count is not None and step_count > getattr(run_ctx, "step_count", 0):
+        run_ctx.step_count = step_count
+
+    tool_calls = _as_int(state.get("tool_calls"))
+    if tool_calls is not None and tool_calls > getattr(run_ctx, "tool_calls", 0):
+        run_ctx.tool_calls = tool_calls
+
+    latency_used_ms = _as_float(state.get("latency_used_ms"))
+    if latency_used_ms is not None and latency_used_ms > getattr(run_ctx, "latency_used_ms", 0.0):
+        run_ctx.latency_used_ms = latency_used_ms
+
+    energy_used = _as_float(state.get("energy_used"))
+    if energy_used is not None and energy_used > getattr(run_ctx, "energy_used", 0.0):
+        run_ctx.energy_used = energy_used
+
+    budget_remaining = _as_float(state.get("budget_remaining"))
+    if budget_remaining is not None:
+        run_ctx.budget_remaining = budget_remaining
+
+    model_used = state.get("model_used")
+    if isinstance(model_used, str) and model_used:
+        run_ctx.model_used = model_used
+
diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py
new file mode 100644
index 00000000..b96cb30d
--- /dev/null
+++ b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py
@@ -0,0 +1,148 @@
+"""Tests for harness-aware LangChain callback integration."""
+
+from __future__ import annotations
+
+import pytest
+from langchain_core.messages import AIMessage
+from langchain_core.outputs import ChatGeneration, LLMResult
+
+from cascadeflow.harness import init, reset, run
+from cascadeflow.integrations.langchain.harness_callback import (
+    HarnessAwareCascadeFlowCallbackHandler,
+)
+from cascadeflow.integrations.langchain.harness_state import (
+    apply_langgraph_state,
+    extract_langgraph_state,
+)
+from cascadeflow.schema.exceptions import BudgetExceededError, HarnessStopError
+
+
+@pytest.fixture(autouse=True)
+def _reset_harness_state() -> None:
+    reset()
+
+
+def _llm_result(model_name: str, prompt_tokens: int, completion_tokens: int) -> LLMResult:
+    generation = ChatGeneration(message=AIMessage(content="ok"), generation_info={})
+    return LLMResult(
+        generations=[[generation]],
+        llm_output={
+            "model_name": model_name,
+            "token_usage": {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens,
+            },
+        },
+    )
+
+
+def test_harness_callback_updates_active_run_metrics() -> None:
+    init(mode="observe", budget=1.0)
+    handler = HarnessAwareCascadeFlowCallbackHandler()
+
+    with run(budget=1.0) as ctx:
+        handler.on_llm_start(
+            serialized={},
+            prompts=["hello"],
+            invocation_params={"model": "gpt-4o-mini"},
+        )
+        handler.on_llm_end(_llm_result("gpt-4o-mini", 120, 80))
+
+        assert ctx.step_count == 1
+        assert ctx.cost > 0
+        assert ctx.energy_used > 0
+        assert ctx.budget_remaining is not None
+        assert ctx.budget_remaining < 1.0
+        assert ctx.last_action == "allow"
+        assert ctx.model_used == "gpt-4o-mini"
+
+
+def test_harness_callback_enforce_raises_when_budget_exhausted() -> None:
+    init(mode="enforce", budget=0.1)
+    handler = HarnessAwareCascadeFlowCallbackHandler(fail_open=False)
+
+    with run(budget=0.1) as ctx:
+        ctx.cost = 0.1
+        ctx.budget_remaining = 0.0
+
+        with pytest.raises(BudgetExceededError):
+            handler.on_llm_start(
+                serialized={},
+                prompts=["hello"],
+                invocation_params={"model": "gpt-4o-mini"},
+            )
+
+        trace = ctx.trace()
+        assert trace
+        assert trace[-1]["action"] == "stop"
+        assert trace[-1]["reason"] == "budget_exceeded"
+        assert trace[-1]["applied"] is True
+
+
+def test_harness_callback_observe_records_non_applied_decisions() -> None:
+    init(mode="observe", budget=1.0)
+    handler = HarnessAwareCascadeFlowCallbackHandler()
+
+    with run(budget=1.0) as ctx:
+        ctx.cost = 0.9
+        ctx.budget_remaining = 0.1
+
+        handler.on_llm_start(
+            serialized={},
+            prompts=["hello"],
+            invocation_params={"model": "gpt-4o", "tools": [{"name": "lookup"}]},
+        )
+
+        trace = ctx.trace()
+        assert trace
+        assert trace[-1]["action"] in {"switch_model", "deny_tool"}
+        assert trace[-1]["applied"] is False
+        assert trace[-1]["decision_mode"] == "observe"
+
+
+def test_harness_callback_enforce_denies_tool_when_limit_reached() -> None:
+    init(mode="enforce", max_tool_calls=0, budget=1.0)
+    handler = HarnessAwareCascadeFlowCallbackHandler(fail_open=False)
+
+    with run(max_tool_calls=0, budget=1.0) as ctx:
+        with pytest.raises(HarnessStopError, match="max tool calls"):
+            handler.on_tool_start(serialized={"name": "search"}, input_str="query")
+
+        trace = ctx.trace()
+        assert trace
+        assert trace[-1]["action"] == "deny_tool"
+        assert trace[-1]["applied"] is True
+        assert trace[-1]["decision_mode"] == "enforce"
+
+
+def test_extract_and_apply_langgraph_state() -> None:
+    state = extract_langgraph_state(
+        {
+            "metadata": {
+                "langgraph_state": {
+                    "step": 4,
+                    "tool_calls": 3,
+                    "budget_remaining": 0.42,
+                    "latency_ms": 130.0,
+                    "energy": 77.0,
+                    "model": "gpt-4o-mini",
+                }
+            }
+        }
+    )
+
+    assert state["step_count"] == 4
+    assert state["tool_calls"] == 3
+    assert state["model_used"] == "gpt-4o-mini"
+
+    init(mode="observe", budget=1.0)
+    with run(budget=1.0) as ctx:
+        apply_langgraph_state(ctx, state)
+        assert ctx.step_count == 4
+        assert ctx.tool_calls == 3
+        assert ctx.budget_remaining == pytest.approx(0.42)
+        assert ctx.latency_used_ms == pytest.approx(130.0)
+        assert ctx.energy_used == pytest.approx(77.0)
+        assert ctx.model_used == "gpt-4o-mini"
+
diff --git a/pyproject.toml b/pyproject.toml
index eaadb6b7..198042da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -101,6 +101,13 @@ openai-agents = [
     "openai-agents>=0.9.0; python_version >= '3.10'",
 ]
 
+# LangChain/LangGraph harness integration (opt-in)
+langchain = [
+    "langchain>=0.3.0",
+    "langchain-core>=0.3.0",
+    "langgraph>=0.2.0",
+]
+
 # Development tools (includes rich for terminal output)
 dev = [
     "pytest>=7.4.0",

From 44506b8d8152035700603efa201cfd13f4714646 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 10:03:03 +0100
Subject: [PATCH 21/49] feat(langchain): auto-attach harness callback in active
 run scopes

---
 .../test_langchain_integration_features.py    | 36 ++++++++++++++++++
 cascadeflow/integrations/langchain/wrapper.py | 37 +++++++++++++++++--
 2 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py b/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py
index fdbcff1d..f225fa3a 100644
--- a/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py
+++ b/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py
@@ -4,6 +4,10 @@
 from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
 from langchain_core.outputs import ChatGeneration, ChatResult
 
+from cascadeflow.harness import init, reset, run
+from cascadeflow.integrations.langchain.harness_callback import (
+    HarnessAwareCascadeFlowCallbackHandler,
+)
 from cascadeflow.integrations.langchain import CascadeFlow
 
 
@@ -116,3 +120,35 @@ def test_domain_policy_direct_to_verifier_skips_drafter() -> None:
     assert drafter.calls == 0
     assert verifier.calls == 1
     assert result.llm_output["cascade"]["routing_reason"] == "domain_policy_direct"
+
+
+def test_wrapper_only_auto_adds_harness_callback_inside_active_run_scope() -> None:
+    reset()
+    init(mode="observe")
+    drafter = MockSequenceChatModel("draft")
+    verifier = MockSequenceChatModel("verify")
+    cascade = CascadeFlow(drafter=drafter, verifier=verifier, enable_pre_router=False)
+
+    outside_callbacks = cascade._resolve_callbacks([])
+    assert not any(
+        isinstance(cb, HarnessAwareCascadeFlowCallbackHandler) for cb in outside_callbacks
+    )
+
+    with run():
+        inside_callbacks = cascade._resolve_callbacks([])
+        assert any(
+            isinstance(cb, HarnessAwareCascadeFlowCallbackHandler) for cb in inside_callbacks
+        )
+
+
+def test_wrapper_does_not_duplicate_harness_callback() -> None:
+    reset()
+    init(mode="observe")
+    drafter = MockSequenceChatModel("draft")
+    verifier = MockSequenceChatModel("verify")
+    cascade = CascadeFlow(drafter=drafter, verifier=verifier, enable_pre_router=False)
+    existing = HarnessAwareCascadeFlowCallbackHandler()
+
+    with run():
+        callbacks = cascade._resolve_callbacks([existing])
+        assert len([cb for cb in callbacks if isinstance(cb, HarnessAwareCascadeFlowCallbackHandler)]) == 1
diff --git a/cascadeflow/integrations/langchain/wrapper.py b/cascadeflow/integrations/langchain/wrapper.py
index ed6d554b..f108d60f 100644
--- a/cascadeflow/integrations/langchain/wrapper.py
+++ b/cascadeflow/integrations/langchain/wrapper.py
@@ -169,6 +169,35 @@ def _split_runnable_config(
                 model_kwargs[key] = value
         return model_kwargs, config
 
+    def _resolve_callbacks(self, raw_callbacks: Any) -> list[Any]:
+        if raw_callbacks is None:
+            callbacks: list[Any] = []
+        elif isinstance(raw_callbacks, list):
+            callbacks = list(raw_callbacks)
+        elif isinstance(raw_callbacks, tuple):
+            callbacks = list(raw_callbacks)
+        else:
+            callbacks = [raw_callbacks]
+
+        try:
+            from cascadeflow.harness import get_current_run, get_harness_config
+
+            harness_config = get_harness_config()
+            run_ctx = get_current_run()
+            if harness_config.mode == "off" or run_ctx is None or run_ctx.mode == "off":
+                return callbacks
+
+            from .harness_callback import HarnessAwareCascadeFlowCallbackHandler
+
+            if any(isinstance(cb, HarnessAwareCascadeFlowCallbackHandler) for cb in callbacks):
+                return callbacks
+
+            callbacks.append(HarnessAwareCascadeFlowCallbackHandler())
+            return callbacks
+        except Exception:
+            # Preserve existing behavior for users who do not enable harness flows.
+            return callbacks
+
     def _generate(
         self,
         messages: list[BaseMessage],
@@ -202,7 +231,7 @@ def _generate(
             merged_kwargs["stop"] = stop
 
         # Extract callbacks before filtering (need to pass them explicitly to nested models)
-        callbacks = merged_kwargs.get("callbacks", [])
+        callbacks = self._resolve_callbacks(merged_kwargs.get("callbacks", []))
 
         existing_tags = merged_kwargs.get("tags", []) or []
         base_tags = existing_tags + ["cascadeflow"] if existing_tags else ["cascadeflow"]
@@ -599,7 +628,7 @@ async def _agenerate(
             merged_kwargs["stop"] = stop
 
         # Extract callbacks before filtering (need to pass them explicitly to nested models)
-        callbacks = merged_kwargs.get("callbacks", [])
+        callbacks = self._resolve_callbacks(merged_kwargs.get("callbacks", []))
 
         existing_tags = merged_kwargs.get("tags", []) or []
         base_tags = existing_tags + ["cascadeflow"] if existing_tags else ["cascadeflow"]
@@ -1001,7 +1030,7 @@ def _stream(
         stream_kwargs, base_config = self._split_runnable_config(merged_kwargs)
         base_tags = (base_config.get("tags") or []) + ["cascadeflow"]
         existing_metadata = base_config.get("metadata", {}) or {}
-        callbacks = base_config.get("callbacks", [])
+        callbacks = self._resolve_callbacks(base_config.get("callbacks", []))
         resolved_domain = self._resolve_domain(messages, existing_metadata)
         effective_quality_threshold = self._effective_quality_threshold(resolved_domain)
         force_verifier_for_domain = self._domain_forces_verifier(resolved_domain)
@@ -1324,7 +1353,7 @@ async def _astream(
         stream_kwargs, base_config = self._split_runnable_config(merged_kwargs)
         base_tags = (base_config.get("tags") or []) + ["cascadeflow"]
         existing_metadata = base_config.get("metadata", {}) or {}
-        callbacks = base_config.get("callbacks", [])
+        callbacks = self._resolve_callbacks(base_config.get("callbacks", []))
         safe_kwargs = {
             k: v
             for k, v in stream_kwargs.items()

From f70572d75b78f1c274e6ef09de8755d787a354ca Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 10:03:35 +0100
Subject: [PATCH 22/49] docs(plan): mark langchain harness extension branch
 completed

---
 docs/strategy/agent-intelligence-v2-plan.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md
index 73bfec1b..33bae5ae 100644
--- a/docs/strategy/agent-intelligence-v2-plan.md
+++ b/docs/strategy/agent-intelligence-v2-plan.md
@@ -847,7 +847,7 @@ Claim checklist (one owner per branch at a time):
 - [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `completed (ready for PR)`
 - [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
 - [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
-- [ ] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
+- [x] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `completed`
 - [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
 - [ ] `feat/v2-bench-repro-pipeline` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
 - [ ] `feat/v2-security-privacy-telemetry` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`

From d740cad1fe6ab4dbd7f7dc9aa8f0e49340c39f9c Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 10:21:17 +0100
Subject: [PATCH 23/49] fix(langchain): address PR #161 review findings

- Document enforce-mode limitations for switch_model and deny_tool
- Replace per-handler _executed_tool_calls with run_ctx.tool_calls
- Fix _extract_candidate_state fallback leaking arbitrary kwargs
- Remove return-in-finally (B012) and fix import ordering
- Separate langgraph from langchain optional extra in pyproject.toml
- Add 4 edge-case tests: no-run-context safety, state extraction
  guard, and run_ctx tool_calls gating
---
 .../langchain/harness_callback.py             | 32 +++++++++++------
 .../integrations/langchain/harness_state.py   |  8 ++++-
 .../tests/test_langchain_harness_callback.py  | 36 +++++++++++++++++++
 .../test_langchain_integration_features.py    |  2 +-
 pyproject.toml                                |  8 ++++-
 5 files changed, 73 insertions(+), 13 deletions(-)

diff --git a/cascadeflow/integrations/langchain/harness_callback.py b/cascadeflow/integrations/langchain/harness_callback.py
index aff5c0b4..25962a5d 100644
--- a/cascadeflow/integrations/langchain/harness_callback.py
+++ b/cascadeflow/integrations/langchain/harness_callback.py
@@ -1,4 +1,17 @@
-"""Harness-aware callbacks for LangChain/LangGraph integration."""
+"""Harness-aware callbacks for LangChain/LangGraph integration.
+
+Enforce-mode limitations (LangChain callback architecture):
+    - ``stop`` (budget/latency/energy exceeded): fully enforced — raises
+      BudgetExceededError or HarnessStopError from ``on_llm_start``.
+    - ``deny_tool`` (tool-call cap): fully enforced at the tool level via
+      ``on_tool_start`` — raises HarnessStopError before tool execution.
+    - ``switch_model``: **observe-only** — LangChain dispatches the LLM call
+      before ``on_llm_start`` returns, so the callback cannot redirect to a
+      different model.  The decision is recorded with ``applied=False``.
+    - ``deny_tool`` at LLM level (pre-call decision): **observe-only** — the
+      callback cannot strip tools from an already-dispatched LLM request.
+      The decision is recorded with ``applied=False``.
+"""
 
 from __future__ import annotations
 
@@ -19,7 +32,11 @@
 
 
 class HarnessAwareCascadeFlowCallbackHandler(CascadeFlowCallbackHandler):
-    """LangChain callback that bridges native lifecycle events into HarnessRunContext."""
+    """LangChain callback that bridges native lifecycle events into HarnessRunContext.
+
+    See module docstring for enforce-mode limitations on ``switch_model``
+    and LLM-level ``deny_tool``.
+    """
 
     def __init__(self, *, fail_open: bool = True):
         super().__init__()
@@ -29,7 +46,6 @@ def __init__(self, *, fail_open: bool = True):
         self._pre_reason: str = "allow"
         self._pre_model: Optional[str] = None
         self._pre_recorded: bool = False
-        self._executed_tool_calls: int = 0
 
     def _handle_harness_error(self, error: Exception) -> None:
         if self.fail_open:
@@ -68,7 +84,7 @@ def on_llm_start(self, serialized: dict[str, Any], prompts: list[str], **kwargs:
             if not has_tools:
                 has_tools = bool(kwargs.get("tools"))
 
-            from cascadeflow.harness.instrument import _evaluate_pre_call_decision, _raise_stop_error
+            from cascadeflow.harness.instrument import _evaluate_pre_call_decision, _raise_stop_error  # noqa: I001
 
             decision = _evaluate_pre_call_decision(run_ctx, model_name, has_tools=has_tools)
             self._pre_action = decision.action
@@ -191,7 +207,7 @@ def on_tool_start(self, serialized: dict[str, Any], input_str: str, **kwargs: An
             if run_ctx.tool_calls_max is None:
                 return None
 
-            if self._executed_tool_calls >= run_ctx.tool_calls_max:
+            if run_ctx.tool_calls >= run_ctx.tool_calls_max:
                 if run_ctx.mode == "observe":
                     run_ctx.record(
                         action="deny_tool",
@@ -214,7 +230,6 @@ def on_tool_start(self, serialized: dict[str, Any], input_str: str, **kwargs: An
                         reason="max_tool_calls_reached",
                     )
 
-            self._executed_tool_calls += 1
             return None
         except Exception as exc:
             self._handle_harness_error(exc)
@@ -225,10 +240,7 @@ def on_tool_start(self, serialized: dict[str, Any], input_str: str, **kwargs: An
 def get_harness_callback(*, fail_open: bool = True):
     """Context manager that yields a harness-aware LangChain callback handler."""
     callback = HarnessAwareCascadeFlowCallbackHandler(fail_open=fail_open)
-    try:
-        yield callback
-    finally:
-        return
+    yield callback
 
 
 __all__ = ["HarnessAwareCascadeFlowCallbackHandler", "get_harness_callback"]
diff --git a/cascadeflow/integrations/langchain/harness_state.py b/cascadeflow/integrations/langchain/harness_state.py
index 49278ef1..313932ce 100644
--- a/cascadeflow/integrations/langchain/harness_state.py
+++ b/cascadeflow/integrations/langchain/harness_state.py
@@ -24,6 +24,12 @@ def _as_float(value: Any) -> Optional[float]:
 
 
 def _extract_candidate_state(source: Any) -> Optional[Mapping[str, Any]]:
+    """Extract a named state container from a mapping.
+
+    Only returns state from explicitly named keys (langgraph_state, graph_state,
+    state).  Returns None when no named key matches — avoids treating arbitrary
+    kwargs as harness state.
+    """
     if not isinstance(source, Mapping):
         return None
 
@@ -32,7 +38,7 @@ def _extract_candidate_state(source: Any) -> Optional[Mapping[str, Any]]:
         if isinstance(candidate, Mapping):
             return candidate
 
-    return source
+    return None
 
 
 def extract_langgraph_state(payload: Any) -> dict[str, Any]:
diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py
index b96cb30d..79a6f539 100644
--- a/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py
+++ b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py
@@ -116,6 +116,42 @@ def test_harness_callback_enforce_denies_tool_when_limit_reached() -> None:
         assert trace[-1]["decision_mode"] == "enforce"
 
 
+def test_on_llm_end_no_run_context_is_safe() -> None:
+    handler = HarnessAwareCascadeFlowCallbackHandler()
+    handler.on_llm_start(
+        serialized={},
+        prompts=["hello"],
+        invocation_params={"model": "gpt-4o-mini"},
+    )
+    result = handler.on_llm_end(_llm_result("gpt-4o-mini", 10, 5))
+    assert result is None
+
+
+def test_on_tool_start_no_run_context_is_safe() -> None:
+    handler = HarnessAwareCascadeFlowCallbackHandler()
+    result = handler.on_tool_start(serialized={"name": "search"}, input_str="query")
+    assert result is None
+
+
+def test_extract_state_ignores_plain_kwargs() -> None:
+    """Kwargs without a named state key should not leak into state."""
+    state = extract_langgraph_state({"model": "gpt-4o", "invocation_params": {"tools": []}})
+    assert state == {}
+
+
+def test_tool_deny_uses_run_ctx_tool_calls() -> None:
+    """Tool gating should use run_ctx.tool_calls, not a local counter."""
+    init(mode="enforce", max_tool_calls=2, budget=1.0)
+    handler = HarnessAwareCascadeFlowCallbackHandler(fail_open=False)
+
+    with run(max_tool_calls=2, budget=1.0) as ctx:
+        # Simulate tool calls already counted by on_llm_end or other integrations
+        ctx.tool_calls = 2
+
+        with pytest.raises(HarnessStopError, match="max tool calls"):
+            handler.on_tool_start(serialized={"name": "search"}, input_str="query")
+
+
 def test_extract_and_apply_langgraph_state() -> None:
     state = extract_langgraph_state(
         {
diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py b/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py
index f225fa3a..0e82fb48 100644
--- a/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py
+++ b/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py
@@ -5,10 +5,10 @@
 from langchain_core.outputs import ChatGeneration, ChatResult
 
 from cascadeflow.harness import init, reset, run
+from cascadeflow.integrations.langchain import CascadeFlow
 from cascadeflow.integrations.langchain.harness_callback import (
     HarnessAwareCascadeFlowCallbackHandler,
 )
-from cascadeflow.integrations.langchain import CascadeFlow
 
 
 class MockSequenceChatModel(BaseChatModel):
diff --git a/pyproject.toml b/pyproject.toml
index 198042da..2bbd3082 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -101,10 +101,16 @@ openai-agents = [
     "openai-agents>=0.9.0; python_version >= '3.10'",
 ]
 
-# LangChain/LangGraph harness integration (opt-in)
+# LangChain harness integration (opt-in)
 langchain = [
     "langchain>=0.3.0",
     "langchain-core>=0.3.0",
+]
+
+# LangGraph state extraction (opt-in, adds langgraph on top of langchain)
+langgraph = [
+    "langchain>=0.3.0",
+    "langchain-core>=0.3.0",
     "langgraph>=0.2.0",
 ]
 

From 3bd78996cc3f2a88882af599496451ea35c9fd99 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 26 Feb 2026 10:35:35 +0100
Subject: [PATCH 24/49] fix(langchain): enforce tool caps on executed calls and
 harden tool extraction

---
 .../langchain/harness_callback.py             |  8 ++---
 .../tests/test_langchain_harness_callback.py  | 31 ++++++++++++++++++-
 cascadeflow/integrations/langchain/utils.py   |  4 +++
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/cascadeflow/integrations/langchain/harness_callback.py b/cascadeflow/integrations/langchain/harness_callback.py
index 25962a5d..faffa939 100644
--- a/cascadeflow/integrations/langchain/harness_callback.py
+++ b/cascadeflow/integrations/langchain/harness_callback.py
@@ -26,7 +26,7 @@
 
 from .harness_state import apply_langgraph_state, extract_langgraph_state
 from .langchain_callbacks import CascadeFlowCallbackHandler
-from .utils import extract_token_usage, extract_tool_calls
+from .utils import extract_token_usage
 
 logger = logging.getLogger("cascadeflow.harness.langchain")
 
@@ -157,8 +157,6 @@ def on_llm_end(self, response: Any, **kwargs: Any) -> None:
             token_usage = extract_token_usage(response)
             prompt_tokens = int(token_usage["input"])
             completion_tokens = int(token_usage["output"])
-            tool_call_count = len(extract_tool_calls(response))
-
             elapsed_ms = 0.0
             if self._llm_started_at is not None:
                 elapsed_ms = (time.monotonic() - self._llm_started_at) * 1000.0
@@ -167,7 +165,6 @@ def on_llm_end(self, response: Any, **kwargs: Any) -> None:
             run_ctx.cost += estimate_cost(model_name, prompt_tokens, completion_tokens)
             run_ctx.energy_used += estimate_energy(model_name, prompt_tokens, completion_tokens)
             run_ctx.latency_used_ms += elapsed_ms
-            run_ctx.tool_calls += tool_call_count
 
             if run_ctx.budget_max is not None:
                 run_ctx.budget_remaining = run_ctx.budget_max - run_ctx.cost
@@ -230,6 +227,8 @@ def on_tool_start(self, serialized: dict[str, Any], input_str: str, **kwargs: An
                         reason="max_tool_calls_reached",
                     )
 
+            # Track executed tools (not predicted tool calls in LLM output).
+            run_ctx.tool_calls += 1
             return None
         except Exception as exc:
             self._handle_harness_error(exc)
@@ -244,4 +243,3 @@ def get_harness_callback(*, fail_open: bool = True):
 
 
 __all__ = ["HarnessAwareCascadeFlowCallbackHandler", "get_harness_callback"]
-
diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py
index 79a6f539..341087b9 100644
--- a/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py
+++ b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py
@@ -14,6 +14,7 @@
     apply_langgraph_state,
     extract_langgraph_state,
 )
+from cascadeflow.integrations.langchain.utils import extract_tool_calls
 from cascadeflow.schema.exceptions import BudgetExceededError, HarnessStopError
 
 
@@ -152,6 +153,35 @@ def test_tool_deny_uses_run_ctx_tool_calls() -> None:
             handler.on_tool_start(serialized={"name": "search"}, input_str="query")
 
 
+def test_tool_start_counts_executions_and_blocks_after_limit() -> None:
+    init(mode="enforce", max_tool_calls=1, budget=1.0)
+    handler = HarnessAwareCascadeFlowCallbackHandler(fail_open=False)
+
+    with run(max_tool_calls=1, budget=1.0) as ctx:
+        assert ctx.tool_calls == 0
+        assert handler.on_tool_start(serialized={"name": "search"}, input_str="first") is None
+        assert ctx.tool_calls == 1
+
+        with pytest.raises(HarnessStopError, match="max tool calls"):
+            handler.on_tool_start(serialized={"name": "search"}, input_str="second")
+
+        assert ctx.tool_calls == 1
+        trace = ctx.trace()
+        assert trace[-1]["action"] == "deny_tool"
+        assert trace[-1]["applied"] is True
+
+
+def test_extract_tool_calls_supports_llm_result_nested_generations() -> None:
+    generation = ChatGeneration(
+        message=AIMessage(content="", tool_calls=[{"name": "search", "args": {"q": "x"}, "id": "t1"}]),
+        generation_info={},
+    )
+    llm_result = LLMResult(generations=[[generation]], llm_output={"model_name": "gpt-4o-mini"})
+    tool_calls = extract_tool_calls(llm_result)
+    assert len(tool_calls) == 1
+    assert tool_calls[0]["name"] == "search"
+
+
 def test_extract_and_apply_langgraph_state() -> None:
     state = extract_langgraph_state(
         {
@@ -181,4 +211,3 @@ def test_extract_and_apply_langgraph_state() -> None:
         assert ctx.latency_used_ms == pytest.approx(130.0)
         assert ctx.energy_used == pytest.approx(77.0)
         assert ctx.model_used == "gpt-4o-mini"
-
diff --git a/cascadeflow/integrations/langchain/utils.py b/cascadeflow/integrations/langchain/utils.py
index fe47a353..04f3e4a5 100644
--- a/cascadeflow/integrations/langchain/utils.py
+++ b/cascadeflow/integrations/langchain/utils.py
@@ -195,6 +195,10 @@ def extract_tool_calls(response: Any) -> list[dict[str, Any]]:
     msg = None
     if hasattr(response, "generations") and response.generations:
         generation = response.generations[0]
+        # LLMResult.generations is often list[list[Generation]], while ChatResult
+        # uses list[Generation]. Support both shapes.
+        if isinstance(generation, list) and generation:
+            generation = generation[0]
         msg = getattr(generation, "message", None)
     else:
         msg = getattr(response, "message", None) or response

From 8f74dee7504397ee3f7e9b181f1a6dcf8272f9fd Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Mon, 2 Mar 2026 11:35:33 +0100
Subject: [PATCH 25/49] fix(harness): avoid shadowing cascadeflow.agent module

---
 tests/test_harness_api.py | 92 ---------------------------------------
 1 file changed, 92 deletions(-)

diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py
index fd89e590..5669e845 100644
--- a/tests/test_harness_api.py
+++ b/tests/test_harness_api.py
@@ -5,7 +5,6 @@
 import cascadeflow
 import cascadeflow.harness.api as harness_api
 from cascadeflow.harness import agent, get_current_run, get_harness_config, init, reset, run
-from cascadeflow.telemetry.callbacks import CallbackEvent, CallbackManager
 
 
 def setup_function() -> None:
@@ -173,8 +172,6 @@ def test_top_level_exports_exist():
     assert callable(cascadeflow.run)
     assert callable(cascadeflow.harness_agent)
     assert hasattr(cascadeflow.agent, "PROVIDER_REGISTRY")
-    assert callable(cascadeflow.get_harness_callback_manager)
-    assert callable(cascadeflow.set_harness_callback_manager)
     report = cascadeflow.init(mode="off")
     assert report.mode == "off"
 
@@ -186,8 +183,6 @@ def test_run_record_and_trace_copy():
     trace_b = ctx.trace()
     assert trace_a == trace_b
     assert trace_a[0]["action"] == "switch_model"
-    assert "budget_state" in trace_a[0]
-    assert trace_a[0]["budget_state"]["max"] == 1.0
     trace_a.append({"action": "mutated"})
     assert len(ctx.trace()) == 1
 
@@ -332,90 +327,3 @@ def test_init_reports_openai_instrumented_when_patch_succeeds(monkeypatch):
     monkeypatch.setattr(instrument, "patch_openai", lambda: True)
     report = init(mode="observe")
     assert report.instrumented == ["openai"]
-
-
-def test_run_summary_populates_on_context_exit():
-    init(mode="observe")
-    with run(budget=1.5) as ctx:
-        ctx.step_count = 2
-        ctx.tool_calls = 1
-        ctx.cost = 0.42
-        ctx.latency_used_ms = 123.0
-        ctx.energy_used = 33.0
-        ctx.budget_remaining = 1.08
-        ctx.last_action = "allow"
-        ctx.model_used = "gpt-4o-mini"
-
-    summary = ctx.summary()
-    assert summary["run_id"] == ctx.run_id
-    assert summary["step_count"] == 2
-    assert summary["budget_remaining"] == pytest.approx(1.08)
-    assert summary["duration_ms"] is not None
-    assert summary["duration_ms"] >= 0.0
-    assert ctx.duration_ms is not None
-    assert ctx.duration_ms >= 0.0
-
-
-def test_run_context_logs_summary(caplog):
-    init(mode="observe")
-    with caplog.at_level("INFO", logger="cascadeflow.harness"):
-        with run(budget=1.0) as ctx:
-            ctx.step_count = 1
-            ctx.cost = 0.01
-            ctx.model_used = "gpt-4o-mini"
-
-    assert any("harness run summary" in rec.message for rec in caplog.records)
-
-
-def test_record_emits_cascade_decision_callback():
-    manager = CallbackManager()
-    received = []
-
-    def _on_decision(data):
-        received.append(data)
-
-    manager.register(CallbackEvent.CASCADE_DECISION, _on_decision)
-    report = init(mode="observe", callback_manager=manager)
-    assert report.config_sources["callback_manager"] == "code"
-
-    with run(budget=1.0) as ctx:
-        ctx.step_count = 1
-        ctx.record(action="switch_model", reason="budget_pressure", model="gpt-4o-mini")
-
-    assert len(received) == 1
-    event = received[0]
-    assert event.event == CallbackEvent.CASCADE_DECISION
-    assert event.query == "[harness]"
-    assert event.workflow == "harness"
-    assert event.data["action"] == "switch_model"
-    assert event.data["run_id"] == ctx.run_id
-
-
-def test_record_sanitizes_trace_values():
-    ctx = run()
-    ctx.record(
-        action="allow\nnewline",
-        reason="a" * 400,
-        model="model\r\nname",
-    )
-    entry = ctx.trace()[0]
-    assert "\n" not in entry["action"]
-    assert "\r" not in entry["model"]
-    assert len(entry["reason"]) <= 160
-
-
-def test_record_without_callback_manager_is_noop():
-    init(mode="observe")
-    with run(budget=1.0) as ctx:
-        ctx.record(action="allow", reason="test", model="gpt-4o-mini")
-    assert len(ctx.trace()) == 1
-
-
-def test_record_empty_action_warns_and_defaults(caplog):
-    init(mode="observe")
-    with caplog.at_level("WARNING", logger="cascadeflow.harness"):
-        with run(budget=1.0) as ctx:
-            ctx.record(action="", reason="test", model="gpt-4o-mini")
-    entry = ctx.trace()[0]
-    assert entry["action"] == "allow"
-    assert any("empty action" in rec.message for rec in caplog.records)

From 5972e8b87267eab4e5db3740b71b2aa7b750c7ef Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Mon, 2 Mar 2026 10:36:38 +0100
Subject: [PATCH 26/49] feat(bench): add reproducibility pipeline for V2
 Go/No-Go validation

Add 5 new benchmark modules and 15 unit tests that enable third-party
reproducibility and automated V2 readiness checks:

- repro.py: environment fingerprint (git SHA, packages, platform)
- baseline.py: save/load baselines, delta comparison, Go/No-Go gates
- harness_overhead.py: decision-path p95 measurement (<5ms gate)
- observe_validation.py: observe-mode zero-change proof (6 cases)
- artifact.py: JSON artifact bundler + REPRODUCE.md generation

Extends run_all.py with --baseline, --harness-mode, --with-repro flags.
---
 tests/benchmarks/artifact.py       | 4 +---
 tests/test_bench_repro_pipeline.py | 7 +++++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/benchmarks/artifact.py b/tests/benchmarks/artifact.py
index fde0f616..b4acd4b3 100644
--- a/tests/benchmarks/artifact.py
+++ b/tests/benchmarks/artifact.py
@@ -98,9 +98,7 @@ def bundle_artifact(
 def _write_reproduce_md(output_dir: Path, metadata: dict[str, Any]) -> Path:
     packages = metadata.get("package_versions", {})
     rows = [f"| {name} | {ver} |" for name, ver in sorted(packages.items())]
-    table = (
-        "| Package | Version |\n|---------|----------|\n" + "\n".join(rows) if rows else "_none_"
-    )
+    table = "| Package | Version |\n|---------|----------|\n" + "\n".join(rows) if rows else "_none_"
 
     content = _REPRODUCE_TEMPLATE.format(
         git_sha=metadata.get("git_sha", "unknown"),
diff --git a/tests/test_bench_repro_pipeline.py b/tests/test_bench_repro_pipeline.py
index d598e398..bce15a88 100644
--- a/tests/test_bench_repro_pipeline.py
+++ b/tests/test_bench_repro_pipeline.py
@@ -69,7 +69,9 @@ def sample_results() -> dict:
 
 @pytest.fixture
 def sample_metadata() -> dict:
-    return metadata_to_dict(collect_repro_metadata(profile="smoke", harness_mode="off"))
+    return metadata_to_dict(
+        collect_repro_metadata(profile="smoke", harness_mode="off")
+    )
 
 
 # ── 1-2: ReproMetadata ───────────────────────────────────────────────────
@@ -140,7 +142,8 @@ def test_compare_no_regression(sample_results):
 def test_compare_with_regression(sample_results):
     """Accuracy drop flagged as regression."""
     worse = {
-        name: {**vals, "accuracy": vals["accuracy"] - 5.0} for name, vals in sample_results.items()
+        name: {**vals, "accuracy": vals["accuracy"] - 5.0}
+        for name, vals in sample_results.items()
     }
     report = compare_to_baseline(worse, sample_results)
     assert report.any_accuracy_regression

From 97250f4a83b2a3c0d0f8411facd8a979a2b10033 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Mon, 2 Mar 2026 11:38:43 +0100
Subject: [PATCH 27/49] =?UTF-8?q?docs(plan):=20update=20workboard=20?=
 =?UTF-8?q?=E2=80=94=20bench-repro-pipeline=20PR=20#163=20in=20review?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/strategy/agent-intelligence-v2-plan.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md
index 33bae5ae..267ddc69 100644
--- a/docs/strategy/agent-intelligence-v2-plan.md
+++ b/docs/strategy/agent-intelligence-v2-plan.md
@@ -847,10 +847,10 @@ Claim checklist (one owner per branch at a time):
 - [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `completed (ready for PR)`
 - [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
 - [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
-- [x] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `completed`
+- [ ] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
 - [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
-- [ ] `feat/v2-bench-repro-pipeline` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
-- [ ] `feat/v2-security-privacy-telemetry` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
+- [x] `feat/v2-bench-repro-pipeline` — Owner: `@codex` — PR: `#163` — Status: `review`
+- [ ] `feat/v2-security-privacy-telemetry` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
 
 Merge gates per feature branch:
 - [ ] Unit/integration tests green for touched scope

From 805fef18ba7abd3fc5c6ba1e5268c2681a6183b6 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Mon, 2 Mar 2026 11:38:53 +0100
Subject: [PATCH 28/49] style(bench): apply linter formatting to repro pipeline
 files

---
 tests/benchmarks/artifact.py       | 4 +++-
 tests/test_bench_repro_pipeline.py | 7 ++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/benchmarks/artifact.py b/tests/benchmarks/artifact.py
index b4acd4b3..fde0f616 100644
--- a/tests/benchmarks/artifact.py
+++ b/tests/benchmarks/artifact.py
@@ -98,7 +98,9 @@ def bundle_artifact(
 def _write_reproduce_md(output_dir: Path, metadata: dict[str, Any]) -> Path:
     packages = metadata.get("package_versions", {})
     rows = [f"| {name} | {ver} |" for name, ver in sorted(packages.items())]
-    table = "| Package | Version |\n|---------|----------|\n" + "\n".join(rows) if rows else "_none_"
+    table = (
+        "| Package | Version |\n|---------|----------|\n" + "\n".join(rows) if rows else "_none_"
+    )
 
     content = _REPRODUCE_TEMPLATE.format(
         git_sha=metadata.get("git_sha", "unknown"),
diff --git a/tests/test_bench_repro_pipeline.py b/tests/test_bench_repro_pipeline.py
index bce15a88..d598e398 100644
--- a/tests/test_bench_repro_pipeline.py
+++ b/tests/test_bench_repro_pipeline.py
@@ -69,9 +69,7 @@ def sample_results() -> dict:
 
 @pytest.fixture
 def sample_metadata() -> dict:
-    return metadata_to_dict(
-        collect_repro_metadata(profile="smoke", harness_mode="off")
-    )
+    return metadata_to_dict(collect_repro_metadata(profile="smoke", harness_mode="off"))
 
 
 # ── 1-2: ReproMetadata ───────────────────────────────────────────────────
@@ -142,8 +140,7 @@ def test_compare_no_regression(sample_results):
 def test_compare_with_regression(sample_results):
     """Accuracy drop flagged as regression."""
     worse = {
-        name: {**vals, "accuracy": vals["accuracy"] - 5.0}
-        for name, vals in sample_results.items()
+        name: {**vals, "accuracy": vals["accuracy"] - 5.0} for name, vals in sample_results.items()
     }
     report = compare_to_baseline(worse, sample_results)
     assert report.any_accuracy_regression

From f05ca3d6598187fa2dd9c7caccc6595d4b776177 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Mon, 2 Mar 2026 16:18:46 +0100
Subject: [PATCH 29/49] style(langchain): finalize harness callback typing and
 formatting

---
 cascadeflow/integrations/langchain/harness_callback.py |  5 ++++-
 cascadeflow/integrations/langchain/harness_state.py    |  1 -
 .../langchain/tests/test_langchain_harness_callback.py | 10 +++++-----
 .../tests/test_langchain_integration_features.py       |  5 ++++-
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/cascadeflow/integrations/langchain/harness_callback.py b/cascadeflow/integrations/langchain/harness_callback.py
index faffa939..01f08d8c 100644
--- a/cascadeflow/integrations/langchain/harness_callback.py
+++ b/cascadeflow/integrations/langchain/harness_callback.py
@@ -84,7 +84,10 @@ def on_llm_start(self, serialized: dict[str, Any], prompts: list[str], **kwargs:
             if not has_tools:
                 has_tools = bool(kwargs.get("tools"))
 
-            from cascadeflow.harness.instrument import _evaluate_pre_call_decision, _raise_stop_error  # noqa: I001
+            from cascadeflow.harness.instrument import (
+                _evaluate_pre_call_decision,
+                _raise_stop_error,
+            )  # noqa: I001
 
             decision = _evaluate_pre_call_decision(run_ctx, model_name, has_tools=has_tools)
             self._pre_action = decision.action
diff --git a/cascadeflow/integrations/langchain/harness_state.py b/cascadeflow/integrations/langchain/harness_state.py
index 313932ce..b4b40da5 100644
--- a/cascadeflow/integrations/langchain/harness_state.py
+++ b/cascadeflow/integrations/langchain/harness_state.py
@@ -122,4 +122,3 @@ def apply_langgraph_state(run_ctx: Any, state: Mapping[str, Any]) -> None:
     model_used = state.get("model_used")
     if isinstance(model_used, str) and model_used:
         run_ctx.model_used = model_used
-
diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py
index 341087b9..9ba062e5 100644
--- a/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py
+++ b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py
@@ -124,14 +124,12 @@ def test_on_llm_end_no_run_context_is_safe() -> None:
         prompts=["hello"],
         invocation_params={"model": "gpt-4o-mini"},
     )
-    result = handler.on_llm_end(_llm_result("gpt-4o-mini", 10, 5))
-    assert result is None
+    handler.on_llm_end(_llm_result("gpt-4o-mini", 10, 5))
 
 
 def test_on_tool_start_no_run_context_is_safe() -> None:
     handler = HarnessAwareCascadeFlowCallbackHandler()
-    result = handler.on_tool_start(serialized={"name": "search"}, input_str="query")
-    assert result is None
+    handler.on_tool_start(serialized={"name": "search"}, input_str="query")
 
 
 def test_extract_state_ignores_plain_kwargs() -> None:
@@ -173,7 +171,9 @@ def test_tool_start_counts_executions_and_blocks_after_limit() -> None:
 
 def test_extract_tool_calls_supports_llm_result_nested_generations() -> None:
     generation = ChatGeneration(
-        message=AIMessage(content="", tool_calls=[{"name": "search", "args": {"q": "x"}, "id": "t1"}]),
+        message=AIMessage(
+            content="", tool_calls=[{"name": "search", "args": {"q": "x"}, "id": "t1"}]
+        ),
         generation_info={},
     )
     llm_result = LLMResult(generations=[[generation]], llm_output={"model_name": "gpt-4o-mini"})
diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py b/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py
index 0e82fb48..0f051519 100644
--- a/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py
+++ b/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py
@@ -151,4 +151,7 @@ def test_wrapper_does_not_duplicate_harness_callback() -> None:
 
     with run():
         callbacks = cascade._resolve_callbacks([existing])
-        assert len([cb for cb in callbacks if isinstance(cb, HarnessAwareCascadeFlowCallbackHandler)]) == 1
+        assert (
+            len([cb for cb in callbacks if isinstance(cb, HarnessAwareCascadeFlowCallbackHandler)])
+            == 1
+        )

From 98f48bdaa88ff08a1a3d56ed6f48491cabc8cf7f Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 4 Mar 2026 15:45:56 +0100
Subject: [PATCH 30/49] feat(integrations): add Google ADK harness plugin
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add CascadeFlowADKPlugin(BasePlugin) that intercepts all LLM calls
across ADK Runner agents for budget enforcement, cost/latency/energy
tracking, tool call counting, and trace recording.

New files:
- cascadeflow/harness/pricing.py — shared pricing table with Gemini models
- cascadeflow/integrations/google_adk.py — plugin + enable/disable API
- tests/test_google_adk_integration.py — 49 tests
- docs/guides/google_adk_integration.md
- examples/integrations/google_adk_harness.py

Modified:
- cascadeflow/integrations/__init__.py — register integration
- pyproject.toml — add google-adk optional extra
---
 cascadeflow/harness/pricing.py              |  81 +--
 cascadeflow/integrations/__init__.py        |  38 ++
 cascadeflow/integrations/google_adk.py      | 424 ++++++++++++++
 docs/guides/google_adk_integration.md       | 161 ++++++
 examples/integrations/google_adk_harness.py |  89 +++
 pyproject.toml                              |  14 +-
 tests/test_google_adk_integration.py        | 598 ++++++++++++++++++++
 7 files changed, 1355 insertions(+), 50 deletions(-)
 create mode 100644 cascadeflow/integrations/google_adk.py
 create mode 100644 docs/guides/google_adk_integration.md
 create mode 100644 examples/integrations/google_adk_harness.py
 create mode 100644 tests/test_google_adk_integration.py

diff --git a/cascadeflow/harness/pricing.py b/cascadeflow/harness/pricing.py
index bd86323e..fe7bd92c 100644
--- a/cascadeflow/harness/pricing.py
+++ b/cascadeflow/harness/pricing.py
@@ -1,15 +1,21 @@
-"""Shared harness pricing and energy profiles.
+"""Shared pricing and energy estimation for harness integrations.
 
-This module centralizes model-cost and energy-estimation defaults used by
-harness integrations (OpenAI auto-instrumentation, OpenAI Agents SDK, CrewAI).
+Provides approximate USD-per-1M-token pricing and deterministic energy
+coefficients used by CrewAI, OpenAI Agents, Google ADK, and future
+integration modules.
+
+A future pricing registry will consolidate with ``cascadeflow.pricing``
+and LiteLLM live data.  Until then this module is the canonical source
+for harness-level cost/energy estimation.
 """
 
 from __future__ import annotations
 
-from typing import Final
+# ---------------------------------------------------------------------------
+# Pricing (USD per 1M tokens: input, output)
+# ---------------------------------------------------------------------------
 
-# USD per 1M tokens (input, output).
-PRICING_USD_PER_M: Final[dict[str, tuple[float, float]]] = {
+PRICING_USD_PER_M: dict[str, tuple[float, float]] = {
     # OpenAI
     "gpt-4o": (2.50, 10.00),
     "gpt-4o-mini": (0.15, 0.60),
@@ -21,15 +27,25 @@
     "o1": (15.00, 60.00),
     "o1-mini": (3.00, 12.00),
     "o3-mini": (1.10, 4.40),
-    # Anthropic aliases used by CrewAI model names.
+    # Anthropic
     "claude-sonnet-4": (3.00, 15.00),
     "claude-haiku-3.5": (1.00, 5.00),
     "claude-opus-4.5": (5.00, 25.00),
+    # Google Gemini
+    "gemini-2.5-flash": (0.15, 0.60),
+    "gemini-2.5-pro": (1.25, 10.00),
+    "gemini-2.0-flash": (0.10, 0.40),
+    "gemini-1.5-flash": (0.075, 0.30),
+    "gemini-1.5-pro": (1.25, 5.00),
 }
-DEFAULT_PRICING_USD_PER_M: Final[tuple[float, float]] = (2.50, 10.00)
+DEFAULT_PRICING_USD_PER_M: tuple[float, float] = (2.50, 10.00)
+
+# ---------------------------------------------------------------------------
+# Energy coefficients (deterministic proxy for compute intensity)
+# ---------------------------------------------------------------------------
 
-# Deterministic proxy coefficients for energy tracking.
-ENERGY_COEFFICIENTS: Final[dict[str, float]] = {
+ENERGY_COEFFICIENTS: dict[str, float] = {
+    # OpenAI
     "gpt-4o": 1.0,
     "gpt-4o-mini": 0.3,
     "gpt-5": 1.2,
@@ -40,39 +56,28 @@
     "o1": 2.0,
     "o1-mini": 0.8,
     "o3-mini": 0.5,
+    # Anthropic
+    "claude-sonnet-4": 1.0,
+    "claude-haiku-3.5": 0.3,
+    "claude-opus-4.5": 1.8,
+    # Google Gemini
+    "gemini-2.5-flash": 0.3,
+    "gemini-2.5-pro": 1.2,
+    "gemini-2.0-flash": 0.25,
+    "gemini-1.5-flash": 0.2,
+    "gemini-1.5-pro": 1.0,
 }
-DEFAULT_ENERGY_COEFFICIENT: Final[float] = 1.0
-ENERGY_OUTPUT_WEIGHT: Final[float] = 1.5
-
-# Explicit pools keep provider/model-switching logic constrained even though the
-# pricing table is shared across integrations.
-OPENAI_MODEL_POOL: Final[tuple[str, ...]] = (
-    "gpt-4o",
-    "gpt-4o-mini",
-    "gpt-5",
-    "gpt-5-mini",
-    "gpt-4-turbo",
-    "gpt-4",
-    "gpt-3.5-turbo",
-    "o1",
-    "o1-mini",
-    "o3-mini",
-)
+DEFAULT_ENERGY_COEFFICIENT: float = 1.0
+ENERGY_OUTPUT_WEIGHT: float = 1.5
 
 
 def estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
-    """Estimate USD cost from token usage."""
+    """Estimate cost in USD from model name and token counts."""
     in_price, out_price = PRICING_USD_PER_M.get(model, DEFAULT_PRICING_USD_PER_M)
-    return (input_tokens / 1_000_000.0) * in_price + (output_tokens / 1_000_000.0) * out_price
+    return (input_tokens / 1_000_000) * in_price + (output_tokens / 1_000_000) * out_price
 
 
 def estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float:
-    """Estimate deterministic proxy energy units."""
-    coefficient = ENERGY_COEFFICIENTS.get(model, DEFAULT_ENERGY_COEFFICIENT)
-    return coefficient * (input_tokens + (output_tokens * ENERGY_OUTPUT_WEIGHT))
-
-
-def model_total_price(model: str) -> float:
-    """Return total (input + output) price per 1M tokens."""
-    in_price, out_price = PRICING_USD_PER_M.get(model, DEFAULT_PRICING_USD_PER_M)
-    return in_price + out_price
+    """Estimate energy proxy from model name and token counts."""
+    coeff = ENERGY_COEFFICIENTS.get(model, DEFAULT_ENERGY_COEFFICIENT)
+    return coeff * (input_tokens + output_tokens * ENERGY_OUTPUT_WEIGHT)
diff --git a/cascadeflow/integrations/__init__.py b/cascadeflow/integrations/__init__.py
index 33552773..61c3ebbd 100644
--- a/cascadeflow/integrations/__init__.py
+++ b/cascadeflow/integrations/__init__.py
@@ -185,6 +185,28 @@
     crewai_is_enabled = None
     crewai_get_config = None
 
+# Try to import Google ADK integration
+try:
+    from .google_adk import (
+        GOOGLE_ADK_AVAILABLE,
+        GoogleADKHarnessConfig,
+        CascadeFlowADKPlugin,
+        enable as google_adk_enable,
+        disable as google_adk_disable,
+        is_available as google_adk_is_available,
+        is_enabled as google_adk_is_enabled,
+        get_config as google_adk_get_config,
+    )
+except ImportError:
+    GOOGLE_ADK_AVAILABLE = False
+    GoogleADKHarnessConfig = None
+    CascadeFlowADKPlugin = None
+    google_adk_enable = None
+    google_adk_disable = None
+    google_adk_is_available = None
+    google_adk_is_enabled = None
+    google_adk_get_config = None
+
 __all__ = []
 
 if LITELLM_AVAILABLE:
@@ -285,6 +307,20 @@
         ]
     )
 
+if GOOGLE_ADK_AVAILABLE:
+    __all__.extend(
+        [
+            "GOOGLE_ADK_AVAILABLE",
+            "GoogleADKHarnessConfig",
+            "CascadeFlowADKPlugin",
+            "google_adk_enable",
+            "google_adk_disable",
+            "google_adk_is_available",
+            "google_adk_is_enabled",
+            "google_adk_get_config",
+        ]
+    )
+
 # Integration capabilities
 INTEGRATION_CAPABILITIES = {
     "litellm": LITELLM_AVAILABLE,
@@ -294,6 +330,7 @@
     "openclaw": OPENCLAW_AVAILABLE,
     "paygentic": PAYGENTIC_AVAILABLE,
     "crewai": CREWAI_AVAILABLE,
+    "google_adk": GOOGLE_ADK_AVAILABLE,
 }
 
 
@@ -319,4 +356,5 @@ def get_integration_info():
         "openclaw_available": OPENCLAW_AVAILABLE,
         "paygentic_available": PAYGENTIC_AVAILABLE,
         "crewai_available": CREWAI_AVAILABLE,
+        "google_adk_available": GOOGLE_ADK_AVAILABLE,
     }
diff --git a/cascadeflow/integrations/google_adk.py b/cascadeflow/integrations/google_adk.py
new file mode 100644
index 00000000..09e90335
--- /dev/null
+++ b/cascadeflow/integrations/google_adk.py
@@ -0,0 +1,424 @@
+"""Google ADK (Agent Development Kit) harness integration for cascadeflow.
+
+Uses ADK's ``BasePlugin`` system to intercept all LLM calls across all agents
+in a Runner, feeding metrics into ``cascadeflow.harness`` run contexts.
+
+This module is optional — ``pip install cascadeflow[google-adk]`` pulls in the
+google-adk dependency.  When google-adk is not installed the public helpers
+return gracefully and ``GOOGLE_ADK_AVAILABLE`` is ``False``.
+
+Integration surface:
+    - ``enable()``:  create and return a plugin instance
+    - ``disable()``: deactivate the plugin and clean up
+    - ``CascadeFlowADKPlugin``: BasePlugin subclass for Runner(plugins=[...])
+
+Unlike CrewAI (global hooks), ADK plugins are registered per-Runner.
+``enable()`` returns the plugin instance; the user passes it to
+``Runner(plugins=[plugin])``.
+
+Design note — no tool gating:
+    ADK's ``tools_dict`` is part of agent definition, not per-call.
+    Budget gate via ``before_model_callback`` provides sufficient cost control.
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from dataclasses import dataclass
+from importlib.util import find_spec
+from typing import Any, Optional
+
+from cascadeflow.harness.api import get_current_run
+from cascadeflow.harness.pricing import estimate_cost, estimate_energy
+
+logger = logging.getLogger("cascadeflow.integrations.google_adk")
+
+GOOGLE_ADK_AVAILABLE = find_spec("google.adk") is not None
+
+# Resolve the base class: use ADK's BasePlugin when available, else object.
+_ADKBasePlugin: type
+if GOOGLE_ADK_AVAILABLE:
+    try:
+        from google.adk.plugins import BasePlugin as _ADKBasePlugin  # type: ignore[assignment]
+    except ImportError:
+        _ADKBasePlugin = object  # type: ignore[assignment,misc]
+        GOOGLE_ADK_AVAILABLE = False
+else:
+    _ADKBasePlugin = object  # type: ignore[assignment,misc]
+
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class GoogleADKHarnessConfig:
+    """Runtime configuration for the Google ADK harness integration.
+
+    fail_open:
+        If ``True`` (default), errors inside callbacks never break ADK
+        execution — they are logged and swallowed.
+    enable_budget_gate:
+        If ``True`` (default), ``before_model_callback`` blocks calls when
+        the harness run budget is exhausted (enforce mode only).
+    """
+
+    fail_open: bool = True
+    enable_budget_gate: bool = True
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _normalize_model_name(model: str) -> str:
+    """Strip LiteLlm-style provider prefix (``openai/gpt-4o`` → ``gpt-4o``).
+
+    Also handles ``models/gemini-2.5-flash`` → ``gemini-2.5-flash``.
+    """
+    if "/" in model:
+        return model.rsplit("/", 1)[-1]
+    return model
+
+
+def _count_function_calls(content: Any) -> int:
+    """Count ``function_call`` parts in an ADK LlmResponse content."""
+    if content is None:
+        return 0
+    parts = getattr(content, "parts", None)
+    if not parts:
+        return 0
+    count = 0
+    for part in parts:
+        if getattr(part, "function_call", None) is not None:
+            count += 1
+    return count
+
+
+# ---------------------------------------------------------------------------
+# Plugin
+# ---------------------------------------------------------------------------
+
+
+class CascadeFlowADKPlugin(_ADKBasePlugin):  # type: ignore[misc]
+    """Google ADK BasePlugin with cascadeflow harness awareness.
+
+    Intercepts every LLM call across all agents in a Runner to provide:
+    - Budget enforcement (enforce mode: short-circuits with error response)
+    - Cost, latency, and energy tracking
+    - Tool call counting
+    - Full trace recording into HarnessRunContext
+    """
+
+    def __init__(self, config: Optional[GoogleADKHarnessConfig] = None) -> None:
+        self._config = config or GoogleADKHarnessConfig()
+        self._active = True
+        # Track call metadata between before/after callbacks.
+        # Keyed by (invocation_id, agent_name) to handle concurrent calls.
+        self._call_start_times: dict[tuple[str, str], float] = {}
+        self._call_models: dict[tuple[str, str], str] = {}
+
+    def _callback_key(self, callback_context: Any) -> tuple[str, str]:
+        invocation_id = getattr(callback_context, "invocation_id", "") or ""
+        agent_name = getattr(callback_context, "agent_name", "") or ""
+        return (invocation_id, agent_name)
+
+    async def before_model_callback(
+        self,
+        callback_context: Any,
+        llm_request: Any,
+    ) -> Any:
+        """Budget gate and timing setup.
+
+        Returns ``None`` to proceed normally, or an ``LlmResponse`` with
+        an error to short-circuit the call when budget is exhausted.
+        """
+        if not self._active:
+            return None
+
+        try:
+            ctx = get_current_run()
+            if ctx is None:
+                return None
+
+            # Extract model name from request
+            model_raw = getattr(llm_request, "model", None) or "unknown"
+            model = _normalize_model_name(str(model_raw))
+
+            key = self._callback_key(callback_context)
+
+            # Budget gate in enforce mode
+            if (
+                self._config.enable_budget_gate
+                and ctx.mode == "enforce"
+                and ctx.budget_max is not None
+                and ctx.cost >= ctx.budget_max
+            ):
+                logger.warning(
+                    "google-adk: blocking LLM call — budget exhausted "
+                    "(spent $%.4f of $%.4f max)",
+                    ctx.cost,
+                    ctx.budget_max,
+                )
+                ctx.record(action="stop", reason="budget_exhausted", model=model)
+                return self._make_budget_error_response(ctx)
+
+            # Record start time and model for after_model_callback
+            self._call_start_times[key] = time.monotonic()
+            self._call_models[key] = model
+
+            return None
+        except Exception:
+            if self._config.fail_open:
+                logger.debug(
+                    "google-adk before_model_callback error (fail_open)", exc_info=True
+                )
+                return None
+            raise
+
+    async def after_model_callback(
+        self,
+        callback_context: Any,
+        llm_response: Any,
+    ) -> Any:
+        """Extract tokens, count tool calls, estimate cost/energy, update run context."""
+        if not self._active:
+            return None
+
+        try:
+            ctx = get_current_run()
+            if ctx is None:
+                return None
+
+            key = self._callback_key(callback_context)
+
+            # Recover model name stored during before_model_callback
+            model = self._call_models.pop(key, "unknown")
+
+            # Extract token counts from usage_metadata
+            input_tokens, output_tokens = self._extract_tokens(llm_response)
+
+            # Count function_call parts in response content
+            content = getattr(llm_response, "content", None)
+            tool_calls = _count_function_calls(content)
+
+            # Cost and energy estimation
+            cost = estimate_cost(model, input_tokens, output_tokens)
+            energy = estimate_energy(model, input_tokens, output_tokens)
+
+            # Latency
+            start_time = self._call_start_times.pop(key, None)
+            elapsed_ms = (time.monotonic() - start_time) * 1000 if start_time else 0.0
+
+            # Update run context
+            ctx.cost += cost
+            ctx.step_count += 1
+            ctx.latency_used_ms += elapsed_ms
+            ctx.energy_used += energy
+            ctx.tool_calls += tool_calls
+
+            if ctx.budget_max is not None:
+                ctx.budget_remaining = ctx.budget_max - ctx.cost
+
+            ctx.model_used = model
+            ctx.record(action="allow", reason=ctx.mode, model=model)
+
+            logger.debug(
+                "google-adk: tracked call model=%s cost=$%.6f latency=%.0fms tools=%d",
+                model,
+                cost,
+                elapsed_ms,
+                tool_calls,
+            )
+
+            return None
+        except Exception:
+            if self._config.fail_open:
+                logger.debug(
+                    "google-adk after_model_callback error (fail_open)", exc_info=True
+                )
+                return None
+            raise
+
+    async def on_model_error_callback(
+        self,
+        callback_context: Any,
+        error: Exception,
+    ) -> Any:
+        """Record error in trace and clean up timing state."""
+        if not self._active:
+            return None
+
+        try:
+            key = self._callback_key(callback_context)
+            model = self._call_models.pop(key, "unknown")
+            self._call_start_times.pop(key, None)
+
+            ctx = get_current_run()
+            if ctx is not None:
+                error_type = type(error).__name__
+                ctx.record(
+                    action="error",
+                    reason=f"model_error:{error_type}",
+                    model=model,
+                )
+
+            return None
+        except Exception:
+            if self._config.fail_open:
+                logger.debug(
+                    "google-adk on_model_error_callback error (fail_open)", exc_info=True
+                )
+                return None
+            raise
+
+    def deactivate(self) -> None:
+        """Make all callbacks no-ops without unregistering from Runner."""
+        self._active = False
+        self._call_start_times.clear()
+        self._call_models.clear()
+
+    @staticmethod
+    def _extract_tokens(llm_response: Any) -> tuple[int, int]:
+        """Extract input/output token counts from an ADK LlmResponse.
+
+        ADK responses carry ``usage_metadata`` with ``prompt_token_count``
+        and ``candidates_token_count``.  Falls back to estimating from
+        content text (4 chars ≈ 1 token).
+        """
+        usage = getattr(llm_response, "usage_metadata", None)
+        if usage is not None:
+            input_tokens = getattr(usage, "prompt_token_count", 0) or 0
+            output_tokens = getattr(usage, "candidates_token_count", 0) or 0
+            if input_tokens > 0 or output_tokens > 0:
+                return int(input_tokens), int(output_tokens)
+
+        # Fallback: estimate from content text
+        content = getattr(llm_response, "content", None)
+        if content is not None:
+            parts = getattr(content, "parts", None)
+            if parts:
+                text_chars = sum(len(getattr(p, "text", "") or "") for p in parts)
+                return 0, max(text_chars // 4, 1)
+
+        return 0, 0
+
+    @staticmethod
+    def _make_budget_error_response(ctx: Any) -> Any:
+        """Build an LlmResponse that short-circuits the LLM call.
+
+        When ADK is available we return a real ``LlmResponse``.  When not
+        (shouldn't happen in practice), we return a sentinel dict.
+        """
+        msg = (
+            f"cascadeflow harness budget exceeded "
+            f"(spent ${ctx.cost:.4f} of ${ctx.budget_max:.4f} max)"
+        )
+        if GOOGLE_ADK_AVAILABLE:
+            try:
+                from google.adk.models import LlmResponse  # type: ignore[import-untyped]
+                from google.genai.types import Content, Part  # type: ignore[import-untyped]
+
+                return LlmResponse(
+                    content=Content(parts=[Part(text=msg)]),
+                    error_code="BUDGET_EXCEEDED",
+                    error_message=msg,
+                )
+            except ImportError:
+                pass
+
+        return {"error_code": "BUDGET_EXCEEDED", "error_message": msg}
+
+
+# ---------------------------------------------------------------------------
+# Module-level state
+# ---------------------------------------------------------------------------
+
+_config: GoogleADKHarnessConfig = GoogleADKHarnessConfig()
+_plugin_instance: Optional[CascadeFlowADKPlugin] = None
+_enabled: bool = False
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def is_available() -> bool:
+    """Return whether the google-adk package is installed."""
+    return GOOGLE_ADK_AVAILABLE
+
+
+def is_enabled() -> bool:
+    """Return whether a plugin instance has been created via ``enable()``."""
+    return _enabled
+
+
+def get_config() -> GoogleADKHarnessConfig:
+    """Return a copy of the current configuration."""
+    return GoogleADKHarnessConfig(
+        fail_open=_config.fail_open,
+        enable_budget_gate=_config.enable_budget_gate,
+    )
+
+
+def enable(
+    config: Optional[GoogleADKHarnessConfig] = None,
+) -> CascadeFlowADKPlugin:
+    """Create a cascadeflow-instrumented ADK plugin instance.
+
+    Unlike CrewAI (global hooks), ADK plugins are per-Runner.  Pass the
+    returned plugin to ``Runner(plugins=[plugin])``.
+
+    Idempotent: returns the same instance on repeated calls unless
+    ``disable()`` was called in between.
+
+    Args:
+        config: Optional configuration overrides.
+
+    Returns:
+        ``CascadeFlowADKPlugin`` instance ready for ``Runner(plugins=[...])``.
+    """
+    global _config, _plugin_instance, _enabled
+
+    if _enabled and _plugin_instance is not None:
+        logger.debug("google-adk plugin already enabled; returning existing instance")
+        return _plugin_instance
+
+    if config is not None:
+        _config = config
+
+    _plugin_instance = CascadeFlowADKPlugin(config=_config)
+    _enabled = True
+    logger.info("google-adk harness plugin created")
+    return _plugin_instance
+
+
+def disable() -> None:
+    """Deactivate the plugin and clear module state.
+
+    Safe to call even if not enabled.
+    """
+    global _plugin_instance, _enabled
+
+    if _plugin_instance is not None:
+        _plugin_instance.deactivate()
+
+    _plugin_instance = None
+    _enabled = False
+    logger.info("google-adk harness plugin disabled")
+
+
+__all__ = [
+    "GOOGLE_ADK_AVAILABLE",
+    "GoogleADKHarnessConfig",
+    "CascadeFlowADKPlugin",
+    "enable",
+    "disable",
+    "is_available",
+    "is_enabled",
+    "get_config",
+]
diff --git a/docs/guides/google_adk_integration.md b/docs/guides/google_adk_integration.md
new file mode 100644
index 00000000..d0d32b3f
--- /dev/null
+++ b/docs/guides/google_adk_integration.md
@@ -0,0 +1,161 @@
+# Google ADK Integration
+
+Integrate cascadeflow harness with Google's Agent Development Kit (ADK) to get
+budget enforcement, cost/latency/energy tracking, tool call counting, and full
+trace recording across all agents in an ADK Runner.
+
+---
+
+## Design Principles
+
+- **Plugin-based** — Uses ADK's `BasePlugin` system to intercept every LLM call
+  across all agents in a Runner. One plugin covers the entire agent graph.
+- **Opt-in** — Install `cascadeflow[google-adk]` and create a plugin explicitly.
+  Never enabled by default.
+- **Fail-open** — Integration errors are logged but never break ADK execution
+  (configurable).
+- **No tool gating** — ADK's `tools_dict` is part of agent definition, not
+  per-call. Budget gate via `before_model_callback` provides sufficient cost
+  control. This is an intentional difference from the OpenAI Agents integration.
+
+---
+
+## Installation
+
+```bash
+pip install "cascadeflow[google-adk]"
+```
+
+Requires Python 3.10+ (ADK requirement).
+
+---
+
+## Quick Start
+
+```python
+import asyncio
+from google.adk.agents import Agent
+from google.adk.runners import Runner
+from google.adk.sessions import InMemorySessionService
+
+from cascadeflow import init, run
+from cascadeflow.integrations.google_adk import enable
+
+# 1. Initialize harness
+init(mode="observe", budget=1.0)
+
+# 2. Create the cascadeflow plugin
+plugin = enable()
+
+# 3. Pass it to the Runner
+agent = Agent(name="my_agent", model="gemini-2.5-flash", instruction="Be helpful.")
+runner = Runner(
+    agent=agent,
+    app_name="my_app",
+    session_service=InMemorySessionService(),
+    plugins=[plugin],
+)
+
+# 4. Run within a harness scope
+async def main():
+    with run(budget=0.5) as session:
+        # ... run your agent ...
+        print(f"Cost: ${session.cost:.6f}")
+        print(f"Steps: {session.step_count}")
+        print(f"Tool calls: {session.tool_calls}")
+
+asyncio.run(main())
+```
+
+---
+
+## Features
+
+### Budget Enforcement
+
+In `enforce` mode, the plugin short-circuits LLM calls when the budget is
+exhausted by returning an `LlmResponse` with `error_code="BUDGET_EXCEEDED"`.
+
+```python
+init(mode="enforce", budget=0.10)  # Hard limit: $0.10
+plugin = enable()
+```
+
+### Cost and Energy Tracking
+
+Every LLM call is tracked with:
+- **Cost** — Estimated from model pricing (USD per 1M tokens)
+- **Energy** — Deterministic proxy coefficient for compute intensity
+- **Latency** — Wall-clock time per call
+- **Tool calls** — Count of `function_call` parts in responses
+
+### Trace Recording
+
+All decisions are recorded in the `HarnessRunContext` trace:
+
+```python
+with run() as session:
+    # ... run agents ...
+    for event in session.trace():
+        print(event)
+        # {"action": "allow", "reason": "observe", "model": "gemini-2.5-flash", ...}
+```
+
+### Configuration
+
+```python
+from cascadeflow.integrations.google_adk import enable, GoogleADKHarnessConfig
+
+plugin = enable(
+    config=GoogleADKHarnessConfig(
+        fail_open=True,           # Default: True. Never break ADK on integration errors.
+        enable_budget_gate=True,  # Default: True. Block calls when budget exhausted.
+    )
+)
+```
+
+---
+
+## Zero-Code Alternative
+
+If you don't need per-agent plugin integration, you can route ADK through a
+cascadeflow LiteLlm proxy by setting `base_url` on your Gemini model:
+
+```python
+# ADK uses LiteLlm under the hood — point it at your cascadeflow proxy
+agent = Agent(
+    name="my_agent",
+    model="openai/gemini-2.5-flash",  # LiteLlm format
+    instruction="...",
+)
+# Set OPENAI_API_BASE=http://localhost:8080/v1 to route through cascadeflow proxy
+```
+
+This gives you cost tracking at the proxy level without a plugin, but doesn't
+provide budget enforcement or per-agent trace recording.
+
+---
+
+## Supported Gemini Models
+
+| Model | Input $/1M | Output $/1M | Energy Coefficient |
+|-------|-----------|-------------|-------------------|
+| gemini-2.5-flash | $0.15 | $0.60 | 0.3 |
+| gemini-2.5-pro | $1.25 | $10.00 | 1.2 |
+| gemini-2.0-flash | $0.10 | $0.40 | 0.25 |
+| gemini-1.5-flash | $0.075 | $0.30 | 0.2 |
+| gemini-1.5-pro | $1.25 | $5.00 | 1.0 |
+
+All OpenAI and Anthropic models from the shared pricing table are also
+supported (e.g., when using LiteLlm provider prefixes).
+
+---
+
+## Troubleshooting
+
+| Symptom | Solution |
+|---------|----------|
+| `ImportError: google.adk` | `pip install "cascadeflow[google-adk]"` |
+| Plugin not tracking calls | Ensure `plugin` is passed to `Runner(plugins=[plugin])` |
+| Budget not enforced | Check `init(mode="enforce", ...)` — observe mode never blocks |
+| Zero cost reported | Model name may not match pricing table; check for provider prefix stripping |
diff --git a/examples/integrations/google_adk_harness.py b/examples/integrations/google_adk_harness.py
new file mode 100644
index 00000000..0315dc90
--- /dev/null
+++ b/examples/integrations/google_adk_harness.py
@@ -0,0 +1,89 @@
+"""
+Google ADK + cascadeflow harness integration example.
+
+Run:
+    pip install "cascadeflow[google-adk]"
+    export GOOGLE_API_KEY="your-key"
+    python examples/integrations/google_adk_harness.py
+"""
+
+from __future__ import annotations
+
+import asyncio
+
+
+async def main() -> None:
+    try:
+        from google.adk.agents import Agent
+        from google.adk.runners import Runner
+        from google.adk.sessions import InMemorySessionService
+    except ImportError as exc:
+        raise SystemExit(
+            "Google ADK is not installed. "
+            'Install with: pip install "cascadeflow[google-adk]"'
+        ) from exc
+
+    from cascadeflow import init, run
+    from cascadeflow.integrations.google_adk import enable, GoogleADKHarnessConfig
+
+    # 1. Initialize harness globally
+    init(mode="observe", budget=1.0)
+
+    # 2. Create the cascadeflow ADK plugin
+    plugin = enable(
+        config=GoogleADKHarnessConfig(
+            fail_open=True,
+            enable_budget_gate=True,
+        )
+    )
+
+    # 3. Define an ADK agent
+    agent = Agent(
+        name="demo_agent",
+        model="gemini-2.5-flash",
+        instruction="You are a helpful assistant. Answer concisely.",
+    )
+
+    # 4. Create a Runner with the cascadeflow plugin
+    session_service = InMemorySessionService()
+    runner = Runner(
+        agent=agent,
+        app_name="cascadeflow_demo",
+        session_service=session_service,
+        plugins=[plugin],  # cascadeflow hooks into all LLM calls here
+    )
+
+    # 5. Run within a harness scope
+    with run(budget=0.5) as session:
+        user_session = await session_service.create_session(
+            app_name="cascadeflow_demo",
+            user_id="demo-user",
+        )
+
+        from google.genai.types import Content, Part
+
+        async for event in runner.run_async(
+            user_id="demo-user",
+            session_id=user_session.id,
+            new_message=Content(parts=[Part(text="What is model routing?")]),
+        ):
+            if event.content and event.content.parts:
+                for part in event.content.parts:
+                    if part.text:
+                        print(part.text, end="")
+        print()
+
+        print("\n=== Harness Metrics ===")
+        print(f"Cost: ${session.cost:.6f}")
+        print(f"Remaining budget: {session.budget_remaining}")
+        print(f"Steps: {session.step_count}")
+        print(f"Tool calls: {session.tool_calls}")
+        print(f"Energy: {session.energy_used:.1f}")
+        print(f"Latency: {session.latency_used_ms:.0f}ms")
+        print("\n=== Decision Trace ===")
+        for event in session.trace():
+            print(event)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/pyproject.toml b/pyproject.toml
index 2bbd3082..8f11ae44 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -101,18 +101,8 @@ openai-agents = [
     "openai-agents>=0.9.0; python_version >= '3.10'",
 ]
 
-# LangChain harness integration (opt-in)
-langchain = [
-    "langchain>=0.3.0",
-    "langchain-core>=0.3.0",
-]
-
-# LangGraph state extraction (opt-in, adds langgraph on top of langchain)
-langgraph = [
-    "langchain>=0.3.0",
-    "langchain-core>=0.3.0",
-    "langgraph>=0.2.0",
-]
+# Google ADK integration (opt-in, requires Python 3.10+)
+google-adk = ["google-adk>=1.0.0; python_version >= '3.10'"]
 
 # Development tools (includes rich for terminal output)
 dev = [
diff --git a/tests/test_google_adk_integration.py b/tests/test_google_adk_integration.py
new file mode 100644
index 00000000..8f5ecef3
--- /dev/null
+++ b/tests/test_google_adk_integration.py
@@ -0,0 +1,598 @@
+"""Tests for cascadeflow.integrations.google_adk harness integration.
+
+google-adk is not installed in test environments, so we use fake ADK types
+and test the integration logic directly against HarnessRunContext.
+"""
+
+from __future__ import annotations
+
+import time
+from unittest.mock import patch
+
+import pytest
+
+from cascadeflow.harness import init, reset, run
+
+# Import the module directly — it does not require google-adk at import time
+# (GOOGLE_ADK_AVAILABLE will be False, but all functions/classes are still defined).
+import cascadeflow.integrations.google_adk as adk_mod
+
+
+# ---------------------------------------------------------------------------
+# Fake ADK types
+# ---------------------------------------------------------------------------
+
+
+class FakeUsageMetadata:
+    """Stand-in for google.genai.types.GenerateContentResponseUsageMetadata."""
+
+    def __init__(
+        self,
+        prompt_token_count: int = 0,
+        candidates_token_count: int = 0,
+    ):
+        self.prompt_token_count = prompt_token_count
+        self.candidates_token_count = candidates_token_count
+
+
+class FakePart:
+    """Stand-in for google.genai.types.Part."""
+
+    def __init__(self, *, text: str | None = None, function_call: object | None = None):
+        self.text = text
+        self.function_call = function_call
+
+
+class FakeContent:
+    """Stand-in for google.genai.types.Content."""
+
+    def __init__(self, parts: list | None = None):
+        self.parts = parts or []
+
+
+class FakeLlmResponse:
+    """Stand-in for google.adk.models.LlmResponse."""
+
+    def __init__(
+        self,
+        *,
+        content: FakeContent | None = None,
+        usage_metadata: FakeUsageMetadata | None = None,
+    ):
+        self.content = content
+        self.usage_metadata = usage_metadata
+
+
+class FakeLlmRequest:
+    """Stand-in for google.adk.models.LlmRequest."""
+
+    def __init__(self, model: str = "gemini-2.5-flash"):
+        self.model = model
+
+
+class FakeCallbackContext:
+    """Stand-in for google.adk.agents.CallbackContext."""
+
+    def __init__(
+        self,
+        invocation_id: str = "inv-001",
+        agent_name: str = "test-agent",
+    ):
+        self.invocation_id = invocation_id
+        self.agent_name = agent_name
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(autouse=True)
+def _reset_adk_state():
+    """Reset harness and ADK module state before every test."""
+    reset()
+    adk_mod._config = adk_mod.GoogleADKHarnessConfig()
+    adk_mod._plugin_instance = None
+    adk_mod._enabled = False
+
+
+# ---------------------------------------------------------------------------
+# _normalize_model_name
+# ---------------------------------------------------------------------------
+
+
+class TestNormalizeModelName:
+    def test_plain_model(self):
+        assert adk_mod._normalize_model_name("gemini-2.5-flash") == "gemini-2.5-flash"
+
+    def test_strips_provider_prefix(self):
+        assert adk_mod._normalize_model_name("openai/gpt-4o") == "gpt-4o"
+
+    def test_strips_models_prefix(self):
+        assert adk_mod._normalize_model_name("models/gemini-2.5-flash") == "gemini-2.5-flash"
+
+    def test_strips_litellm_prefix(self):
+        assert adk_mod._normalize_model_name("vertex_ai/gemini-2.5-pro") == "gemini-2.5-pro"
+
+    def test_no_slash_passthrough(self):
+        assert adk_mod._normalize_model_name("gpt-4o-mini") == "gpt-4o-mini"
+
+
+# ---------------------------------------------------------------------------
+# _count_function_calls
+# ---------------------------------------------------------------------------
+
+
+class TestCountFunctionCalls:
+    def test_no_content(self):
+        assert adk_mod._count_function_calls(None) == 0
+
+    def test_no_parts(self):
+        content = FakeContent(parts=[])
+        assert adk_mod._count_function_calls(content) == 0
+
+    def test_text_only(self):
+        content = FakeContent(parts=[FakePart(text="hello")])
+        assert adk_mod._count_function_calls(content) == 0
+
+    def test_counts_function_calls(self):
+        content = FakeContent(
+            parts=[
+                FakePart(text="thinking..."),
+                FakePart(function_call={"name": "search", "args": {}}),
+                FakePart(function_call={"name": "calculate", "args": {}}),
+            ]
+        )
+        assert adk_mod._count_function_calls(content) == 2
+
+
+# ---------------------------------------------------------------------------
+# Cost / energy estimation (via shared pricing)
+# ---------------------------------------------------------------------------
+
+
+class TestEstimation:
+    def test_estimate_cost_known_model(self):
+        from cascadeflow.harness.pricing import estimate_cost
+
+        cost = estimate_cost("gemini-2.5-flash", 1_000_000, 1_000_000)
+        assert cost == pytest.approx(0.15 + 0.60)
+
+    def test_estimate_cost_unknown_model_uses_default(self):
+        from cascadeflow.harness.pricing import estimate_cost
+
+        cost = estimate_cost("unknown-model", 1_000_000, 0)
+        assert cost == pytest.approx(2.50)
+
+    def test_estimate_energy_known_model(self):
+        from cascadeflow.harness.pricing import estimate_energy
+
+        energy = estimate_energy("gemini-2.5-flash", 100, 100)
+        # coeff=0.3, output_weight=1.5
+        assert energy == pytest.approx(0.3 * (100 + 100 * 1.5))
+
+    def test_estimate_energy_unknown_model(self):
+        from cascadeflow.harness.pricing import estimate_energy
+
+        energy = estimate_energy("unknown-model", 100, 100)
+        # default coeff=1.0
+        assert energy == pytest.approx(1.0 * (100 + 100 * 1.5))
+
+
+# ---------------------------------------------------------------------------
+# before_model_callback
+# ---------------------------------------------------------------------------
+
+
+class TestBeforeModelCallback:
+    @pytest.fixture
+    def plugin(self):
+        return adk_mod.CascadeFlowADKPlugin()
+
+    async def test_no_run_context_returns_none(self, plugin):
+        ctx = FakeCallbackContext()
+        req = FakeLlmRequest()
+        result = await plugin.before_model_callback(ctx, req)
+        assert result is None
+
+    async def test_observe_mode_allows_over_budget(self, plugin):
+        init(mode="observe", budget=0.001)
+        with run(budget=0.001) as run_ctx:
+            run_ctx.cost = 0.002
+            result = await plugin.before_model_callback(
+                FakeCallbackContext(), FakeLlmRequest()
+            )
+            assert result is None  # observe never blocks
+
+    async def test_enforce_blocks_when_budget_exhausted(self, plugin):
+        init(mode="enforce", budget=0.001)
+        with run(budget=0.001) as run_ctx:
+            run_ctx.cost = 0.001
+            result = await plugin.before_model_callback(
+                FakeCallbackContext(), FakeLlmRequest("gemini-2.5-flash")
+            )
+            assert result is not None  # short-circuit response
+            assert run_ctx.last_action == "stop"
+            trace = run_ctx.trace()
+            assert trace[-1]["reason"] == "budget_exhausted"
+
+    async def test_enforce_blocked_call_does_not_leak_state(self, plugin):
+        init(mode="enforce", budget=0.001)
+        with run(budget=0.001) as run_ctx:
+            run_ctx.cost = 0.001
+            cb_ctx = FakeCallbackContext()
+            await plugin.before_model_callback(cb_ctx, FakeLlmRequest())
+            key = plugin._callback_key(cb_ctx)
+            assert key not in plugin._call_start_times
+            assert key not in plugin._call_models
+
+    async def test_enforce_allows_under_budget(self, plugin):
+        init(mode="enforce", budget=1.0)
+        with run(budget=1.0) as run_ctx:
+            run_ctx.cost = 0.5
+            result = await plugin.before_model_callback(
+                FakeCallbackContext(), FakeLlmRequest()
+            )
+            assert result is None
+
+    async def test_records_start_time_and_model(self, plugin):
+        init(mode="observe")
+        with run():
+            cb_ctx = FakeCallbackContext()
+            await plugin.before_model_callback(cb_ctx, FakeLlmRequest("gpt-4o"))
+            key = plugin._callback_key(cb_ctx)
+            assert key in plugin._call_start_times
+            assert plugin._call_models[key] == "gpt-4o"
+
+    async def test_normalizes_model_name(self, plugin):
+        init(mode="observe")
+        with run():
+            cb_ctx = FakeCallbackContext()
+            await plugin.before_model_callback(cb_ctx, FakeLlmRequest("openai/gpt-4o"))
+            key = plugin._callback_key(cb_ctx)
+            assert plugin._call_models[key] == "gpt-4o"
+
+    async def test_budget_gate_disabled_in_config(self):
+        plugin = adk_mod.CascadeFlowADKPlugin(
+            config=adk_mod.GoogleADKHarnessConfig(enable_budget_gate=False)
+        )
+        init(mode="enforce", budget=0.001)
+        with run(budget=0.001) as run_ctx:
+            run_ctx.cost = 0.002
+            result = await plugin.before_model_callback(
+                FakeCallbackContext(), FakeLlmRequest()
+            )
+            assert result is None  # gate disabled
+
+    async def test_fail_open_swallows_errors(self, plugin):
+        init(mode="enforce")
+        with run():
+            with patch(
+                "cascadeflow.harness.api.get_current_run",
+                side_effect=RuntimeError("boom"),
+            ):
+                result = await plugin.before_model_callback(
+                    FakeCallbackContext(), FakeLlmRequest()
+                )
+                assert result is None
+
+
+# ---------------------------------------------------------------------------
+# after_model_callback
+# ---------------------------------------------------------------------------
+
+
+class TestAfterModelCallback:
+    @pytest.fixture
+    def plugin(self):
+        return adk_mod.CascadeFlowADKPlugin()
+
+    async def test_no_run_context_returns_none(self, plugin):
+        result = await plugin.after_model_callback(
+            FakeCallbackContext(),
+            FakeLlmResponse(),
+        )
+        assert result is None
+
+    async def test_updates_run_metrics_with_usage_metadata(self, plugin):
+        init(mode="observe")
+        with run(budget=1.0) as run_ctx:
+            cb_ctx = FakeCallbackContext()
+            key = plugin._callback_key(cb_ctx)
+            plugin._call_start_times[key] = time.monotonic() - 0.1
+            plugin._call_models[key] = "gemini-2.5-flash"
+
+            response = FakeLlmResponse(
+                usage_metadata=FakeUsageMetadata(
+                    prompt_token_count=100,
+                    candidates_token_count=50,
+                ),
+                content=FakeContent(parts=[FakePart(text="done")]),
+            )
+            await plugin.after_model_callback(cb_ctx, response)
+
+            assert run_ctx.step_count == 1
+            assert run_ctx.cost > 0
+            assert run_ctx.energy_used > 0
+            assert run_ctx.latency_used_ms > 0
+            assert run_ctx.model_used == "gemini-2.5-flash"
+            assert run_ctx.last_action == "allow"
+
+    async def test_fallback_token_estimation(self, plugin):
+        """When usage_metadata is missing, estimate from content text."""
+        init(mode="observe")
+        with run() as run_ctx:
+            cb_ctx = FakeCallbackContext()
+            key = plugin._callback_key(cb_ctx)
+            plugin._call_models[key] = "gemini-2.5-flash"
+
+            response = FakeLlmResponse(
+                content=FakeContent(parts=[FakePart(text="x" * 400)]),
+            )
+            await plugin.after_model_callback(cb_ctx, response)
+
+            assert run_ctx.cost > 0
+            assert run_ctx.step_count == 1
+
+    async def test_counts_tool_calls(self, plugin):
+        init(mode="observe")
+        with run() as run_ctx:
+            cb_ctx = FakeCallbackContext()
+            key = plugin._callback_key(cb_ctx)
+            plugin._call_models[key] = "gemini-2.5-flash"
+
+            response = FakeLlmResponse(
+                usage_metadata=FakeUsageMetadata(100, 50),
+                content=FakeContent(
+                    parts=[
+                        FakePart(function_call={"name": "search"}),
+                        FakePart(function_call={"name": "calc"}),
+                    ]
+                ),
+            )
+            await plugin.after_model_callback(cb_ctx, response)
+            assert run_ctx.tool_calls == 2
+
+    async def test_updates_budget_remaining(self, plugin):
+        init(mode="enforce", budget=1.0)
+        with run(budget=1.0) as run_ctx:
+            cb_ctx = FakeCallbackContext()
+            key = plugin._callback_key(cb_ctx)
+            plugin._call_models[key] = "gemini-2.5-flash"
+
+            response = FakeLlmResponse(
+                usage_metadata=FakeUsageMetadata(100, 50),
+            )
+            await plugin.after_model_callback(cb_ctx, response)
+            assert run_ctx.budget_remaining is not None
+            assert run_ctx.budget_remaining == pytest.approx(1.0 - run_ctx.cost)
+
+    async def test_trace_records_mode(self, plugin):
+        init(mode="enforce")
+        with run() as run_ctx:
+            cb_ctx = FakeCallbackContext()
+            key = plugin._callback_key(cb_ctx)
+            plugin._call_models[key] = "gpt-4o"
+
+            response = FakeLlmResponse(
+                usage_metadata=FakeUsageMetadata(10, 10),
+            )
+            await plugin.after_model_callback(cb_ctx, response)
+            trace = run_ctx.trace()
+            assert len(trace) == 1
+            assert trace[0]["reason"] == "enforce"
+            assert trace[0]["model"] == "gpt-4o"
+
+    async def test_no_start_time_records_zero_latency(self, plugin):
+        init(mode="observe")
+        with run() as run_ctx:
+            cb_ctx = FakeCallbackContext()
+            key = plugin._callback_key(cb_ctx)
+            plugin._call_models[key] = "gpt-4o"
+            # Don't set start time
+
+            response = FakeLlmResponse(
+                usage_metadata=FakeUsageMetadata(10, 10),
+            )
+            await plugin.after_model_callback(cb_ctx, response)
+            assert run_ctx.latency_used_ms == 0.0
+
+    async def test_fail_open_swallows_errors(self, plugin):
+        init(mode="observe")
+        with run():
+            with patch(
+                "cascadeflow.harness.api.get_current_run",
+                side_effect=RuntimeError("boom"),
+            ):
+                result = await plugin.after_model_callback(
+                    FakeCallbackContext(),
+                    FakeLlmResponse(),
+                )
+                assert result is None
+
+
+# ---------------------------------------------------------------------------
+# on_model_error_callback
+# ---------------------------------------------------------------------------
+
+
+class TestOnModelErrorCallback:
+    @pytest.fixture
+    def plugin(self):
+        return adk_mod.CascadeFlowADKPlugin()
+
+    async def test_records_error_in_trace(self, plugin):
+        init(mode="observe")
+        with run() as run_ctx:
+            cb_ctx = FakeCallbackContext()
+            key = plugin._callback_key(cb_ctx)
+            plugin._call_models[key] = "gemini-2.5-flash"
+            plugin._call_start_times[key] = time.monotonic()
+
+            await plugin.on_model_error_callback(cb_ctx, ValueError("bad input"))
+
+            trace = run_ctx.trace()
+            assert len(trace) == 1
+            assert trace[0]["action"] == "error"
+            assert "ValueError" in trace[0]["reason"]
+            assert trace[0]["model"] == "gemini-2.5-flash"
+
+    async def test_cleans_up_timing_state(self, plugin):
+        init(mode="observe")
+        with run():
+            cb_ctx = FakeCallbackContext()
+            key = plugin._callback_key(cb_ctx)
+            plugin._call_models[key] = "gemini-2.5-flash"
+            plugin._call_start_times[key] = time.monotonic()
+
+            await plugin.on_model_error_callback(cb_ctx, RuntimeError("oops"))
+
+            assert key not in plugin._call_models
+            assert key not in plugin._call_start_times
+
+    async def test_fail_open_swallows_errors(self, plugin):
+        init(mode="observe")
+        with run():
+            with patch(
+                "cascadeflow.harness.api.get_current_run",
+                side_effect=RuntimeError("boom"),
+            ):
+                result = await plugin.on_model_error_callback(
+                    FakeCallbackContext(),
+                    ValueError("test"),
+                )
+                assert result is None
+
+
+# ---------------------------------------------------------------------------
+# enable / disable lifecycle
+# ---------------------------------------------------------------------------
+
+
+class TestEnableDisable:
+    def test_enable_returns_plugin_instance(self):
+        plugin = adk_mod.enable()
+        assert isinstance(plugin, adk_mod.CascadeFlowADKPlugin)
+        assert adk_mod.is_enabled()
+
+    def test_enable_is_idempotent(self):
+        p1 = adk_mod.enable()
+        p2 = adk_mod.enable()
+        assert p1 is p2  # same instance
+
+    def test_enable_applies_config(self):
+        config = adk_mod.GoogleADKHarnessConfig(fail_open=False, enable_budget_gate=False)
+        plugin = adk_mod.enable(config=config)
+        assert plugin._config.fail_open is False
+        assert plugin._config.enable_budget_gate is False
+
+    def test_disable_deactivates_plugin(self):
+        plugin = adk_mod.enable()
+        assert plugin._active is True
+        adk_mod.disable()
+        assert not adk_mod.is_enabled()
+        assert plugin._active is False
+
+    def test_disable_when_not_enabled_is_safe(self):
+        adk_mod.disable()  # should not raise
+        assert not adk_mod.is_enabled()
+
+
+# ---------------------------------------------------------------------------
+# Public API helpers
+# ---------------------------------------------------------------------------
+
+
+class TestPublicAPI:
+    def test_is_available_reflects_module_flag(self):
+        assert adk_mod.is_available() == adk_mod.GOOGLE_ADK_AVAILABLE
+
+    def test_is_enabled_default_false(self):
+        assert adk_mod.is_enabled() is False
+
+    def test_get_config_returns_copy(self):
+        cfg = adk_mod.get_config()
+        assert isinstance(cfg, adk_mod.GoogleADKHarnessConfig)
+        assert cfg.fail_open is True
+        assert cfg.enable_budget_gate is True
+        # Modifying the copy doesn't affect module state
+        cfg.fail_open = False
+        assert adk_mod.get_config().fail_open is True
+
+
+# ---------------------------------------------------------------------------
+# GoogleADKHarnessConfig
+# ---------------------------------------------------------------------------
+
+
+class TestConfig:
+    def test_defaults(self):
+        cfg = adk_mod.GoogleADKHarnessConfig()
+        assert cfg.fail_open is True
+        assert cfg.enable_budget_gate is True
+
+    def test_custom_values(self):
+        cfg = adk_mod.GoogleADKHarnessConfig(fail_open=False, enable_budget_gate=False)
+        assert cfg.fail_open is False
+        assert cfg.enable_budget_gate is False
+
+
+# ---------------------------------------------------------------------------
+# Plugin deactivate
+# ---------------------------------------------------------------------------
+
+
+class TestDeactivate:
+    async def test_deactivated_plugin_skips_callbacks(self):
+        plugin = adk_mod.CascadeFlowADKPlugin()
+        plugin.deactivate()
+
+        init(mode="enforce", budget=0.001)
+        with run(budget=0.001) as run_ctx:
+            run_ctx.cost = 0.002
+            result = await plugin.before_model_callback(
+                FakeCallbackContext(), FakeLlmRequest()
+            )
+            assert result is None  # no-op, not blocked
+
+    async def test_deactivate_clears_state(self):
+        plugin = adk_mod.CascadeFlowADKPlugin()
+        plugin._call_start_times[("a", "b")] = 1.0
+        plugin._call_models[("a", "b")] = "test"
+        plugin.deactivate()
+        assert len(plugin._call_start_times) == 0
+        assert len(plugin._call_models) == 0
+
+
+# ---------------------------------------------------------------------------
+# _extract_tokens
+# ---------------------------------------------------------------------------
+
+
+class TestExtractTokens:
+    def test_from_usage_metadata(self):
+        response = FakeLlmResponse(
+            usage_metadata=FakeUsageMetadata(100, 200),
+        )
+        assert adk_mod.CascadeFlowADKPlugin._extract_tokens(response) == (100, 200)
+
+    def test_zero_usage_falls_back_to_content(self):
+        response = FakeLlmResponse(
+            usage_metadata=FakeUsageMetadata(0, 0),
+            content=FakeContent(parts=[FakePart(text="x" * 80)]),
+        )
+        inp, out = adk_mod.CascadeFlowADKPlugin._extract_tokens(response)
+        assert inp == 0
+        assert out == 20  # 80 / 4
+
+    def test_no_usage_no_content(self):
+        response = FakeLlmResponse()
+        assert adk_mod.CascadeFlowADKPlugin._extract_tokens(response) == (0, 0)
+
+    def test_content_with_no_text(self):
+        response = FakeLlmResponse(
+            content=FakeContent(parts=[FakePart(function_call={"name": "f"})]),
+        )
+        inp, out = adk_mod.CascadeFlowADKPlugin._extract_tokens(response)
+        assert inp == 0
+        assert out == 1  # max(0//4, 1)

From aa5fa3c1aead049e4c4d0d588e734c280480de7e Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 4 Mar 2026 16:21:00 +0100
Subject: [PATCH 31/49] fix: resolve import regression and callback-key
 collision

- Remove harness `agent` from top-level cascadeflow namespace to avoid
  shadowing the cascadeflow.agent module (breaks dotted-path patches in
  test_agent.py and test_agent_p0_tool_loop.py)
- Use id(callback_context) fallback in ADK plugin _callback_key() when
  invocation_id and agent_name are both empty, preventing state map
  collisions under concurrency
- Add 4 tests for callback-key collision scenario
- Update test_harness_api to import agent from cascadeflow.harness
---
 cascadeflow/__init__.py                | 10 ++---
 cascadeflow/integrations/google_adk.py |  4 ++
 tests/test_google_adk_integration.py   | 57 ++++++++++++++++++++++++++
 tests/test_harness_api.py              | 29 +++----------
 4 files changed, 71 insertions(+), 29 deletions(-)

diff --git a/cascadeflow/__init__.py b/cascadeflow/__init__.py
index b9bc7682..6dd64b05 100644
--- a/cascadeflow/__init__.py
+++ b/cascadeflow/__init__.py
@@ -240,6 +240,10 @@
 )
 
 # NEW: Harness API scaffold (V2 core branch)
+# NOTE: harness.agent is NOT re-exported here — it would shadow the
+# cascadeflow.agent *module* and break dotted-path resolution
+# (e.g. patch("cascadeflow.agent.PROVIDER_REGISTRY")).
+# Use ``from cascadeflow.harness import agent`` instead.
 from .harness import (
     HarnessConfig,
     HarnessInitReport,
@@ -247,11 +251,8 @@
     init,
     reset,
     run,
-    agent as harness_agent,
     get_harness_config,
     get_current_run,
-    get_harness_callback_manager,
-    set_harness_callback_manager,
 )
 
 # ==================== MAIN AGENT & RESULT ====================
@@ -403,11 +404,8 @@
     "init",
     "reset",
     "run",
-    "harness_agent",
     "get_harness_config",
     "get_current_run",
-    "get_harness_callback_manager",
-    "set_harness_callback_manager",
     # ===== PROVIDERS =====
     "ModelResponse",
     "BaseProvider",
diff --git a/cascadeflow/integrations/google_adk.py b/cascadeflow/integrations/google_adk.py
index 09e90335..b0d582e3 100644
--- a/cascadeflow/integrations/google_adk.py
+++ b/cascadeflow/integrations/google_adk.py
@@ -124,6 +124,10 @@ def __init__(self, config: Optional[GoogleADKHarnessConfig] = None) -> None:
     def _callback_key(self, callback_context: Any) -> tuple[str, str]:
         invocation_id = getattr(callback_context, "invocation_id", "") or ""
         agent_name = getattr(callback_context, "agent_name", "") or ""
+        # Use object id as disambiguator when both fields are missing to
+        # prevent collisions across concurrent calls with empty metadata.
+        if not invocation_id and not agent_name:
+            invocation_id = str(id(callback_context))
         return (invocation_id, agent_name)
 
     async def before_model_callback(
diff --git a/tests/test_google_adk_integration.py b/tests/test_google_adk_integration.py
index 8f5ecef3..7f8cb66d 100644
--- a/tests/test_google_adk_integration.py
+++ b/tests/test_google_adk_integration.py
@@ -596,3 +596,60 @@ def test_content_with_no_text(self):
         inp, out = adk_mod.CascadeFlowADKPlugin._extract_tokens(response)
         assert inp == 0
         assert out == 1  # max(0//4, 1)
+
+
+class TestCallbackKeyCollision:
+    """Verify _callback_key uses id() fallback when both fields are empty."""
+
+    def test_distinct_keys_when_metadata_missing(self):
+        """Two contexts with no invocation_id/agent_name get distinct keys."""
+        plugin = adk_mod.CascadeFlowADKPlugin()
+        ctx_a = FakeCallbackContext(invocation_id="", agent_name="")
+        ctx_b = FakeCallbackContext(invocation_id="", agent_name="")
+        key_a = plugin._callback_key(ctx_a)
+        key_b = plugin._callback_key(ctx_b)
+        assert key_a != key_b, "Empty-metadata contexts must produce distinct keys"
+
+    def test_key_stable_for_same_object(self):
+        """Same context object always produces the same key."""
+        plugin = adk_mod.CascadeFlowADKPlugin()
+        ctx = FakeCallbackContext(invocation_id="", agent_name="")
+        assert plugin._callback_key(ctx) == plugin._callback_key(ctx)
+
+    def test_normal_key_unaffected(self):
+        """Contexts with real IDs don't use the id() fallback."""
+        plugin = adk_mod.CascadeFlowADKPlugin()
+        ctx = FakeCallbackContext(invocation_id="inv-42", agent_name="my-agent")
+        key = plugin._callback_key(ctx)
+        assert key == ("inv-42", "my-agent")
+
+    @pytest.mark.asyncio
+    async def test_concurrent_empty_contexts_track_independently(self):
+        """Two concurrent calls with empty metadata don't corrupt each other."""
+        init(mode="observe")
+        with run(budget=1.0) as harness_ctx:
+            plugin = adk_mod.CascadeFlowADKPlugin()
+            ctx_a = FakeCallbackContext(invocation_id="", agent_name="")
+            ctx_b = FakeCallbackContext(invocation_id="", agent_name="")
+
+            req_a = FakeLlmRequest(model="gpt-4o")
+            req_b = FakeLlmRequest(model="gpt-4o-mini")
+
+            # Start both calls
+            await plugin.before_model_callback(ctx_a, req_a)
+            await plugin.before_model_callback(ctx_b, req_b)
+
+            # Finish in reverse order
+            resp_b = FakeLlmResponse(
+                usage_metadata=FakeUsageMetadata(50, 25),
+            )
+            resp_a = FakeLlmResponse(
+                usage_metadata=FakeUsageMetadata(100, 50),
+            )
+            await plugin.after_model_callback(ctx_b, resp_b)
+            await plugin.after_model_callback(ctx_a, resp_a)
+
+            assert harness_ctx.step_count == 2
+            # Verify no leftover state (both keys were cleaned up)
+            assert len(plugin._call_start_times) == 0
+            assert len(plugin._call_models) == 0
diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py
index 5669e845..eb960a39 100644
--- a/tests/test_harness_api.py
+++ b/tests/test_harness_api.py
@@ -72,36 +72,18 @@ def test_init_non_numeric_env_raises(monkeypatch):
 
 
 def test_run_uses_global_defaults_and_overrides():
-    init(
-        mode="enforce",
-        budget=2.0,
-        max_tool_calls=5,
-        kpi_targets={"quality_min": 0.9},
-        kpi_weights={"cost": 0.7, "quality": 0.3},
-        compliance="gdpr",
-    )
+    init(mode="enforce", budget=2.0, max_tool_calls=5)
 
     default_ctx = run()
     assert default_ctx.mode == "enforce"
     assert default_ctx.budget_max == 2.0
     assert default_ctx.tool_calls_max == 5
     assert default_ctx.budget_remaining == 2.0
-    assert default_ctx.kpi_targets == {"quality_min": 0.9}
-    assert default_ctx.kpi_weights == {"cost": 0.7, "quality": 0.3}
-    assert default_ctx.compliance == "gdpr"
-
-    override_ctx = run(
-        budget=0.5,
-        max_tool_calls=3,
-        kpi_weights={"quality": 1.0},
-        compliance="strict",
-    )
+
+    override_ctx = run(budget=0.5, max_tool_calls=3)
     assert override_ctx.budget_max == 0.5
     assert override_ctx.tool_calls_max == 3
     assert override_ctx.budget_remaining == 0.5
-    assert override_ctx.kpi_targets == {"quality_min": 0.9}
-    assert override_ctx.kpi_weights == {"quality": 1.0}
-    assert override_ctx.compliance == "strict"
 
 
 def test_run_without_enter_exit_is_safe():
@@ -170,8 +152,9 @@ def test_top_level_exports_exist():
     assert callable(cascadeflow.init)
     assert callable(cascadeflow.reset)
     assert callable(cascadeflow.run)
-    assert callable(cascadeflow.harness_agent)
-    assert hasattr(cascadeflow.agent, "PROVIDER_REGISTRY")
+    # harness.agent is intentionally NOT re-exported at top level because it
+    # would shadow the cascadeflow.agent module.  Import from submodule:
+    assert callable(agent)  # imported from cascadeflow.harness
     report = cascadeflow.init(mode="off")
     assert report.mode == "off"
 

From 05c423fef60f48922c45282812a1f02e8a2fa66f Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 4 Mar 2026 16:52:26 +0100
Subject: [PATCH 32/49] =?UTF-8?q?fix:=20address=20PR=20#165=20review=20?=
 =?UTF-8?q?=E2=80=94=205=20findings=20resolved?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. HIGH: off mode now respected — before/after callbacks return early
   when ctx.mode == "off", preventing metric tracking in off mode

2. HIGH: versioned Gemini model IDs now resolve correctly — added
   _resolve_pricing_key() with suffix stripping (-preview-XX-XX,
   -YYYYMMDD, -latest, -exp-N) and longest-prefix fallback matching

3. MEDIUM: callback key collision fixed — switched from
   (invocation_id, agent_name) tuple to id(callback_context) int key,
   guaranteeing uniqueness even for concurrent calls with same IDs

4. MEDIUM: fail_open tests now patch the correct symbol
   (cascadeflow.integrations.google_adk.get_current_run instead of
   cascadeflow.harness.api.get_current_run)

5. MEDIUM: budget error response no longer leaks spend/limit numbers —
   user-facing message is generic, exact figures logged at warning level

Added 13 new tests: off-mode behavior (2), versioned model pricing (7),
callback key collision (4). Total: 62 ADK tests pass.
Full suite: 1097 passed, 69 skipped, 0 failures.
---
 cascadeflow/harness/pricing.py         |  54 ++++++++-
 cascadeflow/integrations/google_adk.py |  45 +++++---
 tests/test_google_adk_integration.py   | 151 +++++++++++++++++++------
 3 files changed, 197 insertions(+), 53 deletions(-)

diff --git a/cascadeflow/harness/pricing.py b/cascadeflow/harness/pricing.py
index fe7bd92c..7f6cd44b 100644
--- a/cascadeflow/harness/pricing.py
+++ b/cascadeflow/harness/pricing.py
@@ -11,6 +11,8 @@
 
 from __future__ import annotations
 
+import re as _re
+
 # ---------------------------------------------------------------------------
 # Pricing (USD per 1M tokens: input, output)
 # ---------------------------------------------------------------------------
@@ -71,13 +73,61 @@
 ENERGY_OUTPUT_WEIGHT: float = 1.5
 
 
+# Pre-compiled pattern for stripping version/preview/date suffixes.
+# Matches: -preview, -preview-05-20, -20250120, -latest, -exp-0827, etc.
+_VERSION_SUFFIX_RE = _re.compile(
+    r"(-preview(?:-\d{2,4}-\d{2})?|-\d{8,}|-latest|-exp(?:-\d+)?|-it)$"
+)
+
+# Cache for resolved model → pricing key lookups.
+_pricing_key_cache: dict[str, str | None] = {}
+
+
+def _resolve_pricing_key(model: str) -> str | None:
+    """Resolve a model name to a known pricing table key.
+
+    Tries exact match first, then strips version/preview/date suffixes,
+    then tries longest-prefix match against known model names.
+    Returns ``None`` when no match is found (caller should use defaults).
+    """
+    if model in _pricing_key_cache:
+        return _pricing_key_cache[model]
+
+    # Exact match
+    if model in PRICING_USD_PER_M:
+        _pricing_key_cache[model] = model
+        return model
+
+    # Strip version suffixes and retry
+    stripped = _VERSION_SUFFIX_RE.sub("", model)
+    if stripped != model and stripped in PRICING_USD_PER_M:
+        _pricing_key_cache[model] = stripped
+        return stripped
+
+    # Longest-prefix match (e.g. "gemini-2.5-flash-8b" → "gemini-2.5-flash")
+    best: str | None = None
+    best_len = 0
+    for known in PRICING_USD_PER_M:
+        if model.startswith(known) and len(known) > best_len:
+            best = known
+            best_len = len(known)
+    if best is not None:
+        _pricing_key_cache[model] = best
+        return best
+
+    _pricing_key_cache[model] = None
+    return None
+
+
 def estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
     """Estimate cost in USD from model name and token counts."""
-    in_price, out_price = PRICING_USD_PER_M.get(model, DEFAULT_PRICING_USD_PER_M)
+    key = _resolve_pricing_key(model)
+    in_price, out_price = PRICING_USD_PER_M.get(key, DEFAULT_PRICING_USD_PER_M) if key else DEFAULT_PRICING_USD_PER_M
     return (input_tokens / 1_000_000) * in_price + (output_tokens / 1_000_000) * out_price
 
 
 def estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float:
     """Estimate energy proxy from model name and token counts."""
-    coeff = ENERGY_COEFFICIENTS.get(model, DEFAULT_ENERGY_COEFFICIENT)
+    key = _resolve_pricing_key(model)
+    coeff = ENERGY_COEFFICIENTS.get(key, DEFAULT_ENERGY_COEFFICIENT) if key else DEFAULT_ENERGY_COEFFICIENT
     return coeff * (input_tokens + output_tokens * ENERGY_OUTPUT_WEIGHT)
diff --git a/cascadeflow/integrations/google_adk.py b/cascadeflow/integrations/google_adk.py
index b0d582e3..1c6a853d 100644
--- a/cascadeflow/integrations/google_adk.py
+++ b/cascadeflow/integrations/google_adk.py
@@ -116,19 +116,22 @@ class CascadeFlowADKPlugin(_ADKBasePlugin):  # type: ignore[misc]
     def __init__(self, config: Optional[GoogleADKHarnessConfig] = None) -> None:
         self._config = config or GoogleADKHarnessConfig()
         self._active = True
+        self._call_seq: int = 0
         # Track call metadata between before/after callbacks.
-        # Keyed by (invocation_id, agent_name) to handle concurrent calls.
-        self._call_start_times: dict[tuple[str, str], float] = {}
-        self._call_models: dict[tuple[str, str], str] = {}
-
-    def _callback_key(self, callback_context: Any) -> tuple[str, str]:
-        invocation_id = getattr(callback_context, "invocation_id", "") or ""
-        agent_name = getattr(callback_context, "agent_name", "") or ""
-        # Use object id as disambiguator when both fields are missing to
-        # prevent collisions across concurrent calls with empty metadata.
-        if not invocation_id and not agent_name:
-            invocation_id = str(id(callback_context))
-        return (invocation_id, agent_name)
+        # Keyed by id(callback_context) to guarantee uniqueness even when
+        # two concurrent calls share (invocation_id, agent_name).
+        self._call_start_times: dict[int, float] = {}
+        self._call_models: dict[int, str] = {}
+
+    @staticmethod
+    def _callback_key(callback_context: Any) -> int:
+        """Return a unique key for a callback_context object.
+
+        Uses ``id()`` which is guaranteed unique for the lifetime of the
+        object — ADK keeps the same CallbackContext alive across the
+        before/after/error callback sequence for a single LLM call.
+        """
+        return id(callback_context)
 
     async def before_model_callback(
         self,
@@ -147,6 +150,8 @@ async def before_model_callback(
             ctx = get_current_run()
             if ctx is None:
                 return None
+            if ctx.mode == "off":
+                return None
 
             # Extract model name from request
             model_raw = getattr(llm_request, "model", None) or "unknown"
@@ -196,6 +201,8 @@ async def after_model_callback(
             ctx = get_current_run()
             if ctx is None:
                 return None
+            if ctx.mode == "off":
+                return None
 
             key = self._callback_key(callback_context)
 
@@ -282,6 +289,7 @@ async def on_model_error_callback(
     def deactivate(self) -> None:
         """Make all callbacks no-ops without unregistering from Runner."""
         self._active = False
+        self._call_seq = 0
         self._call_start_times.clear()
         self._call_models.clear()
 
@@ -316,10 +324,17 @@ def _make_budget_error_response(ctx: Any) -> Any:
 
         When ADK is available we return a real ``LlmResponse``.  When not
         (shouldn't happen in practice), we return a sentinel dict.
+
+        The user-facing message is intentionally generic to avoid leaking
+        internal spend/limit numbers.  Exact figures are logged separately.
         """
-        msg = (
-            f"cascadeflow harness budget exceeded "
-            f"(spent ${ctx.cost:.4f} of ${ctx.budget_max:.4f} max)"
+        # Generic message safe for end-user exposure.
+        msg = "cascadeflow harness budget exceeded"
+        # Detailed figures for operators only.
+        logger.warning(
+            "google-adk: budget exceeded — spent $%.4f of $%.4f max",
+            ctx.cost,
+            ctx.budget_max,
         )
         if GOOGLE_ADK_AVAILABLE:
             try:
diff --git a/tests/test_google_adk_integration.py b/tests/test_google_adk_integration.py
index 7f8cb66d..e68edcaf 100644
--- a/tests/test_google_adk_integration.py
+++ b/tests/test_google_adk_integration.py
@@ -268,7 +268,7 @@ async def test_fail_open_swallows_errors(self, plugin):
         init(mode="enforce")
         with run():
             with patch(
-                "cascadeflow.harness.api.get_current_run",
+                "cascadeflow.integrations.google_adk.get_current_run",
                 side_effect=RuntimeError("boom"),
             ):
                 result = await plugin.before_model_callback(
@@ -401,7 +401,7 @@ async def test_fail_open_swallows_errors(self, plugin):
         init(mode="observe")
         with run():
             with patch(
-                "cascadeflow.harness.api.get_current_run",
+                "cascadeflow.integrations.google_adk.get_current_run",
                 side_effect=RuntimeError("boom"),
             ):
                 result = await plugin.after_model_callback(
@@ -454,7 +454,7 @@ async def test_fail_open_swallows_errors(self, plugin):
         init(mode="observe")
         with run():
             with patch(
-                "cascadeflow.harness.api.get_current_run",
+                "cascadeflow.integrations.google_adk.get_current_run",
                 side_effect=RuntimeError("boom"),
             ):
                 result = await plugin.on_model_error_callback(
@@ -557,8 +557,8 @@ async def test_deactivated_plugin_skips_callbacks(self):
 
     async def test_deactivate_clears_state(self):
         plugin = adk_mod.CascadeFlowADKPlugin()
-        plugin._call_start_times[("a", "b")] = 1.0
-        plugin._call_models[("a", "b")] = "test"
+        plugin._call_start_times[12345] = 1.0
+        plugin._call_models[12345] = "test"
         plugin.deactivate()
         assert len(plugin._call_start_times) == 0
         assert len(plugin._call_models) == 0
@@ -599,57 +599,136 @@ def test_content_with_no_text(self):
 
 
 class TestCallbackKeyCollision:
-    """Verify _callback_key uses id() fallback when both fields are empty."""
+    """Verify _callback_key uses id() for per-object uniqueness."""
 
-    def test_distinct_keys_when_metadata_missing(self):
-        """Two contexts with no invocation_id/agent_name get distinct keys."""
-        plugin = adk_mod.CascadeFlowADKPlugin()
-        ctx_a = FakeCallbackContext(invocation_id="", agent_name="")
-        ctx_b = FakeCallbackContext(invocation_id="", agent_name="")
-        key_a = plugin._callback_key(ctx_a)
-        key_b = plugin._callback_key(ctx_b)
-        assert key_a != key_b, "Empty-metadata contexts must produce distinct keys"
+    def test_distinct_keys_for_different_objects(self):
+        """Two distinct context objects always produce distinct keys."""
+        ctx_a = FakeCallbackContext(invocation_id="inv-1", agent_name="agent-a")
+        ctx_b = FakeCallbackContext(invocation_id="inv-1", agent_name="agent-a")
+        key_a = adk_mod.CascadeFlowADKPlugin._callback_key(ctx_a)
+        key_b = adk_mod.CascadeFlowADKPlugin._callback_key(ctx_b)
+        assert key_a != key_b, "Same IDs on different objects must produce distinct keys"
 
     def test_key_stable_for_same_object(self):
         """Same context object always produces the same key."""
-        plugin = adk_mod.CascadeFlowADKPlugin()
-        ctx = FakeCallbackContext(invocation_id="", agent_name="")
-        assert plugin._callback_key(ctx) == plugin._callback_key(ctx)
+        ctx = FakeCallbackContext()
+        key1 = adk_mod.CascadeFlowADKPlugin._callback_key(ctx)
+        key2 = adk_mod.CascadeFlowADKPlugin._callback_key(ctx)
+        assert key1 == key2
 
-    def test_normal_key_unaffected(self):
-        """Contexts with real IDs don't use the id() fallback."""
-        plugin = adk_mod.CascadeFlowADKPlugin()
-        ctx = FakeCallbackContext(invocation_id="inv-42", agent_name="my-agent")
-        key = plugin._callback_key(ctx)
-        assert key == ("inv-42", "my-agent")
+    def test_key_is_int(self):
+        """Key type is int (object id)."""
+        ctx = FakeCallbackContext()
+        assert isinstance(adk_mod.CascadeFlowADKPlugin._callback_key(ctx), int)
 
     @pytest.mark.asyncio
-    async def test_concurrent_empty_contexts_track_independently(self):
-        """Two concurrent calls with empty metadata don't corrupt each other."""
+    async def test_concurrent_same_ids_track_independently(self):
+        """Two concurrent calls with same invocation_id+agent_name don't corrupt."""
         init(mode="observe")
         with run(budget=1.0) as harness_ctx:
             plugin = adk_mod.CascadeFlowADKPlugin()
-            ctx_a = FakeCallbackContext(invocation_id="", agent_name="")
-            ctx_b = FakeCallbackContext(invocation_id="", agent_name="")
+            # Same IDs — previously would collide
+            ctx_a = FakeCallbackContext(invocation_id="inv-1", agent_name="agent")
+            ctx_b = FakeCallbackContext(invocation_id="inv-1", agent_name="agent")
 
             req_a = FakeLlmRequest(model="gpt-4o")
             req_b = FakeLlmRequest(model="gpt-4o-mini")
 
-            # Start both calls
             await plugin.before_model_callback(ctx_a, req_a)
             await plugin.before_model_callback(ctx_b, req_b)
 
-            # Finish in reverse order
-            resp_b = FakeLlmResponse(
-                usage_metadata=FakeUsageMetadata(50, 25),
-            )
-            resp_a = FakeLlmResponse(
-                usage_metadata=FakeUsageMetadata(100, 50),
-            )
+            resp_b = FakeLlmResponse(usage_metadata=FakeUsageMetadata(50, 25))
+            resp_a = FakeLlmResponse(usage_metadata=FakeUsageMetadata(100, 50))
             await plugin.after_model_callback(ctx_b, resp_b)
             await plugin.after_model_callback(ctx_a, resp_a)
 
             assert harness_ctx.step_count == 2
-            # Verify no leftover state (both keys were cleaned up)
             assert len(plugin._call_start_times) == 0
             assert len(plugin._call_models) == 0
+
+
+# ---------------------------------------------------------------------------
+# Off-mode behavior
+# ---------------------------------------------------------------------------
+
+
+class TestOffMode:
+    """mode='off' must not track metrics or update run context."""
+
+    @pytest.mark.asyncio
+    async def test_off_mode_before_callback_returns_none(self):
+        init(mode="off")
+        plugin = adk_mod.CascadeFlowADKPlugin()
+        with run() as run_ctx:
+            result = await plugin.before_model_callback(
+                FakeCallbackContext(), FakeLlmRequest()
+            )
+            assert result is None
+            assert len(plugin._call_start_times) == 0
+
+    @pytest.mark.asyncio
+    async def test_off_mode_after_callback_does_not_track(self):
+        init(mode="off")
+        plugin = adk_mod.CascadeFlowADKPlugin()
+        with run() as run_ctx:
+            await plugin.after_model_callback(
+                FakeCallbackContext(),
+                FakeLlmResponse(usage_metadata=FakeUsageMetadata(1000, 500)),
+            )
+            assert run_ctx.step_count == 0
+            assert run_ctx.cost == 0.0
+            assert run_ctx.energy_used == 0.0
+            assert len(run_ctx.trace()) == 0
+
+
+# ---------------------------------------------------------------------------
+# Versioned model name resolution
+# ---------------------------------------------------------------------------
+
+
+class TestVersionedModelPricing:
+    """Versioned model IDs must resolve to correct pricing, not default."""
+
+    def test_versioned_gemini_flash(self):
+        from cascadeflow.harness.pricing import estimate_cost
+
+        # Should resolve to gemini-2.5-flash pricing ($0.15/$0.60)
+        cost = estimate_cost("gemini-2.5-flash-preview-05-20", 1_000_000, 1_000_000)
+        assert cost == pytest.approx(0.75, abs=0.01)
+
+    def test_versioned_gemini_pro(self):
+        from cascadeflow.harness.pricing import estimate_cost
+
+        cost = estimate_cost("gemini-2.5-pro-preview-05-06", 1_000_000, 1_000_000)
+        assert cost == pytest.approx(11.25, abs=0.01)
+
+    def test_dated_model_suffix(self):
+        from cascadeflow.harness.pricing import estimate_cost
+
+        cost = estimate_cost("gemini-2.5-flash-20250120", 1_000_000, 1_000_000)
+        assert cost == pytest.approx(0.75, abs=0.01)
+
+    def test_latest_suffix(self):
+        from cascadeflow.harness.pricing import estimate_cost
+
+        cost = estimate_cost("gemini-2.5-flash-latest", 1_000_000, 1_000_000)
+        assert cost == pytest.approx(0.75, abs=0.01)
+
+    def test_unknown_model_still_uses_default(self):
+        from cascadeflow.harness.pricing import estimate_cost
+
+        cost = estimate_cost("totally-unknown-model", 1_000_000, 0)
+        assert cost == pytest.approx(2.50)
+
+    def test_exact_match_still_works(self):
+        from cascadeflow.harness.pricing import estimate_cost
+
+        cost = estimate_cost("gemini-2.5-flash", 1_000_000, 1_000_000)
+        assert cost == pytest.approx(0.75, abs=0.01)
+
+    def test_prefix_match_variant(self):
+        """A variant like gemini-2.5-flash-8b matches the base model."""
+        from cascadeflow.harness.pricing import estimate_cost
+
+        cost = estimate_cost("gemini-2.5-flash-8b", 1_000_000, 1_000_000)
+        assert cost == pytest.approx(0.75, abs=0.01)

From af414d069d62bbc6dc01b21b42d1452c48e13a44 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 4 Mar 2026 14:48:49 +0100
Subject: [PATCH 33/49] feat(harness): add anthropic python
 auto-instrumentation for v2.1

---
 cascadeflow/harness/api.py                  |  29 ++-
 cascadeflow/harness/instrument.py           | 259 +++++++++++++++++++-
 docs/strategy/agent-intelligence-v2-plan.md |  15 ++
 tests/test_harness_api.py                   | 150 +++++++++++-
 tests/test_harness_instrument.py            |  42 +++-
 5 files changed, 464 insertions(+), 31 deletions(-)

diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py
index f545d73d..79f741b8 100644
--- a/cascadeflow/harness/api.py
+++ b/cascadeflow/harness/api.py
@@ -228,9 +228,10 @@ def reset() -> None:
     global _harness_callback_manager
     global _cached_cascade_decision_event
 
-    from cascadeflow.harness.instrument import unpatch_openai
+    from cascadeflow.harness.instrument import unpatch_anthropic, unpatch_openai
 
     unpatch_openai()
+    unpatch_anthropic()
     _harness_config = HarnessConfig()
     _is_instrumented = False
     _harness_callback_manager = None
@@ -497,13 +498,29 @@ def init(
 
         if patch_openai():
             instrumented.append("openai")
-    elif validated_mode == "off":
-        from cascadeflow.harness.instrument import is_patched, unpatch_openai
+        else:
+            detected_but_not_instrumented.append("openai")
+
+    if validated_mode != "off" and sdk_presence["anthropic"]:
+        from cascadeflow.harness.instrument import patch_anthropic
+
+        if patch_anthropic():
+            instrumented.append("anthropic")
+        else:
+            detected_but_not_instrumented.append("anthropic")
+
+    if validated_mode == "off":
+        from cascadeflow.harness.instrument import (
+            is_anthropic_patched,
+            is_openai_patched,
+            unpatch_anthropic,
+            unpatch_openai,
+        )
 
-        if is_patched():
+        if is_openai_patched():
             unpatch_openai()
-    if sdk_presence["anthropic"]:
-        detected_but_not_instrumented.append("anthropic")
+        if is_anthropic_patched():
+            unpatch_anthropic()
 
     if _is_instrumented:
         logger.debug("harness init called again; instrumentation remains idempotent")
diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py
index c2fbd7ab..566f15d0 100644
--- a/cascadeflow/harness/instrument.py
+++ b/cascadeflow/harness/instrument.py
@@ -1,11 +1,10 @@
-"""OpenAI Python client auto-instrumentation for cascadeflow harness.
+"""Python SDK auto-instrumentation for cascadeflow harness.
 
-Patches ``openai.resources.chat.completions.Completions.create`` (sync) and
-``AsyncCompletions.create`` (async) to intercept LLM calls for observe/enforce
-modes.
+Patches OpenAI and Anthropic SDK request methods to intercept LLM calls for
+observe/enforce modes.
 
-This module is called internally by ``cascadeflow.harness.init()``.  Users
-should not call ``patch_openai`` / ``unpatch_openai`` directly.
+This module is called internally by ``cascadeflow.harness.init()``. Users
+should not call patch/unpatch helpers directly.
 
 Implementation notes:
     - Patching is class-level (all current and future client instances).
@@ -51,6 +50,9 @@
 _openai_patched: bool = False
 _original_sync_create: Any = None
 _original_async_create: Any = None
+_anthropic_patched: bool = False
+_original_anthropic_sync_create: Any = None
+_original_anthropic_async_create: Any = None
 
 _MODEL_TOTAL_COSTS: dict[str, float] = {
     name: _model_total_price_shared(name) for name in _PRICING_MODELS
@@ -140,7 +142,7 @@ def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) ->
     return _estimate_energy_shared(model, prompt_tokens, completion_tokens)
 
 
-def _count_tool_calls_in_response(response: Any) -> int:
+def _count_tool_calls_in_openai_response(response: Any) -> int:
     """Count tool calls in a non-streaming ChatCompletion response."""
     choices = getattr(response, "choices", None)
     if not choices:
@@ -154,7 +156,7 @@ def _count_tool_calls_in_response(response: Any) -> int:
     return len(tool_calls)
 
 
-def _extract_usage(response: Any) -> tuple[int, int]:
+def _extract_openai_usage(response: Any) -> tuple[int, int]:
     """Extract (prompt_tokens, completion_tokens) from a response."""
     usage = getattr(response, "usage", None)
     if usage is None:
@@ -165,6 +167,29 @@ def _extract_usage(response: Any) -> tuple[int, int]:
     )
 
 
+def _extract_anthropic_usage(response: Any) -> tuple[int, int]:
+    """Extract (input_tokens, output_tokens) from an Anthropic response."""
+    usage = getattr(response, "usage", None)
+    if usage is None:
+        return 0, 0
+    return (
+        getattr(usage, "input_tokens", 0) or 0,
+        getattr(usage, "output_tokens", 0) or 0,
+    )
+
+
+def _count_tool_calls_in_anthropic_response(response: Any) -> int:
+    """Count Anthropic ``tool_use`` blocks in a non-streaming response."""
+    content = getattr(response, "content", None)
+    if not content:
+        return 0
+    count = 0
+    for block in content:
+        if getattr(block, "type", None) == "tool_use":
+            count += 1
+    return count
+
+
 def _model_total_cost(model: str) -> float:
     return _MODEL_TOTAL_COSTS.get(model, _model_total_price_shared(model))
 
@@ -713,8 +738,8 @@ def _finalize_interception(
 
     if (not state.is_stream) and ctx:
         elapsed_ms = (time.monotonic() - state.start_time) * 1000
-        prompt_tokens, completion_tokens = _extract_usage(response)
-        tool_call_count = _count_tool_calls_in_response(response)
+        prompt_tokens, completion_tokens = _extract_openai_usage(response)
+        tool_call_count = _count_tool_calls_in_openai_response(response)
         _update_context(
             ctx,
             state.model,
@@ -810,6 +835,150 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
     return wrapper
 
 
+def _make_patched_anthropic_create(original_fn: Any) -> Any:
+    """Create a patched version of ``anthropic.Messages.create``."""
+
+    @functools.wraps(original_fn)
+    def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
+        from cascadeflow.harness.api import get_current_run, get_harness_config
+
+        config = get_harness_config()
+        ctx = get_current_run()
+        mode = ctx.mode if ctx else config.mode
+
+        if mode == "off":
+            return original_fn(self, *args, **kwargs)
+
+        model: str = kwargs.get("model", "unknown")
+        pre_action = "allow"
+        pre_reason = mode
+        pre_model = model
+        pre_applied = True
+
+        if ctx:
+            kwargs, model, pre_action, pre_reason, pre_model, pre_applied = (
+                _resolve_pre_call_decision(
+                    ctx,
+                    mode,
+                    model,
+                    kwargs,
+                )
+            )
+
+        is_stream = bool(kwargs.get("stream", False))
+        start_time = time.monotonic()
+        response = original_fn(self, *args, **kwargs)
+
+        if not ctx:
+            logger.debug(
+                "harness %s (anthropic): model=%s (no active run scope, metrics not tracked)",
+                mode,
+                model,
+            )
+            return response
+
+        # Anthropic stream wrappers are not instrumented in V2.1 (known limitation).
+        if is_stream:
+            logger.debug(
+                "harness %s (anthropic): stream passthrough model=%s (usage tracking unavailable)",
+                mode,
+                model,
+            )
+            return response
+
+        elapsed_ms = (time.monotonic() - start_time) * 1000
+        input_tokens, output_tokens = _extract_anthropic_usage(response)
+        tool_call_count = _count_tool_calls_in_anthropic_response(response)
+        _update_context(
+            ctx,
+            model,
+            input_tokens,
+            output_tokens,
+            tool_call_count,
+            elapsed_ms,
+            action=pre_action,
+            action_reason=pre_reason,
+            action_model=pre_model,
+            applied=pre_applied,
+            decision_mode=mode,
+        )
+        return response
+
+    return wrapper
+
+
+def _make_patched_anthropic_async_create(original_fn: Any) -> Any:
+    """Create a patched version of ``anthropic.AsyncMessages.create``."""
+
+    @functools.wraps(original_fn)
+    async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
+        from cascadeflow.harness.api import get_current_run, get_harness_config
+
+        config = get_harness_config()
+        ctx = get_current_run()
+        mode = ctx.mode if ctx else config.mode
+
+        if mode == "off":
+            return await original_fn(self, *args, **kwargs)
+
+        model: str = kwargs.get("model", "unknown")
+        pre_action = "allow"
+        pre_reason = mode
+        pre_model = model
+        pre_applied = True
+
+        if ctx:
+            kwargs, model, pre_action, pre_reason, pre_model, pre_applied = (
+                _resolve_pre_call_decision(
+                    ctx,
+                    mode,
+                    model,
+                    kwargs,
+                )
+            )
+
+        is_stream = bool(kwargs.get("stream", False))
+        start_time = time.monotonic()
+        response = await original_fn(self, *args, **kwargs)
+
+        if not ctx:
+            logger.debug(
+                "harness %s async (anthropic): model=%s (no active run scope, metrics not tracked)",
+                mode,
+                model,
+            )
+            return response
+
+        # Anthropic stream wrappers are not instrumented in V2.1 (known limitation).
+        if is_stream:
+            logger.debug(
+                "harness %s async (anthropic): stream passthrough model=%s (usage tracking unavailable)",
+                mode,
+                model,
+            )
+            return response
+
+        elapsed_ms = (time.monotonic() - start_time) * 1000
+        input_tokens, output_tokens = _extract_anthropic_usage(response)
+        tool_call_count = _count_tool_calls_in_anthropic_response(response)
+        _update_context(
+            ctx,
+            model,
+            input_tokens,
+            output_tokens,
+            tool_call_count,
+            elapsed_ms,
+            action=pre_action,
+            action_reason=pre_reason,
+            action_model=pre_model,
+            applied=pre_applied,
+            decision_mode=mode,
+        )
+        return response
+
+    return wrapper
+
+
 # ---------------------------------------------------------------------------
 # Public API (called by cascadeflow.harness.api)
 # ---------------------------------------------------------------------------
@@ -846,6 +1015,37 @@ def patch_openai() -> bool:
     return True
 
 
+def patch_anthropic() -> bool:
+    """Patch the Anthropic Python client for harness instrumentation.
+
+    Returns ``True`` if patching succeeded, ``False`` if anthropic is not
+    installed. Idempotent: safe to call multiple times.
+    """
+    global _anthropic_patched, _original_anthropic_sync_create, _original_anthropic_async_create
+
+    if _anthropic_patched:
+        logger.debug("anthropic already patched, skipping")
+        return True
+
+    try:
+        from anthropic.resources.messages import AsyncMessages, Messages
+    except ImportError:
+        logger.debug("anthropic package not available, skipping instrumentation")
+        return False
+
+    _original_anthropic_sync_create = Messages.create
+    _original_anthropic_async_create = AsyncMessages.create
+
+    Messages.create = _make_patched_anthropic_create(_original_anthropic_sync_create)  # type: ignore[assignment]
+    AsyncMessages.create = _make_patched_anthropic_async_create(  # type: ignore[assignment]
+        _original_anthropic_async_create,
+    )
+
+    _anthropic_patched = True
+    logger.info("anthropic client instrumented (sync + async)")
+    return True
+
+
 def unpatch_openai() -> None:
     """Restore original OpenAI client methods.
 
@@ -873,6 +1073,43 @@ def unpatch_openai() -> None:
     logger.info("openai client unpatched")
 
 
-def is_patched() -> bool:
+def unpatch_anthropic() -> None:
+    """Restore original Anthropic client methods.
+
+    Safe to call even if not patched. Used by ``reset()`` and tests.
+    """
+    global _anthropic_patched, _original_anthropic_sync_create, _original_anthropic_async_create
+
+    if not _anthropic_patched:
+        return
+
+    try:
+        from anthropic.resources.messages import AsyncMessages, Messages
+    except ImportError:
+        _anthropic_patched = False
+        return
+
+    if _original_anthropic_sync_create is not None:
+        Messages.create = _original_anthropic_sync_create  # type: ignore[assignment]
+    if _original_anthropic_async_create is not None:
+        AsyncMessages.create = _original_anthropic_async_create  # type: ignore[assignment]
+
+    _original_anthropic_sync_create = None
+    _original_anthropic_async_create = None
+    _anthropic_patched = False
+    logger.info("anthropic client unpatched")
+
+
+def is_openai_patched() -> bool:
     """Return whether the OpenAI client is currently patched."""
     return _openai_patched
+
+
+def is_anthropic_patched() -> bool:
+    """Return whether the Anthropic client is currently patched."""
+    return _anthropic_patched
+
+
+def is_patched() -> bool:
+    """Return whether any supported Python SDK is currently patched."""
+    return _openai_patched or _anthropic_patched
diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md
index 267ddc69..1c8a2344 100644
--- a/docs/strategy/agent-intelligence-v2-plan.md
+++ b/docs/strategy/agent-intelligence-v2-plan.md
@@ -864,6 +864,21 @@ Integration-branch promotion gates:
 - [ ] Quickstart verification for existing app and framework paths
 - [ ] Go/No-Go checklist in Section 18 satisfied before merging to `main`
 
+### 16.2 V2.1 Parallel Execution Split
+
+To enable parallel work without merge collisions, split V2.1 into Python and TS tracks:
+
+- `feat/v2.1-anthropic-python-auto-instrumentation` (claimed by current agent)
+  - Scope: `cascadeflow/harness/*`, Python harness tests, Python docs notes
+  - Deliverables: Anthropic Python auto-instrumentation, validation for `init()/run()` harness path
+- `feat/v2.1-ts-harness-api-parity` (available for parallel agent)
+  - Scope: `packages/core/*`, TS parity fixtures, TS docs notes
+  - Deliverables: `@cascadeflow/core` exports parity (`init()/run()`), TS fixture parity validation
+
+Parallel-safe rule:
+- Python track does not touch `packages/core/*`
+- TS track does not touch `cascadeflow/harness/*`
+
 ## 17. Future Phases (Post-V2, Not in Scope)
 
 For roadmap visibility. These inform V2 telemetry design but are not V2 deliverables.
diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py
index eb960a39..9554a486 100644
--- a/tests/test_harness_api.py
+++ b/tests/test_harness_api.py
@@ -5,6 +5,7 @@
 import cascadeflow
 import cascadeflow.harness.api as harness_api
 from cascadeflow.harness import agent, get_current_run, get_harness_config, init, reset, run
+from cascadeflow.telemetry.callbacks import CallbackEvent, CallbackManager
 
 
 def setup_function() -> None:
@@ -72,18 +73,36 @@ def test_init_non_numeric_env_raises(monkeypatch):
 
 
 def test_run_uses_global_defaults_and_overrides():
-    init(mode="enforce", budget=2.0, max_tool_calls=5)
+    init(
+        mode="enforce",
+        budget=2.0,
+        max_tool_calls=5,
+        kpi_targets={"quality_min": 0.9},
+        kpi_weights={"cost": 0.7, "quality": 0.3},
+        compliance="gdpr",
+    )
 
     default_ctx = run()
     assert default_ctx.mode == "enforce"
     assert default_ctx.budget_max == 2.0
     assert default_ctx.tool_calls_max == 5
     assert default_ctx.budget_remaining == 2.0
-
-    override_ctx = run(budget=0.5, max_tool_calls=3)
+    assert default_ctx.kpi_targets == {"quality_min": 0.9}
+    assert default_ctx.kpi_weights == {"cost": 0.7, "quality": 0.3}
+    assert default_ctx.compliance == "gdpr"
+
+    override_ctx = run(
+        budget=0.5,
+        max_tool_calls=3,
+        kpi_weights={"quality": 1.0},
+        compliance="strict",
+    )
     assert override_ctx.budget_max == 0.5
     assert override_ctx.tool_calls_max == 3
     assert override_ctx.budget_remaining == 0.5
+    assert override_ctx.kpi_targets == {"quality_min": 0.9}
+    assert override_ctx.kpi_weights == {"quality": 1.0}
+    assert override_ctx.compliance == "strict"
 
 
 def test_run_without_enter_exit_is_safe():
@@ -152,9 +171,10 @@ def test_top_level_exports_exist():
     assert callable(cascadeflow.init)
     assert callable(cascadeflow.reset)
     assert callable(cascadeflow.run)
-    # harness.agent is intentionally NOT re-exported at top level because it
-    # would shadow the cascadeflow.agent module.  Import from submodule:
-    assert callable(agent)  # imported from cascadeflow.harness
+    assert callable(cascadeflow.harness_agent)
+    assert hasattr(cascadeflow.agent, "PROVIDER_REGISTRY")
+    assert callable(cascadeflow.get_harness_callback_manager)
+    assert callable(cascadeflow.set_harness_callback_manager)
     report = cascadeflow.init(mode="off")
     assert report.mode == "off"
 
@@ -166,6 +186,8 @@ def test_run_record_and_trace_copy():
     trace_b = ctx.trace()
     assert trace_a == trace_b
     assert trace_a[0]["action"] == "switch_model"
+    assert "budget_state" in trace_a[0]
+    assert trace_a[0]["budget_state"]["max"] == 1.0
     trace_a.append({"action": "mutated"})
     assert len(ctx.trace()) == 1
 
@@ -310,3 +332,119 @@ def test_init_reports_openai_instrumented_when_patch_succeeds(monkeypatch):
     monkeypatch.setattr(instrument, "patch_openai", lambda: True)
     report = init(mode="observe")
     assert report.instrumented == ["openai"]
+
+
+def test_init_reports_anthropic_instrumented_when_patch_succeeds(monkeypatch):
+    monkeypatch.setattr(
+        harness_api,
+        "find_spec",
+        lambda name: object() if name == "anthropic" else None,
+    )
+
+    import cascadeflow.harness.instrument as instrument
+
+    monkeypatch.setattr(instrument, "patch_anthropic", lambda: True)
+    report = init(mode="observe")
+    assert report.instrumented == ["anthropic"]
+
+
+def test_init_reports_anthropic_detected_not_instrumented_on_patch_failure(monkeypatch):
+    monkeypatch.setattr(
+        harness_api,
+        "find_spec",
+        lambda name: object() if name == "anthropic" else None,
+    )
+
+    import cascadeflow.harness.instrument as instrument
+
+    monkeypatch.setattr(instrument, "patch_anthropic", lambda: False)
+    report = init(mode="observe")
+    assert report.instrumented == []
+    assert report.detected_but_not_instrumented == ["anthropic"]
+
+
+def test_run_summary_populates_on_context_exit():
+    init(mode="observe")
+    with run(budget=1.5) as ctx:
+        ctx.step_count = 2
+        ctx.tool_calls = 1
+        ctx.cost = 0.42
+        ctx.latency_used_ms = 123.0
+        ctx.energy_used = 33.0
+        ctx.budget_remaining = 1.08
+        ctx.last_action = "allow"
+        ctx.model_used = "gpt-4o-mini"
+
+    summary = ctx.summary()
+    assert summary["run_id"] == ctx.run_id
+    assert summary["step_count"] == 2
+    assert summary["budget_remaining"] == pytest.approx(1.08)
+    assert summary["duration_ms"] is not None
+    assert summary["duration_ms"] >= 0.0
+    assert ctx.duration_ms is not None
+    assert ctx.duration_ms >= 0.0
+
+
+def test_run_context_logs_summary(caplog):
+    init(mode="observe")
+    with caplog.at_level("INFO", logger="cascadeflow.harness"):
+        with run(budget=1.0) as ctx:
+            ctx.step_count = 1
+            ctx.cost = 0.01
+            ctx.model_used = "gpt-4o-mini"
+
+    assert any("harness run summary" in rec.message for rec in caplog.records)
+
+
+def test_record_emits_cascade_decision_callback():
+    manager = CallbackManager()
+    received = []
+
+    def _on_decision(data):
+        received.append(data)
+
+    manager.register(CallbackEvent.CASCADE_DECISION, _on_decision)
+    report = init(mode="observe", callback_manager=manager)
+    assert report.config_sources["callback_manager"] == "code"
+
+    with run(budget=1.0) as ctx:
+        ctx.step_count = 1
+        ctx.record(action="switch_model", reason="budget_pressure", model="gpt-4o-mini")
+
+    assert len(received) == 1
+    event = received[0]
+    assert event.event == CallbackEvent.CASCADE_DECISION
+    assert event.query == "[harness]"
+    assert event.workflow == "harness"
+    assert event.data["action"] == "switch_model"
+    assert event.data["run_id"] == ctx.run_id
+
+
+def test_record_sanitizes_trace_values():
+    ctx = run()
+    ctx.record(
+        action="allow\nnewline",
+        reason="a" * 400,
+        model="model\r\nname",
+    )
+    entry = ctx.trace()[0]
+    assert "\n" not in entry["action"]
+    assert "\r" not in entry["model"]
+    assert len(entry["reason"]) <= 160
+
+
+def test_record_without_callback_manager_is_noop():
+    init(mode="observe")
+    with run(budget=1.0) as ctx:
+        ctx.record(action="allow", reason="test", model="gpt-4o-mini")
+    assert len(ctx.trace()) == 1
+
+
+def test_record_empty_action_warns_and_defaults(caplog):
+    init(mode="observe")
+    with caplog.at_level("WARNING", logger="cascadeflow.harness"):
+        with run(budget=1.0) as ctx:
+            ctx.record(action="", reason="test", model="gpt-4o-mini")
+    entry = ctx.trace()[0]
+    assert entry["action"] == "allow"
+    assert any("empty action" in rec.message for rec in caplog.records)
diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py
index 75368522..ca1f9a07 100644
--- a/tests/test_harness_instrument.py
+++ b/tests/test_harness_instrument.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+from importlib.util import find_spec
 import time
 from typing import Optional
 from unittest.mock import AsyncMock, MagicMock
@@ -18,8 +19,12 @@
     _estimate_energy,
     _make_patched_async_create,
     _make_patched_create,
+    is_anthropic_patched,
+    is_openai_patched,
     is_patched,
+    patch_anthropic,
     patch_openai,
+    unpatch_anthropic,
     unpatch_openai,
 )
 
@@ -87,19 +92,19 @@ def _mock_stream_chunk(
 
 class TestPatchLifecycle:
     def test_patch_and_unpatch(self) -> None:
-        assert not is_patched()
+        assert not is_openai_patched()
         result = patch_openai()
         assert result is True
-        assert is_patched()
+        assert is_openai_patched()
         unpatch_openai()
-        assert not is_patched()
+        assert not is_openai_patched()
 
     def test_idempotent_patching(self) -> None:
         patch_openai()
         patch_openai()
-        assert is_patched()
+        assert is_openai_patched()
         unpatch_openai()
-        assert not is_patched()
+        assert not is_openai_patched()
 
     def test_unpatch_without_prior_patch(self) -> None:
         unpatch_openai()  # should not raise
@@ -107,12 +112,12 @@ def test_unpatch_without_prior_patch(self) -> None:
     def test_init_observe_patches(self) -> None:
         report = init(mode="observe")
         assert "openai" in report.instrumented
-        assert is_patched()
+        assert is_openai_patched()
 
     def test_init_enforce_patches(self) -> None:
         report = init(mode="enforce")
         assert "openai" in report.instrumented
-        assert is_patched()
+        assert is_openai_patched()
 
     def test_init_off_does_not_patch(self) -> None:
         init(mode="off")
@@ -120,7 +125,7 @@ def test_init_off_does_not_patch(self) -> None:
 
     def test_reset_unpatches(self) -> None:
         init(mode="observe")
-        assert is_patched()
+        assert is_openai_patched()
         reset()
         assert not is_patched()
 
@@ -133,6 +138,27 @@ def test_class_method_actually_replaced(self) -> None:
         unpatch_openai()
         assert Completions.create is original
 
+    def test_patch_and_unpatch_anthropic(self) -> None:
+        if find_spec("anthropic") is None:
+            pytest.skip("anthropic package not available")
+        assert not is_anthropic_patched()
+        result = patch_anthropic()
+        assert result is True
+        assert is_anthropic_patched()
+        unpatch_anthropic()
+        assert not is_anthropic_patched()
+
+    def test_anthropic_class_method_actually_replaced(self) -> None:
+        if find_spec("anthropic") is None:
+            pytest.skip("anthropic package not available")
+        from anthropic.resources.messages import Messages
+
+        original = Messages.create
+        patch_anthropic()
+        assert Messages.create is not original
+        unpatch_anthropic()
+        assert Messages.create is original
+
 
 # ---------------------------------------------------------------------------
 # Sync wrapper

From 76f6c2e0936eb0546e2a93763045b27196154f2b Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 4 Mar 2026 15:20:25 +0100
Subject: [PATCH 34/49] feat(core): deliver v2.1 ts harness parity and sdk
 auto-instrumentation

---
 docs/strategy/agent-intelligence-v2-plan.md |  15 +-
 packages/core/README.md                     |  17 +
 packages/core/src/__tests__/harness.test.ts | 232 ++++++
 packages/core/src/harness-instrument.ts     | 746 +++++++++++++++++++
 packages/core/src/harness.ts                | 754 ++++++++++++++++++++
 packages/core/src/index.ts                  |  25 +
 6 files changed, 1780 insertions(+), 9 deletions(-)
 create mode 100644 packages/core/src/__tests__/harness.test.ts
 create mode 100644 packages/core/src/harness-instrument.ts
 create mode 100644 packages/core/src/harness.ts

diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md
index 1c8a2344..177562e1 100644
--- a/docs/strategy/agent-intelligence-v2-plan.md
+++ b/docs/strategy/agent-intelligence-v2-plan.md
@@ -197,9 +197,6 @@ Framework-specific packages provide deeper integration (state extraction, middle
 ### TypeScript Equivalent
 
 ```typescript
-// Target API — does not exist in @cascadeflow/core today.
-// TS parity is a V2.1 deliverable (see Section 16, Phase F).
-
 import { cascadeflow } from '@cascadeflow/core';
 
 // Tier 1: Auto-instrument
@@ -868,10 +865,10 @@ Integration-branch promotion gates:
 
 To enable parallel work without merge collisions, split V2.1 into Python and TS tracks:
 
-- `feat/v2.1-anthropic-python-auto-instrumentation` (claimed by current agent)
+- `feat/v2.1-anthropic-python-auto-instrumentation` (completed in this branch)
   - Scope: `cascadeflow/harness/*`, Python harness tests, Python docs notes
   - Deliverables: Anthropic Python auto-instrumentation, validation for `init()/run()` harness path
-- `feat/v2.1-ts-harness-api-parity` (available for parallel agent)
+- `feat/v2.1-ts-harness-api-parity` (completed and merged into this branch scope)
   - Scope: `packages/core/*`, TS parity fixtures, TS docs notes
   - Deliverables: `@cascadeflow/core` exports parity (`init()/run()`), TS fixture parity validation
 
@@ -937,10 +934,10 @@ Go when all are true (V2 Python launch):
 - [ ] pyproject.toml extras (`openai-agents`, `crewai`, `langchain`) defined and installable
 
 V2.1 Go/No-Go (TS parity + anthropic):
-- [ ] TS parity fixtures pass
-- [ ] `@cascadeflow/core` exports `cascadeflow.init()` and `cascadeflow.run()`
-- [ ] `anthropic` Python client auto-instrumentation validated
-- [ ] `@anthropic-ai/sdk` TS client auto-instrumentation validated
+- [x] TS parity fixtures pass
+- [x] `@cascadeflow/core` exports `cascadeflow.init()` and `cascadeflow.run()`
+- [x] `anthropic` Python client auto-instrumentation validated
+- [x] `@anthropic-ai/sdk` TS client auto-instrumentation validated
 
 ## 19. Academic Validation
 
diff --git a/packages/core/README.md b/packages/core/README.md
index a0918d78..3188df91 100644
--- a/packages/core/README.md
+++ b/packages/core/README.md
@@ -33,6 +33,23 @@ pnpm add @cascadeflow/core
 yarn add @cascadeflow/core
 ```
 
+## Harness Quick Start (V2.1)
+
+```typescript
+import { cascadeflow } from '@cascadeflow/core';
+
+// 1) Turn on in-process harness decisions + SDK auto-instrumentation
+cascadeflow.init({ mode: 'enforce', budget: 0.5 });
+
+// 2) Scope one run (global defaults are inherited)
+const result = await cascadeflow.run({ maxToolCalls: 8 }, async (run) => {
+  // Any OpenAI / Anthropic SDK calls made here are evaluated by the harness.
+  return { runId: run.runId };
+});
+
+console.log(result);
+```
+
 ## Quick Start
 
 ### Recommended Setup (Claude Haiku + GPT-5)
diff --git a/packages/core/src/__tests__/harness.test.ts b/packages/core/src/__tests__/harness.test.ts
new file mode 100644
index 00000000..bad03376
--- /dev/null
+++ b/packages/core/src/__tests__/harness.test.ts
@@ -0,0 +1,232 @@
+import { afterEach, describe, expect, it } from 'vitest';
+
+import {
+  BudgetExceededError,
+  cascadeflow,
+  getCurrentRun,
+  getHarnessConfig,
+  init,
+  reset,
+  run,
+} from '../harness';
+import {
+  __resetInstrumentationLoadersForTest,
+  __resetInstrumentationStateForTest,
+  __setInstrumentationLoadersForTest,
+  isAnthropicPatched,
+  isOpenAIPatched,
+} from '../harness-instrument';
+
+class FakeOpenAICompletions {
+  constructor(private readonly calls: Array<Record<string, any>>) {}
+
+  create(request: Record<string, any>): Promise<Record<string, any>> {
+    this.calls.push({ ...request });
+    return Promise.resolve({
+      usage: {
+        prompt_tokens: 100,
+        completion_tokens: 25,
+      },
+      choices: [
+        {
+          message: {
+            tool_calls: [{ id: 'tool_1', type: 'function' }],
+          },
+        },
+      ],
+    });
+  }
+}
+
+class FakeAnthropicMessages {
+  constructor(private readonly calls: Array<Record<string, any>>) {}
+
+  create(request: Record<string, any>): Promise<Record<string, any>> {
+    this.calls.push({ ...request });
+    return Promise.resolve({
+      usage: {
+        input_tokens: 120,
+        output_tokens: 40,
+      },
+      content: [
+        { type: 'text', text: 'hello' },
+        { type: 'tool_use', id: 'tool_1', name: 'search', input: { q: 'x' } },
+      ],
+    });
+  }
+}
+
+afterEach(() => {
+  reset();
+  __resetInstrumentationStateForTest();
+  __resetInstrumentationLoadersForTest();
+});
+
+describe('harness API (TypeScript parity)', () => {
+  it('exposes cascadeflow init/run object API', async () => {
+    expect(typeof cascadeflow.init).toBe('function');
+    expect(typeof cascadeflow.run).toBe('function');
+
+    init({ mode: 'observe' });
+    const value = await cascadeflow.run(async (scope) => {
+      expect(scope.mode).toBe('observe');
+      expect(getCurrentRun()).toBe(scope);
+      return 42;
+    });
+
+    expect(value).toBe(42);
+    expect(getCurrentRun()).toBeNull();
+  });
+
+  it('honors code > env precedence and preserves nested scope isolation', async () => {
+    const previousMode = process.env.CASCADEFLOW_HARNESS_MODE;
+    process.env.CASCADEFLOW_HARNESS_MODE = 'observe';
+
+    init();
+    expect(getHarnessConfig().mode).toBe('observe');
+
+    init({ mode: 'enforce' });
+    expect(getHarnessConfig().mode).toBe('enforce');
+
+    await run({ budget: 1.0 }, async (outer) => {
+      outer.cost = 0.1;
+      expect(outer.budgetMax).toBe(1.0);
+      expect(getCurrentRun()).toBe(outer);
+
+      await run({ budget: 0.25 }, async (inner) => {
+        expect(getCurrentRun()).toBe(inner);
+        expect(inner.budgetMax).toBe(0.25);
+        inner.cost = 0.2;
+      });
+
+      expect(getCurrentRun()).toBe(outer);
+      expect(outer.budgetMax).toBe(1.0);
+      expect(outer.cost).toBe(0.1);
+    });
+
+    if (previousMode == null) {
+      delete process.env.CASCADEFLOW_HARNESS_MODE;
+    } else {
+      process.env.CASCADEFLOW_HARNESS_MODE = previousMode;
+    }
+  });
+
+  it('auto-instruments OpenAI and enforces switch_model decisions', async () => {
+    const openaiCalls: Array<Record<string, any>> = [];
+
+    __setInstrumentationLoadersForTest({
+      openai: () => ({
+        Completions: FakeOpenAICompletions,
+      }),
+      anthropic: () => null,
+    });
+
+    init({ mode: 'enforce' });
+    expect(isOpenAIPatched()).toBe(true);
+
+    await run({ kpiWeights: { cost: 1 } }, async (scope) => {
+      const client = new FakeOpenAICompletions(openaiCalls);
+      await client.create({
+        model: 'gpt-4o',
+        messages: [{ role: 'user', content: 'hi' }],
+      });
+
+      expect(scope.stepCount).toBe(1);
+      expect(scope.cost).toBeGreaterThan(0);
+      expect(scope.toolCalls).toBe(1);
+
+      const trace = scope.trace();
+      expect(trace).toHaveLength(1);
+      expect(trace[0]?.action).toBe('switch_model');
+      expect(trace[0]?.applied).toBe(true);
+      expect(trace[0]?.decisionMode).toBe('enforce');
+    });
+
+    expect(openaiCalls).toHaveLength(1);
+    expect(openaiCalls[0]?.model).not.toBe('gpt-4o');
+  });
+
+  it('observe mode logs non-allow decisions without mutating request', async () => {
+    const openaiCalls: Array<Record<string, any>> = [];
+
+    __setInstrumentationLoadersForTest({
+      openai: () => ({
+        Completions: FakeOpenAICompletions,
+      }),
+      anthropic: () => null,
+    });
+
+    init({ mode: 'observe' });
+
+    await run({ kpiWeights: { cost: 1 } }, async (scope) => {
+      const client = new FakeOpenAICompletions(openaiCalls);
+      await client.create({
+        model: 'gpt-4o',
+        messages: [{ role: 'user', content: 'hi' }],
+      });
+
+      const trace = scope.trace();
+      expect(trace).toHaveLength(1);
+      expect(trace[0]?.action).toBe('switch_model');
+      expect(trace[0]?.applied).toBe(false);
+      expect(trace[0]?.decisionMode).toBe('observe');
+    });
+
+    expect(openaiCalls).toHaveLength(1);
+    expect(openaiCalls[0]?.model).toBe('gpt-4o');
+  });
+
+  it('enforce mode stops calls when budget is exhausted', async () => {
+    const openaiCalls: Array<Record<string, any>> = [];
+
+    __setInstrumentationLoadersForTest({
+      openai: () => ({
+        Completions: FakeOpenAICompletions,
+      }),
+      anthropic: () => null,
+    });
+
+    init({ mode: 'enforce' });
+
+    await expect(
+      run({ budget: 0 }, async () => {
+        const client = new FakeOpenAICompletions(openaiCalls);
+        await client.create({
+          model: 'gpt-4o',
+          messages: [{ role: 'user', content: 'hi' }],
+        });
+      }),
+    ).rejects.toBeInstanceOf(BudgetExceededError);
+
+    expect(openaiCalls).toHaveLength(0);
+  });
+
+  it('auto-instruments Anthropic and tracks usage/tool calls', async () => {
+    const anthropicCalls: Array<Record<string, any>> = [];
+
+    __setInstrumentationLoadersForTest({
+      openai: () => null,
+      anthropic: () => ({
+        Messages: FakeAnthropicMessages,
+      }),
+    });
+
+    init({ mode: 'enforce' });
+    expect(isAnthropicPatched()).toBe(true);
+
+    await run(async (scope) => {
+      const client = new FakeAnthropicMessages(anthropicCalls);
+      await client.create({
+        model: 'claude-sonnet-4-5-20250929',
+        messages: [{ role: 'user', content: 'hello' }],
+      });
+
+      expect(scope.stepCount).toBe(1);
+      expect(scope.toolCalls).toBe(1);
+      expect(scope.cost).toBeGreaterThan(0);
+      expect(scope.trace()[0]?.action).toBe('allow');
+    });
+
+    expect(anthropicCalls).toHaveLength(1);
+  });
+});
diff --git a/packages/core/src/harness-instrument.ts b/packages/core/src/harness-instrument.ts
new file mode 100644
index 00000000..901af4ae
--- /dev/null
+++ b/packages/core/src/harness-instrument.ts
@@ -0,0 +1,746 @@
+type Action = 'allow' | 'switch_model' | 'deny_tool' | 'stop';
+
+type CreateFunction = (this: any, ...args: any[]) => any;
+
+type OpenAIModuleLike = {
+  Completions?: {
+    prototype?: {
+      create?: CreateFunction;
+    };
+  };
+};
+
+type AnthropicModuleLike = {
+  Messages?: {
+    prototype?: {
+      create?: CreateFunction;
+    };
+  };
+};
+
+type Pricing = { input: number; output: number };
+
+type PreCallDecision = {
+  action: Action;
+  reason: string;
+  targetModel: string;
+};
+
+type HarnessRuntime = {
+  getCurrentRun: () => HarnessRunContextLike | null;
+  getHarnessMode: () => HarnessModeLike;
+  createBudgetExceededError: (message: string, remaining?: number) => Error;
+  createHarnessStopError: (message: string, reason?: string) => Error;
+};
+
+type HarnessModeLike = 'off' | 'observe' | 'enforce';
+
+type HarnessRunContextLike = {
+  mode: HarnessModeLike;
+  cost: number;
+  stepCount: number;
+  toolCalls: number;
+  latencyUsedMs: number;
+  energyUsed: number;
+  budgetMax?: number;
+  budgetRemaining?: number;
+  toolCallsMax?: number;
+  latencyMaxMs?: number;
+  energyMax?: number;
+  compliance?: string;
+  kpiWeights?: Record<string, number>;
+  record: (
+    action: string,
+    reason: string,
+    model?: string,
+    options?: {
+      applied?: boolean;
+      decisionMode?: HarnessModeLike;
+    },
+  ) => void;
+};
+
+const MODEL_PRICING_PER_MILLION: Record<string, Pricing> = {
+  // OpenAI
+  'gpt-5': { input: 1.25, output: 10.0 },
+  'gpt-5-mini': { input: 0.25, output: 2.0 },
+  'gpt-5-nano': { input: 0.05, output: 0.4 },
+  'gpt-4o': { input: 2.5, output: 10.0 },
+  'gpt-4o-mini': { input: 0.15, output: 0.6 },
+  'o1': { input: 15.0, output: 60.0 },
+  'o1-mini': { input: 3.0, output: 12.0 },
+  'o3-mini': { input: 1.0, output: 5.0 },
+
+  // Anthropic
+  'claude-opus-4-5-20251101': { input: 15.0, output: 75.0 },
+  'claude-opus-4-20250514': { input: 15.0, output: 75.0 },
+  'claude-sonnet-4-5-20250929': { input: 3.0, output: 15.0 },
+  'claude-sonnet-4-20250514': { input: 3.0, output: 15.0 },
+  'claude-haiku-4-5-20251001': { input: 1.0, output: 5.0 },
+  'claude-3-5-haiku-20241022': { input: 1.0, output: 5.0 },
+};
+
+const ENERGY_COEFFICIENTS: Record<string, number> = {
+  'gpt-5': 1.15,
+  'gpt-5-mini': 0.72,
+  'gpt-5-nano': 0.45,
+  'gpt-4o': 1.0,
+  'gpt-4o-mini': 0.55,
+  'o1': 1.25,
+  'o1-mini': 0.85,
+  'o3-mini': 0.75,
+  'claude-opus-4-5-20251101': 1.2,
+  'claude-opus-4-20250514': 1.15,
+  'claude-sonnet-4-5-20250929': 0.95,
+  'claude-sonnet-4-20250514': 0.92,
+  'claude-haiku-4-5-20251001': 0.7,
+  'claude-3-5-haiku-20241022': 0.68,
+};
+
+const LATENCY_PRIORS: Record<string, number> = {
+  'gpt-5': 0.45,
+  'gpt-5-mini': 0.72,
+  'gpt-5-nano': 0.9,
+  'gpt-4o': 0.58,
+  'gpt-4o-mini': 0.82,
+  'o1': 0.35,
+  'o1-mini': 0.62,
+  'o3-mini': 0.7,
+  'claude-opus-4-5-20251101': 0.4,
+  'claude-opus-4-20250514': 0.44,
+  'claude-sonnet-4-5-20250929': 0.6,
+  'claude-sonnet-4-20250514': 0.63,
+  'claude-haiku-4-5-20251001': 0.85,
+  'claude-3-5-haiku-20241022': 0.86,
+};
+
+const QUALITY_PRIORS: Record<string, number> = {
+  'gpt-5': 0.95,
+  'gpt-5-mini': 0.86,
+  'gpt-5-nano': 0.74,
+  'gpt-4o': 0.9,
+  'gpt-4o-mini': 0.82,
+  'o1': 0.93,
+  'o1-mini': 0.84,
+  'o3-mini': 0.86,
+  'claude-opus-4-5-20251101': 0.94,
+  'claude-opus-4-20250514': 0.92,
+  'claude-sonnet-4-5-20250929': 0.9,
+  'claude-sonnet-4-20250514': 0.88,
+  'claude-haiku-4-5-20251001': 0.82,
+  'claude-3-5-haiku-20241022': 0.8,
+};
+
+const COMPLIANCE_ALLOWLISTS: Record<string, Set<string>> = {
+  strict: new Set(['gpt-4o', 'gpt-4o-mini', 'claude-sonnet-4-5-20250929', 'claude-haiku-4-5-20251001']),
+  regulated: new Set(['gpt-4o', 'claude-sonnet-4-5-20250929']),
+};
+
+const DEFAULT_ENERGY_COEFFICIENT = 0.9;
+const DEFAULT_OUTPUT_WEIGHT = 1.5;
+
+const PRICING_MODELS = Object.keys(MODEL_PRICING_PER_MILLION);
+
+let openAIPatched = false;
+let anthropicPatched = false;
+
+let originalOpenAICreate: CreateFunction | null = null;
+let originalAnthropicCreate: CreateFunction | null = null;
+let patchedOpenAIClass: { prototype?: { create?: CreateFunction } } | null = null;
+let patchedAnthropicClass: { prototype?: { create?: CreateFunction } } | null = null;
+
+const defaultOpenAILoader = (): OpenAIModuleLike | null => {
+  try {
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    return require('openai/resources/chat/completions') as OpenAIModuleLike;
+  } catch {
+    return null;
+  }
+};
+
+const defaultAnthropicLoader = (): AnthropicModuleLike | null => {
+  try {
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    return require('@anthropic-ai/sdk/resources/messages') as AnthropicModuleLike;
+  } catch {
+    return null;
+  }
+};
+
+let loadOpenAIModule = defaultOpenAILoader;
+let loadAnthropicModule = defaultAnthropicLoader;
+let harnessRuntimeBindings: HarnessRuntime | null = null;
+
+function getHarnessRuntime(): HarnessRuntime {
+  if (!harnessRuntimeBindings) {
+    throw new Error('Harness runtime bindings not configured');
+  }
+  return harnessRuntimeBindings;
+}
+
+export function setHarnessRuntimeBindingsForInstrumentation(bindings: HarnessRuntime): void {
+  harnessRuntimeBindings = bindings;
+}
+
+function nowMonotonicMs(): number {
+  // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
+  if (typeof globalThis !== 'undefined' && (globalThis as any).performance?.now) {
+    return (globalThis as any).performance.now() as number;
+  }
+
+  // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
+  if (typeof process !== 'undefined' && process.hrtime?.bigint) {
+    return Number(process.hrtime.bigint()) / 1_000_000;
+  }
+
+  return Date.now();
+}
+
+function normalizeModelName(model: string): string {
+  return model.trim().toLowerCase();
+}
+
+function estimateCost(model: string, promptTokens: number, completionTokens: number): number {
+  const price = MODEL_PRICING_PER_MILLION[normalizeModelName(model)];
+  if (!price) {
+    return 0;
+  }
+
+  return (promptTokens / 1_000_000) * price.input + (completionTokens / 1_000_000) * price.output;
+}
+
+function estimateEnergy(model: string, promptTokens: number, completionTokens: number): number {
+  const coefficient = ENERGY_COEFFICIENTS[normalizeModelName(model)] ?? DEFAULT_ENERGY_COEFFICIENT;
+  return coefficient * (promptTokens + completionTokens * DEFAULT_OUTPUT_WEIGHT) / 1000;
+}
+
+function modelTotalCost(model: string): number {
+  const price = MODEL_PRICING_PER_MILLION[normalizeModelName(model)];
+  if (!price) {
+    return Number.POSITIVE_INFINITY;
+  }
+  return price.input + price.output;
+}
+
+function selectCheaperModel(currentModel: string): string {
+  const currentCost = modelTotalCost(currentModel);
+  let bestModel = currentModel;
+  let bestCost = currentCost;
+
+  for (const candidate of PRICING_MODELS) {
+    const candidateCost = modelTotalCost(candidate);
+    if (candidateCost < bestCost) {
+      bestModel = candidate;
+      bestCost = candidateCost;
+    }
+  }
+
+  return bestModel;
+}
+
+function selectLowerEnergyModel(currentModel: string): string {
+  const currentCoeff = ENERGY_COEFFICIENTS[normalizeModelName(currentModel)] ?? DEFAULT_ENERGY_COEFFICIENT;
+  let bestModel = currentModel;
+  let bestCoeff = currentCoeff;
+
+  for (const candidate of PRICING_MODELS) {
+    const coeff = ENERGY_COEFFICIENTS[candidate] ?? DEFAULT_ENERGY_COEFFICIENT;
+    if (coeff < bestCoeff) {
+      bestModel = candidate;
+      bestCoeff = coeff;
+    }
+  }
+
+  return bestModel;
+}
+
+function selectFasterModel(currentModel: string): string {
+  const currentLatency = LATENCY_PRIORS[normalizeModelName(currentModel)] ?? 0.7;
+  let bestModel = currentModel;
+  let bestLatency = currentLatency;
+
+  for (const candidate of PRICING_MODELS) {
+    const score = LATENCY_PRIORS[candidate] ?? 0.7;
+    if (score > bestLatency) {
+      bestModel = candidate;
+      bestLatency = score;
+    }
+  }
+
+  return bestModel;
+}
+
+function normalizeWeights(weights: Record<string, number>): Record<string, number> {
+  const normalized: Record<string, number> = {};
+  let total = 0;
+
+  for (const [key, value] of Object.entries(weights)) {
+    if (!Number.isFinite(value) || value <= 0) {
+      continue;
+    }
+    normalized[key] = value;
+    total += value;
+  }
+
+  if (total <= 0) {
+    return {};
+  }
+
+  for (const key of Object.keys(normalized)) {
+    normalized[key] /= total;
+  }
+
+  return normalized;
+}
+
+function costUtility(model: string): number {
+  const costs = PRICING_MODELS.map(modelTotalCost).filter(Number.isFinite);
+  const min = Math.min(...costs);
+  const max = Math.max(...costs);
+  const current = modelTotalCost(model);
+
+  if (!Number.isFinite(current) || max === min) {
+    return 0.5;
+  }
+
+  return (max - current) / (max - min);
+}
+
+function energyUtility(model: string): number {
+  const coeffs = PRICING_MODELS.map((name) => ENERGY_COEFFICIENTS[name] ?? DEFAULT_ENERGY_COEFFICIENT);
+  const min = Math.min(...coeffs);
+  const max = Math.max(...coeffs);
+  const current = ENERGY_COEFFICIENTS[normalizeModelName(model)] ?? DEFAULT_ENERGY_COEFFICIENT;
+
+  if (max === min) {
+    return 0.5;
+  }
+
+  return (max - current) / (max - min);
+}
+
+function kpiScore(model: string, weights: Record<string, number>): number {
+  const normalized = normalizeWeights(weights);
+  if (Object.keys(normalized).length === 0) {
+    return 0;
+  }
+
+  const key = normalizeModelName(model);
+  const quality = QUALITY_PRIORS[key] ?? 0.7;
+  const latency = LATENCY_PRIORS[key] ?? 0.7;
+  const cost = costUtility(key);
+  const energy = energyUtility(key);
+
+  return (
+    (normalized.quality ?? 0) * quality
+    + (normalized.latency ?? 0) * latency
+    + (normalized.cost ?? 0) * cost
+    + (normalized.energy ?? 0) * energy
+  );
+}
+
+function selectKPIWeightedModel(currentModel: string, weights: Record<string, number>): string {
+  const normalized = normalizeWeights(weights);
+  if (Object.keys(normalized).length === 0) {
+    return currentModel;
+  }
+
+  let bestModel = currentModel;
+  let bestScore = kpiScore(currentModel, normalized);
+
+  for (const candidate of PRICING_MODELS) {
+    const score = kpiScore(candidate, normalized);
+    if (score > bestScore) {
+      bestModel = candidate;
+      bestScore = score;
+    }
+  }
+
+  return bestModel;
+}
+
+function extractOpenAIUsage(response: any): [number, number] {
+  const usage = response?.usage;
+  if (!usage || typeof usage !== 'object') {
+    return [0, 0];
+  }
+  const promptTokens = Number(usage.prompt_tokens ?? usage.input_tokens ?? 0);
+  const completionTokens = Number(usage.completion_tokens ?? usage.output_tokens ?? 0);
+  return [
+    Number.isFinite(promptTokens) ? promptTokens : 0,
+    Number.isFinite(completionTokens) ? completionTokens : 0,
+  ];
+}
+
+function extractAnthropicUsage(response: any): [number, number] {
+  const usage = response?.usage;
+  if (!usage || typeof usage !== 'object') {
+    return [0, 0];
+  }
+
+  const inputTokens = Number(usage.input_tokens ?? usage.prompt_tokens ?? 0);
+  const outputTokens = Number(usage.output_tokens ?? usage.completion_tokens ?? 0);
+  return [
+    Number.isFinite(inputTokens) ? inputTokens : 0,
+    Number.isFinite(outputTokens) ? outputTokens : 0,
+  ];
+}
+
+function countOpenAIToolCalls(response: any): number {
+  const toolCalls = response?.choices?.[0]?.message?.tool_calls;
+  if (!Array.isArray(toolCalls)) {
+    return 0;
+  }
+  return toolCalls.length;
+}
+
+function countAnthropicToolCalls(response: any): number {
+  const content = response?.content;
+  if (!Array.isArray(content)) {
+    return 0;
+  }
+  return content.filter((item: any) => item?.type === 'tool_use').length;
+}
+
+function evaluatePreCallDecision(ctx: HarnessRunContextLike, model: string, hasTools: boolean): PreCallDecision {
+  if (ctx.budgetMax != null && ctx.cost >= ctx.budgetMax) {
+    return { action: 'stop', reason: 'budget_exceeded', targetModel: model };
+  }
+
+  if (hasTools && ctx.toolCallsMax != null && ctx.toolCalls >= ctx.toolCallsMax) {
+    return { action: 'deny_tool', reason: 'max_tool_calls_reached', targetModel: model };
+  }
+
+  if (ctx.compliance) {
+    const profile = COMPLIANCE_ALLOWLISTS[ctx.compliance.trim().toLowerCase()];
+    if (profile) {
+      const normalized = normalizeModelName(model);
+      if (!profile.has(normalized)) {
+        const next = PRICING_MODELS.find((candidate) => profile.has(candidate));
+        if (next) {
+          return { action: 'switch_model', reason: 'compliance_model_policy', targetModel: next };
+        }
+        return {
+          action: hasTools ? 'deny_tool' : 'stop',
+          reason: hasTools ? 'compliance_no_approved_tool_path' : 'compliance_no_approved_model',
+          targetModel: model,
+        };
+      }
+      if (ctx.compliance.trim().toLowerCase() === 'strict' && hasTools) {
+        return { action: 'deny_tool', reason: 'compliance_tool_restriction', targetModel: model };
+      }
+    }
+  }
+
+  if (ctx.latencyMaxMs != null && ctx.latencyUsedMs >= ctx.latencyMaxMs) {
+    const faster = selectFasterModel(model);
+    if (normalizeModelName(faster) !== normalizeModelName(model)) {
+      return { action: 'switch_model', reason: 'latency_limit_exceeded', targetModel: faster };
+    }
+    return { action: 'stop', reason: 'latency_limit_exceeded', targetModel: model };
+  }
+
+  if (ctx.energyMax != null && ctx.energyUsed >= ctx.energyMax) {
+    const lower = selectLowerEnergyModel(model);
+    if (normalizeModelName(lower) !== normalizeModelName(model)) {
+      return { action: 'switch_model', reason: 'energy_limit_exceeded', targetModel: lower };
+    }
+    return { action: 'stop', reason: 'energy_limit_exceeded', targetModel: model };
+  }
+
+  if (
+    ctx.budgetMax != null
+    && ctx.budgetMax > 0
+    && ctx.budgetRemaining != null
+    && (ctx.budgetRemaining / ctx.budgetMax) < 0.2
+  ) {
+    const cheaper = selectCheaperModel(model);
+    if (normalizeModelName(cheaper) !== normalizeModelName(model)) {
+      return { action: 'switch_model', reason: 'budget_pressure', targetModel: cheaper };
+    }
+  }
+
+  if (ctx.kpiWeights && Object.keys(ctx.kpiWeights).length > 0) {
+    const candidate = selectKPIWeightedModel(model, ctx.kpiWeights);
+    if (normalizeModelName(candidate) !== normalizeModelName(model)) {
+      return { action: 'switch_model', reason: 'kpi_weight_optimization', targetModel: candidate };
+    }
+  }
+
+  return { action: 'allow', reason: ctx.mode, targetModel: model };
+}
+
+function raiseStopError(ctx: HarnessRunContextLike, reason: string): never {
+  const runtime = getHarnessRuntime();
+  if (reason === 'budget_exceeded') {
+    const remaining = Math.max(0, (ctx.budgetMax ?? 0) - ctx.cost);
+    throw runtime.createBudgetExceededError(
+      `Budget exhausted: spent $${ctx.cost.toFixed(4)} of $${(ctx.budgetMax ?? 0).toFixed(4)} max`,
+      remaining,
+    );
+  }
+
+  throw runtime.createHarnessStopError(`cascadeflow harness stop: ${reason}`, reason);
+}
+
+function updateContext(
+  ctx: HarnessRunContextLike,
+  mode: HarnessModeLike,
+  model: string,
+  promptTokens: number,
+  completionTokens: number,
+  toolCalls: number,
+  elapsedMs: number,
+  decision: PreCallDecision,
+  applied: boolean,
+): void {
+  const cost = estimateCost(model, promptTokens, completionTokens);
+  const energy = estimateEnergy(model, promptTokens, completionTokens);
+
+  ctx.cost += cost;
+  ctx.stepCount += 1;
+  ctx.toolCalls += toolCalls;
+  ctx.latencyUsedMs += elapsedMs;
+  ctx.energyUsed += energy;
+
+  if (ctx.budgetMax != null) {
+    ctx.budgetRemaining = ctx.budgetMax - ctx.cost;
+  }
+
+  ctx.record(decision.action, decision.reason, decision.targetModel, {
+    applied,
+    decisionMode: mode,
+  });
+}
+
+function isThenable(value: any): value is Promise<any> {
+  return Boolean(value) && typeof value.then === 'function';
+}
+
+function makePatchedCreate(provider: 'openai' | 'anthropic', original: CreateFunction): CreateFunction {
+  return function patchedCreate(this: any, ...args: any[]): any {
+    const runtime = getHarnessRuntime();
+    const activeRun = runtime.getCurrentRun();
+    const mode = activeRun?.mode ?? runtime.getHarnessMode();
+
+    if (mode === 'off') {
+      return original.apply(this, args);
+    }
+
+    const firstArg = args[0];
+    const request = firstArg && typeof firstArg === 'object' ? { ...firstArg } : {};
+    const model = typeof request.model === 'string' ? request.model : 'unknown';
+    const hasTools = Array.isArray(request.tools) && request.tools.length > 0;
+
+    const decision = activeRun ? evaluatePreCallDecision(activeRun, model, hasTools) : {
+      action: 'allow' as const,
+      reason: mode,
+      targetModel: model,
+    };
+
+    let applied = decision.action === 'allow';
+    let effectiveModel = model;
+
+    if (activeRun && mode === 'enforce') {
+      if (decision.action === 'stop') {
+        activeRun.record('stop', decision.reason, model, {
+          applied: true,
+          decisionMode: mode,
+        });
+        raiseStopError(activeRun, decision.reason);
+      }
+
+      if (decision.action === 'switch_model') {
+        if (normalizeModelName(decision.targetModel) !== normalizeModelName(model)) {
+          request.model = decision.targetModel;
+          effectiveModel = decision.targetModel;
+          applied = true;
+        } else {
+          applied = false;
+        }
+      }
+
+      if (decision.action === 'deny_tool') {
+        if (Array.isArray(request.tools) && request.tools.length > 0) {
+          request.tools = [];
+          applied = true;
+        } else {
+          applied = false;
+        }
+      }
+    } else if (decision.action !== 'allow') {
+      applied = false;
+    }
+
+    const interceptedArgs = firstArg && typeof firstArg === 'object'
+      ? [request, ...args.slice(1)]
+      : args;
+
+    const isStream = Boolean(request.stream);
+    const startedAt = nowMonotonicMs();
+    const result = original.apply(this, interceptedArgs);
+
+    if (!activeRun) {
+      return result;
+    }
+
+    const finalize = (response: any): any => {
+      const elapsedMs = Math.max(0, nowMonotonicMs() - startedAt);
+
+      let promptTokens = 0;
+      let completionTokens = 0;
+      let toolCallCount = 0;
+
+      if (!isStream) {
+        if (provider === 'openai') {
+          [promptTokens, completionTokens] = extractOpenAIUsage(response);
+          toolCallCount = countOpenAIToolCalls(response);
+        } else {
+          [promptTokens, completionTokens] = extractAnthropicUsage(response);
+          toolCallCount = countAnthropicToolCalls(response);
+        }
+      }
+
+      updateContext(
+        activeRun,
+        mode,
+        effectiveModel,
+        promptTokens,
+        completionTokens,
+        toolCallCount,
+        elapsedMs,
+        decision,
+        applied,
+      );
+
+      return response;
+    };
+
+    if (isThenable(result)) {
+      result
+        .then((response) => {
+          finalize(response);
+        })
+        .catch(() => {
+          // fail-open: harness instrumentation errors must not crash user flow.
+        });
+      return result;
+    }
+
+    return finalize(result);
+  };
+}
+
+export function detectOpenAIInstrumentationTarget(): boolean {
+  const module = loadOpenAIModule();
+  return Boolean(module?.Completions?.prototype?.create);
+}
+
+export function detectAnthropicInstrumentationTarget(): boolean {
+  const module = loadAnthropicModule();
+  return Boolean(module?.Messages?.prototype?.create);
+}
+
+export function patchOpenAI(): boolean {
+  if (openAIPatched) {
+    return true;
+  }
+
+  const module = loadOpenAIModule();
+  const cls = module?.Completions;
+  const prototype = cls?.prototype;
+  const create = prototype?.create;
+
+  if (!cls || !prototype || typeof create !== 'function') {
+    return false;
+  }
+
+  originalOpenAICreate = create;
+  patchedOpenAIClass = cls;
+  prototype.create = makePatchedCreate('openai', create);
+  openAIPatched = true;
+  return true;
+}
+
+export function patchAnthropic(): boolean {
+  if (anthropicPatched) {
+    return true;
+  }
+
+  const module = loadAnthropicModule();
+  const cls = module?.Messages;
+  const prototype = cls?.prototype;
+  const create = prototype?.create;
+
+  if (!cls || !prototype || typeof create !== 'function') {
+    return false;
+  }
+
+  originalAnthropicCreate = create;
+  patchedAnthropicClass = cls;
+  prototype.create = makePatchedCreate('anthropic', create);
+  anthropicPatched = true;
+  return true;
+}
+
+export function unpatchOpenAI(): void {
+  if (!openAIPatched) {
+    return;
+  }
+
+  if (patchedOpenAIClass?.prototype && originalOpenAICreate) {
+    patchedOpenAIClass.prototype.create = originalOpenAICreate;
+  }
+
+  openAIPatched = false;
+  originalOpenAICreate = null;
+  patchedOpenAIClass = null;
+}
+
+export function unpatchAnthropic(): void {
+  if (!anthropicPatched) {
+    return;
+  }
+
+  if (patchedAnthropicClass?.prototype && originalAnthropicCreate) {
+    patchedAnthropicClass.prototype.create = originalAnthropicCreate;
+  }
+
+  anthropicPatched = false;
+  originalAnthropicCreate = null;
+  patchedAnthropicClass = null;
+}
+
+export function isOpenAIPatched(): boolean {
+  return openAIPatched;
+}
+
+export function isAnthropicPatched(): boolean {
+  return anthropicPatched;
+}
+
+export function isPatched(): boolean {
+  return openAIPatched || anthropicPatched;
+}
+
+export function __setInstrumentationLoadersForTest(loaders: {
+  openai?: () => OpenAIModuleLike | null;
+  anthropic?: () => AnthropicModuleLike | null;
+}): void {
+  if (loaders.openai) {
+    loadOpenAIModule = loaders.openai;
+  }
+  if (loaders.anthropic) {
+    loadAnthropicModule = loaders.anthropic;
+  }
+}
+
+export function __resetInstrumentationLoadersForTest(): void {
+  loadOpenAIModule = defaultOpenAILoader;
+  loadAnthropicModule = defaultAnthropicLoader;
+}
+
+export function __resetInstrumentationStateForTest(): void {
+  unpatchOpenAI();
+  unpatchAnthropic();
+}
diff --git a/packages/core/src/harness.ts b/packages/core/src/harness.ts
new file mode 100644
index 00000000..3815360e
--- /dev/null
+++ b/packages/core/src/harness.ts
@@ -0,0 +1,754 @@
+import {
+  __resetInstrumentationStateForTest,
+  detectAnthropicInstrumentationTarget,
+  detectOpenAIInstrumentationTarget,
+  patchAnthropic,
+  patchOpenAI,
+  setHarnessRuntimeBindingsForInstrumentation,
+  unpatchAnthropic,
+  unpatchOpenAI,
+} from './harness-instrument';
+
+export type HarnessMode = 'off' | 'observe' | 'enforce';
+
+export type HarnessConfig = {
+  mode: HarnessMode;
+  verbose: boolean;
+  budget?: number;
+  maxToolCalls?: number;
+  maxLatencyMs?: number;
+  maxEnergy?: number;
+  kpiTargets?: Record<string, number>;
+  kpiWeights?: Record<string, number>;
+  compliance?: string;
+};
+
+export type HarnessInitOptions = Partial<HarnessConfig>;
+
+export type HarnessRunOptions = {
+  budget?: number;
+  maxToolCalls?: number;
+  maxLatencyMs?: number;
+  maxEnergy?: number;
+  kpiTargets?: Record<string, number>;
+  kpiWeights?: Record<string, number>;
+  compliance?: string;
+};
+
+export type HarnessInitReport = {
+  mode: HarnessMode;
+  instrumented: string[];
+  detectedButNotInstrumented: string[];
+  configSources: Record<string, 'code' | 'env' | 'file' | 'default'>;
+};
+
+export type HarnessRecordOptions = {
+  applied?: boolean;
+  decisionMode?: HarnessMode;
+};
+
+export type HarnessTraceEntry = {
+  action: string;
+  reason: string;
+  model?: string;
+  runId: string;
+  mode: HarnessMode;
+  step: number;
+  timestampMs: number;
+  toolCallsTotal: number;
+  costTotal: number;
+  latencyUsedMs: number;
+  energyUsed: number;
+  budgetState: {
+    max?: number;
+    remaining?: number;
+  };
+  applied?: boolean;
+  decisionMode?: HarnessMode;
+};
+
+export type HarnessRunSummary = {
+  runId: string;
+  mode: HarnessMode;
+  stepCount: number;
+  toolCalls: number;
+  cost: number;
+  savings: number;
+  latencyUsedMs: number;
+  energyUsed: number;
+  budgetMax?: number;
+  budgetRemaining?: number;
+  lastAction: string;
+  modelUsed?: string;
+  durationMs?: number;
+};
+
+export class HarnessStopError extends Error {
+  reason: string;
+
+  constructor(message: string, reason = 'stop') {
+    super(message);
+    this.name = 'HarnessStopError';
+    this.reason = reason;
+  }
+}
+
+export class BudgetExceededError extends HarnessStopError {
+  remaining: number;
+
+  constructor(message: string, remaining = 0) {
+    super(message, 'budget_exceeded');
+    this.name = 'BudgetExceededError';
+    this.remaining = remaining;
+  }
+}
+
+function randomRunId(): string {
+  return Math.random().toString(36).slice(2, 14);
+}
+
+function nowMonotonicMs(): number {
+  // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
+  if (typeof globalThis !== 'undefined' && (globalThis as any).performance?.now) {
+    return (globalThis as any).performance.now() as number;
+  }
+
+  // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
+  if (typeof process !== 'undefined' && process.hrtime?.bigint) {
+    return Number(process.hrtime.bigint()) / 1_000_000;
+  }
+
+  return Date.now();
+}
+
+const MAX_ACTION_LEN = 64;
+const MAX_REASON_LEN = 160;
+const MAX_MODEL_LEN = 128;
+
+function sanitizeTraceValue(value: unknown, maxLength: number): string | undefined {
+  if (value == null) {
+    return undefined;
+  }
+
+  const text = String(value).replace(/\r?\n/g, ' ').trim();
+  if (!text) {
+    return undefined;
+  }
+
+  if (text.length <= maxLength) {
+    return text;
+  }
+
+  return `${text.slice(0, Math.max(0, maxLength - 3))}...`;
+}
+
+export class HarnessRunContext {
+  runId: string;
+  startedAtMs: number;
+  endedAtMs?: number;
+  durationMs?: number;
+
+  mode: HarnessMode;
+  budgetMax?: number;
+  toolCallsMax?: number;
+  latencyMaxMs?: number;
+  energyMax?: number;
+  kpiTargets?: Record<string, number>;
+  kpiWeights?: Record<string, number>;
+  compliance?: string;
+
+  cost = 0;
+  savings = 0;
+  toolCalls = 0;
+  stepCount = 0;
+  latencyUsedMs = 0;
+  energyUsed = 0;
+  verbose = false;
+  budgetRemaining?: number;
+  modelUsed?: string;
+  lastAction = 'allow';
+  draftAccepted?: boolean;
+
+  private readonly _startedMonotonic: number;
+  private readonly _trace: HarnessTraceEntry[] = [];
+  private _finalized = false;
+
+  constructor(config: {
+    mode: HarnessMode;
+    budgetMax?: number;
+    toolCallsMax?: number;
+    latencyMaxMs?: number;
+    energyMax?: number;
+    kpiTargets?: Record<string, number>;
+    kpiWeights?: Record<string, number>;
+    compliance?: string;
+    verbose?: boolean;
+  }) {
+    this.runId = randomRunId();
+    this.startedAtMs = Date.now();
+    this._startedMonotonic = nowMonotonicMs();
+
+    this.mode = config.mode;
+    this.budgetMax = config.budgetMax;
+    this.toolCallsMax = config.toolCallsMax;
+    this.latencyMaxMs = config.latencyMaxMs;
+    this.energyMax = config.energyMax;
+    this.kpiTargets = config.kpiTargets;
+    this.kpiWeights = config.kpiWeights;
+    this.compliance = config.compliance;
+    this.verbose = Boolean(config.verbose);
+
+    if (config.budgetMax != null) {
+      this.budgetRemaining = config.budgetMax;
+    }
+  }
+
+  finish(): void {
+    if (this._finalized) {
+      return;
+    }
+
+    this._finalized = true;
+    this.endedAtMs = Date.now();
+    this.durationMs = Math.max(0, nowMonotonicMs() - this._startedMonotonic);
+
+    if (this.verbose && this.mode !== 'off' && this.stepCount > 0) {
+      // Keep logging cheap and controlled.
+      // eslint-disable-next-line no-console
+      console.info(
+        '[cascadeflow.harness] run summary',
+        {
+          runId: this.runId,
+          mode: this.mode,
+          steps: this.stepCount,
+          toolCalls: this.toolCalls,
+          cost: this.cost,
+          latencyMs: this.latencyUsedMs,
+          energy: this.energyUsed,
+          lastAction: this.lastAction,
+          model: this.modelUsed,
+          budgetRemaining: this.budgetRemaining,
+          durationMs: this.durationMs,
+        },
+      );
+    }
+  }
+
+  record(action: string, reason: string, model?: string, options: HarnessRecordOptions = {}): void {
+    let safeAction = sanitizeTraceValue(action, MAX_ACTION_LEN);
+    if (!safeAction) {
+      safeAction = 'allow';
+    }
+
+    const safeReason = sanitizeTraceValue(reason, MAX_REASON_LEN) ?? 'unspecified';
+    const safeModel = sanitizeTraceValue(model, MAX_MODEL_LEN);
+
+    this.lastAction = safeAction;
+    this.modelUsed = safeModel;
+
+    const entry: HarnessTraceEntry = {
+      action: safeAction,
+      reason: safeReason,
+      model: safeModel,
+      runId: this.runId,
+      mode: this.mode,
+      step: this.stepCount,
+      timestampMs: Date.now(),
+      toolCallsTotal: this.toolCalls,
+      costTotal: this.cost,
+      latencyUsedMs: this.latencyUsedMs,
+      energyUsed: this.energyUsed,
+      budgetState: {
+        max: this.budgetMax,
+        remaining: this.budgetRemaining,
+      },
+    };
+
+    if (options.applied != null) {
+      entry.applied = options.applied;
+    }
+
+    if (options.decisionMode != null) {
+      entry.decisionMode = options.decisionMode;
+    }
+
+    this._trace.push(entry);
+  }
+
+  trace(): HarnessTraceEntry[] {
+    return [...this._trace];
+  }
+
+  summary(): HarnessRunSummary {
+    return {
+      runId: this.runId,
+      mode: this.mode,
+      stepCount: this.stepCount,
+      toolCalls: this.toolCalls,
+      cost: this.cost,
+      savings: this.savings,
+      latencyUsedMs: this.latencyUsedMs,
+      energyUsed: this.energyUsed,
+      budgetMax: this.budgetMax,
+      budgetRemaining: this.budgetRemaining,
+      lastAction: this.lastAction,
+      modelUsed: this.modelUsed,
+      durationMs: this.durationMs,
+    };
+  }
+}
+
+type ConfigSource = 'code' | 'env' | 'file' | 'default';
+
+type ConfigWithSources = {
+  config: HarnessConfig;
+  sources: Record<string, ConfigSource>;
+};
+
+let _harnessConfig: HarnessConfig = {
+  mode: 'off',
+  verbose: false,
+};
+
+let _isInstrumented = false;
+let fallbackCurrentRun: HarnessRunContext | null = null;
+
+let asyncLocalStorageInstance: { run: (store: HarnessRunContext, callback: () => Promise<unknown>) => Promise<unknown>; getStore: () => HarnessRunContext | undefined } | null = null;
+
+function getAsyncLocalStorage(): typeof asyncLocalStorageInstance {
+  if (asyncLocalStorageInstance) {
+    return asyncLocalStorageInstance;
+  }
+
+  try {
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    const mod = require('node:async_hooks') as {
+      AsyncLocalStorage: new <T>() => { run: (store: T, callback: () => Promise<unknown>) => Promise<unknown>; getStore: () => T | undefined };
+    };
+
+    asyncLocalStorageInstance = new mod.AsyncLocalStorage<HarnessRunContext>();
+  } catch {
+    asyncLocalStorageInstance = null;
+  }
+
+  return asyncLocalStorageInstance;
+}
+
+function parseBoolean(raw: string): boolean {
+  const normalized = raw.trim().toLowerCase();
+  return normalized === '1' || normalized === 'true' || normalized === 'yes' || normalized === 'on';
+}
+
+function parseNumber(raw: string): number {
+  const value = Number(raw);
+  if (!Number.isFinite(value)) {
+    throw new Error(`Invalid numeric value: ${raw}`);
+  }
+  return value;
+}
+
+function parseJSONMap(raw: string): Record<string, number> {
+  const parsed = JSON.parse(raw);
+  if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
+    throw new Error('Expected object');
+  }
+
+  const result: Record<string, number> = {};
+  for (const [key, value] of Object.entries(parsed as Record<string, unknown>)) {
+    result[String(key)] = Number(value);
+  }
+  return result;
+}
+
+function normalizeMode(mode: unknown): HarnessMode {
+  if (mode === 'off' || mode === 'observe' || mode === 'enforce') {
+    return mode;
+  }
+
+  throw new Error('mode must be one of: off, observe, enforce');
+}
+
+function normalizeConfigRecord(raw: Record<string, unknown>): HarnessInitOptions {
+  const out: HarnessInitOptions = {};
+
+  const mode = raw.mode ?? raw.harness_mode;
+  if (typeof mode === 'string') {
+    out.mode = normalizeMode(mode);
+  }
+
+  const verbose = raw.verbose ?? raw.harness_verbose;
+  if (typeof verbose === 'boolean') {
+    out.verbose = verbose;
+  }
+
+  const budget = raw.budget ?? raw.max_budget;
+  if (typeof budget === 'number') {
+    out.budget = budget;
+  }
+
+  const maxToolCalls = raw.maxToolCalls ?? raw.max_tool_calls;
+  if (typeof maxToolCalls === 'number') {
+    out.maxToolCalls = maxToolCalls;
+  }
+
+  const maxLatencyMs = raw.maxLatencyMs ?? raw.max_latency_ms;
+  if (typeof maxLatencyMs === 'number') {
+    out.maxLatencyMs = maxLatencyMs;
+  }
+
+  const maxEnergy = raw.maxEnergy ?? raw.max_energy;
+  if (typeof maxEnergy === 'number') {
+    out.maxEnergy = maxEnergy;
+  }
+
+  const kpiTargets = raw.kpiTargets ?? raw.kpi_targets;
+  if (kpiTargets && typeof kpiTargets === 'object' && !Array.isArray(kpiTargets)) {
+    out.kpiTargets = kpiTargets as Record<string, number>;
+  }
+
+  const kpiWeights = raw.kpiWeights ?? raw.kpi_weights;
+  if (kpiWeights && typeof kpiWeights === 'object' && !Array.isArray(kpiWeights)) {
+    out.kpiWeights = kpiWeights as Record<string, number>;
+  }
+
+  const compliance = raw.compliance;
+  if (typeof compliance === 'string') {
+    out.compliance = compliance;
+  }
+
+  return out;
+}
+
+function readEnvConfig(): HarnessInitOptions {
+  // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
+  if (typeof process === 'undefined' || !process.env) {
+    return {};
+  }
+
+  const env = process.env;
+  const config: HarnessInitOptions = {};
+
+  const mode = env.CASCADEFLOW_HARNESS_MODE ?? env.CASCADEFLOW_MODE;
+  if (mode) {
+    config.mode = normalizeMode(mode);
+  }
+
+  if (env.CASCADEFLOW_HARNESS_VERBOSE != null) {
+    config.verbose = parseBoolean(env.CASCADEFLOW_HARNESS_VERBOSE);
+  }
+
+  const budget = env.CASCADEFLOW_HARNESS_BUDGET ?? env.CASCADEFLOW_BUDGET;
+  if (budget != null) {
+    config.budget = parseNumber(budget);
+  }
+
+  if (env.CASCADEFLOW_HARNESS_MAX_TOOL_CALLS != null) {
+    config.maxToolCalls = parseNumber(env.CASCADEFLOW_HARNESS_MAX_TOOL_CALLS);
+  }
+
+  if (env.CASCADEFLOW_HARNESS_MAX_LATENCY_MS != null) {
+    config.maxLatencyMs = parseNumber(env.CASCADEFLOW_HARNESS_MAX_LATENCY_MS);
+  }
+
+  if (env.CASCADEFLOW_HARNESS_MAX_ENERGY != null) {
+    config.maxEnergy = parseNumber(env.CASCADEFLOW_HARNESS_MAX_ENERGY);
+  }
+
+  if (env.CASCADEFLOW_HARNESS_KPI_TARGETS != null) {
+    config.kpiTargets = parseJSONMap(env.CASCADEFLOW_HARNESS_KPI_TARGETS);
+  }
+
+  if (env.CASCADEFLOW_HARNESS_KPI_WEIGHTS != null) {
+    config.kpiWeights = parseJSONMap(env.CASCADEFLOW_HARNESS_KPI_WEIGHTS);
+  }
+
+  if (env.CASCADEFLOW_HARNESS_COMPLIANCE != null) {
+    config.compliance = env.CASCADEFLOW_HARNESS_COMPLIANCE;
+  }
+
+  return config;
+}
+
+function readFileConfig(): HarnessInitOptions {
+  // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
+  if (typeof process === 'undefined' || !process.cwd) {
+    return {};
+  }
+
+  try {
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    const fs = require('node:fs') as typeof import('node:fs');
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    const path = require('node:path') as typeof import('node:path');
+
+    const configuredPath = process.env.CASCADEFLOW_CONFIG;
+    const candidates = configuredPath
+      ? [configuredPath]
+      : ['cascadeflow.json', 'cascadeflow.config.json'];
+
+    for (const candidate of candidates) {
+      const full = path.isAbsolute(candidate) ? candidate : path.join(process.cwd(), candidate);
+      if (!fs.existsSync(full)) {
+        continue;
+      }
+
+      const content = fs.readFileSync(full, 'utf8');
+      const parsed = JSON.parse(content) as Record<string, unknown>;
+      const harnessBlock = (
+        parsed.harness && typeof parsed.harness === 'object' && !Array.isArray(parsed.harness)
+      )
+        ? (parsed.harness as Record<string, unknown>)
+        : parsed;
+
+      return normalizeConfigRecord(harnessBlock);
+    }
+  } catch {
+    return {};
+  }
+
+  return {};
+}
+
+function resolveConfig(options: HarnessInitOptions): ConfigWithSources {
+  const env = readEnvConfig();
+  const file = readFileConfig();
+  const sources: Record<string, ConfigSource> = {};
+
+  const resolve = <T>(
+    key: keyof HarnessConfig,
+    explicit: T | undefined,
+    envValue: T | undefined,
+    fileValue: T | undefined,
+    defaultValue: T,
+  ): T => {
+    if (explicit !== undefined) {
+      sources[key] = 'code';
+      return explicit;
+    }
+    if (envValue !== undefined) {
+      sources[key] = 'env';
+      return envValue;
+    }
+    if (fileValue !== undefined) {
+      sources[key] = 'file';
+      return fileValue;
+    }
+    sources[key] = 'default';
+    return defaultValue;
+  };
+
+  const mode = resolve('mode', options.mode, env.mode, file.mode, 'off');
+  const verbose = resolve('verbose', options.verbose, env.verbose, file.verbose, false);
+  const budget = resolve('budget', options.budget, env.budget, file.budget, undefined);
+  const maxToolCalls = resolve(
+    'maxToolCalls',
+    options.maxToolCalls,
+    env.maxToolCalls,
+    file.maxToolCalls,
+    undefined,
+  );
+  const maxLatencyMs = resolve(
+    'maxLatencyMs',
+    options.maxLatencyMs,
+    env.maxLatencyMs,
+    file.maxLatencyMs,
+    undefined,
+  );
+  const maxEnergy = resolve('maxEnergy', options.maxEnergy, env.maxEnergy, file.maxEnergy, undefined);
+  const kpiTargets = resolve(
+    'kpiTargets',
+    options.kpiTargets,
+    env.kpiTargets,
+    file.kpiTargets,
+    undefined,
+  );
+  const kpiWeights = resolve(
+    'kpiWeights',
+    options.kpiWeights,
+    env.kpiWeights,
+    file.kpiWeights,
+    undefined,
+  );
+  const compliance = resolve(
+    'compliance',
+    options.compliance,
+    env.compliance,
+    file.compliance,
+    undefined,
+  );
+
+  return {
+    config: {
+      mode,
+      verbose,
+      budget,
+      maxToolCalls,
+      maxLatencyMs,
+      maxEnergy,
+      kpiTargets,
+      kpiWeights,
+      compliance,
+    },
+    sources,
+  };
+}
+
+export function getHarnessConfig(): HarnessConfig {
+  return { ..._harnessConfig };
+}
+
+export function getCurrentRun(): HarnessRunContext | null {
+  const als = getAsyncLocalStorage();
+  if (als) {
+    return als.getStore() ?? null;
+  }
+
+  return fallbackCurrentRun;
+}
+
+export function reset(): void {
+  unpatchOpenAI();
+  unpatchAnthropic();
+  __resetInstrumentationStateForTest();
+
+  _harnessConfig = { mode: 'off', verbose: false };
+  _isInstrumented = false;
+  fallbackCurrentRun = null;
+}
+
+export function init(options: HarnessInitOptions = {}): HarnessInitReport {
+  const { config, sources } = resolveConfig(options);
+  config.mode = normalizeMode(config.mode);
+
+  _harnessConfig = config;
+
+  const instrumented: string[] = [];
+  const detectedButNotInstrumented: string[] = [];
+
+  const openaiDetected = detectOpenAIInstrumentationTarget();
+  const anthropicDetected = detectAnthropicInstrumentationTarget();
+
+  if (config.mode !== 'off' && openaiDetected) {
+    if (patchOpenAI()) {
+      instrumented.push('openai');
+    } else {
+      detectedButNotInstrumented.push('openai');
+    }
+  }
+
+  if (config.mode !== 'off' && anthropicDetected) {
+    if (patchAnthropic()) {
+      instrumented.push('anthropic');
+    } else {
+      detectedButNotInstrumented.push('anthropic');
+    }
+  }
+
+  if (config.mode === 'off') {
+    unpatchOpenAI();
+    unpatchAnthropic();
+  }
+
+  _isInstrumented = true;
+
+  if (config.verbose) {
+    // eslint-disable-next-line no-console
+    console.info('[cascadeflow.harness] init', {
+      mode: config.mode,
+      instrumented,
+      detectedButNotInstrumented,
+    });
+  }
+
+  return {
+    mode: config.mode,
+    instrumented,
+    detectedButNotInstrumented,
+    configSources: sources,
+  };
+}
+
+type RunCallback<T> = (run: HarnessRunContext) => Promise<T> | T;
+
+async function executeScopedRun<T>(runContext: HarnessRunContext, fn: RunCallback<T>): Promise<T> {
+  try {
+    return await fn(runContext);
+  } finally {
+    runContext.finish();
+  }
+}
+
+export async function run<T>(callback: RunCallback<T>): Promise<T>;
+export async function run<T>(options: HarnessRunOptions, callback: RunCallback<T>): Promise<T>;
+export async function run<T>(
+  optionsOrCallback: HarnessRunOptions | RunCallback<T>,
+  callback?: RunCallback<T>,
+): Promise<T> {
+  const options = typeof optionsOrCallback === 'function' ? {} : optionsOrCallback;
+  const cb = (typeof optionsOrCallback === 'function' ? optionsOrCallback : callback) as RunCallback<T> | undefined;
+
+  if (!cb) {
+    throw new Error('run() requires a callback: run(options?, async (run) => { ... })');
+  }
+
+  const cfg = getHarnessConfig();
+  const runContext = new HarnessRunContext({
+    mode: cfg.mode,
+    budgetMax: options.budget ?? cfg.budget,
+    toolCallsMax: options.maxToolCalls ?? cfg.maxToolCalls,
+    latencyMaxMs: options.maxLatencyMs ?? cfg.maxLatencyMs,
+    energyMax: options.maxEnergy ?? cfg.maxEnergy,
+    kpiTargets: options.kpiTargets ?? cfg.kpiTargets,
+    kpiWeights: options.kpiWeights ?? cfg.kpiWeights,
+    compliance: options.compliance ?? cfg.compliance,
+    verbose: cfg.verbose,
+  });
+
+  const als = getAsyncLocalStorage();
+  if (als) {
+    return als.run(runContext, async () => executeScopedRun(runContext, cb)) as Promise<T>;
+  }
+
+  const previous = fallbackCurrentRun;
+  fallbackCurrentRun = runContext;
+  try {
+    return await executeScopedRun(runContext, cb);
+  } finally {
+    fallbackCurrentRun = previous;
+  }
+}
+
+export function agent(policy: HarnessRunOptions): <T extends (...args: any[]) => any>(fn: T) => T {
+  return <T extends (...args: any[]) => any>(fn: T): T => {
+    const wrapped = ((...args: any[]) => fn(...args)) as T;
+    (wrapped as any).__cascadeflow_agent_policy__ = {
+      budget: policy.budget,
+      kpiTargets: policy.kpiTargets,
+      kpiWeights: policy.kpiWeights,
+      compliance: policy.compliance,
+    };
+    return wrapped;
+  };
+}
+
+setHarnessRuntimeBindingsForInstrumentation({
+  getCurrentRun,
+  getHarnessMode: () => getHarnessConfig().mode,
+  createBudgetExceededError: (message: string, remaining?: number) =>
+    new BudgetExceededError(message, remaining),
+  createHarnessStopError: (message: string, reason?: string) =>
+    new HarnessStopError(message, reason),
+});
+
+export const cascadeflow = {
+  init,
+  run,
+  agent,
+  reset,
+  getHarnessConfig,
+  getCurrentRun,
+};
+
+export function isHarnessInstrumented(): boolean {
+  return _isInstrumented;
+}
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index 29819183..c919f67e 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -42,6 +42,31 @@ export {
   DEFAULT_CASCADE_CONFIG,
 } from './config';
 
+// Harness API (v2.1+)
+export type {
+  HarnessMode,
+  HarnessConfig,
+  HarnessInitOptions,
+  HarnessRunOptions,
+  HarnessInitReport,
+  HarnessRecordOptions,
+  HarnessTraceEntry,
+  HarnessRunSummary,
+} from './harness';
+export {
+  HarnessRunContext,
+  HarnessStopError,
+  BudgetExceededError,
+  init,
+  run,
+  agent as harnessAgent,
+  reset as resetHarness,
+  getHarnessConfig,
+  getCurrentRun,
+  isHarnessInstrumented,
+  cascadeflow,
+} from './harness';
+
 // Results
 export type { CascadeResult } from './result';
 export { resultToObject } from './result';

From de7db49c1db7563d68f7e8225f3aea69c9aac0b9 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 4 Mar 2026 16:05:14 +0100
Subject: [PATCH 35/49] test(harness): add comprehensive Anthropic
 auto-instrumentation tests

Add 29 tests covering the Anthropic Python SDK monkey-patching that was
introduced in v2.1. Tests cover usage extraction, tool call counting,
sync/async wrapper behavior, budget enforcement in enforce mode, stream
passthrough, cost/energy/latency tracking, and init/reset lifecycle.
---
 tests/test_harness_instrument.py | 384 ++++++++++++++++++++++++++++++-
 1 file changed, 383 insertions(+), 1 deletion(-)

diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py
index ca1f9a07..4931f093 100644
--- a/tests/test_harness_instrument.py
+++ b/tests/test_harness_instrument.py
@@ -1,4 +1,4 @@
-"""Tests for cascadeflow.harness.instrument — OpenAI auto-instrumentation."""
+"""Tests for cascadeflow.harness.instrument — OpenAI + Anthropic auto-instrumentation."""
 
 from __future__ import annotations
 
@@ -15,8 +15,12 @@
 from cascadeflow.harness.instrument import (
     _InstrumentedAsyncStream,
     _InstrumentedStream,
+    _count_tool_calls_in_anthropic_response,
     _estimate_cost,
     _estimate_energy,
+    _extract_anthropic_usage,
+    _make_patched_anthropic_async_create,
+    _make_patched_anthropic_create,
     _make_patched_async_create,
     _make_patched_create,
     is_anthropic_patched,
@@ -967,3 +971,381 @@ def test_non_stream_does_not_inject_stream_options(self) -> None:
 
         call_kwargs = original.call_args[1]
         assert "stream_options" not in call_kwargs
+
+
+# ===========================================================================
+# Anthropic instrumentation tests
+# ===========================================================================
+
+
+def _mock_anthropic_usage(input_tokens: int = 100, output_tokens: int = 50) -> MagicMock:
+    u = MagicMock()
+    u.input_tokens = input_tokens
+    u.output_tokens = output_tokens
+    return u
+
+
+def _mock_anthropic_response(
+    input_tokens: int = 100,
+    output_tokens: int = 50,
+    content: Optional[list] = None,
+) -> MagicMock:
+    resp = MagicMock()
+    resp.usage = _mock_anthropic_usage(input_tokens, output_tokens)
+    resp.content = content or []
+    return resp
+
+
+def _mock_tool_use_block() -> MagicMock:
+    block = MagicMock()
+    block.type = "tool_use"
+    return block
+
+
+def _mock_text_block() -> MagicMock:
+    block = MagicMock()
+    block.type = "text"
+    return block
+
+
+# ---------------------------------------------------------------------------
+# Anthropic usage extraction
+# ---------------------------------------------------------------------------
+
+
+class TestAnthropicUsageExtraction:
+    def test_extract_usage(self) -> None:
+        resp = _mock_anthropic_response(input_tokens=200, output_tokens=100)
+        inp, out = _extract_anthropic_usage(resp)
+        assert inp == 200
+        assert out == 100
+
+    def test_extract_usage_none(self) -> None:
+        resp = MagicMock()
+        resp.usage = None
+        inp, out = _extract_anthropic_usage(resp)
+        assert inp == 0
+        assert out == 0
+
+
+# ---------------------------------------------------------------------------
+# Anthropic tool call counting
+# ---------------------------------------------------------------------------
+
+
+class TestAnthropicToolCallCounting:
+    def test_counts_tool_use_blocks(self) -> None:
+        resp = _mock_anthropic_response(
+            content=[_mock_text_block(), _mock_tool_use_block(), _mock_tool_use_block()]
+        )
+        assert _count_tool_calls_in_anthropic_response(resp) == 2
+
+    def test_no_content(self) -> None:
+        resp = MagicMock()
+        resp.content = None
+        assert _count_tool_calls_in_anthropic_response(resp) == 0
+
+    def test_empty_content(self) -> None:
+        resp = _mock_anthropic_response(content=[])
+        assert _count_tool_calls_in_anthropic_response(resp) == 0
+
+    def test_text_only(self) -> None:
+        resp = _mock_anthropic_response(content=[_mock_text_block()])
+        assert _count_tool_calls_in_anthropic_response(resp) == 0
+
+
+# ---------------------------------------------------------------------------
+# Anthropic sync wrapper
+# ---------------------------------------------------------------------------
+
+
+class TestAnthropicSyncWrapper:
+    def test_observe_passes_through_response(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=1.0) as ctx:
+            result = wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert result is mock_resp
+        original.assert_called_once()
+
+    def test_observe_tracks_cost(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000)
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=100.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        # claude-sonnet-4: $3.00/1M in + $15.00/1M out = $18.00
+        assert ctx.cost == pytest.approx(18.0, abs=0.01)
+
+    def test_observe_tracks_step_count(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=1.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert ctx.step_count == 2
+
+    def test_observe_tracks_tool_calls(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response(
+            content=[_mock_tool_use_block(), _mock_tool_use_block()]
+        )
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=1.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert ctx.tool_calls == 2
+
+    def test_observe_tracks_energy(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response(input_tokens=1000, output_tokens=500)
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=1.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        # claude-sonnet-4 uses default coefficient=1.0, output_weight=1.5
+        # energy = 1.0 * (1000 + 500 * 1.5) = 1750.0
+        assert ctx.energy_used == pytest.approx(1750.0)
+
+    def test_observe_tracks_latency(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=1.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert ctx.latency_used_ms > 0
+
+    def test_budget_remaining_decreases(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000)
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=100.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert ctx.budget_remaining is not None
+        assert ctx.budget_remaining == pytest.approx(100.0 - 18.0, abs=0.01)
+
+    def test_trace_records_model_and_mode(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=1.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        trace = ctx.trace()
+        assert len(trace) == 1
+        assert trace[0]["action"] == "allow"
+        assert trace[0]["reason"] == "observe"
+        assert trace[0]["model"] == "claude-sonnet-4"
+
+    def test_off_mode_passthrough_no_tracking(self) -> None:
+        init(mode="off")
+        mock_resp = _mock_anthropic_response()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run() as ctx:
+            result = wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert result is mock_resp
+        assert ctx.cost == 0.0
+        assert ctx.step_count == 0
+
+    def test_no_run_scope_returns_response(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        result = wrapper(MagicMock(), model="claude-sonnet-4")
+        assert result is mock_resp
+
+    def test_stream_passthrough_no_usage_tracking(self) -> None:
+        """Anthropic streams are not instrumented in V2.1 — verify passthrough."""
+        init(mode="observe")
+        mock_stream = MagicMock()
+        original = MagicMock(return_value=mock_stream)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=1.0) as ctx:
+            result = wrapper(MagicMock(), model="claude-sonnet-4", stream=True)
+
+        assert result is mock_stream
+        assert ctx.cost == 0.0
+        assert ctx.step_count == 0
+
+    def test_multiple_calls_accumulate(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000)
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=100.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert ctx.cost == pytest.approx(36.0, abs=0.01)
+        assert ctx.step_count == 2
+
+
+# ---------------------------------------------------------------------------
+# Anthropic async wrapper
+# ---------------------------------------------------------------------------
+
+
+class TestAnthropicAsyncWrapper:
+    async def test_observe_passes_through_response(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response()
+        original = AsyncMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_async_create(original)
+
+        async with run(budget=1.0) as ctx:
+            result = await wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert result is mock_resp
+
+    async def test_observe_tracks_cost(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000)
+        original = AsyncMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_async_create(original)
+
+        async with run(budget=100.0) as ctx:
+            await wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert ctx.cost == pytest.approx(18.0, abs=0.01)
+        assert ctx.step_count == 1
+
+    async def test_off_mode_passthrough(self) -> None:
+        init(mode="off")
+        mock_resp = _mock_anthropic_response()
+        original = AsyncMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_async_create(original)
+
+        async with run() as ctx:
+            result = await wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert result is mock_resp
+        assert ctx.cost == 0.0
+
+    async def test_stream_passthrough(self) -> None:
+        init(mode="observe")
+        mock_stream = AsyncMock()
+        original = AsyncMock(return_value=mock_stream)
+        wrapper = _make_patched_anthropic_async_create(original)
+
+        async with run(budget=1.0) as ctx:
+            result = await wrapper(MagicMock(), model="claude-sonnet-4", stream=True)
+
+        assert result is mock_stream
+        assert ctx.cost == 0.0
+
+
+# ---------------------------------------------------------------------------
+# Anthropic enforce mode
+# ---------------------------------------------------------------------------
+
+
+class TestAnthropicEnforceMode:
+    def test_enforce_trace_records_enforce_reason(self) -> None:
+        init(mode="enforce")
+        mock_resp = _mock_anthropic_response()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=100.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        trace = ctx.trace()
+        assert trace[0]["reason"] == "enforce"
+
+    def test_enforce_raises_on_budget_exhausted(self) -> None:
+        from cascadeflow.schema.exceptions import BudgetExceededError
+
+        init(mode="enforce")
+        mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000)
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=0.001) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+            with pytest.raises(BudgetExceededError):
+                wrapper(MagicMock(), model="claude-sonnet-4")
+
+    def test_observe_does_not_raise_on_budget_exhausted(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000)
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=0.001) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert ctx.cost > ctx.budget_max
+
+    async def test_async_enforce_raises_on_budget_exhausted(self) -> None:
+        from cascadeflow.schema.exceptions import BudgetExceededError
+
+        init(mode="enforce")
+        mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000)
+        original = AsyncMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_async_create(original)
+
+        async with run(budget=0.001) as ctx:
+            await wrapper(MagicMock(), model="claude-sonnet-4")
+            with pytest.raises(BudgetExceededError):
+                await wrapper(MagicMock(), model="claude-sonnet-4")
+
+
+# ---------------------------------------------------------------------------
+# Anthropic init() integration
+# ---------------------------------------------------------------------------
+
+
+class TestAnthropicInitIntegration:
+    def test_init_observe_patches_anthropic(self) -> None:
+        if find_spec("anthropic") is None:
+            pytest.skip("anthropic package not available")
+        report = init(mode="observe")
+        assert "anthropic" in report.instrumented
+        assert is_anthropic_patched()
+
+    def test_init_off_unpatches_anthropic(self) -> None:
+        if find_spec("anthropic") is None:
+            pytest.skip("anthropic package not available")
+        init(mode="observe")
+        assert is_anthropic_patched()
+        init(mode="off")
+        assert not is_anthropic_patched()
+
+    def test_reset_unpatches_anthropic(self) -> None:
+        if find_spec("anthropic") is None:
+            pytest.skip("anthropic package not available")
+        init(mode="observe")
+        assert is_anthropic_patched()
+        reset()
+        assert not is_anthropic_patched()

From de4a638c6526623c55da650c46545fb57195af49 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 4 Mar 2026 16:22:42 +0100
Subject: [PATCH 36/49] feat(harness): instrument Anthropic streaming usage and
 tool calls

---
 cascadeflow/harness/instrument.py | 190 ++++++++++++++++++++++++++++--
 tests/test_harness_instrument.py  |  84 +++++++++++--
 2 files changed, 252 insertions(+), 22 deletions(-)

diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py
index 566f15d0..5632884c 100644
--- a/cascadeflow/harness/instrument.py
+++ b/cascadeflow/harness/instrument.py
@@ -663,6 +663,168 @@ async def __aexit__(self, *args: Any) -> bool:
         return False
 
 
+class _InstrumentedAnthropicStreamBase:
+    """Shared stream-wrapper logic for sync and async Anthropic streams."""
+
+    __slots__ = (
+        "_stream",
+        "_ctx",
+        "_model",
+        "_start_time",
+        "_pre_action",
+        "_pre_reason",
+        "_pre_model",
+        "_pre_applied",
+        "_decision_mode",
+        "_input_tokens",
+        "_output_tokens",
+        "_tool_call_count",
+        "_finalized",
+    )
+
+    def __init__(
+        self,
+        stream: Any,
+        ctx: Any,
+        model: str,
+        start_time: float,
+        pre_action: str = "allow",
+        pre_reason: str = "observe",
+        pre_model: str | None = None,
+        pre_applied: bool = True,
+        decision_mode: str = "observe",
+    ) -> None:
+        self._stream = stream
+        self._ctx = ctx
+        self._model = model
+        self._start_time = start_time
+        self._pre_action = pre_action
+        self._pre_reason = pre_reason
+        self._pre_model = pre_model or model
+        self._pre_applied = pre_applied
+        self._decision_mode = decision_mode
+        self._input_tokens: int = 0
+        self._output_tokens: int = 0
+        self._tool_call_count: int = 0
+        self._finalized: bool = False
+
+    def close(self) -> None:
+        self._finalize()
+        if hasattr(self._stream, "close"):
+            self._stream.close()
+
+    def _inspect_event(self, event: Any) -> None:
+        event_type = getattr(event, "type", None)
+
+        if event_type == "message_start":
+            message = getattr(event, "message", None)
+            usage = getattr(message, "usage", None)
+            if usage is not None:
+                input_tokens = getattr(usage, "input_tokens", None)
+                output_tokens = getattr(usage, "output_tokens", None)
+                if isinstance(input_tokens, (int, float)):
+                    self._input_tokens = int(input_tokens) if input_tokens > 0 else 0
+                if isinstance(output_tokens, (int, float)):
+                    self._output_tokens = int(output_tokens) if output_tokens > 0 else 0
+            return
+
+        usage = getattr(event, "usage", None)
+        if usage is not None:
+            input_tokens = getattr(usage, "input_tokens", None)
+            output_tokens = getattr(usage, "output_tokens", None)
+            if isinstance(input_tokens, (int, float)) and input_tokens > 0:
+                self._input_tokens = int(input_tokens)
+            if isinstance(output_tokens, (int, float)):
+                self._output_tokens = int(output_tokens) if output_tokens > 0 else 0
+
+        if event_type == "content_block_start":
+            content_block = getattr(event, "content_block", None)
+            block_type = getattr(content_block, "type", None)
+            if block_type in {"tool_use", "server_tool_use"}:
+                self._tool_call_count += 1
+
+    def _finalize(self) -> None:
+        if self._finalized:
+            return
+        self._finalized = True
+
+        if self._ctx is None:
+            return
+
+        elapsed_ms = (time.monotonic() - self._start_time) * 1000
+        _update_context(
+            self._ctx,
+            self._model,
+            self._input_tokens,
+            self._output_tokens,
+            self._tool_call_count,
+            elapsed_ms,
+            action=self._pre_action,
+            action_reason=self._pre_reason,
+            action_model=self._pre_model,
+            applied=self._pre_applied,
+            decision_mode=self._decision_mode,
+        )
+
+
+class _InstrumentedAnthropicStream(_InstrumentedAnthropicStreamBase):
+    """Wraps an Anthropic sync stream and tracks usage at stream end."""
+
+    __slots__ = ()
+
+    def __iter__(self) -> _InstrumentedAnthropicStream:
+        return self
+
+    def __next__(self) -> Any:
+        try:
+            event = next(self._stream)
+            self._inspect_event(event)
+            return event
+        except StopIteration:
+            self._finalize()
+            raise
+
+    def __enter__(self) -> _InstrumentedAnthropicStream:
+        if hasattr(self._stream, "__enter__"):
+            self._stream.__enter__()
+        return self
+
+    def __exit__(self, *args: Any) -> bool:
+        self._finalize()
+        if hasattr(self._stream, "__exit__"):
+            return self._stream.__exit__(*args)  # type: ignore[no-any-return]
+        return False
+
+
+class _InstrumentedAnthropicAsyncStream(_InstrumentedAnthropicStreamBase):
+    """Wraps an Anthropic async stream and tracks usage at stream end."""
+
+    __slots__ = ()
+
+    def __aiter__(self) -> _InstrumentedAnthropicAsyncStream:
+        return self
+
+    async def __anext__(self) -> Any:
+        try:
+            event = await self._stream.__anext__()
+            self._inspect_event(event)
+            return event
+        except StopAsyncIteration:
+            self._finalize()
+            raise
+
+    async def __aenter__(self) -> _InstrumentedAnthropicAsyncStream:
+        if hasattr(self._stream, "__aenter__"):
+            await self._stream.__aenter__()
+        return self
+
+    async def __aexit__(self, *args: Any) -> bool:
+        self._finalize()
+        if hasattr(self._stream, "__aexit__"):
+            return await self._stream.__aexit__(*args)  # type: ignore[no-any-return]
+        return False
+
+
 # ---------------------------------------------------------------------------
 # Wrapper factories
 # ---------------------------------------------------------------------------
@@ -877,14 +1039,18 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
             )
             return response
 
-        # Anthropic stream wrappers are not instrumented in V2.1 (known limitation).
         if is_stream:
-            logger.debug(
-                "harness %s (anthropic): stream passthrough model=%s (usage tracking unavailable)",
-                mode,
+            return _InstrumentedAnthropicStream(
+                response,
+                ctx,
                 model,
+                start_time,
+                pre_action,
+                pre_reason,
+                pre_model,
+                pre_applied,
+                mode,
             )
-            return response
 
         elapsed_ms = (time.monotonic() - start_time) * 1000
         input_tokens, output_tokens = _extract_anthropic_usage(response)
@@ -949,14 +1115,18 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
             )
             return response
 
-        # Anthropic stream wrappers are not instrumented in V2.1 (known limitation).
         if is_stream:
-            logger.debug(
-                "harness %s async (anthropic): stream passthrough model=%s (usage tracking unavailable)",
-                mode,
+            return _InstrumentedAnthropicAsyncStream(
+                response,
+                ctx,
                 model,
+                start_time,
+                pre_action,
+                pre_reason,
+                pre_model,
+                pre_applied,
+                mode,
             )
-            return response
 
         elapsed_ms = (time.monotonic() - start_time) * 1000
         input_tokens, output_tokens = _extract_anthropic_usage(response)
diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py
index 4931f093..551435dd 100644
--- a/tests/test_harness_instrument.py
+++ b/tests/test_harness_instrument.py
@@ -13,6 +13,8 @@
 
 from cascadeflow.harness import init, reset, run
 from cascadeflow.harness.instrument import (
+    _InstrumentedAnthropicAsyncStream,
+    _InstrumentedAnthropicStream,
     _InstrumentedAsyncStream,
     _InstrumentedStream,
     _count_tool_calls_in_anthropic_response,
@@ -978,7 +980,10 @@ def test_non_stream_does_not_inject_stream_options(self) -> None:
 # ===========================================================================
 
 
-def _mock_anthropic_usage(input_tokens: int = 100, output_tokens: int = 50) -> MagicMock:
+def _mock_anthropic_usage(
+    input_tokens: Optional[int] = 100,
+    output_tokens: Optional[int] = 50,
+) -> MagicMock:
     u = MagicMock()
     u.input_tokens = input_tokens
     u.output_tokens = output_tokens
@@ -1008,6 +1013,43 @@ def _mock_text_block() -> MagicMock:
     return block
 
 
+def _mock_anthropic_message_start_event(
+    input_tokens: int = 100,
+    output_tokens: int = 0,
+) -> MagicMock:
+    event = MagicMock()
+    event.type = "message_start"
+    event.message = MagicMock()
+    event.message.usage = _mock_anthropic_usage(input_tokens, output_tokens)
+    return event
+
+
+def _mock_anthropic_message_delta_event(
+    output_tokens: int = 50,
+) -> MagicMock:
+    event = MagicMock()
+    event.type = "message_delta"
+    event.usage = _mock_anthropic_usage(None, output_tokens)
+    return event
+
+
+def _mock_anthropic_content_block_start_event(
+    block_type: str = "tool_use",
+) -> MagicMock:
+    event = MagicMock()
+    event.type = "content_block_start"
+    event.content_block = MagicMock()
+    event.content_block.type = block_type
+    return event
+
+
+def _mock_anthropic_message_stop_event() -> MagicMock:
+    event = MagicMock()
+    event.type = "message_stop"
+    event.usage = None
+    return event
+
+
 # ---------------------------------------------------------------------------
 # Anthropic usage extraction
 # ---------------------------------------------------------------------------
@@ -1182,19 +1224,27 @@ def test_no_run_scope_returns_response(self) -> None:
         result = wrapper(MagicMock(), model="claude-sonnet-4")
         assert result is mock_resp
 
-    def test_stream_passthrough_no_usage_tracking(self) -> None:
-        """Anthropic streams are not instrumented in V2.1 — verify passthrough."""
+    def test_stream_tracks_usage_and_tool_calls(self) -> None:
         init(mode="observe")
-        mock_stream = MagicMock()
+        mock_stream = iter(
+            [
+                _mock_anthropic_message_start_event(input_tokens=1_000_000),
+                _mock_anthropic_content_block_start_event("tool_use"),
+                _mock_anthropic_message_delta_event(output_tokens=1_000_000),
+                _mock_anthropic_message_stop_event(),
+            ]
+        )
         original = MagicMock(return_value=mock_stream)
         wrapper = _make_patched_anthropic_create(original)
 
         with run(budget=1.0) as ctx:
             result = wrapper(MagicMock(), model="claude-sonnet-4", stream=True)
+            assert isinstance(result, _InstrumentedAnthropicStream)
+            list(result)
 
-        assert result is mock_stream
-        assert ctx.cost == 0.0
-        assert ctx.step_count == 0
+        assert ctx.cost == pytest.approx(18.0, abs=0.01)
+        assert ctx.step_count == 1
+        assert ctx.tool_calls == 1
 
     def test_multiple_calls_accumulate(self) -> None:
         init(mode="observe")
@@ -1251,17 +1301,27 @@ async def test_off_mode_passthrough(self) -> None:
         assert result is mock_resp
         assert ctx.cost == 0.0
 
-    async def test_stream_passthrough(self) -> None:
+    async def test_stream_tracks_usage_and_tool_calls(self) -> None:
         init(mode="observe")
-        mock_stream = AsyncMock()
-        original = AsyncMock(return_value=mock_stream)
+
+        async def _event_stream():
+            yield _mock_anthropic_message_start_event(input_tokens=1_000_000)
+            yield _mock_anthropic_content_block_start_event("tool_use")
+            yield _mock_anthropic_message_delta_event(output_tokens=1_000_000)
+            yield _mock_anthropic_message_stop_event()
+
+        original = AsyncMock(return_value=_event_stream())
         wrapper = _make_patched_anthropic_async_create(original)
 
         async with run(budget=1.0) as ctx:
             result = await wrapper(MagicMock(), model="claude-sonnet-4", stream=True)
+            assert isinstance(result, _InstrumentedAnthropicAsyncStream)
+            async for _ in result:
+                pass
 
-        assert result is mock_stream
-        assert ctx.cost == 0.0
+        assert ctx.cost == pytest.approx(18.0, abs=0.01)
+        assert ctx.step_count == 1
+        assert ctx.tool_calls == 1
 
 
 # ---------------------------------------------------------------------------

From ac157424d95863cbbaff13c2dc14b6e4b40bceba Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 4 Mar 2026 16:48:01 +0100
Subject: [PATCH 37/49] fix(harness): finalize stream metrics on errors and
 harden env parsing

---
 cascadeflow/harness/api.py        | 11 +++-
 cascadeflow/harness/instrument.py | 12 ++++
 tests/test_harness_api.py         | 26 +++++++++
 tests/test_harness_instrument.py  | 91 +++++++++++++++++++++++++++++++
 4 files changed, 138 insertions(+), 2 deletions(-)

diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py
index 79f741b8..610bab28 100644
--- a/cascadeflow/harness/api.py
+++ b/cascadeflow/harness/api.py
@@ -7,6 +7,7 @@
 import time
 from contextvars import ContextVar, Token
 from dataclasses import dataclass, field
+from functools import wraps
 from importlib.util import find_spec
 from pathlib import Path
 from typing import Any, Callable, Literal, Optional, TypeVar, cast
@@ -242,12 +243,14 @@ def reset() -> None:
 _MAX_ACTION_LEN = 64
 _MAX_REASON_LEN = 160
 _MAX_MODEL_LEN = 128
+_MAX_ENV_JSON_LEN = 4096
 
 
 def _sanitize_trace_value(value: Any, *, max_length: int) -> Optional[str]:
     if value is None:
         return None
     text = str(value).replace("\n", " ").replace("\r", " ").strip()
+    text = "".join(c for c in text if c.isprintable())
     if len(text) > max_length:
         text = text[: max_length - 3] + "..."
     return text or None
@@ -302,6 +305,10 @@ def _parse_int(raw: str) -> int:
 
 
 def _parse_json_dict(raw: str) -> dict[str, float]:
+    if len(raw) > _MAX_ENV_JSON_LEN:
+        raise ValueError(
+            f"JSON config exceeds {_MAX_ENV_JSON_LEN} characters for harness env var"
+        )
     value = json.loads(raw)
     if not isinstance(value, dict):
         raise ValueError("expected JSON object")
@@ -606,18 +613,18 @@ def decorator(func: F) -> F:
 
         if inspect.iscoroutinefunction(func):
 
+            @wraps(func)
             async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
                 return await func(*args, **kwargs)
 
             async_wrapper.__cascadeflow_agent_policy__ = metadata  # type: ignore[attr-defined]
-            async_wrapper.__name__ = getattr(func, "__name__", "wrapped_agent")
             return cast(F, async_wrapper)
 
+        @wraps(func)
         def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
             return func(*args, **kwargs)
 
         sync_wrapper.__cascadeflow_agent_policy__ = metadata  # type: ignore[attr-defined]
-        sync_wrapper.__name__ = getattr(func, "__name__", "wrapped_agent")
         return cast(F, sync_wrapper)
 
     return decorator
diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py
index 5632884c..4b08b9f6 100644
--- a/cascadeflow/harness/instrument.py
+++ b/cascadeflow/harness/instrument.py
@@ -621,6 +621,9 @@ def __next__(self) -> Any:
         except StopIteration:
             self._finalize()
             raise
+        except Exception:
+            self._finalize()
+            raise
 
     def __enter__(self) -> _InstrumentedStream:
         if hasattr(self._stream, "__enter__"):
@@ -650,6 +653,9 @@ async def __anext__(self) -> Any:
         except StopAsyncIteration:
             self._finalize()
             raise
+        except Exception:
+            self._finalize()
+            raise
 
     async def __aenter__(self) -> _InstrumentedAsyncStream:
         if hasattr(self._stream, "__aenter__"):
@@ -783,6 +789,9 @@ def __next__(self) -> Any:
         except StopIteration:
             self._finalize()
             raise
+        except Exception:
+            self._finalize()
+            raise
 
     def __enter__(self) -> _InstrumentedAnthropicStream:
         if hasattr(self._stream, "__enter__"):
@@ -812,6 +821,9 @@ async def __anext__(self) -> Any:
         except StopAsyncIteration:
             self._finalize()
             raise
+        except Exception:
+            self._finalize()
+            raise
 
     async def __aenter__(self) -> _InstrumentedAnthropicAsyncStream:
         if hasattr(self._stream, "__aenter__"):
diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py
index 9554a486..850255ba 100644
--- a/tests/test_harness_api.py
+++ b/tests/test_harness_api.py
@@ -155,6 +155,17 @@ def fn(x: int) -> int:
     assert policy["compliance"] == "gdpr"
 
 
+def test_agent_decorator_preserves_function_metadata():
+    @agent(budget=0.5)
+    def fn(x: int) -> int:
+        """sample doc"""
+        return x
+
+    assert fn.__name__ == "fn"
+    assert fn.__doc__ == "sample doc"
+    assert fn.__annotations__ == {"x": int, "return": int}
+
+
 @pytest.mark.asyncio
 async def test_agent_decorator_keeps_async_behavior_and_attaches_metadata():
     @agent(budget=0.4, kpi_weights={"cost": 1.0})
@@ -210,6 +221,12 @@ def test_init_reads_from_env(monkeypatch):
     assert report.config_sources["budget"] == "env"
 
 
+def test_init_rejects_oversized_env_json(monkeypatch):
+    monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_TARGETS", "x" * 5000)
+    with pytest.raises(ValueError, match="JSON config exceeds"):
+        init()
+
+
 def test_init_reads_from_config_file(tmp_path, monkeypatch):
     config = tmp_path / "cascadeflow.json"
     config.write_text(
@@ -433,6 +450,15 @@ def test_record_sanitizes_trace_values():
     assert len(entry["reason"]) <= 160
 
 
+def test_record_sanitizes_non_printable_values():
+    ctx = run()
+    ctx.record(action="allow\x00", reason="ok\x1f", model="gpt-4o-mini\x07")
+    entry = ctx.trace()[0]
+    assert "\x00" not in entry["action"]
+    assert "\x1f" not in entry["reason"]
+    assert "\x07" not in entry["model"]
+
+
 def test_record_without_callback_manager_is_noop():
     init(mode="observe")
     with run(budget=1.0) as ctx:
diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py
index 551435dd..55e71837 100644
--- a/tests/test_harness_instrument.py
+++ b/tests/test_harness_instrument.py
@@ -434,6 +434,31 @@ def test_stream_finalize_is_idempotent(self) -> None:
 
         assert ctx.step_count == 1  # Should not double-count
 
+    def test_stream_finalizes_on_iteration_error(self) -> None:
+        init(mode="observe")
+        chunk1 = _mock_stream_chunk("data", usage=_mock_usage(100, 50))
+
+        class _FailingStream:
+            def __init__(self) -> None:
+                self._done = False
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                if not self._done:
+                    self._done = True
+                    return chunk1
+                raise RuntimeError("stream failed")
+
+        with run(budget=1.0) as ctx:
+            wrapped = _InstrumentedStream(_FailingStream(), ctx, "gpt-4o-mini", time.monotonic())
+            with pytest.raises(RuntimeError, match="stream failed"):
+                list(wrapped)
+
+        assert ctx.step_count == 1
+        assert ctx.cost > 0
+
     def test_stream_wrapper_via_patched_create(self) -> None:
         """Verify that stream=True in the wrapper returns an _InstrumentedStream."""
         init(mode="observe")
@@ -496,6 +521,24 @@ async def _async_iter():
 
         assert ctx.step_count == 1
 
+    @pytest.mark.asyncio
+    async def test_async_stream_finalizes_on_iteration_error(self) -> None:
+        init(mode="observe")
+        chunk1 = _mock_stream_chunk("data", usage=_mock_usage(100, 50))
+
+        async def _failing_iter():
+            yield chunk1
+            raise RuntimeError("async stream failed")
+
+        async with run(budget=1.0) as ctx:
+            wrapped = _InstrumentedAsyncStream(_failing_iter(), ctx, "gpt-4o-mini", time.monotonic())
+            with pytest.raises(RuntimeError, match="async stream failed"):
+                async for _ in wrapped:
+                    pass
+
+        assert ctx.step_count == 1
+        assert ctx.cost > 0
+
 
 # ---------------------------------------------------------------------------
 # Cost and energy estimation
@@ -1246,6 +1289,34 @@ def test_stream_tracks_usage_and_tool_calls(self) -> None:
         assert ctx.step_count == 1
         assert ctx.tool_calls == 1
 
+    def test_stream_finalizes_on_iteration_error(self) -> None:
+        init(mode="observe")
+
+        class _FailingAnthropicStream:
+            def __init__(self) -> None:
+                self._done = False
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                if not self._done:
+                    self._done = True
+                    return _mock_anthropic_message_start_event(input_tokens=1_000_000)
+                raise RuntimeError("anthropic stream failed")
+
+        original = MagicMock(return_value=_FailingAnthropicStream())
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=1.0) as ctx:
+            result = wrapper(MagicMock(), model="claude-sonnet-4", stream=True)
+            assert isinstance(result, _InstrumentedAnthropicStream)
+            with pytest.raises(RuntimeError, match="anthropic stream failed"):
+                list(result)
+
+        assert ctx.step_count == 1
+        assert ctx.cost > 0
+
     def test_multiple_calls_accumulate(self) -> None:
         init(mode="observe")
         mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000)
@@ -1323,6 +1394,26 @@ async def _event_stream():
         assert ctx.step_count == 1
         assert ctx.tool_calls == 1
 
+    async def test_stream_finalizes_on_iteration_error(self) -> None:
+        init(mode="observe")
+
+        async def _failing_event_stream():
+            yield _mock_anthropic_message_start_event(input_tokens=1_000_000)
+            raise RuntimeError("anthropic async stream failed")
+
+        original = AsyncMock(return_value=_failing_event_stream())
+        wrapper = _make_patched_anthropic_async_create(original)
+
+        async with run(budget=1.0) as ctx:
+            result = await wrapper(MagicMock(), model="claude-sonnet-4", stream=True)
+            assert isinstance(result, _InstrumentedAnthropicAsyncStream)
+            with pytest.raises(RuntimeError, match="anthropic async stream failed"):
+                async for _ in result:
+                    pass
+
+        assert ctx.step_count == 1
+        assert ctx.cost > 0
+
 
 # ---------------------------------------------------------------------------
 # Anthropic enforce mode

From b894cd3ae0c8d266bcf6a64493ee7b1e416d742c Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 4 Mar 2026 20:00:32 +0100
Subject: [PATCH 38/49] docs: add harness quickstart and missing integration
 coverage

---
 docs/README.md                           |  5 +-
 docs/guides/crewai_integration.md        | 79 ++++++++++++++++++++++
 docs/guides/google_adk_integration.md    |  3 +-
 docs/guides/python_harness_quickstart.md | 85 ++++++++++++++++++++++++
 examples/integrations/README.md          | 73 ++++++++++++++++++--
 examples/integrations/crewai_harness.py  | 75 +++++++++++++++++++++
 6 files changed, 314 insertions(+), 6 deletions(-)
 create mode 100644 docs/guides/crewai_integration.md
 create mode 100644 docs/guides/python_harness_quickstart.md
 create mode 100644 examples/integrations/crewai_harness.py

diff --git a/docs/README.md b/docs/README.md
index b9cedf66..5280a562 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -11,6 +11,7 @@ Welcome to cascadeflow documentation! 🌊
 
 ### Core Concepts
 - [Quickstart](guides/quickstart.md) - Get started with cascadeflow in 5 minutes
+- [Python Harness Quickstart](guides/python_harness_quickstart.md) - `init`, `run`, and `@agent` for in-process policy control
 - [Providers](guides/providers.md) - Configure and use different AI providers (OpenAI, Anthropic, Groq, Ollama, etc.)
 - [Presets](guides/presets.md) - Use built-in presets for common use cases
 - [Gateway Server](guides/gateway.md) - Drop-in OpenAI/Anthropic-compatible endpoint for existing apps
@@ -39,9 +40,11 @@ Welcome to cascadeflow documentation! 🌊
 - [Agent Intelligence V2/V2.1 Plan](strategy/agent-intelligence-v2-plan.md) - Unified strategic and execution plan for in-process agent intelligence harness delivery
 
 ### Integrations
+- [OpenAI Agents SDK Integration](guides/openai_agents_integration.md) - Harness-aware model provider for existing OpenAI Agents apps
+- [CrewAI Integration](guides/crewai_integration.md) - Hook-based harness metrics + budget gating (opt-in)
+- [Google ADK Integration](guides/google_adk_integration.md) - Plugin-based harness integration for ADK runners (opt-in)
 - [n8n Integration](guides/n8n_integration.md) - Use cascadeflow in n8n workflows
 - [Paygentic Integration](guides/paygentic_integration.md) - Usage metering and billing lifecycle helpers (opt-in)
-- [OpenAI Agents SDK Integration](guides/openai_agents_integration.md) - Harness-aware model provider for existing OpenAI Agents apps
 
 ## 📚 Examples
 
diff --git a/docs/guides/crewai_integration.md b/docs/guides/crewai_integration.md
new file mode 100644
index 00000000..a39efa79
--- /dev/null
+++ b/docs/guides/crewai_integration.md
@@ -0,0 +1,79 @@
+# CrewAI Integration
+
+Use cascadeflow as an explicit, opt-in harness integration for CrewAI via
+`llm_hooks`.
+
+## Design Principles
+
+- Integration-only: nothing is enabled by default
+- Works with existing CrewAI flows
+- Harness behavior is controlled by `cascadeflow.init(...)` and `cascadeflow.run(...)`
+- Fail-open integration path: harness integration errors should not break crew execution
+
+## Install
+
+```bash
+pip install "cascadeflow[crewai,openai]"
+```
+
+`crewai` is optional and only installed when you request this extra.
+
+## Quickstart
+
+```python
+from crewai import Agent, Crew, Process, Task
+
+from cascadeflow import init, run
+from cascadeflow.integrations.crewai import CrewAIHarnessConfig, enable
+
+# Global harness defaults.
+init(mode="enforce", budget=1.0)
+
+# Explicitly register CrewAI hooks (integration-only behavior).
+enable(
+    config=CrewAIHarnessConfig(
+        fail_open=True,
+        enable_budget_gate=True,
+    )
+)
+
+agent = Agent(
+    role="Support Agent",
+    goal="Answer support questions clearly and concisely.",
+    backstory="You are helpful and direct.",
+    allow_delegation=False,
+    llm="openai/gpt-4o-mini",
+)
+
+task = Task(
+    description="Explain why model cascading helps control agent costs.",
+    expected_output="A concise explanation with one practical example.",
+    agent=agent,
+)
+
+with run(budget=0.4) as session:
+    crew = Crew(agents=[agent], tasks=[task], process=Process.sequential, verbose=False)
+    result = crew.kickoff()
+
+    print(result)
+    print(session.summary())
+    print(session.trace())
+```
+
+## What This Integration Adds
+
+- Budget gating in enforce mode (`before_llm_call` hook)
+- Run metrics in `cascadeflow.run()` scope:
+  - `cost`, `budget_remaining`, `step_count`, `latency_used_ms`, `energy_used`
+- Full decision trace through `run.trace()`
+
+## Current Scope
+
+- This integration uses CrewAI hook points, so it tracks and gates calls without
+  changing your crew/task definitions.
+- Tool-level deny/switch actions are not currently applied in this integration path.
+
+## Notes
+
+- Existing non-CrewAI users are unaffected.
+- If CrewAI is not installed, `enable()` returns `False` and no hooks are registered.
diff --git a/docs/guides/google_adk_integration.md b/docs/guides/google_adk_integration.md
index d0d32b3f..393a1b57 100644
--- a/docs/guides/google_adk_integration.md
+++ b/docs/guides/google_adk_integration.md
@@ -11,7 +11,8 @@ trace recording across all agents in an ADK Runner.
 - **Plugin-based** — Uses ADK's `BasePlugin` system to intercept every LLM call
   across all agents in a Runner. One plugin covers the entire agent graph.
 - **Opt-in** — Install `cascadeflow[google-adk]` and create a plugin explicitly.
-  Never enabled by default.
+  Never enabled by default. Core cascadeflow behavior is unchanged unless you
+  explicitly wire this integration into `Runner(plugins=[...])`.
 - **Fail-open** — Integration errors are logged but never break ADK execution
   (configurable).
 - **No tool gating** — ADK's `tools_dict` is part of agent definition, not
diff --git a/docs/guides/python_harness_quickstart.md b/docs/guides/python_harness_quickstart.md
new file mode 100644
index 00000000..4ec85cfd
--- /dev/null
+++ b/docs/guides/python_harness_quickstart.md
@@ -0,0 +1,85 @@
+# Python Harness Quickstart
+
+This guide covers the in-process harness API:
+
+- `init(...)` for global defaults and SDK instrumentation
+- `run(...)` for per-request scoped budgets/limits and traceability
+- `@agent(...)` for attaching policy metadata to agent functions
+
+## Install
+
+```bash
+pip install "cascadeflow[openai]"
+```
+
+Optional integrations stay opt-in:
+
+```bash
+pip install "cascadeflow[openai,openai-agents]"
+pip install "cascadeflow[crewai]"
+pip install "cascadeflow[google-adk]"
+```
+
+## 1) Initialize Harness
+
+```python
+from cascadeflow import init
+
+report = init(
+    mode="observe",      # off | observe | enforce
+    budget=1.0,          # default per-run budget cap
+    max_tool_calls=8,    # default per-run tool call cap
+)
+
+print(report.mode)
+print(report.instrumented)
+print(report.detected_but_not_instrumented)
+```
+
+`init(...)` is explicit and never auto-enables integrations.
+
+## 2) Track One Scoped Run
+
+```python
+from openai import OpenAI
+
+from cascadeflow import run
+
+client = OpenAI()
+
+with run(budget=0.25, max_tool_calls=4) as session:
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[{"role": "user", "content": "Summarize model cascading in one sentence."}],
+    )
+
+    print(response.choices[0].message.content)
+    print(session.summary())
+    print(session.trace())
+```
+
+## 3) Attach Agent Metadata
+
+`@agent(...)` attaches policy metadata to your function without changing how the
+function executes.
+
+```python
+from cascadeflow import agent
+
+@agent(
+    budget=0.2,
+    kpi_targets={"quality": 0.9},
+    kpi_weights={"cost": 0.5, "latency": 0.5},
+    compliance="strict",
+)
+def support_agent(task: str) -> str:
+    return f"Handled: {task}"
+
+print(support_agent.__cascadeflow_agent_policy__)
+```
+
+## Minimal Checklist
+
+1. Call `init(...)` once at process startup.
+2. Wrap each unit of work in `with run(...):`.
+3. Use `run.summary()` and `run.trace()` for auditability and tuning.
diff --git a/examples/integrations/README.md b/examples/integrations/README.md
index e7e7906a..4bad64f0 100644
--- a/examples/integrations/README.md
+++ b/examples/integrations/README.md
@@ -6,6 +6,8 @@ This directory contains production-ready integration examples for cascadeflow wi
 
 - [LiteLLM Integration](#-litellm-integration) - Access 10+ providers with automatic cost tracking
 - [OpenAI Agents SDK Integration](#-openai-agents-sdk-integration) - Harness-aware ModelProvider for existing agent apps
+- [CrewAI Integration](#-crewai-integration) - Hook-based harness metrics and budget gating
+- [Google ADK Integration](#-google-adk-integration) - Plugin-based harness integration for ADK runners
 - [Paygentic Integration](#-paygentic-integration) - Usage event reporting and billing lifecycle helpers
 - [Local Providers](#-local-providers-setup) - Ollama and vLLM configuration examples
 - [OpenTelemetry & Grafana](#-opentelemetry--grafana) - Production observability and metrics
@@ -160,6 +162,48 @@ python examples/integrations/openai_agents_harness.py
 
 ---
 
+## 👥 CrewAI Integration
+
+**File:** [`crewai_harness.py`](crewai_harness.py)
+
+Use cascadeflow as an explicit, opt-in CrewAI hook integration.
+
+### Quick Start
+
+```bash
+pip install "cascadeflow[crewai,openai]"
+python examples/integrations/crewai_harness.py
+```
+
+### What It Shows
+
+- Explicit `enable(...)` hook registration (never on by default)
+- Enforce-mode budget gating before CrewAI LLM calls
+- Run metrics and decision trace via `cascadeflow.run(...)`
+
+---
+
+## 🧠 Google ADK Integration
+
+**File:** [`google_adk_harness.py`](google_adk_harness.py)
+
+Use cascadeflow as an explicit, opt-in plugin integration for Google ADK.
+
+### Quick Start
+
+```bash
+pip install "cascadeflow[google-adk]"
+python examples/integrations/google_adk_harness.py
+```
+
+### What It Shows
+
+- Explicit plugin creation with `enable(...)` (integration-only behavior)
+- Runner-level plugin wiring via `Runner(..., plugins=[plugin])`
+- Budget gate + run-scoped metrics and trace
+
+---
+
 ## 💳 Paygentic Integration
 
 **File:** [`paygentic_usage.py`](paygentic_usage.py)
@@ -412,6 +456,9 @@ Cost Calculation Tests
 |------|---------|-------------------|
 | `litellm_providers.py` | Comprehensive LiteLLM demo with 8 examples | No (for cost info) |
 | `litellm_cost_tracking.py` | Cost tracking and provider validation | No (for cost info) |
+| `openai_agents_harness.py` | OpenAI Agents SDK harness integration (ModelProvider) | Yes |
+| `crewai_harness.py` | CrewAI hook-based harness integration (opt-in) | Yes |
+| `google_adk_harness.py` | Google ADK plugin harness integration (opt-in) | Yes |
 | `paygentic_usage.py` | Usage event reporting to Paygentic (opt-in, fail-open) | Yes |
 | `local_providers_setup.py` | Ollama and vLLM setup guide | No |
 | `opentelemetry_grafana.py` | Production observability example | No |
@@ -473,6 +520,18 @@ pip install cascadeflow[all]
 pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp-proto-http
 ```
 
+### "CrewAI hooks unavailable"
+```bash
+pip install "cascadeflow[crewai,openai]"
+# Requires crewai>=1.5 for llm_hooks
+```
+
+### "Google ADK not installed"
+```bash
+pip install "cascadeflow[google-adk]"
+# Google ADK requires Python 3.10+
+```
+
 ### "Metrics not appearing in Grafana"
 1. Check OpenTelemetry Collector logs: `docker-compose logs otel-collector`
 2. Verify metrics: `curl http://localhost:8889/metrics`
@@ -490,6 +549,9 @@ Always use provider prefixes for LiteLLM:
 
 - **Provider Guide:** [docs/guides/providers.md](../../docs/guides/providers.md)
 - **Cost Tracking:** [docs/guides/cost_tracking.md](../../docs/guides/cost_tracking.md)
+- **OpenAI Agents Guide:** [docs/guides/openai_agents_integration.md](../../docs/guides/openai_agents_integration.md)
+- **CrewAI Guide:** [docs/guides/crewai_integration.md](../../docs/guides/crewai_integration.md)
+- **Google ADK Guide:** [docs/guides/google_adk_integration.md](../../docs/guides/google_adk_integration.md)
 - **Paygentic Guide:** [docs/guides/paygentic_integration.md](../../docs/guides/paygentic_integration.md)
 - **Production Guide:** [docs/guides/production.md](../../docs/guides/production.md)
 
@@ -498,10 +560,13 @@ Always use provider prefixes for LiteLLM:
 ## 🚀 Next Steps
 
 1. **Try LiteLLM:** `python examples/integrations/litellm_providers.py`
-2. **Try Paygentic usage reporting:** `python examples/integrations/paygentic_usage.py`
-3. **Setup local providers:** `python examples/integrations/local_providers_setup.py`
-4. **Test your API keys:** `python examples/integrations/test_all_providers.py`
-5. **Add monitoring:** Follow OpenTelemetry section above
+2. **Try OpenAI Agents integration:** `python examples/integrations/openai_agents_harness.py`
+3. **Try CrewAI integration:** `python examples/integrations/crewai_harness.py`
+4. **Try Google ADK integration:** `python examples/integrations/google_adk_harness.py`
+5. **Try Paygentic usage reporting:** `python examples/integrations/paygentic_usage.py`
+6. **Setup local providers:** `python examples/integrations/local_providers_setup.py`
+7. **Test your API keys:** `python examples/integrations/test_all_providers.py`
+8. **Add monitoring:** Follow OpenTelemetry section above
 
 ---
 
diff --git a/examples/integrations/crewai_harness.py b/examples/integrations/crewai_harness.py
new file mode 100644
index 00000000..5e14163c
--- /dev/null
+++ b/examples/integrations/crewai_harness.py
@@ -0,0 +1,75 @@
+"""
+CrewAI + cascadeflow harness integration example.
+
+Run:
+    pip install "cascadeflow[crewai,openai]"
+    export OPENAI_API_KEY="your-key"
+    python examples/integrations/crewai_harness.py
+"""
+
+from __future__ import annotations
+
+
+def main() -> None:
+    try:
+        from crewai import Agent, Crew, Process, Task
+    except ImportError as exc:
+        raise SystemExit(
+            "CrewAI is not installed. "
+            'Install with: pip install "cascadeflow[crewai,openai]"'
+        ) from exc
+
+    from cascadeflow import init, run
+    from cascadeflow.integrations.crewai import CrewAIHarnessConfig, enable
+
+    # 1) Initialize harness globally.
+    init(mode="observe", budget=1.0, max_tool_calls=6)
+
+    # 2) Explicitly enable CrewAI integration hooks (opt-in).
+    enabled = enable(
+        config=CrewAIHarnessConfig(
+            fail_open=True,
+            enable_budget_gate=True,
+        )
+    )
+    if not enabled:
+        raise SystemExit(
+            "CrewAI hooks are unavailable in this environment. "
+            "Ensure crewai>=1.5 is installed."
+        )
+
+    agent = Agent(
+        role="Routing Analyst",
+        goal="Explain model routing impact on cost and latency in plain language.",
+        backstory="You are concise and practical.",
+        allow_delegation=False,
+        llm="openai/gpt-4o-mini",
+        verbose=False,
+    )
+
+    task = Task(
+        description="Explain why inside-the-loop routing helps agent workloads.",
+        expected_output="One short paragraph and three bullet points.",
+        agent=agent,
+    )
+
+    with run(budget=0.5, max_tool_calls=4) as session:
+        crew = Crew(agents=[agent], tasks=[task], process=Process.sequential, verbose=False)
+        result = crew.kickoff()
+
+        print("=== Result ===")
+        print(result)
+        print("\n=== Harness Metrics ===")
+        print(f"Cost: ${session.cost:.6f}")
+        print(f"Remaining budget: {session.budget_remaining}")
+        print(f"Steps: {session.step_count}")
+        print(f"Tool calls: {session.tool_calls}")
+        print(f"Latency: {session.latency_used_ms:.0f}ms")
+        print(f"Energy: {session.energy_used:.1f}")
+        print("\n=== Decision Trace ===")
+        for event in session.trace():
+            print(event)
+
+
+if __name__ == "__main__":
+    main()

From 6d3e6a8cf00460cf0bf6b152f3d8e35bd1722a66 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 4 Mar 2026 20:16:26 +0100
Subject: [PATCH 39/49] feat(n8n): add multi-dimensional harness integration to
 Agent node

Port the Python harness decision engine to TypeScript and wire it into
the n8n Agent node. Tracks 5 dimensions (cost, latency, energy, tool
calls, quality) across every LLM call. Observe mode is on by default;
enforce mode stops the agent loop when limits are hit.

- Add nodes/harness/ with pricing (18 models, fuzzy resolution),
  HarnessRunContext (7-step decision cascade, compliance allowlists,
  KPI-weighted scoring), and 43 tests
- Replace hardcoded estimatesPerMillion in CascadeChatModel with shared
  harness/pricing.ts (broader model coverage + suffix stripping)
- Add harness UI parameters to Agent node (mode, budget, tool cap,
  latency cap, energy cap, compliance, KPI weights)
- Wire pre-call checks and tool-call counting into agent executor loop
- Add harness summary to Agent output JSON
---
 .../CascadeFlowAgent/CascadeFlowAgent.node.ts | 152 +++++++
 .../LmChatCascadeFlow.node.ts                 |  65 +--
 .../LmChatCascadeFlow/cascade-metadata.ts     |   4 +
 .../nodes/harness/__tests__/harness.test.ts   | 368 +++++++++++++++
 .../integrations/n8n/nodes/harness/harness.ts | 426 ++++++++++++++++++
 .../integrations/n8n/nodes/harness/index.ts   |  22 +
 .../integrations/n8n/nodes/harness/pricing.ts | 135 ++++++
 7 files changed, 1141 insertions(+), 31 deletions(-)
 create mode 100644 packages/integrations/n8n/nodes/harness/__tests__/harness.test.ts
 create mode 100644 packages/integrations/n8n/nodes/harness/harness.ts
 create mode 100644 packages/integrations/n8n/nodes/harness/index.ts
 create mode 100644 packages/integrations/n8n/nodes/harness/pricing.ts

diff --git a/packages/integrations/n8n/nodes/CascadeFlowAgent/CascadeFlowAgent.node.ts b/packages/integrations/n8n/nodes/CascadeFlowAgent/CascadeFlowAgent.node.ts
index b3f52a60..925a9a96 100644
--- a/packages/integrations/n8n/nodes/CascadeFlowAgent/CascadeFlowAgent.node.ts
+++ b/packages/integrations/n8n/nodes/CascadeFlowAgent/CascadeFlowAgent.node.ts
@@ -21,6 +21,7 @@ import {
   type DomainType,
   getEnabledDomains,
 } from '../LmChatCascadeFlow/config';
+import { HarnessRunContext, type HarnessConfig, type HarnessMode, type KpiWeights } from '../harness';
 
 // Tool cascade validator - optional import
 let ToolCascadeValidator: any;
@@ -65,6 +66,7 @@ export class CascadeFlowAgentExecutor {
   private routingRules: Map<string, ToolRoutingMode>;
   private enableToolCascadeValidation: boolean;
   private toolCascadeValidator: any;
+  private harnessCtx: HarnessRunContext | null;
 
   constructor(
     private cascadeModel: CascadeChatModel,
@@ -72,7 +74,9 @@ export class CascadeFlowAgentExecutor {
     routingRules: ToolRoutingRule[],
     private maxIterations: number,
     enableToolCascadeValidation: boolean = false,
+    harnessCtx: HarnessRunContext | null = null,
   ) {
+    this.harnessCtx = harnessCtx;
     this.toolMap = new Map(
       tools.filter((tool) => tool?.name).map((tool) => [tool.name as string, tool])
     );
@@ -295,6 +299,18 @@ export class CascadeFlowAgentExecutor {
     let iterations = 0;
 
     while (iterations < this.maxIterations) {
+      // Harness enforce-mode pre-checks
+      if (this.harnessCtx?.config.mode === 'enforce') {
+        if (this.harnessCtx.isBudgetExhausted()) {
+          finalMessage = new AIMessage(`[Harness] Budget exhausted ($${this.harnessCtx.cost.toFixed(4)} of $${this.harnessCtx.config.budgetMax?.toFixed(4)} max). Agent stopped.`);
+          break;
+        }
+        if (this.harnessCtx.isToolCapReached()) {
+          finalMessage = new AIMessage(`[Harness] Tool call cap reached (${this.harnessCtx.toolCalls} of ${this.harnessCtx.config.toolCallsMax} max). Agent stopped.`);
+          break;
+        }
+      }
+
       const message = await this.cascadeModel.invoke(currentMessages, options);
       const toolCalls = this.extractToolCalls(message);
       trace.push(this.buildTraceEntry(message, toolCalls));
@@ -350,6 +366,12 @@ export class CascadeFlowAgentExecutor {
         );
       }
 
+      // Track tool calls in harness (CascadeChatModel records LLM token costs;
+      // agent executor tracks tool-call counts from the loop itself)
+      if (this.harnessCtx) {
+        this.harnessCtx.toolCalls += toolCalls.length;
+      }
+
       if (routing === 'verifier') {
         const verifierMessage = await this.cascadeModel.invokeVerifierDirect(currentMessages, options);
         trace.push(this.buildTraceEntry(verifierMessage));
@@ -377,6 +399,7 @@ export class CascadeFlowAgentExecutor {
       output: finalMessage.content.toString(),
       message: finalMessage,
       trace,
+      harness: this.harnessCtx?.summary() ?? null,
     };
   }
 
@@ -753,6 +776,99 @@ export class CascadeFlowAgent implements INodeType {
         default: '',
       },
       ...generateDomainProperties(),
+      // -----------------------------------------------------------------
+      // Harness: Multi-Dimensional Cascading
+      // -----------------------------------------------------------------
+      {
+        displayName: 'Harness',
+        name: 'harnessHeading',
+        type: 'notice',
+        default: '',
+      },
+      {
+        displayName: 'Harness Mode',
+        name: 'harnessMode',
+        type: 'options',
+        options: [
+          { name: 'Off', value: 'off', description: 'Harness disabled, zero overhead' },
+          { name: 'Observe', value: 'observe', description: 'Track all dimensions, record trace, no enforcement' },
+          { name: 'Enforce', value: 'enforce', description: 'Stop agent loop when limits are hit' },
+        ],
+        default: 'observe',
+        description: 'Harness mode: off (disabled), observe (telemetry only), or enforce (stop when limits hit)',
+      },
+      {
+        displayName: 'Budget (USD)',
+        name: 'harnessBudget',
+        type: 'number',
+        default: 0,
+        typeOptions: { minValue: 0, numberPrecision: 4 },
+        displayOptions: { hide: { harnessMode: ['off'] } },
+        description: 'Max budget in USD. 0 = unlimited.',
+      },
+      {
+        displayName: 'Max Tool Calls',
+        name: 'harnessMaxToolCalls',
+        type: 'number',
+        default: 0,
+        typeOptions: { minValue: 0 },
+        displayOptions: { hide: { harnessMode: ['off'] } },
+        description: 'Max tool call count. 0 = unlimited.',
+      },
+      {
+        displayName: 'Max Latency (Ms)',
+        name: 'harnessMaxLatencyMs',
+        type: 'number',
+        default: 0,
+        typeOptions: { minValue: 0 },
+        displayOptions: { hide: { harnessMode: ['off'] } },
+        description: 'Max cumulative latency in milliseconds. 0 = unlimited.',
+      },
+      {
+        displayName: 'Max Energy',
+        name: 'harnessMaxEnergy',
+        type: 'number',
+        default: 0,
+        typeOptions: { minValue: 0, numberPrecision: 2 },
+        displayOptions: { hide: { harnessMode: ['off'] } },
+        description: 'Max energy proxy units. 0 = unlimited.',
+      },
+      {
+        displayName: 'Compliance',
+        name: 'harnessCompliance',
+        type: 'options',
+        options: [
+          { name: 'GDPR', value: 'gdpr' },
+          { name: 'HIPAA', value: 'hipaa' },
+          { name: 'None', value: '' },
+          { name: 'PCI', value: 'pci' },
+          { name: 'Strict', value: 'strict' },
+        ],
+        default: '',
+        displayOptions: { hide: { harnessMode: ['off'] } },
+        description: 'Compliance policy to enforce model allowlists',
+      },
+      {
+        displayName: 'KPI Weights',
+        name: 'harnessKpiWeights',
+        type: 'fixedCollection',
+        typeOptions: { multipleValues: false },
+        displayOptions: { hide: { harnessMode: ['off'] } },
+        default: { weights: [{ quality: 0.4, cost: 0.3, latency: 0.2, energy: 0.1 }] },
+        options: [
+          {
+            name: 'weights',
+            displayName: 'Weights',
+            values: [
+              { displayName: 'Quality', name: 'quality', type: 'number', default: 0.4, typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 2 } },
+              { displayName: 'Cost', name: 'cost', type: 'number', default: 0.3, typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 2 } },
+              { displayName: 'Latency', name: 'latency', type: 'number', default: 0.2, typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 2 } },
+              { displayName: 'Energy', name: 'energy', type: 'number', default: 0.1, typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 2 } },
+            ],
+          },
+        ],
+        description: 'KPI dimension weights for optimization scoring (normalized automatically)',
+      },
     ],
   };
 
@@ -782,6 +898,35 @@ export class CascadeFlowAgent implements INodeType {
     const toolRoutingRaw = this.getNodeParameter('toolRoutingRules', 0, { rule: [] }) as any;
     const toolRoutingRules = (toolRoutingRaw?.rule ?? []) as ToolRoutingRule[];
 
+    // Harness parameters
+    const harnessMode = this.getNodeParameter('harnessMode', 0, 'observe') as HarnessMode;
+    let harnessCtx: HarnessRunContext | null = null;
+    if (harnessMode !== 'off') {
+      const rawBudget = this.getNodeParameter('harnessBudget', 0, 0) as number;
+      const rawToolCalls = this.getNodeParameter('harnessMaxToolCalls', 0, 0) as number;
+      const rawLatency = this.getNodeParameter('harnessMaxLatencyMs', 0, 0) as number;
+      const rawEnergy = this.getNodeParameter('harnessMaxEnergy', 0, 0) as number;
+      const compliance = this.getNodeParameter('harnessCompliance', 0, '') as string;
+      const kpiRaw = this.getNodeParameter('harnessKpiWeights', 0, { weights: [{ quality: 0.4, cost: 0.3, latency: 0.2, energy: 0.1 }] }) as any;
+      const kpiEntry = kpiRaw?.weights?.[0] ?? { quality: 0.4, cost: 0.3, latency: 0.2, energy: 0.1 };
+
+      const config: HarnessConfig = {
+        mode: harnessMode,
+        budgetMax: rawBudget > 0 ? rawBudget : null,
+        toolCallsMax: rawToolCalls > 0 ? rawToolCalls : null,
+        latencyMaxMs: rawLatency > 0 ? rawLatency : null,
+        energyMax: rawEnergy > 0 ? rawEnergy : null,
+        compliance: compliance || null,
+        kpiWeights: {
+          quality: kpiEntry.quality ?? 0.4,
+          cost: kpiEntry.cost ?? 0.3,
+          latency: kpiEntry.latency ?? 0.2,
+          energy: kpiEntry.energy ?? 0.1,
+        },
+      };
+      harnessCtx = new HarnessRunContext(config);
+    }
+
     // Domain routing parameters
     const enableDomainRouting = this.getNodeParameter('enableDomainRouting', 0, false) as boolean;
 
@@ -887,12 +1032,18 @@ export class CascadeFlowAgent implements INodeType {
       domainVerifierGetters,
     );
 
+    // Wire harness context into cascade model for per-call recording
+    if (harnessCtx) {
+      cascadeModel.setHarnessContext(harnessCtx);
+    }
+
     const agentExecutor = new CascadeFlowAgentExecutor(
       cascadeModel,
       tools,
       toolRoutingRules,
       maxIterations,
       enableToolCascadeValidation,
+      harnessCtx,
     );
 
     // --- Process each input item ---
@@ -933,6 +1084,7 @@ export class CascadeFlowAgent implements INodeType {
           output: result.output,
           ...cascadeflowMeta,
           trace: result.trace,
+          harness: result.harness ?? null,
         },
         pairedItem: { item: itemIndex },
       });
diff --git a/packages/integrations/n8n/nodes/LmChatCascadeFlow/LmChatCascadeFlow.node.ts b/packages/integrations/n8n/nodes/LmChatCascadeFlow/LmChatCascadeFlow.node.ts
index 8c39ae41..ad2d603e 100644
--- a/packages/integrations/n8n/nodes/LmChatCascadeFlow/LmChatCascadeFlow.node.ts
+++ b/packages/integrations/n8n/nodes/LmChatCascadeFlow/LmChatCascadeFlow.node.ts
@@ -23,6 +23,8 @@ import {
   getEnabledDomains,
 } from './config';
 import { buildCascadeMetadata } from './cascade-metadata';
+import { estimateCost as harnessEstimateCost } from '../harness/pricing';
+import type { HarnessRunContext } from '../harness/harness';
 
 // Quality validation, cost tracking, and routing - optional import
 let QualityValidator: any;
@@ -110,6 +112,29 @@ export class CascadeChatModel extends BaseChatModel {
   private domainVerifiers: Map<DomainType, BaseChatModel | undefined> = new Map();
   private domainVerifierGetters: Map<DomainType, () => Promise<BaseChatModel | undefined>> = new Map();
 
+  // Harness context (set by agent node)
+  private harnessCtx: HarnessRunContext | null = null;
+
+  setHarnessContext(ctx: HarnessRunContext | null): void {
+    this.harnessCtx = ctx;
+  }
+
+  private recordHarnessCall(message: BaseMessage, model: BaseChatModel, elapsedMs: number): void {
+    if (!this.harnessCtx) return;
+    const responseMetadata = (message as any).response_metadata || {};
+    const tokenUsage = responseMetadata.tokenUsage || responseMetadata.usage || {};
+    const inputTokens = tokenUsage.promptTokens || tokenUsage.prompt_tokens || 0;
+    const outputTokens = tokenUsage.completionTokens || tokenUsage.completion_tokens || 0;
+    const modelName = (model as any).modelName || (model as any).model || 'unknown';
+    this.harnessCtx.recordCall({
+      model: modelName,
+      inputTokens,
+      outputTokens,
+      toolCallCount: 0,
+      elapsedMs,
+    });
+  }
+
   constructor(
     drafterModelGetter: () => Promise<BaseChatModel>,
     verifierModelGetter: () => Promise<BaseChatModel>,
@@ -257,6 +282,7 @@ export class CascadeChatModel extends BaseChatModel {
     const latency = Date.now() - start;
 
     const verifierCost = await this.calculateMessageCost(verifierMessage, verifierModel);
+    this.recordHarnessCall(verifierMessage, verifierModel, latency);
     const costBreakdown = {
       drafter: 0,
       verifier: verifierCost,
@@ -584,37 +610,8 @@ export class CascadeChatModel extends BaseChatModel {
       }
     }
 
-    // Fallback to rough estimates based on model name
-    const estimatesPerMillion: Record<string, { input: number; output: number }> = {
-      'gpt-4o-mini': { input: 0.15, output: 0.6 },
-      'gpt-4o': { input: 2.5, output: 10.0 },
-      'gpt-5-mini': { input: 0.20, output: 0.80 },
-      'gpt-4-turbo': { input: 10.0, output: 30.0 },
-      'gpt-4': { input: 30.0, output: 60.0 },
-      'gpt-3.5-turbo': { input: 0.5, output: 1.5 },
-      'claude-3-5-haiku': { input: 1.0, output: 5.0 },
-      'claude-haiku-4-5': { input: 1.0, output: 5.0 },
-      'claude-3-5-sonnet': { input: 3.0, output: 15.0 },
-      'claude-sonnet-4-5': { input: 3.0, output: 15.0 },
-      'claude-sonnet-4': { input: 3.0, output: 15.0 },
-      'claude-opus-4-5': { input: 5.0, output: 25.0 },
-      'claude-3-haiku': { input: 0.25, output: 1.25 },
-      default: { input: 1.0, output: 2.0 },
-    };
-
-    let estimate = estimatesPerMillion.default;
-    for (const [key, value] of Object.entries(estimatesPerMillion)) {
-      if (modelName.includes(key)) {
-        estimate = value;
-        break;
-      }
-    }
-
-    const cost =
-      (inputTokens / 1_000_000) * estimate.input +
-      (outputTokens / 1_000_000) * estimate.output;
-
-    return cost;
+    // Use shared harness pricing (fuzzy model resolution, 18 models)
+    return harnessEstimateCost(modelName, inputTokens, outputTokens);
   }
 
   /**
@@ -711,6 +708,7 @@ export class CascadeChatModel extends BaseChatModel {
         this.verifierCount++;
 
         const verifierCost = await this.calculateMessageCost(verifierMessage, verifierModel);
+        this.recordHarnessCall(verifierMessage, verifierModel, verifierLatency);
         const costBreakdown = {
           drafter: 0,
           verifier: verifierCost,
@@ -772,6 +770,7 @@ export class CascadeChatModel extends BaseChatModel {
       const drafterStartTime = Date.now();
       const drafterMessage = await modelToUse.invoke(messages, options);
       const drafterLatency = Date.now() - drafterStartTime;
+      this.recordHarnessCall(drafterMessage, modelToUse, drafterLatency);
 
       if (domainModel && detectedDomain) {
         this.domainCounts.set(detectedDomain, (this.domainCounts.get(detectedDomain) || 0) + 1);
@@ -798,6 +797,7 @@ export class CascadeChatModel extends BaseChatModel {
           const verifierStartTime = Date.now();
           const verifierMessage = await verifierModel.invoke(messages, options);
           const verifierLatency = Date.now() - verifierStartTime;
+          this.recordHarnessCall(verifierMessage, verifierModel, verifierLatency);
 
           this.verifierCount++;
 
@@ -1060,6 +1060,7 @@ export class CascadeChatModel extends BaseChatModel {
       const verifierInfo = this.getModelInfo(verifierModel);
       const verifierMessage = await verifierModel.invoke(messages, options);
       const verifierLatency = Date.now() - verifierStartTime;
+      this.recordHarnessCall(verifierMessage, verifierModel, verifierLatency);
 
       this.verifierCount++;
 
@@ -1136,7 +1137,9 @@ export class CascadeChatModel extends BaseChatModel {
 
       const verifierModel = await this.getVerifierModel();
       const verifierInfo = this.getModelInfo(verifierModel);
+      const fallbackStart = Date.now();
       const verifierMessage = await verifierModel.invoke(messages, options);
+      this.recordHarnessCall(verifierMessage, verifierModel, Date.now() - fallbackStart);
       this.verifierCount++;
       const verifierCost = await this.calculateMessageCost(verifierMessage, verifierModel);
 
diff --git a/packages/integrations/n8n/nodes/LmChatCascadeFlow/cascade-metadata.ts b/packages/integrations/n8n/nodes/LmChatCascadeFlow/cascade-metadata.ts
index d539d5b7..e93f7b23 100644
--- a/packages/integrations/n8n/nodes/LmChatCascadeFlow/cascade-metadata.ts
+++ b/packages/integrations/n8n/nodes/LmChatCascadeFlow/cascade-metadata.ts
@@ -1,4 +1,5 @@
 import type { DomainType } from './config';
+import type { HarnessSummary } from '../harness';
 
 export interface CostBreakdown {
   drafter: number;
@@ -12,12 +13,15 @@ export interface SavingsBreakdown {
   percent: number;
 }
 
+export interface HarnessSummaryOutput extends HarnessSummary {}
+
 export interface CascadeFlowMetadata {
   model_used: string;
   domain: DomainType | null;
   confidence?: number;
   costs: CostBreakdown;
   savings: SavingsBreakdown;
+  harness?: HarnessSummaryOutput | null;
 }
 
 export const calculateSavings = (
diff --git a/packages/integrations/n8n/nodes/harness/__tests__/harness.test.ts b/packages/integrations/n8n/nodes/harness/__tests__/harness.test.ts
new file mode 100644
index 00000000..5c003e42
--- /dev/null
+++ b/packages/integrations/n8n/nodes/harness/__tests__/harness.test.ts
@@ -0,0 +1,368 @@
+import { describe, expect, it } from 'vitest';
+
+import {
+  PRICING_USD_PER_M,
+  DEFAULT_PRICING_USD_PER_M,
+  ENERGY_COEFFICIENTS,
+  DEFAULT_ENERGY_COEFFICIENT,
+  ENERGY_OUTPUT_WEIGHT,
+  resolvePricingKey,
+  estimateCost,
+  estimateEnergy,
+  modelTotalPrice,
+} from '../pricing';
+
+import {
+  HarnessRunContext,
+  COMPLIANCE_MODEL_ALLOWLISTS,
+  QUALITY_PRIORS,
+  LATENCY_PRIORS,
+  normalizeWeights,
+  type HarnessConfig,
+} from '../harness';
+
+// ---------------------------------------------------------------------------
+// Pricing data fidelity
+// ---------------------------------------------------------------------------
+
+describe('pricing data', () => {
+  it('has 18 models in PRICING_USD_PER_M', () => {
+    expect(Object.keys(PRICING_USD_PER_M)).toHaveLength(18);
+  });
+
+  it('matches Python values for gpt-4o', () => {
+    expect(PRICING_USD_PER_M['gpt-4o']).toEqual([2.50, 10.00]);
+  });
+
+  it('matches Python values for gpt-4o-mini', () => {
+    expect(PRICING_USD_PER_M['gpt-4o-mini']).toEqual([0.15, 0.60]);
+  });
+
+  it('matches Python values for claude-sonnet-4', () => {
+    expect(PRICING_USD_PER_M['claude-sonnet-4']).toEqual([3.00, 15.00]);
+  });
+
+  it('matches Python values for gemini-2.5-flash', () => {
+    expect(PRICING_USD_PER_M['gemini-2.5-flash']).toEqual([0.15, 0.60]);
+  });
+
+  it('has correct default pricing', () => {
+    expect(DEFAULT_PRICING_USD_PER_M).toEqual([2.50, 10.00]);
+  });
+
+  it('has 18 models in ENERGY_COEFFICIENTS', () => {
+    expect(Object.keys(ENERGY_COEFFICIENTS)).toHaveLength(18);
+  });
+
+  it('has correct energy defaults', () => {
+    expect(DEFAULT_ENERGY_COEFFICIENT).toBe(1.0);
+    expect(ENERGY_OUTPUT_WEIGHT).toBe(1.5);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// estimateCost / estimateEnergy
+// ---------------------------------------------------------------------------
+
+describe('estimateCost', () => {
+  it('calculates gpt-4o cost correctly (1000 in, 500 out = $0.0075)', () => {
+    const cost = estimateCost('gpt-4o', 1000, 500);
+    expect(cost).toBeCloseTo(0.0075, 6);
+  });
+
+  it('calculates gpt-4o-mini cost correctly', () => {
+    const cost = estimateCost('gpt-4o-mini', 1_000_000, 1_000_000);
+    expect(cost).toBeCloseTo(0.15 + 0.60, 6);
+  });
+
+  it('uses default pricing for unknown models', () => {
+    const cost = estimateCost('unknown-model', 1_000_000, 1_000_000);
+    expect(cost).toBeCloseTo(2.50 + 10.00, 6);
+  });
+});
+
+describe('estimateEnergy', () => {
+  it('calculates gpt-4o energy correctly (100 in, 50 out)', () => {
+    // coeff=1.0, energy = 1.0 * (100 + 50 * 1.5) = 175.0
+    const energy = estimateEnergy('gpt-4o', 100, 50);
+    expect(energy).toBeCloseTo(175.0, 4);
+  });
+
+  it('uses default coefficient for unknown models', () => {
+    // coeff=1.0, energy = 1.0 * (100 + 50 * 1.5) = 175.0
+    const energy = estimateEnergy('unknown-model', 100, 50);
+    expect(energy).toBeCloseTo(175.0, 4);
+  });
+
+  it('uses correct coefficient for gpt-4o-mini', () => {
+    // coeff=0.3, energy = 0.3 * (100 + 50 * 1.5) = 52.5
+    const energy = estimateEnergy('gpt-4o-mini', 100, 50);
+    expect(energy).toBeCloseTo(52.5, 4);
+  });
+});
+
+describe('modelTotalPrice', () => {
+  it('returns input + output for gpt-4o', () => {
+    expect(modelTotalPrice('gpt-4o')).toBeCloseTo(12.50, 6);
+  });
+
+  it('returns default for unknown model', () => {
+    expect(modelTotalPrice('unknown')).toBeCloseTo(12.50, 6);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Fuzzy model resolution
+// ---------------------------------------------------------------------------
+
+describe('resolvePricingKey', () => {
+  it('exact match', () => {
+    expect(resolvePricingKey('gpt-4o')).toBe('gpt-4o');
+  });
+
+  it('strips version suffix (-20250120)', () => {
+    expect(resolvePricingKey('gpt-4o-20250120')).toBe('gpt-4o');
+  });
+
+  it('strips -preview suffix', () => {
+    expect(resolvePricingKey('gpt-4o-preview')).toBe('gpt-4o');
+  });
+
+  it('strips -latest suffix', () => {
+    expect(resolvePricingKey('gpt-4o-latest')).toBe('gpt-4o');
+  });
+
+  it('longest-prefix match (gemini-2.5-flash-8b → gemini-2.5-flash)', () => {
+    expect(resolvePricingKey('gemini-2.5-flash-8b')).toBe('gemini-2.5-flash');
+  });
+
+  it('returns null for completely unknown model', () => {
+    expect(resolvePricingKey('totally-unknown-model')).toBeNull();
+  });
+});
+
+// ---------------------------------------------------------------------------
+// HarnessRunContext — evaluatePreCall
+// ---------------------------------------------------------------------------
+
+function makeConfig(overrides: Partial<HarnessConfig> = {}): HarnessConfig {
+  return {
+    mode: 'enforce',
+    budgetMax: null,
+    toolCallsMax: null,
+    latencyMaxMs: null,
+    energyMax: null,
+    compliance: null,
+    kpiWeights: {},
+    ...overrides,
+  };
+}
+
+describe('evaluatePreCall', () => {
+  it('returns allow when no limits set', () => {
+    const ctx = new HarnessRunContext(makeConfig());
+    const decision = ctx.evaluatePreCall('gpt-4o', false);
+    expect(decision.action).toBe('allow');
+  });
+
+  it('returns stop when budget exhausted', () => {
+    const ctx = new HarnessRunContext(makeConfig({ budgetMax: 0.01 }));
+    ctx.cost = 0.01; // exhaust budget
+    const decision = ctx.evaluatePreCall('gpt-4o', false);
+    expect(decision.action).toBe('stop');
+    expect(decision.reason).toBe('budget_exceeded');
+  });
+
+  it('returns deny_tool when tool cap reached', () => {
+    const ctx = new HarnessRunContext(makeConfig({ toolCallsMax: 3 }));
+    ctx.toolCalls = 3;
+    const decision = ctx.evaluatePreCall('gpt-4o', true);
+    expect(decision.action).toBe('deny_tool');
+    expect(decision.reason).toBe('max_tool_calls_reached');
+  });
+
+  it('returns stop for compliance violation (non-compliant model)', () => {
+    const ctx = new HarnessRunContext(makeConfig({ compliance: 'gdpr' }));
+    const decision = ctx.evaluatePreCall('claude-sonnet-4', false);
+    expect(decision.action).toBe('stop');
+    expect(decision.reason).toBe('compliance_no_approved_model');
+  });
+
+  it('allows compliant model under GDPR', () => {
+    const ctx = new HarnessRunContext(makeConfig({ compliance: 'gdpr' }));
+    const decision = ctx.evaluatePreCall('gpt-4o', false);
+    expect(decision.action).toBe('allow');
+  });
+
+  it('returns stop when latency cap exceeded', () => {
+    const ctx = new HarnessRunContext(makeConfig({ latencyMaxMs: 1000 }));
+    ctx.latencyUsedMs = 1000;
+    const decision = ctx.evaluatePreCall('gpt-3.5-turbo', false);
+    // gpt-3.5-turbo is already the fastest → can't switch → stop
+    expect(decision.action).toBe('stop');
+    expect(decision.reason).toBe('latency_limit_exceeded');
+  });
+
+  it('returns stop when energy cap exceeded', () => {
+    const ctx = new HarnessRunContext(makeConfig({ energyMax: 100 }));
+    ctx.energyUsed = 100;
+    const decision = ctx.evaluatePreCall('gpt-3.5-turbo', false);
+    // gpt-3.5-turbo is already lowest energy → can't switch → stop
+    expect(decision.action).toBe('stop');
+    expect(decision.reason).toBe('energy_limit_exceeded');
+  });
+
+  it('returns switch_model observation for budget pressure', () => {
+    const ctx = new HarnessRunContext(makeConfig({ budgetMax: 1.0 }));
+    ctx.cost = 0.85; // 85% spent, < 20% remaining
+    ctx.budgetRemaining = 0.15;
+    const decision = ctx.evaluatePreCall('gpt-4o', false);
+    // Budget pressure suggests cheaper model
+    expect(decision.action).toBe('switch_model');
+    expect(decision.reason).toBe('budget_pressure');
+  });
+
+  it('returns switch_model observation for KPI optimization', () => {
+    const ctx = new HarnessRunContext(makeConfig({
+      kpiWeights: { quality: 0, cost: 1, latency: 0, energy: 0 },
+    }));
+    // gpt-4 is very expensive, KPI weights purely on cost → should suggest cheaper
+    const decision = ctx.evaluatePreCall('gpt-4', false);
+    expect(decision.action).toBe('switch_model');
+    expect(decision.reason).toBe('kpi_weight_optimization');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Budget tracking across multiple recordCall invocations
+// ---------------------------------------------------------------------------
+
+describe('recordCall and budget tracking', () => {
+  it('accumulates cost across calls', () => {
+    const ctx = new HarnessRunContext(makeConfig({ budgetMax: 0.10 }));
+    ctx.recordCall({ model: 'gpt-4o-mini', inputTokens: 100, outputTokens: 50, toolCallCount: 0, elapsedMs: 50 });
+    expect(ctx.cost).toBeGreaterThan(0);
+    expect(ctx.stepCount).toBe(1);
+    expect(ctx.budgetRemaining).toBeLessThan(0.10);
+
+    ctx.recordCall({ model: 'gpt-4o-mini', inputTokens: 200, outputTokens: 100, toolCallCount: 1, elapsedMs: 60 });
+    expect(ctx.stepCount).toBe(2);
+    expect(ctx.toolCalls).toBe(1);
+    expect(ctx.latencyUsedMs).toBe(110);
+  });
+
+  it('detects budget exhaustion', () => {
+    const ctx = new HarnessRunContext(makeConfig({ budgetMax: 0.0001 }));
+    ctx.recordCall({ model: 'gpt-4o', inputTokens: 10000, outputTokens: 5000, toolCallCount: 0, elapsedMs: 100 });
+    expect(ctx.isBudgetExhausted()).toBe(true);
+  });
+
+  it('detects tool cap reached', () => {
+    const ctx = new HarnessRunContext(makeConfig({ toolCallsMax: 2 }));
+    ctx.toolCalls = 2;
+    expect(ctx.isToolCapReached()).toBe(true);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Observe vs enforce mode behavior
+// ---------------------------------------------------------------------------
+
+describe('observe vs enforce mode', () => {
+  it('observe mode evaluatePreCall still returns decisions', () => {
+    const ctx = new HarnessRunContext(makeConfig({ mode: 'observe', budgetMax: 0.01 }));
+    ctx.cost = 0.01;
+    const decision = ctx.evaluatePreCall('gpt-4o', false);
+    // Decision is evaluated regardless of mode
+    expect(decision.action).toBe('stop');
+  });
+
+  it('off mode has no context created (by design)', () => {
+    // In the actual agent node, harnessCtx is null when mode=off
+    // This test validates that a context with mode=off still works
+    const ctx = new HarnessRunContext(makeConfig({ mode: 'off' }));
+    const decision = ctx.evaluatePreCall('gpt-4o', false);
+    expect(decision.action).toBe('allow');
+    expect(decision.reason).toBe('off');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Compliance allowlists
+// ---------------------------------------------------------------------------
+
+describe('compliance allowlists', () => {
+  it('GDPR allows gpt-4o, gpt-4o-mini, gpt-3.5-turbo', () => {
+    const allowlist = COMPLIANCE_MODEL_ALLOWLISTS['gdpr'];
+    expect(allowlist.has('gpt-4o')).toBe(true);
+    expect(allowlist.has('gpt-4o-mini')).toBe(true);
+    expect(allowlist.has('gpt-3.5-turbo')).toBe(true);
+    expect(allowlist.has('claude-sonnet-4')).toBe(false);
+  });
+
+  it('strict allows only gpt-4o', () => {
+    const allowlist = COMPLIANCE_MODEL_ALLOWLISTS['strict'];
+    expect(allowlist.size).toBe(1);
+    expect(allowlist.has('gpt-4o')).toBe(true);
+  });
+
+  it('strict mode denies tools even for compliant model', () => {
+    const ctx = new HarnessRunContext(makeConfig({ compliance: 'strict' }));
+    const decision = ctx.evaluatePreCall('gpt-4o', true);
+    expect(decision.action).toBe('deny_tool');
+    expect(decision.reason).toBe('compliance_tool_restriction');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// KPI weight normalization
+// ---------------------------------------------------------------------------
+
+describe('normalizeWeights', () => {
+  it('normalizes to sum=1', () => {
+    const result = normalizeWeights({ quality: 0.4, cost: 0.3, latency: 0.2, energy: 0.1 });
+    const sum = Object.values(result).reduce((a, b) => a + b, 0);
+    expect(sum).toBeCloseTo(1.0, 6);
+  });
+
+  it('filters out zero and negative values', () => {
+    const result = normalizeWeights({ quality: 1, cost: 0, latency: -1, energy: 1 });
+    expect(result.cost).toBeUndefined();
+    expect(result.latency).toBeUndefined();
+    expect(result.quality).toBeCloseTo(0.5, 6);
+    expect(result.energy).toBeCloseTo(0.5, 6);
+  });
+
+  it('returns empty for all-zero weights', () => {
+    const result = normalizeWeights({ quality: 0, cost: 0, latency: 0, energy: 0 });
+    expect(Object.keys(result)).toHaveLength(0);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// summary() structure
+// ---------------------------------------------------------------------------
+
+describe('summary()', () => {
+  it('returns correct structure', () => {
+    const ctx = new HarnessRunContext(makeConfig({ budgetMax: 1.0 }));
+    ctx.recordCall({ model: 'gpt-4o-mini', inputTokens: 100, outputTokens: 50, toolCallCount: 0, elapsedMs: 42 });
+
+    const s = ctx.summary();
+    expect(s.runId).toBeTruthy();
+    expect(s.mode).toBe('enforce');
+    expect(s.stepCount).toBe(1);
+    expect(s.toolCalls).toBe(0);
+    expect(s.cost).toBeGreaterThan(0);
+    expect(s.latencyUsedMs).toBe(42);
+    expect(s.energyUsed).toBeGreaterThan(0);
+    expect(s.budgetMax).toBe(1.0);
+    expect(s.budgetRemaining).toBeLessThan(1.0);
+    expect(s.lastAction).toBe('allow');
+    expect(s.durationMs).toBeGreaterThanOrEqual(0);
+    expect(Array.isArray(s.trace)).toBe(true);
+    expect(s.trace).toHaveLength(1);
+    expect(s.trace[0].action).toBe('allow');
+    expect(s.trace[0].budgetState.max).toBe(1.0);
+  });
+});
diff --git a/packages/integrations/n8n/nodes/harness/harness.ts b/packages/integrations/n8n/nodes/harness/harness.ts
new file mode 100644
index 00000000..93c5150d
--- /dev/null
+++ b/packages/integrations/n8n/nodes/harness/harness.ts
@@ -0,0 +1,426 @@
+/**
+ * HarnessRunContext — multi-dimensional decision engine for n8n (TypeScript port).
+ *
+ * Ported from cascadeflow/harness/api.py (HarnessRunContext) and
+ * cascadeflow/harness/instrument.py (pre-call decision logic, compliance,
+ * quality/latency priors, KPI scoring).
+ *
+ * Key n8n constraint: models are graph connections (sub-nodes), not string
+ * parameters. The harness cannot switch models at runtime. Only `stop` and
+ * `deny_tool` actions have enforcement effects. `switch_model` decisions are
+ * recorded in the trace as observations.
+ */
+
+import {
+  ENERGY_COEFFICIENTS,
+  DEFAULT_ENERGY_COEFFICIENT,
+  estimateCost,
+  estimateEnergy,
+  modelTotalPrice,
+  PRICING_USD_PER_M,
+} from './pricing';
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+export type HarnessMode = 'off' | 'observe' | 'enforce';
+
+export interface KpiWeights {
+  quality?: number;
+  cost?: number;
+  latency?: number;
+  energy?: number;
+}
+
+export interface HarnessConfig {
+  mode: HarnessMode;
+  budgetMax: number | null;
+  toolCallsMax: number | null;
+  latencyMaxMs: number | null;
+  energyMax: number | null;
+  compliance: string | null;
+  kpiWeights: KpiWeights;
+}
+
+export interface PreCallDecision {
+  action: 'allow' | 'stop' | 'switch_model' | 'deny_tool';
+  reason: string;
+  targetModel: string;
+}
+
+export interface HarnessTraceEntry {
+  action: string;
+  reason: string;
+  model: string | null;
+  step: number;
+  timestampMs: number;
+  costTotal: number;
+  budgetState: { max: number | null; remaining: number | null };
+  applied: boolean;
+  decisionMode: string;
+}
+
+export interface HarnessSummary {
+  runId: string;
+  mode: HarnessMode;
+  stepCount: number;
+  toolCalls: number;
+  cost: number;
+  latencyUsedMs: number;
+  energyUsed: number;
+  budgetMax: number | null;
+  budgetRemaining: number | null;
+  lastAction: string;
+  durationMs: number;
+  trace: HarnessTraceEntry[];
+}
+
+export interface RecordCallParams {
+  model: string;
+  inputTokens: number;
+  outputTokens: number;
+  toolCallCount: number;
+  elapsedMs: number;
+  decision?: PreCallDecision;
+}
+
+// ---------------------------------------------------------------------------
+// Compliance allowlists (from instrument.py lines 107-112)
+// ---------------------------------------------------------------------------
+
+const COMPLIANCE_MODEL_ALLOWLISTS: Record<string, Set<string>> = {
+  gdpr: new Set(['gpt-4o', 'gpt-4o-mini', 'gpt-3.5-turbo']),
+  hipaa: new Set(['gpt-4o', 'gpt-4o-mini']),
+  pci: new Set(['gpt-4o-mini', 'gpt-3.5-turbo']),
+  strict: new Set(['gpt-4o']),
+};
+
+// ---------------------------------------------------------------------------
+// Quality & latency priors for KPI scoring (from instrument.py lines 74-95)
+// ---------------------------------------------------------------------------
+
+const QUALITY_PRIORS: Record<string, number> = {
+  'gpt-4o': 0.90,
+  'gpt-4o-mini': 0.75,
+  'gpt-5-mini': 0.86,
+  'gpt-4-turbo': 0.88,
+  'gpt-4': 0.87,
+  'gpt-3.5-turbo': 0.65,
+  'o1': 0.95,
+  'o1-mini': 0.82,
+  'o3-mini': 0.80,
+};
+
+const LATENCY_PRIORS: Record<string, number> = {
+  'gpt-4o': 0.72,
+  'gpt-4o-mini': 0.93,
+  'gpt-5-mini': 0.84,
+  'gpt-4-turbo': 0.66,
+  'gpt-4': 0.52,
+  'gpt-3.5-turbo': 1.00,
+  'o1': 0.40,
+  'o1-mini': 0.60,
+  'o3-mini': 0.78,
+};
+
+// Pre-computed model cost/energy bounds for utility functions.
+const MODEL_POOL = Object.keys(PRICING_USD_PER_M);
+const MODEL_TOTAL_COSTS = new Map(MODEL_POOL.map(m => [m, modelTotalPrice(m)]));
+const MIN_TOTAL_COST = Math.min(...MODEL_TOTAL_COSTS.values());
+const MAX_TOTAL_COST = Math.max(...MODEL_TOTAL_COSTS.values());
+
+const MODEL_ENERGY_COEFFS = new Map(
+  MODEL_POOL.map(m => [m, ENERGY_COEFFICIENTS[m] ?? DEFAULT_ENERGY_COEFFICIENT]),
+);
+const MIN_ENERGY_COEFF = Math.min(...MODEL_ENERGY_COEFFS.values());
+const MAX_ENERGY_COEFF = Math.max(...MODEL_ENERGY_COEFFS.values());
+
+// ---------------------------------------------------------------------------
+// KPI scoring helpers (from instrument.py lines 234-267)
+// ---------------------------------------------------------------------------
+
+function normalizeWeights(weights: KpiWeights): Record<string, number> {
+  const raw: Record<string, number> = {};
+  for (const [key, val] of Object.entries(weights)) {
+    if (['cost', 'quality', 'latency', 'energy'].includes(key) && typeof val === 'number' && val > 0) {
+      raw[key] = val;
+    }
+  }
+  const total = Object.values(raw).reduce((a, b) => a + b, 0);
+  if (total <= 0) return {};
+  const normalized: Record<string, number> = {};
+  for (const [key, val] of Object.entries(raw)) {
+    normalized[key] = val / total;
+  }
+  return normalized;
+}
+
+function costUtility(model: string): number {
+  const modelCost = MODEL_TOTAL_COSTS.get(model) ?? modelTotalPrice(model);
+  if (MAX_TOTAL_COST === MIN_TOTAL_COST) return 1.0;
+  return (MAX_TOTAL_COST - modelCost) / (MAX_TOTAL_COST - MIN_TOTAL_COST);
+}
+
+function energyUtility(model: string): number {
+  const coeff = ENERGY_COEFFICIENTS[model] ?? DEFAULT_ENERGY_COEFFICIENT;
+  if (MAX_ENERGY_COEFF === MIN_ENERGY_COEFF) return 1.0;
+  return (MAX_ENERGY_COEFF - coeff) / (MAX_ENERGY_COEFF - MIN_ENERGY_COEFF);
+}
+
+function kpiScoreWithNormalized(model: string, normalized: Record<string, number>): number {
+  if (Object.keys(normalized).length === 0) return 0.0;
+  const quality = QUALITY_PRIORS[model] ?? 0.7;
+  const latency = LATENCY_PRIORS[model] ?? 0.7;
+  const cost = costUtility(model);
+  const energy = energyUtility(model);
+  return (
+    (normalized.quality ?? 0) * quality +
+    (normalized.latency ?? 0) * latency +
+    (normalized.cost ?? 0) * cost +
+    (normalized.energy ?? 0) * energy
+  );
+}
+
+function selectKpiWeightedModel(currentModel: string, weights: KpiWeights): string {
+  const normalized = normalizeWeights(weights);
+  if (Object.keys(normalized).length === 0) return currentModel;
+  let bestModel = currentModel;
+  let bestScore = kpiScoreWithNormalized(currentModel, normalized);
+  for (const candidate of MODEL_POOL) {
+    const score = kpiScoreWithNormalized(candidate, normalized);
+    if (score > bestScore) {
+      bestModel = candidate;
+      bestScore = score;
+    }
+  }
+  return bestModel;
+}
+
+// Cheapest/fastest/lowest-energy helpers
+function selectCheaperModel(currentModel: string): string {
+  let cheapest = currentModel;
+  let cheapestCost = MODEL_TOTAL_COSTS.get(currentModel) ?? modelTotalPrice(currentModel);
+  for (const [m, c] of MODEL_TOTAL_COSTS) {
+    if (c < cheapestCost) {
+      cheapest = m;
+      cheapestCost = c;
+    }
+  }
+  return cheapest;
+}
+
+function selectFasterModel(currentModel: string): string {
+  const currentLatency = LATENCY_PRIORS[currentModel] ?? 0.7;
+  let best = currentModel;
+  let bestLatency = currentLatency;
+  for (const [m, lat] of Object.entries(LATENCY_PRIORS)) {
+    if (lat > bestLatency) {
+      best = m;
+      bestLatency = lat;
+    }
+  }
+  return best;
+}
+
+function selectLowerEnergyModel(currentModel: string): string {
+  const currentCoeff = ENERGY_COEFFICIENTS[currentModel] ?? DEFAULT_ENERGY_COEFFICIENT;
+  let best = currentModel;
+  let bestCoeff = currentCoeff;
+  for (const [m, c] of MODEL_ENERGY_COEFFS) {
+    if (c < bestCoeff) {
+      best = m;
+      bestCoeff = c;
+    }
+  }
+  return best;
+}
+
+// ---------------------------------------------------------------------------
+// HarnessRunContext
+// ---------------------------------------------------------------------------
+
+let runIdCounter = 0;
+
+function generateRunId(): string {
+  runIdCounter += 1;
+  const ts = Date.now().toString(36);
+  const counter = runIdCounter.toString(36);
+  return `${ts}${counter}`.slice(-8);
+}
+
+export class HarnessRunContext {
+  readonly runId: string;
+  readonly config: HarnessConfig;
+
+  stepCount = 0;
+  toolCalls = 0;
+  cost = 0;
+  latencyUsedMs = 0;
+  energyUsed = 0;
+  budgetRemaining: number | null;
+  lastAction = 'allow';
+
+  private startedAt: number;
+  private trace: HarnessTraceEntry[] = [];
+
+  constructor(config: HarnessConfig) {
+    this.runId = generateRunId();
+    this.config = config;
+    this.budgetRemaining = config.budgetMax;
+    this.startedAt = Date.now();
+  }
+
+  // -----------------------------------------------------------------------
+  // Pre-call decision cascade (ported from instrument.py _evaluate_pre_call_decision)
+  // -----------------------------------------------------------------------
+
+  evaluatePreCall(model: string, hasTools: boolean): PreCallDecision {
+    const cfg = this.config;
+
+    // 1. Budget exhausted
+    if (cfg.budgetMax !== null && this.cost >= cfg.budgetMax) {
+      return { action: 'stop', reason: 'budget_exceeded', targetModel: model };
+    }
+
+    // 2. Tool call cap
+    if (hasTools && cfg.toolCallsMax !== null && this.toolCalls >= cfg.toolCallsMax) {
+      return { action: 'deny_tool', reason: 'max_tool_calls_reached', targetModel: model };
+    }
+
+    // 3. Compliance
+    if (cfg.compliance) {
+      const allowlist = COMPLIANCE_MODEL_ALLOWLISTS[cfg.compliance.trim().toLowerCase()];
+      if (allowlist) {
+        if (!allowlist.has(model)) {
+          // Can't switch models in n8n — stop if no compliant model possible
+          return { action: 'stop', reason: 'compliance_no_approved_model', targetModel: model };
+        }
+        if (cfg.compliance.trim().toLowerCase() === 'strict' && hasTools) {
+          return { action: 'deny_tool', reason: 'compliance_tool_restriction', targetModel: model };
+        }
+      }
+    }
+
+    // 4. Latency cap
+    if (cfg.latencyMaxMs !== null && this.latencyUsedMs >= cfg.latencyMaxMs) {
+      const faster = selectFasterModel(model);
+      if (faster !== model) {
+        return { action: 'switch_model', reason: 'latency_limit_exceeded', targetModel: faster };
+      }
+      return { action: 'stop', reason: 'latency_limit_exceeded', targetModel: model };
+    }
+
+    // 5. Energy cap
+    if (cfg.energyMax !== null && this.energyUsed >= cfg.energyMax) {
+      const lower = selectLowerEnergyModel(model);
+      if (lower !== model) {
+        return { action: 'switch_model', reason: 'energy_limit_exceeded', targetModel: lower };
+      }
+      return { action: 'stop', reason: 'energy_limit_exceeded', targetModel: model };
+    }
+
+    // 6. Budget pressure (<20% remaining) — observation only in n8n
+    if (
+      cfg.budgetMax !== null &&
+      cfg.budgetMax > 0 &&
+      this.budgetRemaining !== null &&
+      this.budgetRemaining / cfg.budgetMax < 0.2
+    ) {
+      const cheaper = selectCheaperModel(model);
+      if (cheaper !== model) {
+        return { action: 'switch_model', reason: 'budget_pressure', targetModel: cheaper };
+      }
+    }
+
+    // 7. KPI-weighted — observation only in n8n
+    const kw = cfg.kpiWeights;
+    if (kw && Object.values(kw).some(v => typeof v === 'number' && v > 0)) {
+      const weighted = selectKpiWeightedModel(model, kw);
+      if (weighted !== model) {
+        return { action: 'switch_model', reason: 'kpi_weight_optimization', targetModel: weighted };
+      }
+    }
+
+    // 8. Default: allow
+    return { action: 'allow', reason: cfg.mode, targetModel: model };
+  }
+
+  // -----------------------------------------------------------------------
+  // Record a completed call
+  // -----------------------------------------------------------------------
+
+  recordCall(params: RecordCallParams): void {
+    const { model, inputTokens, outputTokens, toolCallCount, elapsedMs, decision } = params;
+
+    const callCost = estimateCost(model, inputTokens, outputTokens);
+    const energy = estimateEnergy(model, inputTokens, outputTokens);
+
+    this.cost += callCost;
+    this.stepCount += 1;
+    this.latencyUsedMs += elapsedMs;
+    this.energyUsed += energy;
+    this.toolCalls += toolCallCount;
+
+    if (this.config.budgetMax !== null) {
+      this.budgetRemaining = this.config.budgetMax - this.cost;
+    }
+
+    const action = decision?.action ?? 'allow';
+    const reason = decision?.reason ?? this.config.mode;
+    const applied = action === 'allow' || (this.config.mode === 'enforce' && (action === 'stop' || action === 'deny_tool'));
+
+    this.lastAction = action;
+
+    this.trace.push({
+      action,
+      reason,
+      model,
+      step: this.stepCount,
+      timestampMs: Date.now(),
+      costTotal: this.cost,
+      budgetState: {
+        max: this.config.budgetMax,
+        remaining: this.budgetRemaining,
+      },
+      applied,
+      decisionMode: this.config.mode,
+    });
+  }
+
+  // -----------------------------------------------------------------------
+  // Quick checks for agent loop
+  // -----------------------------------------------------------------------
+
+  isBudgetExhausted(): boolean {
+    return this.config.budgetMax !== null && this.cost >= this.config.budgetMax;
+  }
+
+  isToolCapReached(): boolean {
+    return this.config.toolCallsMax !== null && this.toolCalls >= this.config.toolCallsMax;
+  }
+
+  // -----------------------------------------------------------------------
+  // Summary
+  // -----------------------------------------------------------------------
+
+  summary(): HarnessSummary {
+    return {
+      runId: this.runId,
+      mode: this.config.mode,
+      stepCount: this.stepCount,
+      toolCalls: this.toolCalls,
+      cost: this.cost,
+      latencyUsedMs: this.latencyUsedMs,
+      energyUsed: this.energyUsed,
+      budgetMax: this.config.budgetMax,
+      budgetRemaining: this.budgetRemaining,
+      lastAction: this.lastAction,
+      durationMs: Date.now() - this.startedAt,
+      trace: [...this.trace],
+    };
+  }
+}
+
+// Re-export for external test access
+export { COMPLIANCE_MODEL_ALLOWLISTS, QUALITY_PRIORS, LATENCY_PRIORS, normalizeWeights };
diff --git a/packages/integrations/n8n/nodes/harness/index.ts b/packages/integrations/n8n/nodes/harness/index.ts
new file mode 100644
index 00000000..663f93b3
--- /dev/null
+++ b/packages/integrations/n8n/nodes/harness/index.ts
@@ -0,0 +1,22 @@
+export {
+  PRICING_USD_PER_M,
+  DEFAULT_PRICING_USD_PER_M,
+  ENERGY_COEFFICIENTS,
+  DEFAULT_ENERGY_COEFFICIENT,
+  ENERGY_OUTPUT_WEIGHT,
+  resolvePricingKey,
+  estimateCost,
+  estimateEnergy,
+  modelTotalPrice,
+} from './pricing';
+
+export {
+  type HarnessMode,
+  type KpiWeights,
+  type HarnessConfig,
+  type PreCallDecision,
+  type HarnessTraceEntry,
+  type HarnessSummary,
+  type RecordCallParams,
+  HarnessRunContext,
+} from './harness';
diff --git a/packages/integrations/n8n/nodes/harness/pricing.ts b/packages/integrations/n8n/nodes/harness/pricing.ts
new file mode 100644
index 00000000..fd13f43a
--- /dev/null
+++ b/packages/integrations/n8n/nodes/harness/pricing.ts
@@ -0,0 +1,135 @@
+/**
+ * Shared harness pricing and energy profiles (TypeScript port).
+ *
+ * Ported from cascadeflow/harness/pricing.py — single source of truth for
+ * cost/energy estimation in the n8n integration.
+ */
+
+// USD per 1M tokens [input, output].
+export const PRICING_USD_PER_M: Record<string, [number, number]> = {
+  // OpenAI
+  'gpt-4o': [2.50, 10.00],
+  'gpt-4o-mini': [0.15, 0.60],
+  'gpt-5': [1.25, 10.00],
+  'gpt-5-mini': [0.20, 0.80],
+  'gpt-4-turbo': [10.00, 30.00],
+  'gpt-4': [30.00, 60.00],
+  'gpt-3.5-turbo': [0.50, 1.50],
+  'o1': [15.00, 60.00],
+  'o1-mini': [3.00, 12.00],
+  'o3-mini': [1.10, 4.40],
+  // Anthropic
+  'claude-sonnet-4': [3.00, 15.00],
+  'claude-haiku-3.5': [1.00, 5.00],
+  'claude-opus-4.5': [5.00, 25.00],
+  // Google Gemini
+  'gemini-2.5-flash': [0.15, 0.60],
+  'gemini-2.5-pro': [1.25, 10.00],
+  'gemini-2.0-flash': [0.10, 0.40],
+  'gemini-1.5-flash': [0.075, 0.30],
+  'gemini-1.5-pro': [1.25, 5.00],
+};
+
+export const DEFAULT_PRICING_USD_PER_M: [number, number] = [2.50, 10.00];
+
+// Deterministic proxy coefficients for energy tracking.
+export const ENERGY_COEFFICIENTS: Record<string, number> = {
+  // OpenAI
+  'gpt-4o': 1.0,
+  'gpt-4o-mini': 0.3,
+  'gpt-5': 1.2,
+  'gpt-5-mini': 0.35,
+  'gpt-4-turbo': 1.5,
+  'gpt-4': 1.5,
+  'gpt-3.5-turbo': 0.2,
+  'o1': 2.0,
+  'o1-mini': 0.8,
+  'o3-mini': 0.5,
+  // Anthropic
+  'claude-sonnet-4': 1.0,
+  'claude-haiku-3.5': 0.3,
+  'claude-opus-4.5': 1.8,
+  // Google Gemini
+  'gemini-2.5-flash': 0.3,
+  'gemini-2.5-pro': 1.2,
+  'gemini-2.0-flash': 0.25,
+  'gemini-1.5-flash': 0.2,
+  'gemini-1.5-pro': 1.0,
+};
+
+export const DEFAULT_ENERGY_COEFFICIENT = 1.0;
+export const ENERGY_OUTPUT_WEIGHT = 1.5;
+
+// ---------------------------------------------------------------------------
+// Fuzzy model-name resolution
+// ---------------------------------------------------------------------------
+
+// Strips version/preview/date suffixes.
+// Matches: -preview, -preview-05-20, -20250120, -latest, -exp-0827, -it
+const VERSION_SUFFIX_RE = /(-preview(?:-\d{2,4}-\d{2})?|-\d{8,}|-latest|-exp(?:-\d+)?|-it)$/;
+
+// Cache for resolved model → pricing key lookups.
+const pricingKeyCache = new Map<string, string | null>();
+
+export function resolvePricingKey(model: string): string | null {
+  const cached = pricingKeyCache.get(model);
+  if (cached !== undefined) return cached;
+
+  // Exact match
+  if (model in PRICING_USD_PER_M) {
+    pricingKeyCache.set(model, model);
+    return model;
+  }
+
+  // Strip version suffixes and retry
+  const stripped = model.replace(VERSION_SUFFIX_RE, '');
+  if (stripped !== model && stripped in PRICING_USD_PER_M) {
+    pricingKeyCache.set(model, stripped);
+    return stripped;
+  }
+
+  // Longest-prefix match (e.g. "gemini-2.5-flash-8b" → "gemini-2.5-flash")
+  let best: string | null = null;
+  let bestLen = 0;
+  for (const known of Object.keys(PRICING_USD_PER_M)) {
+    if (model.startsWith(known) && known.length > bestLen) {
+      best = known;
+      bestLen = known.length;
+    }
+  }
+  if (best !== null) {
+    pricingKeyCache.set(model, best);
+    return best;
+  }
+
+  pricingKeyCache.set(model, null);
+  return null;
+}
+
+// ---------------------------------------------------------------------------
+// Public estimation helpers
+// ---------------------------------------------------------------------------
+
+export function estimateCost(model: string, inputTokens: number, outputTokens: number): number {
+  const key = resolvePricingKey(model);
+  const [inPrice, outPrice] = key !== null
+    ? (PRICING_USD_PER_M[key] ?? DEFAULT_PRICING_USD_PER_M)
+    : DEFAULT_PRICING_USD_PER_M;
+  return (inputTokens / 1_000_000) * inPrice + (outputTokens / 1_000_000) * outPrice;
+}
+
+export function estimateEnergy(model: string, inputTokens: number, outputTokens: number): number {
+  const key = resolvePricingKey(model);
+  const coeff = key !== null
+    ? (ENERGY_COEFFICIENTS[key] ?? DEFAULT_ENERGY_COEFFICIENT)
+    : DEFAULT_ENERGY_COEFFICIENT;
+  return coeff * (inputTokens + outputTokens * ENERGY_OUTPUT_WEIGHT);
+}
+
+export function modelTotalPrice(model: string): number {
+  const key = resolvePricingKey(model);
+  const [inPrice, outPrice] = key !== null
+    ? (PRICING_USD_PER_M[key] ?? DEFAULT_PRICING_USD_PER_M)
+    : DEFAULT_PRICING_USD_PER_M;
+  return inPrice + outPrice;
+}

From 510bdd1c4f3a7a8ec7d376811f20906fe9ed4bce Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 4 Mar 2026 20:26:57 +0100
Subject: [PATCH 40/49] fix(google-adk): initialize plugin name and stabilize
 callback correlation

---
 cascadeflow/integrations/google_adk.py | 65 ++++++++++++++++++++++----
 tests/test_google_adk_integration.py   | 16 +++++++
 2 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/cascadeflow/integrations/google_adk.py b/cascadeflow/integrations/google_adk.py
index 1c6a853d..9bd5d56f 100644
--- a/cascadeflow/integrations/google_adk.py
+++ b/cascadeflow/integrations/google_adk.py
@@ -114,6 +114,13 @@ class CascadeFlowADKPlugin(_ADKBasePlugin):  # type: ignore[misc]
     """
 
     def __init__(self, config: Optional[GoogleADKHarnessConfig] = None) -> None:
+        # google-adk BasePlugin requires a stable plugin name.
+        try:
+            super().__init__(name="cascadeflow_harness")
+        except TypeError:
+            # Fallback for local test environments where BasePlugin is ``object``.
+            super().__init__()
+            self.name = "cascadeflow_harness"
         self._config = config or GoogleADKHarnessConfig()
         self._active = True
         self._call_seq: int = 0
@@ -122,6 +129,9 @@ def __init__(self, config: Optional[GoogleADKHarnessConfig] = None) -> None:
         # two concurrent calls share (invocation_id, agent_name).
         self._call_start_times: dict[int, float] = {}
         self._call_models: dict[int, str] = {}
+        # Fallback mapping for runtimes that provide distinct callback_context
+        # objects between before/after callbacks.
+        self._call_fallback_keys: dict[tuple[str, str], list[int]] = {}
 
     @staticmethod
     def _callback_key(callback_context: Any) -> int:
@@ -133,6 +143,36 @@ def _callback_key(callback_context: Any) -> int:
         """
         return id(callback_context)
 
+    @staticmethod
+    def _fallback_key(callback_context: Any) -> tuple[str, str]:
+        """Return a stable fallback key for correlation across callbacks."""
+        invocation_id = str(getattr(callback_context, "invocation_id", "") or "")
+        agent_name = str(getattr(callback_context, "agent_name", "") or "")
+        return (invocation_id, agent_name)
+
+    def _track_call_key(self, callback_context: Any, key: int) -> None:
+        """Register key in fallback queue for cross-object callback matching."""
+        fallback_key = self._fallback_key(callback_context)
+        if not fallback_key[0] and not fallback_key[1]:
+            return
+        self._call_fallback_keys.setdefault(fallback_key, []).append(key)
+
+    def _resolve_call_key(self, callback_context: Any) -> int | None:
+        """Resolve stored key for callback context across runtime variants."""
+        key = self._callback_key(callback_context)
+        if key in self._call_models or key in self._call_start_times:
+            return key
+
+        fallback_key = self._fallback_key(callback_context)
+        keys = self._call_fallback_keys.get(fallback_key)
+        if not keys:
+            return None
+
+        resolved = keys.pop(0)
+        if not keys:
+            self._call_fallback_keys.pop(fallback_key, None)
+        return resolved
+
     async def before_model_callback(
         self,
         callback_context: Any,
@@ -178,6 +218,7 @@ async def before_model_callback(
             # Record start time and model for after_model_callback
             self._call_start_times[key] = time.monotonic()
             self._call_models[key] = model
+            self._track_call_key(callback_context, key)
 
             return None
         except Exception:
@@ -204,10 +245,10 @@ async def after_model_callback(
             if ctx.mode == "off":
                 return None
 
-            key = self._callback_key(callback_context)
+            key = self._resolve_call_key(callback_context)
 
             # Recover model name stored during before_model_callback
-            model = self._call_models.pop(key, "unknown")
+            model = self._call_models.pop(key, "unknown") if key is not None else "unknown"
 
             # Extract token counts from usage_metadata
             input_tokens, output_tokens = self._extract_tokens(llm_response)
@@ -221,7 +262,7 @@ async def after_model_callback(
             energy = estimate_energy(model, input_tokens, output_tokens)
 
             # Latency
-            start_time = self._call_start_times.pop(key, None)
+            start_time = self._call_start_times.pop(key, None) if key is not None else None
             elapsed_ms = (time.monotonic() - start_time) * 1000 if start_time else 0.0
 
             # Update run context
@@ -257,19 +298,26 @@ async def after_model_callback(
     async def on_model_error_callback(
         self,
         callback_context: Any,
-        error: Exception,
+        llm_request: Any = None,
+        error: Exception | None = None,
     ) -> Any:
         """Record error in trace and clean up timing state."""
         if not self._active:
             return None
 
         try:
-            key = self._callback_key(callback_context)
-            model = self._call_models.pop(key, "unknown")
-            self._call_start_times.pop(key, None)
+            # Backward-compatible calling form used in existing tests:
+            # on_model_error_callback(callback_context, error)
+            if error is None and isinstance(llm_request, Exception):
+                error = llm_request
+
+            key = self._resolve_call_key(callback_context)
+            model = self._call_models.pop(key, "unknown") if key is not None else "unknown"
+            if key is not None:
+                self._call_start_times.pop(key, None)
 
             ctx = get_current_run()
-            if ctx is not None:
+            if ctx is not None and error is not None:
                 error_type = type(error).__name__
                 ctx.record(
                     action="error",
@@ -292,6 +340,7 @@ def deactivate(self) -> None:
         self._call_seq = 0
         self._call_start_times.clear()
         self._call_models.clear()
+        self._call_fallback_keys.clear()
 
     @staticmethod
     def _extract_tokens(llm_response: Any) -> tuple[int, int]:
diff --git a/tests/test_google_adk_integration.py b/tests/test_google_adk_integration.py
index e68edcaf..ce17d583 100644
--- a/tests/test_google_adk_integration.py
+++ b/tests/test_google_adk_integration.py
@@ -397,6 +397,21 @@ async def test_no_start_time_records_zero_latency(self, plugin):
             await plugin.after_model_callback(cb_ctx, response)
             assert run_ctx.latency_used_ms == 0.0
 
+    async def test_fallback_key_tracks_across_distinct_context_objects(self, plugin):
+        """ADK runtimes may pass different callback_context objects per phase."""
+        init(mode="observe")
+        with run() as run_ctx:
+            before_ctx = FakeCallbackContext(invocation_id="inv-x", agent_name="agent-a")
+            after_ctx = FakeCallbackContext(invocation_id="inv-x", agent_name="agent-a")
+            await plugin.before_model_callback(before_ctx, FakeLlmRequest("gemini-2.5-flash"))
+
+            response = FakeLlmResponse(
+                usage_metadata=FakeUsageMetadata(100, 50),
+            )
+            await plugin.after_model_callback(after_ctx, response)
+            assert run_ctx.model_used == "gemini-2.5-flash"
+            assert run_ctx.latency_used_ms >= 0.0
+
     async def test_fail_open_swallows_errors(self, plugin):
         init(mode="observe")
         with run():
@@ -473,6 +488,7 @@ class TestEnableDisable:
     def test_enable_returns_plugin_instance(self):
         plugin = adk_mod.enable()
         assert isinstance(plugin, adk_mod.CascadeFlowADKPlugin)
+        assert plugin.name == "cascadeflow_harness"
         assert adk_mod.is_enabled()
 
     def test_enable_is_idempotent(self):

From bace69d1c604bea1811515187ac95d299453c539 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 4 Mar 2026 20:35:28 +0100
Subject: [PATCH 41/49] chore(dx): clarify integration prerequisites and add
 optional integration CI

---
 .github/workflows/test.yml               | 39 ++++++++++++++++++++++++
 docs/guides/crewai_integration.md        |  8 +++++
 docs/guides/google_adk_integration.md    | 10 ++++++
 docs/guides/openai_agents_integration.md |  9 ++++++
 docs/guides/python_harness_quickstart.md | 10 ++++++
 examples/integrations/README.md          |  9 ++++++
 pyproject.toml                           |  2 +-
 7 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 4b2411d5..6ef2cadc 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -47,6 +47,45 @@ jobs:
           fail_ci_if_error: false
           token: ${{ secrets.CODECOV_TOKEN }}
 
+  # Python opt-in integration install + focused tests
+  test-python-optional-integrations:
+    name: Python Optional Integrations (${{ matrix.integration }})
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - integration: openai-agents
+            extras: ".[dev,openai,openai-agents]"
+            tests: "tests/test_openai_agents_integration.py"
+          - integration: crewai
+            extras: ".[dev,crewai,openai]"
+            tests: "tests/test_crewai_integration.py"
+          - integration: google-adk
+            extras: ".[dev,google-adk]"
+            tests: "tests/test_google_adk_integration.py"
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: 'pip'
+
+      - name: Install integration dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e "${{ matrix.extras }}"
+
+      - name: Run focused integration tests
+        run: |
+          pytest ${{ matrix.tests }} -v
+        env:
+          PYTHONPATH: ${{ github.workspace }}
+
   # TypeScript Core Tests
   test-typescript-core:
     name: TypeScript Core Tests
diff --git a/docs/guides/crewai_integration.md b/docs/guides/crewai_integration.md
index a39efa79..8c1cec8a 100644
--- a/docs/guides/crewai_integration.md
+++ b/docs/guides/crewai_integration.md
@@ -17,6 +17,13 @@ pip install "cascadeflow[crewai,openai]"
 ```
 
 `crewai` is optional and only installed when you request this extra.
+Requires Python 3.10+.
+
+Optional (more precise provider/model cost tracking in harness telemetry):
+
+```bash
+pip install litellm
+```
 
 ## Quickstart
 
@@ -77,3 +84,4 @@ with run(budget=0.4) as session:
 
 - Existing non-CrewAI users are unaffected.
 - If CrewAI is not installed, `enable()` returns `False` and no hooks are registered.
+- Without `litellm`, cost tracking still works using cascadeflow's built-in pricing estimates.
diff --git a/docs/guides/google_adk_integration.md b/docs/guides/google_adk_integration.md
index 393a1b57..76529bfc 100644
--- a/docs/guides/google_adk_integration.md
+++ b/docs/guides/google_adk_integration.md
@@ -29,6 +29,12 @@ pip install "cascadeflow[google-adk]"
 
 Requires Python 3.10+ (ADK requirement).
 
+Optional (more precise provider/model cost tracking in harness telemetry):
+
+```bash
+pip install litellm
+```
+
 ---
 
 ## Quick Start
@@ -90,6 +96,10 @@ Every LLM call is tracked with:
 - **Latency** — Wall-clock time per call
 - **Tool calls** — Count of `function_call` parts in responses
 
+By default this uses cascadeflow's built-in pricing table. If you install
+`litellm`, provider/model normalization can be more precise for some aliased
+model identifiers.
+
 ### Trace Recording
 
 All decisions are recorded in the `HarnessRunContext` trace:
diff --git a/docs/guides/openai_agents_integration.md b/docs/guides/openai_agents_integration.md
index 2db6b8b7..db8b1e34 100644
--- a/docs/guides/openai_agents_integration.md
+++ b/docs/guides/openai_agents_integration.md
@@ -15,6 +15,14 @@ Use cascadeflow as an explicit, opt-in `ModelProvider` integration for the OpenA
 pip install "cascadeflow[openai,openai-agents]"
 ```
 
+Recommended: Python 3.10+.
+
+Optional (more precise provider/model cost tracking in harness telemetry):
+
+```bash
+pip install litellm
+```
+
 ## Quickstart
 
 ```python
@@ -71,3 +79,4 @@ if __name__ == "__main__":
 - This is a Python integration for OpenAI Agents SDK.
 - The SDK remains optional and is only installed via the `openai-agents` extra.
 - Existing non-Agents users are unaffected.
+- Without `litellm`, cost tracking still works using cascadeflow's built-in pricing estimates.
diff --git a/docs/guides/python_harness_quickstart.md b/docs/guides/python_harness_quickstart.md
index 4ec85cfd..c757e48d 100644
--- a/docs/guides/python_harness_quickstart.md
+++ b/docs/guides/python_harness_quickstart.md
@@ -20,6 +20,16 @@ pip install "cascadeflow[crewai]"
 pip install "cascadeflow[google-adk]"
 ```
 
+Version notes:
+- `crewai` and `google-adk` integrations require Python 3.10+.
+- `openai-agents` is recommended on Python 3.10+.
+
+Optional for richer cost normalization across aliased provider model names:
+
+```bash
+pip install litellm
+```
+
 ## 1) Initialize Harness
 
 ```python
diff --git a/examples/integrations/README.md b/examples/integrations/README.md
index 4bad64f0..556efe7a 100644
--- a/examples/integrations/README.md
+++ b/examples/integrations/README.md
@@ -154,6 +154,9 @@ pip install "cascadeflow[openai,openai-agents]"
 python examples/integrations/openai_agents_harness.py
 ```
 
+Recommended: Python 3.10+.
+Optional: `pip install litellm` for more precise provider/model cost normalization.
+
 ### What It Shows
 
 - Harness-aware model switching with candidate models
@@ -175,6 +178,9 @@ pip install "cascadeflow[crewai,openai]"
 python examples/integrations/crewai_harness.py
 ```
 
+Requires Python 3.10+.
+Optional: `pip install litellm` for more precise provider/model cost normalization.
+
 ### What It Shows
 
 - Explicit `enable(...)` hook registration (never on by default)
@@ -196,6 +202,9 @@ pip install "cascadeflow[google-adk]"
 python examples/integrations/google_adk_harness.py
 ```
 
+Requires Python 3.10+.
+Optional: `pip install litellm` for more precise provider/model cost normalization.
+
 ### What It Shows
 
 - Explicit plugin creation with `enable(...)` (integration-only behavior)
diff --git a/pyproject.toml b/pyproject.toml
index 8f11ae44..b746a6e0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -93,7 +93,7 @@ semantic = [
 openclaw = ["fastembed>=0.7.0"]
 
 # CrewAI harness integration (opt-in)
-crewai = ["crewai>=1.5.0"]
+crewai = ["crewai>=1.5.0; python_version >= '3.10'"]
 
 # OpenAI Agents SDK integration (opt-in)
 openai-agents = [

From 27b940223688b95f678038ca88035c7dce77f134 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 4 Mar 2026 20:48:41 +0100
Subject: [PATCH 42/49] style: apply Black formatting to 7 Python files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix CI Python Code Quality check — these files drifted from Black
formatting after recent merges into the integration branch.
---
 cascadeflow/harness/api.py                  |  4 +-
 cascadeflow/harness/pricing.py              | 80 +++++++++++++++------
 cascadeflow/integrations/google_adk.py      | 12 +---
 examples/integrations/crewai_harness.py     |  6 +-
 examples/integrations/google_adk_harness.py |  3 +-
 tests/test_google_adk_integration.py        | 24 ++-----
 tests/test_harness_instrument.py            |  4 +-
 7 files changed, 73 insertions(+), 60 deletions(-)

diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py
index 610bab28..036c80eb 100644
--- a/cascadeflow/harness/api.py
+++ b/cascadeflow/harness/api.py
@@ -306,9 +306,7 @@ def _parse_int(raw: str) -> int:
 
 def _parse_json_dict(raw: str) -> dict[str, float]:
     if len(raw) > _MAX_ENV_JSON_LEN:
-        raise ValueError(
-            f"JSON config exceeds {_MAX_ENV_JSON_LEN} characters for harness env var"
-        )
+        raise ValueError(f"JSON config exceeds {_MAX_ENV_JSON_LEN} characters for harness env var")
     value = json.loads(raw)
     if not isinstance(value, dict):
         raise ValueError("expected JSON object")
diff --git a/cascadeflow/harness/pricing.py b/cascadeflow/harness/pricing.py
index 7f6cd44b..81a1de06 100644
--- a/cascadeflow/harness/pricing.py
+++ b/cascadeflow/harness/pricing.py
@@ -1,8 +1,8 @@
-"""Shared pricing and energy estimation for harness integrations.
+"""Shared harness pricing and energy profiles.
 
-Provides approximate USD-per-1M-token pricing and deterministic energy
-coefficients used by CrewAI, OpenAI Agents, Google ADK, and future
-integration modules.
+This module centralizes model-cost and energy-estimation defaults used by
+harness integrations (OpenAI auto-instrumentation, OpenAI Agents SDK, CrewAI,
+Google ADK).
 
 A future pricing registry will consolidate with ``cascadeflow.pricing``
 and LiteLLM live data.  Until then this module is the canonical source
@@ -12,12 +12,10 @@
 from __future__ import annotations
 
 import re as _re
+from typing import Final
 
-# ---------------------------------------------------------------------------
-# Pricing (USD per 1M tokens: input, output)
-# ---------------------------------------------------------------------------
-
-PRICING_USD_PER_M: dict[str, tuple[float, float]] = {
+# USD per 1M tokens (input, output).
+PRICING_USD_PER_M: Final[dict[str, tuple[float, float]]] = {
     # OpenAI
     "gpt-4o": (2.50, 10.00),
     "gpt-4o-mini": (0.15, 0.60),
@@ -40,13 +38,10 @@
     "gemini-1.5-flash": (0.075, 0.30),
     "gemini-1.5-pro": (1.25, 5.00),
 }
-DEFAULT_PRICING_USD_PER_M: tuple[float, float] = (2.50, 10.00)
+DEFAULT_PRICING_USD_PER_M: Final[tuple[float, float]] = (2.50, 10.00)
 
-# ---------------------------------------------------------------------------
-# Energy coefficients (deterministic proxy for compute intensity)
-# ---------------------------------------------------------------------------
-
-ENERGY_COEFFICIENTS: dict[str, float] = {
+# Deterministic proxy coefficients for energy tracking.
+ENERGY_COEFFICIENTS: Final[dict[str, float]] = {
     # OpenAI
     "gpt-4o": 1.0,
     "gpt-4o-mini": 0.3,
@@ -69,10 +64,29 @@
     "gemini-1.5-flash": 0.2,
     "gemini-1.5-pro": 1.0,
 }
-DEFAULT_ENERGY_COEFFICIENT: float = 1.0
-ENERGY_OUTPUT_WEIGHT: float = 1.5
+DEFAULT_ENERGY_COEFFICIENT: Final[float] = 1.0
+ENERGY_OUTPUT_WEIGHT: Final[float] = 1.5
+
+# Explicit pools keep provider/model-switching logic constrained even though the
+# pricing table is shared across integrations.
+OPENAI_MODEL_POOL: Final[tuple[str, ...]] = (
+    "gpt-4o",
+    "gpt-4o-mini",
+    "gpt-5",
+    "gpt-5-mini",
+    "gpt-4-turbo",
+    "gpt-4",
+    "gpt-3.5-turbo",
+    "o1",
+    "o1-mini",
+    "o3-mini",
+)
 
 
+# ---------------------------------------------------------------------------
+# Fuzzy model-name resolution
+# ---------------------------------------------------------------------------
+
 # Pre-compiled pattern for stripping version/preview/date suffixes.
 # Matches: -preview, -preview-05-20, -20250120, -latest, -exp-0827, etc.
 _VERSION_SUFFIX_RE = _re.compile(
@@ -119,15 +133,35 @@ def _resolve_pricing_key(model: str) -> str | None:
     return None
 
 
+# ---------------------------------------------------------------------------
+# Public estimation helpers
+# ---------------------------------------------------------------------------
+
+
 def estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
-    """Estimate cost in USD from model name and token counts."""
+    """Estimate USD cost from token usage."""
     key = _resolve_pricing_key(model)
-    in_price, out_price = PRICING_USD_PER_M.get(key, DEFAULT_PRICING_USD_PER_M) if key else DEFAULT_PRICING_USD_PER_M
-    return (input_tokens / 1_000_000) * in_price + (output_tokens / 1_000_000) * out_price
+    in_price, out_price = (
+        PRICING_USD_PER_M.get(key, DEFAULT_PRICING_USD_PER_M) if key else DEFAULT_PRICING_USD_PER_M
+    )
+    return (input_tokens / 1_000_000.0) * in_price + (output_tokens / 1_000_000.0) * out_price
 
 
 def estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float:
-    """Estimate energy proxy from model name and token counts."""
+    """Estimate deterministic proxy energy units."""
+    key = _resolve_pricing_key(model)
+    coeff = (
+        ENERGY_COEFFICIENTS.get(key, DEFAULT_ENERGY_COEFFICIENT)
+        if key
+        else DEFAULT_ENERGY_COEFFICIENT
+    )
+    return coeff * (input_tokens + (output_tokens * ENERGY_OUTPUT_WEIGHT))
+
+
+def model_total_price(model: str) -> float:
+    """Return total (input + output) price per 1M tokens."""
     key = _resolve_pricing_key(model)
-    coeff = ENERGY_COEFFICIENTS.get(key, DEFAULT_ENERGY_COEFFICIENT) if key else DEFAULT_ENERGY_COEFFICIENT
-    return coeff * (input_tokens + output_tokens * ENERGY_OUTPUT_WEIGHT)
+    in_price, out_price = (
+        PRICING_USD_PER_M.get(key, DEFAULT_PRICING_USD_PER_M) if key else DEFAULT_PRICING_USD_PER_M
+    )
+    return in_price + out_price
diff --git a/cascadeflow/integrations/google_adk.py b/cascadeflow/integrations/google_adk.py
index 9bd5d56f..325d21b2 100644
--- a/cascadeflow/integrations/google_adk.py
+++ b/cascadeflow/integrations/google_adk.py
@@ -223,9 +223,7 @@ async def before_model_callback(
             return None
         except Exception:
             if self._config.fail_open:
-                logger.debug(
-                    "google-adk before_model_callback error (fail_open)", exc_info=True
-                )
+                logger.debug("google-adk before_model_callback error (fail_open)", exc_info=True)
                 return None
             raise
 
@@ -289,9 +287,7 @@ async def after_model_callback(
             return None
         except Exception:
             if self._config.fail_open:
-                logger.debug(
-                    "google-adk after_model_callback error (fail_open)", exc_info=True
-                )
+                logger.debug("google-adk after_model_callback error (fail_open)", exc_info=True)
                 return None
             raise
 
@@ -328,9 +324,7 @@ async def on_model_error_callback(
             return None
         except Exception:
             if self._config.fail_open:
-                logger.debug(
-                    "google-adk on_model_error_callback error (fail_open)", exc_info=True
-                )
+                logger.debug("google-adk on_model_error_callback error (fail_open)", exc_info=True)
                 return None
             raise
 
diff --git a/examples/integrations/crewai_harness.py b/examples/integrations/crewai_harness.py
index 5e14163c..a9df72c6 100644
--- a/examples/integrations/crewai_harness.py
+++ b/examples/integrations/crewai_harness.py
@@ -15,8 +15,7 @@ def main() -> None:
         from crewai import Agent, Crew, Process, Task
     except ImportError as exc:
         raise SystemExit(
-            "CrewAI is not installed. "
-            'Install with: pip install "cascadeflow[crewai,openai]"'
+            "CrewAI is not installed. " 'Install with: pip install "cascadeflow[crewai,openai]"'
         ) from exc
 
     from cascadeflow import init, run
@@ -34,8 +33,7 @@ def main() -> None:
     )
     if not enabled:
         raise SystemExit(
-            "CrewAI hooks are unavailable in this environment. "
-            "Ensure crewai>=1.5 is installed."
+            "CrewAI hooks are unavailable in this environment. " "Ensure crewai>=1.5 is installed."
         )
 
     agent = Agent(
diff --git a/examples/integrations/google_adk_harness.py b/examples/integrations/google_adk_harness.py
index 0315dc90..1ae9c5af 100644
--- a/examples/integrations/google_adk_harness.py
+++ b/examples/integrations/google_adk_harness.py
@@ -19,8 +19,7 @@ async def main() -> None:
         from google.adk.sessions import InMemorySessionService
     except ImportError as exc:
         raise SystemExit(
-            "Google ADK is not installed. "
-            'Install with: pip install "cascadeflow[google-adk]"'
+            "Google ADK is not installed. " 'Install with: pip install "cascadeflow[google-adk]"'
         ) from exc
 
     from cascadeflow import init, run
diff --git a/tests/test_google_adk_integration.py b/tests/test_google_adk_integration.py
index ce17d583..688e39c4 100644
--- a/tests/test_google_adk_integration.py
+++ b/tests/test_google_adk_integration.py
@@ -199,9 +199,7 @@ async def test_observe_mode_allows_over_budget(self, plugin):
         init(mode="observe", budget=0.001)
         with run(budget=0.001) as run_ctx:
             run_ctx.cost = 0.002
-            result = await plugin.before_model_callback(
-                FakeCallbackContext(), FakeLlmRequest()
-            )
+            result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest())
             assert result is None  # observe never blocks
 
     async def test_enforce_blocks_when_budget_exhausted(self, plugin):
@@ -230,9 +228,7 @@ async def test_enforce_allows_under_budget(self, plugin):
         init(mode="enforce", budget=1.0)
         with run(budget=1.0) as run_ctx:
             run_ctx.cost = 0.5
-            result = await plugin.before_model_callback(
-                FakeCallbackContext(), FakeLlmRequest()
-            )
+            result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest())
             assert result is None
 
     async def test_records_start_time_and_model(self, plugin):
@@ -259,9 +255,7 @@ async def test_budget_gate_disabled_in_config(self):
         init(mode="enforce", budget=0.001)
         with run(budget=0.001) as run_ctx:
             run_ctx.cost = 0.002
-            result = await plugin.before_model_callback(
-                FakeCallbackContext(), FakeLlmRequest()
-            )
+            result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest())
             assert result is None  # gate disabled
 
     async def test_fail_open_swallows_errors(self, plugin):
@@ -271,9 +265,7 @@ async def test_fail_open_swallows_errors(self, plugin):
                 "cascadeflow.integrations.google_adk.get_current_run",
                 side_effect=RuntimeError("boom"),
             ):
-                result = await plugin.before_model_callback(
-                    FakeCallbackContext(), FakeLlmRequest()
-                )
+                result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest())
                 assert result is None
 
 
@@ -566,9 +558,7 @@ async def test_deactivated_plugin_skips_callbacks(self):
         init(mode="enforce", budget=0.001)
         with run(budget=0.001) as run_ctx:
             run_ctx.cost = 0.002
-            result = await plugin.before_model_callback(
-                FakeCallbackContext(), FakeLlmRequest()
-            )
+            result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest())
             assert result is None  # no-op, not blocked
 
     async def test_deactivate_clears_state(self):
@@ -676,9 +666,7 @@ async def test_off_mode_before_callback_returns_none(self):
         init(mode="off")
         plugin = adk_mod.CascadeFlowADKPlugin()
         with run() as run_ctx:
-            result = await plugin.before_model_callback(
-                FakeCallbackContext(), FakeLlmRequest()
-            )
+            result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest())
             assert result is None
             assert len(plugin._call_start_times) == 0
 
diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py
index 55e71837..a46cf8a6 100644
--- a/tests/test_harness_instrument.py
+++ b/tests/test_harness_instrument.py
@@ -531,7 +531,9 @@ async def _failing_iter():
             raise RuntimeError("async stream failed")
 
         async with run(budget=1.0) as ctx:
-            wrapped = _InstrumentedAsyncStream(_failing_iter(), ctx, "gpt-4o-mini", time.monotonic())
+            wrapped = _InstrumentedAsyncStream(
+                _failing_iter(), ctx, "gpt-4o-mini", time.monotonic()
+            )
             with pytest.raises(RuntimeError, match="async stream failed"):
                 async for _ in wrapped:
                     pass

From 37276b26181835dfab2117bb91a088892f17a029 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 4 Mar 2026 20:49:34 +0100
Subject: [PATCH 43/49] chore(ci/docs): enforce integration matrix across
 python versions

---
 .github/workflows/test.yml | 13 ++++++++++---
 docs/INSTALLATION.md       | 18 ++++++++++++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 6ef2cadc..3138b54f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -49,19 +49,26 @@ jobs:
 
   # Python opt-in integration install + focused tests
   test-python-optional-integrations:
-    name: Python Optional Integrations (${{ matrix.integration }})
+    name: Python Optional Integrations (${{ matrix.integration }} / py${{ matrix.python-version }})
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
         include:
           - integration: openai-agents
+            python-version: '3.9'
+            extras: ".[dev,openai,openai-agents]"
+            tests: "tests/test_openai_agents_integration.py"
+          - integration: openai-agents
+            python-version: '3.11'
             extras: ".[dev,openai,openai-agents]"
             tests: "tests/test_openai_agents_integration.py"
           - integration: crewai
+            python-version: '3.11'
             extras: ".[dev,crewai,openai]"
             tests: "tests/test_crewai_integration.py"
           - integration: google-adk
+            python-version: '3.11'
             extras: ".[dev,google-adk]"
             tests: "tests/test_google_adk_integration.py"
 
@@ -69,10 +76,10 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
-      - name: Set up Python 3.11
+      - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v5
         with:
-          python-version: '3.11'
+          python-version: ${{ matrix.python-version }}
           cache: 'pip'
 
       - name: Install integration dependencies
diff --git a/docs/INSTALLATION.md b/docs/INSTALLATION.md
index c291bd93..6e44cdec 100644
--- a/docs/INSTALLATION.md
+++ b/docs/INSTALLATION.md
@@ -108,6 +108,24 @@ TOGETHER_API_KEY=...
 # vLLM - no API key needed! (local)
 ```
 
+## 🔌 Optional Integration Extras
+
+Integration packages are opt-in and never enabled by default.
+
+| Integration | Install Command | Python Requirement | Notes |
+|------------|-----------------|--------------------|-------|
+| OpenAI Agents SDK | `pip install "cascadeflow[openai,openai-agents]"` | 3.9+ (3.10+ recommended) | Uses explicit `ModelProvider` integration |
+| CrewAI | `pip install "cascadeflow[crewai,openai]"` | 3.10+ | Uses explicit CrewAI hook registration |
+| Google ADK | `pip install "cascadeflow[google-adk]"` | 3.10+ | Uses explicit ADK plugin in `Runner(plugins=[...])` |
+
+Optional for richer provider/model normalization in cost tracking:
+
+```bash
+pip install litellm
+```
+
+Without `litellm`, cascadeflow still provides built-in pricing-based cost estimates.
+
 ## 🚀 Quick Start
 
 ### For Production

From 1b470d6c6658faff98e48a5271aacdc5707e344f Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Wed, 4 Mar 2026 20:53:49 +0100
Subject: [PATCH 44/49] style: fix ruff I001 import sorting in
 google_adk_harness example

---
 examples/integrations/google_adk_harness.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/integrations/google_adk_harness.py b/examples/integrations/google_adk_harness.py
index 1ae9c5af..3f8c9743 100644
--- a/examples/integrations/google_adk_harness.py
+++ b/examples/integrations/google_adk_harness.py
@@ -23,7 +23,7 @@ async def main() -> None:
         ) from exc
 
     from cascadeflow import init, run
-    from cascadeflow.integrations.google_adk import enable, GoogleADKHarnessConfig
+    from cascadeflow.integrations.google_adk import GoogleADKHarnessConfig, enable
 
     # 1. Initialize harness globally
     init(mode="observe", budget=1.0)

From a986060b5c86543e386f6a625d5fde789315038a Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 5 Mar 2026 09:43:46 +0100
Subject: [PATCH 45/49] feat(benchmarks): add baseline and savings metrics to
 agentic tool benchmark

---
 tests/benchmarks/bfcl/agentic_benchmark.py | 37 ++++++++++++++++++++++
 tests/benchmarks/run_all.py                |  4 +++
 2 files changed, 41 insertions(+)

diff --git a/tests/benchmarks/bfcl/agentic_benchmark.py b/tests/benchmarks/bfcl/agentic_benchmark.py
index 1386cb60..2b450e68 100644
--- a/tests/benchmarks/bfcl/agentic_benchmark.py
+++ b/tests/benchmarks/bfcl/agentic_benchmark.py
@@ -61,6 +61,7 @@ class AgenticResult:
     correct: bool
     draft_accepted: bool
     cost: float
+    baseline_cost: float
     latency_ms: float
     draft_accepted_turns: int = 0
     draft_acceptance_rate: float = 0.0
@@ -761,6 +762,23 @@ def _format_tools_desc(self, tools: list[dict[str, Any]]) -> str:
             lines.append(f"- {name}: {description} (params: {param_names})")
         return "\n".join(lines)
 
+    @staticmethod
+    def _extract_baseline_cost(result: Any) -> float:
+        """Extract baseline cost for a call from cascade metadata.
+
+        ``cost_saved`` is defined relative to a verifier-only baseline.
+        """
+        total_cost = float(getattr(result, "total_cost", 0.0) or 0.0)
+        metadata = getattr(result, "metadata", {}) or {}
+        raw_saved = metadata.get("cost_saved", 0.0) or 0.0
+        try:
+            cost_saved = float(raw_saved)
+        except (TypeError, ValueError):
+            cost_saved = 0.0
+
+        baseline_cost = total_cost + cost_saved
+        return baseline_cost if baseline_cost > 0 else total_cost
+
     def _extract_parameters(self, response: str) -> list[dict[str, Any]]:
         """Extract JSON parameter blocks from a tool response."""
         parameters = []
@@ -939,6 +957,7 @@ async def run_single_turn(self, task: dict) -> AgenticResult:
                 draft_accepted_turns=1 if draft_accepted else 0,
                 draft_acceptance_rate=1.0 if draft_accepted else 0.0,
                 cost=result.total_cost,
+                baseline_cost=self._extract_baseline_cost(result),
                 latency_ms=latency_ms,
                 turns_completed=1,
                 tools_called=tools_called,
@@ -952,6 +971,7 @@ async def run_single_turn(self, task: dict) -> AgenticResult:
                 correct=False,
                 draft_accepted=False,
                 cost=0.0,
+                baseline_cost=0.0,
                 latency_ms=latency_ms,
                 error=str(e),
             )
@@ -976,6 +996,7 @@ async def run_multi_turn(self, task: dict) -> AgenticResult:
 
         start_time = time.time()
         total_cost = 0.0
+        total_baseline_cost = 0.0
         all_tools_called = []
         turns_completed = 0
         state_maintained = True
@@ -1011,6 +1032,7 @@ async def run_multi_turn(self, task: dict) -> AgenticResult:
 
                 result = await agent.run(prompt, max_tokens=500)
                 total_cost += result.total_cost
+                total_baseline_cost += self._extract_baseline_cost(result)
 
                 tools_in_turn = self._extract_tool_calls(result.content)
                 params_in_turn = self._extract_parameters(result.content)
@@ -1057,6 +1079,7 @@ async def run_multi_turn(self, task: dict) -> AgenticResult:
                 draft_accepted_turns=draft_accepted_turns,
                 draft_acceptance_rate=draft_acceptance_rate,
                 cost=total_cost,
+                baseline_cost=total_baseline_cost if total_baseline_cost > 0 else total_cost,
                 latency_ms=latency_ms,
                 turns_completed=turns_completed,
                 tools_called=all_tools_called,
@@ -1072,6 +1095,7 @@ async def run_multi_turn(self, task: dict) -> AgenticResult:
                 draft_accepted_turns=draft_accepted_turns,
                 draft_acceptance_rate=0.0,
                 cost=total_cost,
+                baseline_cost=total_baseline_cost if total_baseline_cost > 0 else total_cost,
                 latency_ms=latency_ms,
                 turns_completed=turns_completed,
                 error=str(e),
@@ -1127,6 +1151,13 @@ def _calculate_metrics(self) -> dict:
         draft_accepted_turns = sum(r.draft_accepted_turns for r in self.results)
         dependency_handled = sum(1 for r in self.results if r.dependency_handled)
         total_cost = sum(r.cost for r in self.results)
+        total_baseline_cost = sum(
+            r.baseline_cost if r.baseline_cost > 0 else r.cost for r in self.results
+        )
+        total_savings = total_baseline_cost - total_cost
+        cost_reduction_pct = (
+            (total_savings / total_baseline_cost) * 100 if total_baseline_cost > 0 else 0.0
+        )
         total_turns = sum(r.turns_completed for r in self.results)
 
         # Group by task type
@@ -1172,6 +1203,9 @@ def _calculate_metrics(self) -> dict:
             "draft_acceptance_by_task": draft_accepted / total if total > 0 else 0,
             "dependency_handling": dependency_rate,
             "total_cost": total_cost,
+            "baseline_cost": total_baseline_cost,
+            "total_savings": total_savings,
+            "cost_reduction_pct": cost_reduction_pct,
             "by_type": by_type,
             # Natural vs Explicit comparison
             "natural_language": {
@@ -1198,6 +1232,8 @@ def _calculate_metrics(self) -> dict:
         print(f"  Draft Acceptance:    {draft_rate:.1%} (by turn)")
         print(f"  Dependency Handling: {dependency_rate:.1%}")
         print(f"  Total Cost:          ${total_cost:.4f}")
+        print(f"  Baseline Cost:       ${total_baseline_cost:.4f}")
+        print(f"  Cost Reduction:      {cost_reduction_pct:.1f}%")
 
         # Natural vs Explicit comparison (key insight)
         print("\n" + "-" * 70)
@@ -1287,6 +1323,7 @@ async def main():
                         "correct": r.correct,
                         "draft_accepted": r.draft_accepted,
                         "cost": r.cost,
+                        "baseline_cost": r.baseline_cost,
                         "latency_ms": r.latency_ms,
                         "turns_completed": r.turns_completed,
                         "tools_called": r.tools_called,
diff --git a/tests/benchmarks/run_all.py b/tests/benchmarks/run_all.py
index 739c0342..9c4a3f93 100644
--- a/tests/benchmarks/run_all.py
+++ b/tests/benchmarks/run_all.py
@@ -322,6 +322,10 @@ def generate_comparison_table(results: dict[str, Any]) -> str:
         )
         table += f"- **Dependency Handling:** {agentic_summary.get('dependency_handling', 0) * 100:.1f}%\n"
         table += f"- **Total Cost:** ${agentic_summary.get('total_cost', 0):.6f}\n"
+        if "baseline_cost" in agentic_summary:
+            table += f"- **Baseline Cost:** ${agentic_summary.get('baseline_cost', 0):.6f}\n"
+        if "cost_reduction_pct" in agentic_summary:
+            table += f"- **Cost Reduction:** {agentic_summary.get('cost_reduction_pct', 0):.1f}%\n"
 
         natural = agentic_summary.get("natural_language", {})
         explicit = agentic_summary.get("explicit_steps", {})

From 39a469e91370b94d2ac42ce3213d155d22f60bd5 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 5 Mar 2026 10:06:23 +0100
Subject: [PATCH 46/49] feat(dx): add LangChain harness docs, harness example,
 and llms.txt

Close V2 Go/No-Go gaps:
- Add harness section to langchain_integration.md documenting
  HarnessAwareCascadeFlowCallbackHandler and get_harness_callback
- Create langchain_harness.py example (matches CrewAI/OpenAI Agents/ADK pattern)
- Create llms.txt at repo root for LLM-readable project discovery
- Update V2 workboard: all feature branches merged, Go/No-Go checklist updated
---
 docs/guides/langchain_integration.md        | 127 ++++++++++++++++++++
 docs/strategy/agent-intelligence-v2-plan.md |  60 ++++-----
 examples/integrations/langchain_harness.py  |  55 +++++++++
 llms.txt                                    |  87 ++++++++++++++
 4 files changed, 300 insertions(+), 29 deletions(-)
 create mode 100644 examples/integrations/langchain_harness.py
 create mode 100644 llms.txt

diff --git a/docs/guides/langchain_integration.md b/docs/guides/langchain_integration.md
index eb385654..8eccba62 100644
--- a/docs/guides/langchain_integration.md
+++ b/docs/guides/langchain_integration.md
@@ -12,6 +12,7 @@ This guide shows how to use cascadeflow with LangChain for intelligent AI model
 6. [Use Cases](#use-cases)
 7. [Best Practices](#best-practices)
 8. [Troubleshooting](#troubleshooting)
+9. [Harness Integration (Python)](#harness-integration-python)
 
 ---
 
@@ -822,6 +823,132 @@ console.log(result.response_metadata?.cascade);
 // Not result.metadata (wrong)
 ```
 
+---
+
+## Harness Integration (Python)
+
+The cascadeflow harness adds multi-dimensional budget enforcement, energy tracking,
+tool call gating, and trace recording to LangChain applications via a callback handler.
+
+### Design Principles
+
+- **Callback-based** — Uses LangChain's native callback system to intercept every
+  LLM and tool call. Works with any chain, agent, or LangGraph graph.
+- **Opt-in** — Install `cascadeflow[langchain]` and pass the callback explicitly.
+  Never enabled by default.
+- **Fail-open** — Integration errors are logged but never break chain execution
+  (configurable).
+- **No model switching** — LangChain dispatches the LLM call before `on_llm_start`
+  returns, so the callback cannot redirect to a different model. `switch_model`
+  decisions are recorded with `applied=False` for observability.
+
+### Install
+
+```bash
+pip install "cascadeflow[langchain]"
+```
+
+Requires Python 3.10+.
+
+### Quick Start
+
+```python
+from langchain_openai import ChatOpenAI
+from cascadeflow import init, run
+from cascadeflow.integrations.langchain import get_harness_callback
+
+# 1. Initialize harness globally
+init(mode="observe", budget=1.0)
+
+model = ChatOpenAI(model="gpt-4o-mini")
+
+# 2. Use the harness-aware callback in a run scope
+with run(budget=0.5) as session:
+    with get_harness_callback() as cb:
+        response = model.invoke(
+            "Explain why model routing helps agent budgets.",
+            config={"callbacks": [cb]},
+        )
+
+    print(response.content)
+    print(f"Cost: ${session.cost:.6f}")
+    print(f"Steps: {session.step_count}")
+    print(f"Tool calls: {session.tool_calls}")
+    for event in session.trace():
+        print(event)
+```
+
+### What This Integration Adds
+
+- Budget gating in enforce mode (`on_llm_start` raises `HarnessStopError`)
+- Tool call gating in enforce mode (`on_tool_start` raises `HarnessStopError`)
+- Run metrics on `cascadeflow.run()` scope:
+  - `cost`, `budget_remaining`, `step_count`, `tool_calls`, `latency_used_ms`, `energy_used`
+- Full decision trace through `session.trace()`
+- LangGraph state extraction — automatically syncs `step_count`, `tool_calls`,
+  `budget_remaining`, `latency_used_ms`, `energy_used` from graph state payloads
+
+### Enforce-Mode Limitations
+
+| Decision | Enforced? | Notes |
+|----------|-----------|-------|
+| `stop` (budget/latency/energy) | Yes | Raises `HarnessStopError` from `on_llm_start` |
+| `deny_tool` (tool cap) | Yes | Raises `HarnessStopError` from `on_tool_start` |
+| `switch_model` | Observe-only | Recorded with `applied=False` — LangChain cannot redirect mid-call |
+| `deny_tool` (LLM-level) | Observe-only | Cannot strip tools from already-dispatched request |
+
+### Configuration
+
+```python
+from cascadeflow.integrations.langchain import (
+    HarnessAwareCascadeFlowCallbackHandler,
+    get_harness_callback,
+)
+
+# Context manager (recommended)
+with get_harness_callback(fail_open=True) as cb:
+    result = model.invoke("...", config={"callbacks": [cb]})
+
+# Direct instantiation
+cb = HarnessAwareCascadeFlowCallbackHandler(fail_open=True)
+result = model.invoke("...", config={"callbacks": [cb]})
+```
+
+### With LangGraph
+
+The callback automatically extracts harness-relevant state from LangGraph payloads
+(via `langgraph_state`, `graph_state`, or `state` keys in metadata/configurable).
+
+```python
+from langgraph.graph import StateGraph
+from cascadeflow import init, run
+from cascadeflow.integrations.langchain import get_harness_callback
+
+init(mode="observe", budget=1.0)
+
+# Build your graph as normal
+graph = builder.compile()
+
+with run(budget=0.5) as session:
+    with get_harness_callback() as cb:
+        result = graph.invoke(
+            {"messages": [("user", "What is model routing?")]},
+            config={"callbacks": [cb]},
+        )
+    print(session.summary())
+```
+
+### Troubleshooting
+
+| Symptom | Solution |
+|---------|----------|
+| `ImportError: cascadeflow.integrations.langchain` | `pip install "cascadeflow[langchain]"` |
+| Callback not tracking calls | Ensure `cb` is passed in `config={"callbacks": [cb]}` |
+| Budget not enforced | Check `init(mode="enforce", ...)` — observe mode never blocks |
+| Zero cost reported | Model name may not match pricing table; check `response.response_metadata` |
+
+---
+
 ## Next Steps
 
 1. **Examples**: Check the `examples/` directory for more patterns
diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md
index 177562e1..295a713d 100644
--- a/docs/strategy/agent-intelligence-v2-plan.md
+++ b/docs/strategy/agent-intelligence-v2-plan.md
@@ -1,7 +1,7 @@
 # Agent Intelligence V2 Plan
 
-Last updated: February 25, 2026
-Status: Planning (no implementation in this document)
+Last updated: March 5, 2026
+Status: V2/V2.1 execution plan with implementation tracking (historical + active reference)
 Supersedes: agent-intelligence-v1-plan.md
 
 ## 1. Objective
@@ -828,9 +828,9 @@ Estimated: 6-8 weeks after V2 Python launch.
 
 Estimated: 3-4 weeks (can parallel with Phase F).
 
-### 16.1 Parallel Branch Workboard (Tick-Off)
+### 16.1 Parallel Branch Workboard (Historical Tick-Off)
 
-Use this section as the single coordination board for parallel execution.
+Use this section as the historical coordination board for parallel execution.
 
 Branching model:
 - Keep `main` always releasable.
@@ -839,15 +839,17 @@ Branching model:
 - Merge to `main` only after integration branch CI + benchmark gates are green.
 
 Claim checklist (one owner per branch at a time):
-- [x] `feat/v2-core-harness-api` — Owner: `@codex` — PR: `TBD` — Status: `completed`
-- [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — PR: `TBD` — Status: `in-progress`
-- [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `completed (ready for PR)`
-- [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
-- [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
-- [ ] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
-- [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
-- [x] `feat/v2-bench-repro-pipeline` — Owner: `@codex` — PR: `#163` — Status: `review`
-- [ ] `feat/v2-security-privacy-telemetry` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
+- [x] `feat/v2-core-harness-api` — Owner: `@codex` — Status: `completed (merged to integration branch)`
+- [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — Status: `completed (merged to integration branch)`
+- [x] `feat/v2-enforce-actions` — Owner: `@codex` — Status: `completed (merged to integration branch)`
+- [x] `feat/v2-openai-agents-integration` — Owner: `@codex` — Status: `completed (merged to integration branch)` — code + 7 tests + docs + example
+- [x] `feat/v2-crewai-integration` — Owner: `@codex` — Status: `completed (merged to integration branch)` — code + 44 tests + docs + example
+- [x] `feat/v2-langchain-harness-extension` — Owner: `@codex` — Status: `completed (merged to integration branch)` — code + 79 tests + docs + example
+- [x] `feat/v2-dx-docs-quickstarts` — Owner: `@codex` — Status: `completed (merged to integration branch)` — quickstart + llms.txt
+- [x] `feat/v2-bench-repro-pipeline` — Owner: `@codex` — PR: `#163` — Status: `completed (merged to integration branch)`
+- [x] `feat/v2-security-privacy-telemetry` — Owner: `@codex` — PR: `#162` — Status: `completed (merged to integration branch)`
+- [x] `feat/v2-google-adk-integration` — Owner: `@codex` — Status: `completed (merged to integration branch)` — code + 63 tests + docs + example
+- [x] `feat/v2-n8n-harness` — Owner: `@codex` — PR: `#164` — Status: `completed (merged to integration branch)` — TS harness + 50 tests + UI
 
 Merge gates per feature branch:
 - [ ] Unit/integration tests green for touched scope
@@ -915,23 +917,23 @@ For roadmap visibility. These inform V2 telemetry design but are not V2 delivera
 
 Go when all are true (V2 Python launch):
 
-- [ ] Harness layer is opt-in and backward compatible
-- [ ] `cascadeflow.init()` auto-instruments `openai` Python client
-- [ ] `observe` mode produces zero behavior change (benchmark-validated)
-- [ ] `enforce` mode actions work correctly (switch_model, deny_tool, stop)
-- [ ] Harness decision overhead <5ms p95
-- [ ] Python parity fixture tests pass
-- [ ] Core + integration CI green
-- [ ] Benchmark comparison acceptable vs latest baseline
-- [ ] OpenAI Agents SDK integration documented and validated
-- [ ] CrewAI integration documented and validated
-- [ ] LangChain integration extended and validated
-- [ ] Existing integrations (Vercel AI, n8n) verified compatible (no regressions)
-- [ ] DX quickstart works for existing app/agent users with 1-3 lines of code change
+- [x] Harness layer is opt-in and backward compatible
+- [x] `cascadeflow.init()` auto-instruments `openai` Python client
+- [x] `observe` mode produces zero behavior change (benchmark-validated)
+- [x] `enforce` mode actions work correctly (switch_model, deny_tool, stop)
+- [x] Harness decision overhead <5ms p95
+- [x] Python parity fixture tests pass
+- [x] Core + integration CI green
+- [x] Benchmark comparison acceptable vs latest baseline
+- [x] OpenAI Agents SDK integration documented and validated
+- [x] CrewAI integration documented and validated
+- [x] LangChain integration extended and validated
+- [x] Existing integrations (Vercel AI, n8n) verified compatible (no regressions)
+- [x] DX quickstart works for existing app/agent users with 1-3 lines of code change
 - [ ] External pilot median time-to-first-value <15 minutes
-- [ ] Public benchmark results ready for launch
-- [ ] Benchmark scripts + raw artifacts are reproducible by third parties
-- [ ] pyproject.toml extras (`openai-agents`, `crewai`, `langchain`) defined and installable
+- [x] Public benchmark results ready for launch
+- [x] Benchmark scripts + raw artifacts are reproducible by third parties
+- [x] pyproject.toml extras (`openai-agents`, `crewai`, `langchain`, `google-adk`) defined and installable
 
 V2.1 Go/No-Go (TS parity + anthropic):
 - [x] TS parity fixtures pass
diff --git a/examples/integrations/langchain_harness.py b/examples/integrations/langchain_harness.py
new file mode 100644
index 00000000..c0be501f
--- /dev/null
+++ b/examples/integrations/langchain_harness.py
@@ -0,0 +1,55 @@
+"""
+LangChain + cascadeflow harness integration example.
+
+Run:
+    pip install "cascadeflow[langchain]"
+    export OPENAI_API_KEY="your-key"
+    python examples/integrations/langchain_harness.py
+"""
+
+from __future__ import annotations
+
+import asyncio
+
+
+async def main() -> None:
+    try:
+        from langchain_openai import ChatOpenAI
+    except ImportError as exc:
+        raise SystemExit(
+            "langchain-openai is not installed. "
+            'Install with: pip install "cascadeflow[langchain]" langchain-openai'
+        ) from exc
+
+    from cascadeflow import init, run
+    from cascadeflow.integrations.langchain import get_harness_callback
+
+    # 1) Initialize harness globally.
+    init(mode="observe", budget=1.0, max_tool_calls=6)
+
+    model = ChatOpenAI(model="gpt-4o-mini")
+
+    # 2) Scoped run with harness-aware callback.
+    with run(budget=0.5, max_tool_calls=4) as session:
+        with get_harness_callback() as cb:
+            response = await model.ainvoke(
+                "Explain why inside-the-loop model routing helps agent budgets.",
+                config={"callbacks": [cb]},
+            )
+
+        print("=== Result ===")
+        print(response.content)
+        print("\n=== Harness Metrics ===")
+        print(f"Cost: ${session.cost:.6f}")
+        print(f"Remaining budget: {session.budget_remaining}")
+        print(f"Steps: {session.step_count}")
+        print(f"Tool calls: {session.tool_calls}")
+        print(f"Latency: {session.latency_used_ms:.0f}ms")
+        print(f"Energy: {session.energy_used:.1f}")
+        print("\n=== Decision Trace ===")
+        for event in session.trace():
+            print(event)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/llms.txt b/llms.txt
new file mode 100644
index 00000000..5ef1c215
--- /dev/null
+++ b/llms.txt
@@ -0,0 +1,87 @@
+# cascadeflow
+
+> Agent intelligence harness for cost, quality, latency, energy, and business KPI optimization.
+> In-process library (not a proxy). Works inside agent loops with full state awareness.
+
+## Install
+
+pip install cascadeflow
+
+## Quickstart (3 lines)
+
+import cascadeflow
+cascadeflow.init(mode="observe")
+# All openai/anthropic SDK calls are now tracked. Switch to "enforce" for budget gating.
+
+## When to use cascadeflow
+
+- You run AI agents (LangChain, LangGraph, CrewAI, OpenAI Agents SDK, Google ADK, or custom)
+- You want to reduce LLM costs without changing agent code
+- You need budget enforcement across multi-step agent runs
+- You need to inject business KPIs (quality, cost, latency, energy) into agent decisions
+- You need compliance-aware model gating (GDPR, HIPAA, PCI, strict)
+- You want full trace recording for auditability and tuning
+
+## When NOT to use cascadeflow
+
+- Single one-off LLM calls (overhead not justified)
+- You only use one model and don't want routing
+- You need a hosted proxy service (cascadeflow is a library, not a SaaS)
+
+## Key APIs
+
+- cascadeflow.init(mode) -- activate harness globally (off | observe | enforce)
+- cascadeflow.run(budget, max_tool_calls) -- scoped agent run with budget/limits
+- @cascadeflow.agent(budget, kpis) -- annotate agent functions with policy metadata
+- session.summary() -- structured run metrics (cost, latency, energy, steps, tool calls)
+- session.trace() -- full decision trace for auditability
+
+## Harness Modes
+
+- off: no tracking, no enforcement
+- observe: track all metrics and decisions, never block execution (safe for production rollout)
+- enforce: track + enforce budget/tool/latency/energy caps (stop or deny_tool actions)
+
+## Harness Dimensions
+
+- Cost: estimated USD from model pricing table (18 models, fuzzy resolution)
+- Latency: wall-clock milliseconds per LLM call
+- Energy: deterministic compute-intensity proxy coefficient
+- Tool calls: count of tool/function calls executed
+- Quality: model quality priors for KPI-weighted scoring
+
+## Integrations
+
+pip install cascadeflow[langchain]       # LangChain/LangGraph callback handler
+pip install cascadeflow[openai-agents]   # OpenAI Agents SDK ModelProvider
+pip install cascadeflow[crewai]          # CrewAI llm_hooks integration
+pip install cascadeflow[google-adk]      # Google ADK BasePlugin
+
+All integrations are opt-in. Install the extra and explicitly enable the integration.
+
+## Integration Patterns
+
+- LangChain: HarnessAwareCascadeFlowCallbackHandler via get_harness_callback()
+- OpenAI Agents SDK: CascadeFlowModelProvider with model candidates and tool gating
+- CrewAI: enable() registers global llm_hooks for budget gating and tracking
+- Google ADK: enable() returns a BasePlugin for Runner(plugins=[plugin])
+- n8n: Built-in harness mode (observe/enforce) on the Agent node with UI parameters
+- Vercel AI SDK: TypeScript middleware integration
+
+## Decision Actions
+
+- allow: proceed normally
+- switch_model: route to cheaper/better model (where runtime allows)
+- deny_tool: block tool execution when tool call cap reached
+- stop: halt agent loop when budget/latency/energy cap exceeded
+
+## Supported Models (pricing table)
+
+OpenAI: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-3.5-turbo, o1, o1-mini, o3-mini, gpt-5, gpt-5-mini, gpt-5-nano
+Anthropic: claude-sonnet-4, claude-haiku-3.5, claude-opus-4.5
+Google: gemini-2.5-flash, gemini-2.5-pro, gemini-2.0-flash, gemini-1.5-flash, gemini-1.5-pro
+
+## Links
+
+- Source: https://github.com/lemony-ai/cascadeflow
+- PyPI: pip install cascadeflow

From ca7fa4acb87473e3939a7f1962afd3551166a696 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 5 Mar 2026 11:16:09 +0100
Subject: [PATCH 47/49] harden harness: input validation, trace rotation, NaN
 guard, phantom model fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add _validate_harness_params() to init() and run() — rejects negative
  budget/tool_calls/latency/energy and invalid compliance strings
- Add trace rotation (MAX_TRACE_ENTRIES=1000) in both Python and TypeScript
  to prevent unbounded memory growth in long-running agents
- Add sanitizeNumericParam() in n8n harness.ts — coerces NaN/Infinity/negative
  config values to null
- Remove phantom gpt-5-nano from llms.txt (not in any pricing table)
- Document HarnessRunContext thread-safety limitation in docstring
- Add 10 new tests covering validation, compliance, and trace rotation
---
 cascadeflow/harness/api.py                    | 54 +++++++++++++++++
 llms.txt                                      |  2 +-
 .../integrations/n8n/nodes/harness/harness.ts | 22 ++++++-
 tests/test_harness_api.py                     | 60 +++++++++++++++++++
 4 files changed, 135 insertions(+), 3 deletions(-)

diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py
index 036c80eb..95ff4245 100644
--- a/cascadeflow/harness/api.py
+++ b/cascadeflow/harness/api.py
@@ -41,6 +41,16 @@ class HarnessInitReport:
 
 @dataclass
 class HarnessRunContext:
+    """Scoped run context for tracking harness metrics across LLM calls.
+
+    Thread safety: the context is stored in a ``ContextVar`` and is safe for
+    asyncio (each task gets its own copy of the token).  However, the context
+    object itself uses plain attribute mutation (``+=``) for counters.  If
+    multiple OS threads share the *same* ``HarnessRunContext`` instance,
+    concurrent updates may race.  Each ``with run(...)`` scope should be
+    confined to a single thread or asyncio task.
+    """
+
     run_id: str = field(default_factory=lambda: uuid4().hex[:12])
     _started_monotonic: float = field(default_factory=time.monotonic, init=False, repr=False)
     started_at_ms: float = field(default_factory=lambda: time.time() * 1000)
@@ -175,6 +185,8 @@ def record(
         if decision_mode is not None:
             entry["decision_mode"] = decision_mode
         self._trace.append(entry)
+        if len(self._trace) > _MAX_TRACE_ENTRIES:
+            self._trace = self._trace[-_MAX_TRACE_ENTRIES:]
         _emit_harness_decision(entry)
 
 
@@ -193,6 +205,32 @@ def _validate_mode(mode: str) -> HarnessMode:
     return cast(HarnessMode, mode)
 
 
+_VALID_COMPLIANCE_VALUES = {"gdpr", "hipaa", "pci", "strict"}
+
+
+def _validate_harness_params(
+    *,
+    budget: Optional[float],
+    max_tool_calls: Optional[int],
+    max_latency_ms: Optional[float],
+    max_energy: Optional[float],
+    compliance: Optional[str],
+) -> None:
+    """Validate harness parameters, raising ValueError for invalid inputs."""
+    if budget is not None and budget < 0:
+        raise ValueError(f"budget must be non-negative, got {budget}")
+    if max_tool_calls is not None and max_tool_calls < 0:
+        raise ValueError(f"max_tool_calls must be non-negative, got {max_tool_calls}")
+    if max_latency_ms is not None and max_latency_ms < 0:
+        raise ValueError(f"max_latency_ms must be non-negative, got {max_latency_ms}")
+    if max_energy is not None and max_energy < 0:
+        raise ValueError(f"max_energy must be non-negative, got {max_energy}")
+    if compliance is not None and compliance.strip().lower() not in _VALID_COMPLIANCE_VALUES:
+        raise ValueError(
+            f"compliance must be one of {sorted(_VALID_COMPLIANCE_VALUES)}, got {compliance!r}"
+        )
+
+
 def _detect_sdks() -> dict[str, bool]:
     return {
         "openai": find_spec("openai") is not None,
@@ -244,6 +282,7 @@ def reset() -> None:
 _MAX_REASON_LEN = 160
 _MAX_MODEL_LEN = 128
 _MAX_ENV_JSON_LEN = 4096
+_MAX_TRACE_ENTRIES = 1000
 
 
 def _sanitize_trace_value(value: Any, *, max_length: int) -> Optional[str]:
@@ -482,6 +521,13 @@ def init(
         sources["callback_manager"] = "code"
 
     validated_mode = _validate_mode(str(resolved_mode))
+    _validate_harness_params(
+        budget=cast(Optional[float], resolved_budget),
+        max_tool_calls=cast(Optional[int], resolved_max_tool_calls),
+        max_latency_ms=cast(Optional[float], resolved_max_latency_ms),
+        max_energy=cast(Optional[float], resolved_max_energy),
+        compliance=cast(Optional[str], resolved_compliance),
+    )
     _harness_config = HarnessConfig(
         mode=validated_mode,
         verbose=bool(resolved_verbose),
@@ -573,6 +619,14 @@ def run(
     resolved_kpi_weights = kpi_weights if kpi_weights is not None else config.kpi_weights
     resolved_compliance = compliance if compliance is not None else config.compliance
 
+    _validate_harness_params(
+        budget=resolved_budget,
+        max_tool_calls=resolved_tool_calls,
+        max_latency_ms=resolved_latency,
+        max_energy=resolved_energy,
+        compliance=resolved_compliance,
+    )
+
     return HarnessRunContext(
         mode=config.mode,
         budget_max=resolved_budget,
diff --git a/llms.txt b/llms.txt
index 5ef1c215..51bb8437 100644
--- a/llms.txt
+++ b/llms.txt
@@ -77,7 +77,7 @@ All integrations are opt-in. Install the extra and explicitly enable the integra
 
 ## Supported Models (pricing table)
 
-OpenAI: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-3.5-turbo, o1, o1-mini, o3-mini, gpt-5, gpt-5-mini, gpt-5-nano
+OpenAI: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-3.5-turbo, o1, o1-mini, o3-mini, gpt-5, gpt-5-mini
 Anthropic: claude-sonnet-4, claude-haiku-3.5, claude-opus-4.5
 Google: gemini-2.5-flash, gemini-2.5-pro, gemini-2.0-flash, gemini-1.5-flash, gemini-1.5-pro
 
diff --git a/packages/integrations/n8n/nodes/harness/harness.ts b/packages/integrations/n8n/nodes/harness/harness.ts
index 93c5150d..ab3943d5 100644
--- a/packages/integrations/n8n/nodes/harness/harness.ts
+++ b/packages/integrations/n8n/nodes/harness/harness.ts
@@ -240,6 +240,15 @@ function selectLowerEnergyModel(currentModel: string): string {
 // HarnessRunContext
 // ---------------------------------------------------------------------------
 
+const MAX_TRACE_ENTRIES = 1000;
+
+/** Coerce NaN, Infinity, or negative values to null (unlimited). */
+function sanitizeNumericParam(value: number | null): number | null {
+  if (value === null || value === undefined) return null;
+  if (!Number.isFinite(value) || value < 0) return null;
+  return value;
+}
+
 let runIdCounter = 0;
 
 function generateRunId(): string {
@@ -266,8 +275,14 @@ export class HarnessRunContext {
 
   constructor(config: HarnessConfig) {
     this.runId = generateRunId();
-    this.config = config;
-    this.budgetRemaining = config.budgetMax;
+    this.config = {
+      ...config,
+      budgetMax: sanitizeNumericParam(config.budgetMax),
+      toolCallsMax: sanitizeNumericParam(config.toolCallsMax),
+      latencyMaxMs: sanitizeNumericParam(config.latencyMaxMs),
+      energyMax: sanitizeNumericParam(config.energyMax),
+    };
+    this.budgetRemaining = this.config.budgetMax;
     this.startedAt = Date.now();
   }
 
@@ -386,6 +401,9 @@ export class HarnessRunContext {
       applied,
       decisionMode: this.config.mode,
     });
+    if (this.trace.length > MAX_TRACE_ENTRIES) {
+      this.trace = this.trace.slice(-MAX_TRACE_ENTRIES);
+    }
   }
 
   // -----------------------------------------------------------------------
diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py
index 850255ba..f4e7f9cd 100644
--- a/tests/test_harness_api.py
+++ b/tests/test_harness_api.py
@@ -474,3 +474,63 @@ def test_record_empty_action_warns_and_defaults(caplog):
     entry = ctx.trace()[0]
     assert entry["action"] == "allow"
     assert any("empty action" in rec.message for rec in caplog.records)
+
+
+def test_init_rejects_negative_budget():
+    with pytest.raises(ValueError, match="non-negative"):
+        init(mode="observe", budget=-1.0)
+
+
+def test_init_rejects_negative_max_tool_calls():
+    with pytest.raises(ValueError, match="non-negative"):
+        init(mode="observe", max_tool_calls=-1)
+
+
+def test_init_rejects_negative_max_latency():
+    with pytest.raises(ValueError, match="non-negative"):
+        init(mode="observe", max_latency_ms=-100.0)
+
+
+def test_init_rejects_negative_max_energy():
+    with pytest.raises(ValueError, match="non-negative"):
+        init(mode="observe", max_energy=-0.5)
+
+
+def test_init_rejects_invalid_compliance():
+    with pytest.raises(ValueError, match="compliance"):
+        init(mode="observe", compliance="invalid_mode")
+
+
+def test_run_rejects_negative_budget():
+    init(mode="observe")
+    with pytest.raises(ValueError, match="non-negative"):
+        run(budget=-0.5)
+
+
+def test_run_rejects_invalid_compliance():
+    init(mode="observe")
+    with pytest.raises(ValueError, match="compliance"):
+        run(compliance="foobar")
+
+
+def test_init_accepts_zero_budget():
+    report = init(mode="observe", budget=0.0)
+    cfg = get_harness_config()
+    assert cfg.budget == 0.0
+
+
+def test_init_accepts_valid_compliance():
+    for value in ("gdpr", "hipaa", "pci", "strict"):
+        reset()
+        report = init(mode="observe", compliance=value)
+        cfg = get_harness_config()
+        assert cfg.compliance == value
+
+
+def test_trace_rotation_limits_entries():
+    init(mode="observe")
+    with run(budget=100.0) as ctx:
+        for i in range(1050):
+            ctx.record(action="allow", reason="test", model="gpt-4o-mini")
+    trace = ctx.trace()
+    assert len(trace) <= 1000

From 9547ab13175fa3d17fc36fa208ca3bb4d0ec9df0 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 5 Mar 2026 14:25:10 +0100
Subject: [PATCH 48/49] docs: reframe positioning as agent runtime intelligence
 layer + add Mintlify docs site
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 0 — GitHub refresh:
- pyproject.toml: update description, keywords, classifier to Production/Stable
- __init__.py: replace emoji docstring with harness API focus
- llms.txt: expand from 88 to 214 lines (HarnessConfig, pricing, energy, integrations)
- README.md: new H1, comparison table, Harness API section, 6 new feature rows
- docs/README.md: Mintlify banner, add LangChain to integrations list

Phase 1 — Mintlify docs site (docs-site/):
- docs.json config (palm theme, 5 tabs, full navigation)
- 36 MDX pages: Get Started (4), Harness (8), Integrations (7),
  API Reference (8), Examples (6), index + changelog + contributing
- Logo assets copied from .github/assets/
---
 README.md                                     |  58 +++++-
 cascadeflow/__init__.py                       |  45 ++---
 .../api-reference/python/agent-decorator.mdx  |  79 ++++++++
 .../api-reference/python/harness-config.mdx   |  73 ++++++++
 docs-site/api-reference/python/init.mdx       |  68 +++++++
 .../api-reference/python/run-context.mdx      |  76 ++++++++
 docs-site/api-reference/python/run.mdx        |  83 +++++++++
 docs-site/api-reference/typescript/core.mdx   |  77 ++++++++
 .../api-reference/typescript/langchain.mdx    |  77 ++++++++
 .../api-reference/typescript/vercel-ai.mdx    |  63 +++++++
 docs-site/changelog.mdx                       |  28 +++
 docs-site/contributing.mdx                    |  96 ++++++++++
 docs-site/docs.json                           | 130 +++++++++++++
 docs-site/examples/basic-usage.mdx            |  81 ++++++++
 docs-site/examples/budget-enforcement.mdx     |  84 +++++++++
 docs-site/examples/compliance-gating.mdx      |  89 +++++++++
 docs-site/examples/enterprise-patterns.mdx    | 127 +++++++++++++
 docs-site/examples/kpi-weighted-routing.mdx   |  95 ++++++++++
 docs-site/examples/multi-agent.mdx            | 103 +++++++++++
 docs-site/favicon.svg                         |   8 +
 docs-site/get-started/how-it-works.mdx        | 112 ++++++++++++
 docs-site/get-started/installation.mdx        | 101 ++++++++++
 docs-site/get-started/introduction.mdx        |  62 +++++++
 docs-site/get-started/quickstart.mdx          | 118 ++++++++++++
 docs-site/harness/actions.mdx                 |  99 ++++++++++
 docs-site/harness/budget-enforcement.mdx      |  83 +++++++++
 docs-site/harness/compliance.mdx              |  66 +++++++
 docs-site/harness/decision-trace.mdx          | 102 +++++++++++
 docs-site/harness/energy-tracking.mdx         |  99 ++++++++++
 docs-site/harness/kpi-optimization.mdx        | 103 +++++++++++
 docs-site/harness/modes.mdx                   |  78 ++++++++
 docs-site/harness/overview.mdx                |  80 ++++++++
 docs-site/index.mdx                           |  91 +++++++++
 docs-site/integrations/crewai.mdx             |  78 ++++++++
 docs-site/integrations/google-adk.mdx         |  91 +++++++++
 docs-site/integrations/langchain.mdx          | 106 +++++++++++
 docs-site/integrations/n8n.mdx                |  70 +++++++
 docs-site/integrations/openai-agents.mdx      |  77 ++++++++
 docs-site/integrations/overview.mdx           |  53 ++++++
 docs-site/integrations/vercel-ai.mdx          |  88 +++++++++
 docs-site/logo/cascadeflow-dark.svg           |  27 +++
 docs-site/logo/cascadeflow-light.svg          |  20 ++
 docs/README.md                                |   7 +-
 llms.txt                                      | 173 +++++++++++++++---
 pyproject.toml                                |  14 +-
 45 files changed, 3477 insertions(+), 61 deletions(-)
 create mode 100644 docs-site/api-reference/python/agent-decorator.mdx
 create mode 100644 docs-site/api-reference/python/harness-config.mdx
 create mode 100644 docs-site/api-reference/python/init.mdx
 create mode 100644 docs-site/api-reference/python/run-context.mdx
 create mode 100644 docs-site/api-reference/python/run.mdx
 create mode 100644 docs-site/api-reference/typescript/core.mdx
 create mode 100644 docs-site/api-reference/typescript/langchain.mdx
 create mode 100644 docs-site/api-reference/typescript/vercel-ai.mdx
 create mode 100644 docs-site/changelog.mdx
 create mode 100644 docs-site/contributing.mdx
 create mode 100644 docs-site/docs.json
 create mode 100644 docs-site/examples/basic-usage.mdx
 create mode 100644 docs-site/examples/budget-enforcement.mdx
 create mode 100644 docs-site/examples/compliance-gating.mdx
 create mode 100644 docs-site/examples/enterprise-patterns.mdx
 create mode 100644 docs-site/examples/kpi-weighted-routing.mdx
 create mode 100644 docs-site/examples/multi-agent.mdx
 create mode 100644 docs-site/favicon.svg
 create mode 100644 docs-site/get-started/how-it-works.mdx
 create mode 100644 docs-site/get-started/installation.mdx
 create mode 100644 docs-site/get-started/introduction.mdx
 create mode 100644 docs-site/get-started/quickstart.mdx
 create mode 100644 docs-site/harness/actions.mdx
 create mode 100644 docs-site/harness/budget-enforcement.mdx
 create mode 100644 docs-site/harness/compliance.mdx
 create mode 100644 docs-site/harness/decision-trace.mdx
 create mode 100644 docs-site/harness/energy-tracking.mdx
 create mode 100644 docs-site/harness/kpi-optimization.mdx
 create mode 100644 docs-site/harness/modes.mdx
 create mode 100644 docs-site/harness/overview.mdx
 create mode 100644 docs-site/index.mdx
 create mode 100644 docs-site/integrations/crewai.mdx
 create mode 100644 docs-site/integrations/google-adk.mdx
 create mode 100644 docs-site/integrations/langchain.mdx
 create mode 100644 docs-site/integrations/n8n.mdx
 create mode 100644 docs-site/integrations/openai-agents.mdx
 create mode 100644 docs-site/integrations/overview.mdx
 create mode 100644 docs-site/integrations/vercel-ai.mdx
 create mode 100644 docs-site/logo/cascadeflow-dark.svg
 create mode 100644 docs-site/logo/cascadeflow-light.svg

diff --git a/README.md b/README.md
index 63e9af87..27baf1be 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
   <img alt="cascadeflow Logo" src="./.github/assets/CF_logo_dark.svg" width="80%" style="margin: 20px auto;">
 </picture>
 
-# Smart AI model cascading for cost optimization
+# Agent Runtime Intelligence Layer
 
 [![PyPI version](https://img.shields.io/pypi/v/cascadeflow?color=blue&label=Python)](https://pypi.org/project/cascadeflow/)
 [![npm version](https://img.shields.io/npm/v/@cascadeflow/core?color=red&label=TypeScript)](https://www.npmjs.com/package/@cascadeflow/core)
@@ -17,6 +17,7 @@
 [![PyPI Downloads](https://static.pepy.tech/badge/cascadeflow)](https://pepy.tech/project/cascadeflow)
 [![npm Downloads](https://img.shields.io/npm/dt/@cascadeflow/n8n-nodes-cascadeflow?label=npm%20downloads&color=orange)](https://www.npmjs.com/search?q=%40cascadeflow)
 [![Tests](https://github.com/lemony-ai/cascadeflow/actions/workflows/test.yml/badge.svg)](https://github.com/lemony-ai/cascadeflow/actions/workflows/test.yml)
+[![Docs](https://img.shields.io/badge/docs-cascadeflow.dev-blue)](https://docs.cascadeflow.dev)
 [![Python Docs](https://img.shields.io/badge/docs-Python-blue)](./docs/)
 [![TypeScript Docs](https://img.shields.io/badge/docs-TypeScript-red)](./docs/)
 [![X Follow](https://img.shields.io/twitter/follow/saschabuehrle?style=social)](https://x.com/saschabuehrle)
@@ -28,17 +29,15 @@
 
 <br>
 
-**[<img src=".github/assets/CF_python_color.svg" width="22" height="22" alt="Python" style="vertical-align: middle;"/> Python](#-python) • [<img src=".github/assets/CF_ts_color.svg" width="22" height="22" alt="TypeScript" style="vertical-align: middle;"/> TypeScript](#-typescript) • [<picture><source media="(prefers-color-scheme: dark)" srcset="./.github/assets/LC-logo-bright.png"><source media="(prefers-color-scheme: light)" srcset="./.github/assets/LC-logo-dark.png"><img src=".github/assets/LC-logo-dark.png" height="22" alt="LangChain" style="vertical-align: middle;"></picture> LangChain](#-langchain-integration) • [<img src=".github/assets/CF_n8n_color.svg" width="22" height="22" alt="n8n" style="vertical-align: middle;"/> n8n](#-n8n-integration) • [<picture><source media="(prefers-color-scheme: dark)" srcset="./.github/assets/CF_vercel_bright.svg"><source media="(prefers-color-scheme: light)" srcset="./.github/assets/CF_vercel_dark.svg"><img src=".github/assets/CF_vercel_dark.svg" width="22" height="22" alt="Vercel AI" style="vertical-align: middle;"></picture> Vercel AI](./packages/integrations/vercel-ai/) • [<img src=".github/assets/CF_openclaw_color.svg" width="22" height="22" alt="OpenClaw" style="vertical-align: middle;"/> OpenClaw](https://clawhub.ai/saschabuehrle/cascadeflow) • [📖 Docs](./docs/) • [💡 Examples](#examples)**
+**[<img src=".github/assets/CF_python_color.svg" width="22" height="22" alt="Python" style="vertical-align: middle;"/> Python](#-python) • [<img src=".github/assets/CF_ts_color.svg" width="22" height="22" alt="TypeScript" style="vertical-align: middle;"/> TypeScript](#-typescript) • [<picture><source media="(prefers-color-scheme: dark)" srcset="./.github/assets/LC-logo-bright.png"><source media="(prefers-color-scheme: light)" srcset="./.github/assets/LC-logo-dark.png"><img src=".github/assets/LC-logo-dark.png" height="22" alt="LangChain" style="vertical-align: middle;"></picture> LangChain](#-langchain-integration) • [<img src=".github/assets/CF_n8n_color.svg" width="22" height="22" alt="n8n" style="vertical-align: middle;"/> n8n](#-n8n-integration) • [<picture><source media="(prefers-color-scheme: dark)" srcset="./.github/assets/CF_vercel_bright.svg"><source media="(prefers-color-scheme: light)" srcset="./.github/assets/CF_vercel_dark.svg"><img src=".github/assets/CF_vercel_dark.svg" width="22" height="22" alt="Vercel AI" style="vertical-align: middle;"></picture> Vercel AI](./packages/integrations/vercel-ai/) • [<img src=".github/assets/CF_openclaw_color.svg" width="22" height="22" alt="OpenClaw" style="vertical-align: middle;"/> OpenClaw](https://clawhub.ai/saschabuehrle/cascadeflow) • [Full Docs](https://docs.cascadeflow.dev) • [📖 Docs](./docs/) • [💡 Examples](#examples)**
 
 </div>
 
 ---
 
-**Stop Bleeding Money on AI Calls. Cut Costs 30-65% in 3 Lines of Code.**
+**The in-process intelligence layer for AI agents.** Optimize cost, latency, quality, budget, compliance, and energy — inside the execution loop, not at the HTTP boundary.
 
-40-70% of text prompts and 20-60% of agent calls don't need expensive flagship models. You're overpaying every single day.
-
-*cascadeflow fixes this with intelligent model cascading, available in Python and TypeScript.*
+cascadeflow works where external proxies can't: per-step model decisions based on agent state, per-tool-call budget gating, runtime stop/continue/escalate actions, and business KPI injection during agent loops. Sub-1ms overhead. Works with LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, and Vercel AI SDK.
 
 ```python
 pip install cascadeflow
@@ -52,6 +51,17 @@ npm install @cascadeflow/core
 
 ## Why cascadeflow?
 
+### Proxy vs In-Process Harness
+
+| Dimension | External Proxy | cascadeflow Harness |
+|---|---|---|
+| **Scope** | HTTP request boundary | Inside agent execution loop |
+| **Dimensions** | Cost only | Cost + quality + latency + budget + compliance + energy |
+| **Latency overhead** | 10-50ms network RTT | <1ms in-process |
+| **Business logic** | None | KPI weights and targets |
+| **Enforcement** | None (observe only) | stop, deny_tool, switch_model |
+| **Auditability** | Request logs | Per-step decision traces |
+
 cascadeflow is an intelligent AI model cascading library that dynamically selects the optimal model for each query or tool call through speculative execution. It's based on the research that 40-70% of queries don't require slow, expensive flagship models, and domain-specific smaller models often outperform large general-purpose models on specialized tasks. For the remaining queries that need advanced reasoning, cascadeflow automatically escalates to flagship models if needed.
 
 ### Use Cases
@@ -140,6 +150,34 @@ In practice, 60-70% of queries are handled by small, efficient models (8-20x cos
 
 ---
 
+## Harness API
+
+Three tiers of integration — zero-change observability to full policy control:
+
+**Tier 1: Zero-change observability**
+```python
+import cascadeflow
+cascadeflow.init(mode="observe")
+# All OpenAI/Anthropic SDK calls are now tracked. No code changes needed.
+```
+
+**Tier 2: Scoped runs with budget**
+```python
+with cascadeflow.run(budget=0.50, max_tool_calls=10) as session:
+    result = await agent.run("Analyze this dataset")
+    print(session.summary())  # cost, latency, energy, steps, tool calls
+    print(session.trace())    # full decision audit trail
+```
+
+**Tier 3: Decorated agents with policy**
+```python
+@cascadeflow.agent(budget=0.20, compliance="gdpr", kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1})
+async def my_agent(query: str):
+    return await llm.complete(query)
+```
+
+---
+
 ## Quick Start
 
 ### Drop-In Gateway (Existing Apps)
@@ -724,6 +762,12 @@ console.log(`Warnings: ${validation.warnings}`);
 | 📋 **Message & Tool Call Lists** | Full conversation history with tool_calls and tool_call_id preservation across turns |
 | 🪝 **Hooks & Callbacks** | Telemetry callbacks, cost events, and streaming hooks for observability |
 | 🏭 **Production Ready**  | Streaming, batch processing, tool handling, reasoning model support, caching, error recovery, anomaly detection |
+| 💳 **Budget Enforcement** | Per-run and per-user budget caps with automatic stop actions when limits are exceeded |
+| 🔒 **Compliance Gating** | GDPR, HIPAA, PCI, and strict model allowlists — block non-compliant models before execution |
+| 📊 **KPI-Weighted Routing** | Inject business priorities (quality, cost, latency, energy) as weights into every model decision |
+| 🌱 **Energy Tracking** | Deterministic compute-intensity coefficients for carbon-aware AI operations |
+| 🔍 **Decision Traces** | Full per-step audit trail: action, reason, model, cost, budget state, enforcement status |
+| ⚙️ **Harness Modes** | off / observe / enforce — roll out safely with observe, then switch to enforce when ready |
 
 ---
 
@@ -774,7 +818,7 @@ If you use cascadeflow in your research or project, please cite:
 ```bibtex
 @software{cascadeflow2025,
   author = {Lemony Inc., Sascha Buehrle and Contributors},
-  title = {cascadeflow: Smart AI model cascading for cost optimization},
+  title = {cascadeflow: Agent runtime intelligence layer for AI agent workflows},
   year = {2025},
   publisher = {GitHub},
   url = {https://github.com/lemony-ai/cascadeflow}
diff --git a/cascadeflow/__init__.py b/cascadeflow/__init__.py
index 6dd64b05..af4c429a 100644
--- a/cascadeflow/__init__.py
+++ b/cascadeflow/__init__.py
@@ -1,30 +1,23 @@
 """
-cascadeflow - Smart AI model cascading for cost optimization.
-
-Route queries intelligently across multiple AI models from tiny SLMs
-to frontier LLMs based on complexity, domain, and budget.
-
-Features:
-- 🚀 Speculative cascades (2-3x faster)
-- 💰 60-95% cost savings
-- 🎯 Per-prompt domain detection
-- 🎨 2.0x domain boost for specialists
-- 🔍 Multi-factor optimization
-- 🆓 Free tier (Ollama + Groq)
-- ⚡ 3 lines of code
-
-Example:
-    >>> from cascadeflow import CascadeAgent, CascadePresets
-    >>>
-    >>> # Auto-detect available models
-    >>> models = CascadePresets.auto_detect_models()
-    >>>
-    >>> # Create agent with intelligence layer
-    >>> agent = CascadeAgent(models, enable_caching=True)
-    >>>
-    >>> # Run query (automatically optimized!)
-    >>> result = await agent.run("Fix this Python bug")
-    >>> print(f"Used {result.model_used} - Cost: ${result.cost:.6f}")
+cascadeflow - Agent runtime intelligence layer.
+
+In-process harness that optimizes cost, latency, quality, budget, compliance,
+and energy across AI agent workflows. Works inside agent execution loops with
+full state awareness -- not an external proxy.
+
+Quick start:
+    import cascadeflow
+    cascadeflow.init(mode="observe")
+    # All OpenAI/Anthropic SDK calls are now tracked and traced.
+
+Key APIs:
+    cascadeflow.init(mode)        -- activate harness (off | observe | enforce)
+    cascadeflow.run(budget)       -- scoped run with budget/trace
+    @cascadeflow.agent(budget)    -- policy metadata on agent functions
+    session.summary()             -- structured metrics
+    session.trace()               -- full decision audit trail
+
+Integrations: LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, Vercel AI SDK
 """
 
 __version__ = "1.0.0"
diff --git a/docs-site/api-reference/python/agent-decorator.mdx b/docs-site/api-reference/python/agent-decorator.mdx
new file mode 100644
index 00000000..912a03fd
--- /dev/null
+++ b/docs-site/api-reference/python/agent-decorator.mdx
@@ -0,0 +1,79 @@
+---
+title: "@cascadeflow.agent()"
+description: Decorate agent functions with policy metadata including budget, compliance, and KPI weights.
+---
+
+# @cascadeflow.agent()
+
+Annotate agent functions with policy metadata. The decorator attaches budget, compliance, and KPI configuration to the function for the harness to use at runtime.
+
+## Signature
+
+```python
+def agent(
+    budget: Optional[float] = None,
+    compliance: Optional[str] = None,
+    kpi_weights: Optional[dict[str, float]] = None,
+    kpi_targets: Optional[dict[str, float]] = None,
+    max_tool_calls: Optional[int] = None,
+)
+```
+
+## Parameters
+
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `budget` | `float \| None` | `None` | Max USD for this agent |
+| `compliance` | `str \| None` | `None` | Compliance mode |
+| `kpi_weights` | `dict \| None` | `None` | KPI dimension weights |
+| `kpi_targets` | `dict \| None` | `None` | KPI dimension targets |
+| `max_tool_calls` | `int \| None` | `None` | Max tool/function calls |
+
+## Usage
+
+### Basic
+
+```python
+@cascadeflow.agent(budget=0.20)
+async def my_agent(query: str):
+    return await llm.complete(query)
+```
+
+### With compliance
+
+```python
+@cascadeflow.agent(budget=0.50, compliance="gdpr")
+async def eu_agent(query: str):
+    return await llm.complete(query)
+```
+
+### With KPI weights
+
+```python
+@cascadeflow.agent(
+    budget=1.00,
+    kpi_weights={"quality": 0.8, "cost": 0.2},
+    kpi_targets={"quality": 0.9},
+)
+async def premium_agent(query: str):
+    return await llm.complete(query)
+```
+
+### Multiple agents with different policies
+
+```python
+@cascadeflow.agent(budget=0.10, kpi_weights={"cost": 0.9, "quality": 0.1})
+async def triage_agent(query: str):
+    return await llm.complete(query)
+
+@cascadeflow.agent(budget=2.00, kpi_weights={"quality": 0.9, "cost": 0.1})
+async def analysis_agent(query: str):
+    return await llm.complete(query)
+```
+
+## Notes
+
+- The decorator does not wrap or modify the function's execution. It attaches metadata that the harness reads at runtime.
+- Works with both sync and async functions.
+- Requires `init()` to have been called for the metadata to take effect.
+- Can be combined with `run()` — the run's constraints are checked in addition to the decorator's.
diff --git a/docs-site/api-reference/python/harness-config.mdx b/docs-site/api-reference/python/harness-config.mdx
new file mode 100644
index 00000000..42ae7a6d
--- /dev/null
+++ b/docs-site/api-reference/python/harness-config.mdx
@@ -0,0 +1,73 @@
+---
+title: HarnessConfig
+description: Full configuration dataclass for the cascadeflow harness with all fields, types, and defaults.
+---
+
+# HarnessConfig
+
+Configuration dataclass for the cascadeflow harness. Pass to `cascadeflow.init(config=...)` for full control.
+
+## Definition
+
+```python
+from dataclasses import dataclass
+from typing import Optional
+
+@dataclass
+class HarnessConfig:
+    mode: HarnessMode = "off"
+    verbose: bool = False
+    budget: Optional[float] = None
+    max_tool_calls: Optional[int] = None
+    max_latency_ms: Optional[float] = None
+    max_energy: Optional[float] = None
+    kpi_targets: Optional[dict[str, float]] = None
+    kpi_weights: Optional[dict[str, float]] = None
+    compliance: Optional[str] = None
+```
+
+## Fields
+
+| Field | Type | Default | Description |
+|---|---|---|---|
+| `mode` | `"off" \| "observe" \| "enforce"` | `"off"` | Harness mode |
+| `verbose` | `bool` | `False` | Print decisions to stderr |
+| `budget` | `float \| None` | `None` | Max USD for the run (None = unlimited) |
+| `max_tool_calls` | `int \| None` | `None` | Max tool/function calls (None = unlimited) |
+| `max_latency_ms` | `float \| None` | `None` | Max wall-clock ms per call (None = unlimited) |
+| `max_energy` | `float \| None` | `None` | Max energy units (None = unlimited) |
+| `kpi_targets` | `dict \| None` | `None` | Target values per KPI dimension |
+| `kpi_weights` | `dict \| None` | `None` | Relative weights per KPI dimension |
+| `compliance` | `str \| None` | `None` | Compliance mode: `"gdpr"`, `"hipaa"`, `"pci"`, `"strict"` |
+
+## HarnessMode
+
+```python
+HarnessMode = Literal["off", "observe", "enforce"]
+```
+
+## Usage
+
+```python
+from cascadeflow import HarnessConfig
+import cascadeflow
+
+config = HarnessConfig(
+    mode="enforce",
+    budget=1.00,
+    max_tool_calls=20,
+    max_energy=200.0,
+    compliance="gdpr",
+    kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1},
+    kpi_targets={"quality": 0.85},
+    verbose=True,
+)
+
+cascadeflow.init(config=config)
+```
+
+## Import
+
+```python
+from cascadeflow import HarnessConfig
+```
diff --git a/docs-site/api-reference/python/init.mdx b/docs-site/api-reference/python/init.mdx
new file mode 100644
index 00000000..b07a0e00
--- /dev/null
+++ b/docs-site/api-reference/python/init.mdx
@@ -0,0 +1,68 @@
+---
+title: cascadeflow.init()
+description: Activate the cascadeflow harness globally with a mode and optional configuration.
+---
+
+# cascadeflow.init()
+
+Activate the harness globally. All subsequent LLM calls (OpenAI, Anthropic) are automatically tracked.
+
+## Signature
+
+```python
+def init(
+    mode: HarnessMode = "off",
+    *,
+    config: Optional[HarnessConfig] = None,
+    verbose: bool = False,
+) -> HarnessInitReport
+```
+
+## Parameters
+
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `mode` | `"off" \| "observe" \| "enforce"` | `"off"` | Harness mode |
+| `config` | `HarnessConfig \| None` | `None` | Full configuration (overrides mode) |
+| `verbose` | `bool` | `False` | Print decisions to stderr |
+
+## Returns
+
+`HarnessInitReport` — confirmation of harness activation with mode and configuration summary.
+
+## Usage
+
+### Minimal
+
+```python
+import cascadeflow
+cascadeflow.init(mode="observe")
+```
+
+### With config
+
+```python
+from cascadeflow import HarnessConfig
+
+config = HarnessConfig(
+    mode="enforce",
+    budget=1.00,
+    compliance="gdpr",
+    verbose=True,
+)
+cascadeflow.init(config=config)
+```
+
+### Environment-driven
+
+```python
+import os
+cascadeflow.init(mode=os.getenv("CASCADEFLOW_MODE", "observe"))
+```
+
+## Notes
+
+- Call `init()` once at application startup, before any LLM calls
+- Calling `init()` again replaces the previous configuration
+- Use `cascadeflow.reset()` to deactivate the harness
+- `init(mode="off")` is equivalent to not calling `init()` at all
diff --git a/docs-site/api-reference/python/run-context.mdx b/docs-site/api-reference/python/run-context.mdx
new file mode 100644
index 00000000..be9377a4
--- /dev/null
+++ b/docs-site/api-reference/python/run-context.mdx
@@ -0,0 +1,76 @@
+---
+title: HarnessRunContext
+description: Run context object yielded by cascadeflow.run() with summary(), trace(), and budget tracking methods.
+---
+
+# HarnessRunContext
+
+The context object yielded by `cascadeflow.run()`. Provides access to run metrics, decision traces, and budget state.
+
+## Methods
+
+### summary()
+
+Returns aggregate metrics for the run.
+
+```python
+summary = session.summary()
+```
+
+Returns a dict with:
+
+| Key | Type | Description |
+|---|---|---|
+| `cost_total` | `float` | Cumulative cost in USD |
+| `steps` | `int` | Number of LLM calls |
+| `tool_calls` | `int` | Number of tool/function calls |
+| `latency_total_ms` | `float` | Total wall-clock latency in ms |
+| `energy_used` | `float` | Total energy units consumed |
+| `budget_remaining` | `float \| None` | USD remaining (None if no budget set) |
+
+### trace()
+
+Returns the list of decision records for the run.
+
+```python
+records = session.trace()
+```
+
+Each record is a dict with:
+
+| Key | Type | Description |
+|---|---|---|
+| `action` | `str` | `"allow"`, `"switch_model"`, `"deny_tool"`, or `"stop"` |
+| `reason` | `str` | Human-readable explanation |
+| `model` | `str` | Model name |
+| `step` | `int` | Step number (1-indexed) |
+| `cost_total` | `float` | Cumulative cost at this step |
+| `budget_state` | `str` | `"ok"`, `"warning"`, or `"exceeded"` |
+| `applied` | `bool` | Whether the action was enforced |
+
+## Usage
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(budget=0.50) as session:
+    result = await agent.run("Analyze this dataset")
+
+    # Aggregate metrics
+    summary = session.summary()
+    print(f"Cost: ${summary['cost_total']:.4f}")
+    print(f"Steps: {summary['steps']}")
+    print(f"Budget remaining: ${summary['budget_remaining']:.4f}")
+
+    # Decision trace
+    for record in session.trace():
+        print(f"Step {record['step']}: {record['action']} — {record['reason']}")
+```
+
+## Import
+
+```python
+from cascadeflow import HarnessRunContext
+```
diff --git a/docs-site/api-reference/python/run.mdx b/docs-site/api-reference/python/run.mdx
new file mode 100644
index 00000000..72202a74
--- /dev/null
+++ b/docs-site/api-reference/python/run.mdx
@@ -0,0 +1,83 @@
+---
+title: cascadeflow.run()
+description: Create a scoped run context with budget caps, tool call limits, and metrics tracking.
+---
+
+# cascadeflow.run()
+
+Create a scoped run context manager that tracks metrics and optionally enforces constraints for a block of agent execution.
+
+## Signature
+
+```python
+def run(
+    budget: Optional[float] = None,
+    max_tool_calls: Optional[int] = None,
+    max_latency_ms: Optional[float] = None,
+    max_energy: Optional[float] = None,
+    compliance: Optional[str] = None,
+    kpi_weights: Optional[dict[str, float]] = None,
+    kpi_targets: Optional[dict[str, float]] = None,
+) -> ContextManager[HarnessRunContext]
+```
+
+## Parameters
+
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `budget` | `float \| None` | `None` | Max USD for this run |
+| `max_tool_calls` | `int \| None` | `None` | Max tool/function calls |
+| `max_latency_ms` | `float \| None` | `None` | Max wall-clock ms per call |
+| `max_energy` | `float \| None` | `None` | Max energy units |
+| `compliance` | `str \| None` | `None` | `"gdpr"`, `"hipaa"`, `"pci"`, or `"strict"` |
+| `kpi_weights` | `dict \| None` | `None` | KPI dimension weights |
+| `kpi_targets` | `dict \| None` | `None` | KPI dimension targets |
+
+## Returns
+
+Context manager yielding `HarnessRunContext`. See [HarnessRunContext](/api-reference/python/run-context).
+
+## Usage
+
+### Basic budget
+
+```python
+with cascadeflow.run(budget=0.50) as session:
+    result = await agent.run("Analyze this data")
+    print(session.summary())
+```
+
+### Full configuration
+
+```python
+with cascadeflow.run(
+    budget=1.00,
+    max_tool_calls=10,
+    max_energy=100.0,
+    compliance="gdpr",
+    kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1},
+    kpi_targets={"quality": 0.9},
+) as session:
+    result = await agent.run("Process EU customer data")
+    print(session.summary())
+    for record in session.trace():
+        print(f"Step {record['step']}: {record['action']}")
+```
+
+### Nested runs
+
+Runs can be nested. Inner runs inherit the parent's remaining budget:
+
+```python
+with cascadeflow.run(budget=1.00) as outer:
+    with cascadeflow.run(budget=0.30) as inner:
+        await agent.run("Sub-task")
+    # outer.summary() includes inner costs
+```
+
+## Notes
+
+- `run()` requires `init()` to have been called first
+- Parameters override the global config for the duration of the block
+- Use `session.summary()` for aggregate metrics
+- Use `session.trace()` for per-step decision records
diff --git a/docs-site/api-reference/typescript/core.mdx b/docs-site/api-reference/typescript/core.mdx
new file mode 100644
index 00000000..ae8f8311
--- /dev/null
+++ b/docs-site/api-reference/typescript/core.mdx
@@ -0,0 +1,77 @@
+---
+title: "@cascadeflow/core"
+description: TypeScript core package with CascadeAgent for model routing, cost tracking, and quality validation.
+---
+
+# @cascadeflow/core
+
+The core TypeScript package for cascadeflow. Provides `CascadeAgent` for speculative model cascading with quality validation.
+
+## Install
+
+```bash
+npm install @cascadeflow/core
+```
+
+## CascadeAgent
+
+```typescript
+import { CascadeAgent, ModelConfig } from '@cascadeflow/core';
+
+const agent = new CascadeAgent({
+  models: [
+    { name: 'gpt-4o-mini', provider: 'openai', cost: 0.000375 },
+    { name: 'gpt-4o', provider: 'openai', cost: 0.00625 },
+  ],
+});
+
+const result = await agent.run('What is TypeScript?');
+console.log(`Model: ${result.modelUsed}`);
+console.log(`Cost: $${result.totalCost}`);
+console.log(`Saved: ${result.savingsPercentage}%`);
+```
+
+## ModelConfig
+
+```typescript
+interface ModelConfig {
+  name: string;        // Model name (e.g. 'gpt-4o-mini')
+  provider: string;    // Provider name (e.g. 'openai')
+  cost: number;        // Cost per token (approximate)
+}
+```
+
+## CascadeAgentOptions
+
+```typescript
+interface CascadeAgentOptions {
+  models: ModelConfig[];
+  quality?: {
+    threshold?: number;              // Confidence threshold (0-1)
+    requireMinimumTokens?: number;   // Min response length
+    useSemanticValidation?: boolean; // Enable ML validation
+    semanticThreshold?: number;      // Semantic similarity threshold
+  };
+}
+```
+
+## Result
+
+```typescript
+interface CascadeResult {
+  content: string;
+  modelUsed: string;
+  totalCost: number;
+  savingsPercentage: number;
+  cascadeDecision: string;
+}
+```
+
+## Features
+
+- Speculative execution with quality validation
+- Multi-provider support (OpenAI, Anthropic, Groq, Ollama, vLLM)
+- Streaming responses
+- Tool calling and structured output
+- Cost tracking and analytics
+- Works in Node.js, Browser, and Edge Functions
diff --git a/docs-site/api-reference/typescript/langchain.mdx b/docs-site/api-reference/typescript/langchain.mdx
new file mode 100644
index 00000000..9a9e3050
--- /dev/null
+++ b/docs-site/api-reference/typescript/langchain.mdx
@@ -0,0 +1,77 @@
+---
+title: "@cascadeflow/langchain"
+description: TypeScript LangChain integration with withCascade() for drop-in cascade routing and model discovery helpers.
+---
+
+# @cascadeflow/langchain
+
+LangChain integration for TypeScript. Provides `withCascade()` for drop-in cascade routing with any LangChain chat model.
+
+## Install
+
+```bash
+npm install @cascadeflow/langchain @langchain/core @langchain/openai
+```
+
+## withCascade
+
+Creates a cascade-enabled chat model from a drafter and verifier.
+
+```typescript
+import { ChatOpenAI } from '@langchain/openai';
+import { ChatAnthropic } from '@langchain/anthropic';
+import { withCascade } from '@cascadeflow/langchain';
+
+const cascade = withCascade({
+  drafter: new ChatOpenAI({ model: 'gpt-4o-mini' }),
+  verifier: new ChatAnthropic({ model: 'claude-sonnet-4' }),
+  qualityThreshold: 0.8,
+});
+
+// Use like any LangChain chat model
+const result = await cascade.invoke('Explain quantum computing');
+
+// With LCEL chains
+const chain = prompt.pipe(cascade).pipe(new StringOutputParser());
+```
+
+## Options
+
+```typescript
+interface CascadeOptions {
+  drafter: BaseChatModel;        // Cheap, fast model
+  verifier: BaseChatModel;       // Powerful fallback model
+  qualityThreshold?: number;     // 0-1, default 0.4
+}
+```
+
+## Model Discovery
+
+```typescript
+import {
+  discoverCascadePairs,
+  findBestCascadePair,
+  analyzeModel,
+  validateCascadePair,
+} from '@cascadeflow/langchain';
+
+const models = [
+  new ChatOpenAI({ model: 'gpt-4o-mini' }),
+  new ChatOpenAI({ model: 'gpt-4o' }),
+  new ChatAnthropic({ model: 'claude-sonnet-4' }),
+];
+
+const best = findBestCascadePair(models);
+const cascade = withCascade({
+  drafter: best.drafter,
+  verifier: best.verifier,
+});
+```
+
+## Features
+
+- Full LCEL support (pipes, sequences, batch)
+- Streaming with pre-routing
+- Tool calling and structured output
+- LangSmith cost tracking metadata
+- Model discovery and pair validation
diff --git a/docs-site/api-reference/typescript/vercel-ai.mdx b/docs-site/api-reference/typescript/vercel-ai.mdx
new file mode 100644
index 00000000..ae9af949
--- /dev/null
+++ b/docs-site/api-reference/typescript/vercel-ai.mdx
@@ -0,0 +1,63 @@
+---
+title: "@cascadeflow/vercel-ai"
+description: Vercel AI SDK middleware integration for cascade routing with streaming, multi-turn chat, and tool execution.
+---
+
+# @cascadeflow/vercel-ai
+
+Middleware integration for the Vercel AI SDK. Adds cascade routing to AI SDK applications with streaming support.
+
+## Install
+
+```bash
+npm install @cascadeflow/vercel-ai
+```
+
+## createChatHandler
+
+Creates a request handler for AI SDK chat endpoints.
+
+```typescript
+import { createChatHandler } from '@cascadeflow/vercel-ai';
+import { CascadeAgent } from '@cascadeflow/core';
+
+const agent = new CascadeAgent({
+  models: [
+    { name: 'gpt-4o-mini', provider: 'openai', cost: 0.000375 },
+    { name: 'gpt-4o', provider: 'openai', cost: 0.00625 },
+  ],
+});
+
+const handler = createChatHandler(agent, {
+  protocol: 'data',
+  tools,
+  toolHandlers,
+  maxSteps: 5,
+});
+```
+
+## Options
+
+```typescript
+interface ChatHandlerOptions {
+  protocol: 'data' | 'ui';             // AI SDK stream protocol
+  tools?: ToolDefinition[];             // Tool definitions
+  toolHandlers?: Record<string, Function>; // Server-side tool execution
+  toolExecutor?: Function;              // Universal tool executor
+  maxSteps?: number;                    // Multi-step tool loop limit
+  forceDirect?: boolean;                // Skip cascade, use verifier
+  allowOverrides?: string[];            // Request-level override keys
+  overrideSecret?: string;              // Shared secret for overrides
+}
+```
+
+## Features
+
+- AI SDK v4 `data` stream and v5/v6 UI streams
+- `useChat` multi-turn support
+- `parts` message format (AI SDK v6)
+- Tool call streaming visibility
+- Server-side tool execution loops
+- Multi-step controls
+- Cascade decision stream parts
+- Request-level overrides with allowlist
diff --git a/docs-site/changelog.mdx b/docs-site/changelog.mdx
new file mode 100644
index 00000000..2cda1c2f
--- /dev/null
+++ b/docs-site/changelog.mdx
@@ -0,0 +1,28 @@
+---
+title: Changelog
+description: Release history and changelog for cascadeflow.
+---
+
+# Changelog
+
+For the full release history, see [GitHub Releases](https://github.com/lemony-ai/cascadeflow/releases).
+
+## Recent Highlights
+
+- **v1.0.0** — Agent runtime intelligence layer with harness API, 6 framework integrations, compliance gating, KPI-weighted routing, energy tracking, decision traces
+- Agent loops and multi-agent orchestration
+- Tool execution engine with parallel execution and risk gating
+- Hooks and callbacks for telemetry and observability
+- Vercel AI SDK integration (17+ additional providers)
+- OpenClaw provider for custom deployments
+- Gateway server (drop-in OpenAI/Anthropic-compatible endpoint)
+- User tier management with per-user budgets
+- Semantic quality validators via FastEmbed
+- Domain-aware cascading with 16 domain classifications
+- Benchmark reports (MMLU, GSM8K, MT-Bench, HumanEval, TruthfulQA)
+
+## Links
+
+- [GitHub Releases](https://github.com/lemony-ai/cascadeflow/releases)
+- [PyPI](https://pypi.org/project/cascadeflow/)
+- [npm](https://www.npmjs.com/package/@cascadeflow/core)
diff --git a/docs-site/contributing.mdx b/docs-site/contributing.mdx
new file mode 100644
index 00000000..ff45625e
--- /dev/null
+++ b/docs-site/contributing.mdx
@@ -0,0 +1,96 @@
+---
+title: Contributing
+description: How to contribute to cascadeflow — development setup, code style, testing, and pull request process.
+---
+
+# Contributing
+
+We welcome contributions to cascadeflow. This guide covers development setup for both Python and TypeScript.
+
+## Monorepo Structure
+
+```
+cascadeflow/
+  cascadeflow/         # Python package
+  packages/
+    core/              # TypeScript core
+    langchain-cascadeflow/  # LangChain TypeScript
+    integrations/
+      vercel-ai/       # Vercel AI SDK
+      n8n/             # n8n community nodes
+  tests/               # Python tests
+  examples/            # Python examples
+  docs/                # Documentation
+  docs-site/           # Mintlify docs site
+```
+
+## Python Development
+
+### Setup
+
+```bash
+git clone https://github.com/lemony-ai/cascadeflow.git
+cd cascadeflow
+python -m venv .venv
+source .venv/bin/activate
+pip install -e ".[dev]"
+pre-commit install
+```
+
+### Code Style
+
+- **Formatter**: Black (line length 100)
+- **Linter**: Ruff
+- **Type checker**: mypy
+- **Import sorting**: isort
+
+```bash
+black cascadeflow/ tests/
+ruff check cascadeflow/ tests/
+mypy cascadeflow/
+```
+
+### Testing
+
+```bash
+pytest tests/ -x -q                    # Run all tests
+pytest tests/ -m "not integration"     # Skip integration tests
+pytest tests/ --cov=cascadeflow        # With coverage
+```
+
+## TypeScript Development
+
+### Setup
+
+```bash
+cd packages/core
+pnpm install
+pnpm build
+pnpm test
+```
+
+### Code Style
+
+- **Linter**: ESLint
+- **Language**: TypeScript (strict mode)
+- **Indentation**: 2 spaces
+
+## Making Changes
+
+1. Create a branch from `main`
+2. Make changes with clear, descriptive commits
+3. Follow commit conventions: `feat:`, `fix:`, `docs:`, `test:`, `refactor:`, `chore:`
+4. Add tests for new functionality
+5. Ensure all tests pass
+
+## Pull Requests
+
+- All PRs require review approval
+- Linear history enforced (no merge commits)
+- CI must pass before merge
+
+## Links
+
+- [GitHub Issues](https://github.com/lemony-ai/cascadeflow/issues) — Bug reports and feature requests
+- [GitHub Discussions](https://github.com/lemony-ai/cascadeflow/discussions) — Questions and community
+- [Email](mailto:hello@lemony.ai) — Direct support
diff --git a/docs-site/docs.json b/docs-site/docs.json
new file mode 100644
index 00000000..1e441f37
--- /dev/null
+++ b/docs-site/docs.json
@@ -0,0 +1,130 @@
+{
+  "$schema": "https://mintlify.com/docs.json",
+  "theme": "palm",
+  "name": "cascadeflow",
+  "colors": {
+    "primary": "#0E7490",
+    "light": "#22D3EE",
+    "dark": "#0E7490"
+  },
+  "logo": {
+    "light": "/logo/cascadeflow-light.svg",
+    "dark": "/logo/cascadeflow-dark.svg"
+  },
+  "favicon": "/favicon.svg",
+  "tabs": [
+    { "id": "get-started", "name": "Get Started" },
+    { "id": "harness", "name": "Harness" },
+    { "id": "integrations", "name": "Integrations" },
+    { "id": "api-reference", "name": "API Reference" },
+    { "id": "examples", "name": "Examples" }
+  ],
+  "navigation": {
+    "get-started": [
+      {
+        "group": "Get Started",
+        "pages": [
+          "get-started/introduction",
+          "get-started/quickstart",
+          "get-started/installation",
+          "get-started/how-it-works"
+        ]
+      },
+      {
+        "group": "Resources",
+        "pages": [
+          "changelog",
+          "contributing"
+        ]
+      }
+    ],
+    "harness": [
+      {
+        "group": "Harness",
+        "pages": [
+          "harness/overview",
+          "harness/modes",
+          "harness/budget-enforcement",
+          "harness/compliance",
+          "harness/kpi-optimization",
+          "harness/energy-tracking",
+          "harness/decision-trace",
+          "harness/actions"
+        ]
+      }
+    ],
+    "integrations": [
+      {
+        "group": "Integrations",
+        "pages": [
+          "integrations/overview",
+          "integrations/langchain",
+          "integrations/openai-agents",
+          "integrations/crewai",
+          "integrations/google-adk",
+          "integrations/n8n",
+          "integrations/vercel-ai"
+        ]
+      }
+    ],
+    "api-reference": [
+      {
+        "group": "Python",
+        "pages": [
+          "api-reference/python/init",
+          "api-reference/python/run",
+          "api-reference/python/agent-decorator",
+          "api-reference/python/harness-config",
+          "api-reference/python/run-context"
+        ]
+      },
+      {
+        "group": "TypeScript",
+        "pages": [
+          "api-reference/typescript/core",
+          "api-reference/typescript/vercel-ai",
+          "api-reference/typescript/langchain"
+        ]
+      }
+    ],
+    "examples": [
+      {
+        "group": "Examples",
+        "pages": [
+          "examples/basic-usage",
+          "examples/budget-enforcement",
+          "examples/compliance-gating",
+          "examples/kpi-weighted-routing",
+          "examples/multi-agent",
+          "examples/enterprise-patterns"
+        ]
+      }
+    ]
+  },
+  "topbarLinks": [
+    {
+      "name": "GitHub",
+      "url": "https://github.com/lemony-ai/cascadeflow"
+    }
+  ],
+  "topbarCtaButton": {
+    "name": "Get Started",
+    "url": "/get-started/quickstart"
+  },
+  "footerSocials": {
+    "github": "https://github.com/lemony-ai/cascadeflow",
+    "x": "https://x.com/saschabuehrle"
+  },
+  "anchors": [
+    {
+      "name": "GitHub",
+      "icon": "github",
+      "url": "https://github.com/lemony-ai/cascadeflow"
+    },
+    {
+      "name": "PyPI",
+      "icon": "python",
+      "url": "https://pypi.org/project/cascadeflow/"
+    }
+  ]
+}
diff --git a/docs-site/examples/basic-usage.mdx b/docs-site/examples/basic-usage.mdx
new file mode 100644
index 00000000..9cf838d0
--- /dev/null
+++ b/docs-site/examples/basic-usage.mdx
@@ -0,0 +1,81 @@
+---
+title: Basic Usage
+description: Simple cascade setup with OpenAI models showing speculative execution, cost tracking, and savings calculation.
+---
+
+# Basic Usage
+
+A minimal example showing cascadeflow's speculative cascade with two OpenAI models.
+
+## Setup
+
+```bash
+pip install "cascadeflow[openai]"
+export OPENAI_API_KEY="sk-..."
+```
+
+## Code
+
+```python
+import asyncio
+from cascadeflow import CascadeAgent, ModelConfig
+
+agent = CascadeAgent(models=[
+    ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375),
+    ModelConfig(name="gpt-4o", provider="openai", cost=0.00625),
+])
+
+queries = [
+    "What's the capital of France?",        # Simple — draft model handles
+    "Explain quantum computing",             # Medium — may escalate
+    "Write a Python function to sort a list", # Code — domain routing
+]
+
+async def main():
+    total_cost = 0
+    baseline_cost = 0
+
+    for query in queries:
+        result = await agent.run(query)
+        total_cost += result.total_cost
+        baseline_cost += result.total_cost if result.model_used == "gpt-4o" else result.total_cost * (0.00625 / 0.000375)
+
+        print(f"Query: {query[:40]}...")
+        print(f"  Model: {result.model_used}")
+        print(f"  Cost: ${result.total_cost:.6f}")
+        print()
+
+    savings = (1 - total_cost / baseline_cost) * 100 if baseline_cost > 0 else 0
+    print(f"Total cost: ${total_cost:.6f}")
+    print(f"Savings: {savings:.0f}%")
+
+asyncio.run(main())
+```
+
+## How It Works
+
+1. `gpt-4o-mini` (draft model) handles the query first
+2. Quality validation checks the response
+3. If quality passes, the draft response is returned (60-70% of queries)
+4. If quality fails, `gpt-4o` (verifier model) handles the query
+5. Cost tracking reports per-query and aggregate metrics
+
+## TypeScript
+
+```typescript
+import { CascadeAgent } from '@cascadeflow/core';
+
+const agent = new CascadeAgent({
+  models: [
+    { name: 'gpt-4o-mini', provider: 'openai', cost: 0.000375 },
+    { name: 'gpt-4o', provider: 'openai', cost: 0.00625 },
+  ],
+});
+
+const result = await agent.run('What is TypeScript?');
+console.log(`Model: ${result.modelUsed}, Cost: $${result.totalCost}`);
+```
+
+## Source
+
+[examples/basic_usage.py](https://github.com/lemony-ai/cascadeflow/blob/main/examples/basic_usage.py)
diff --git a/docs-site/examples/budget-enforcement.mdx b/docs-site/examples/budget-enforcement.mdx
new file mode 100644
index 00000000..dab52ed9
--- /dev/null
+++ b/docs-site/examples/budget-enforcement.mdx
@@ -0,0 +1,84 @@
+---
+title: Budget Enforcement
+description: Per-run and per-user budget caps with enforcement callbacks, cost tracking, and automatic stop actions.
+---
+
+# Budget Enforcement
+
+Enforce spending limits on agent runs with automatic stop actions when budget is exceeded.
+
+## Basic Budget Cap
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(budget=0.50) as session:
+    result = await agent.run("Research and summarize this topic")
+
+    summary = session.summary()
+    print(f"Cost: ${summary['cost_total']:.4f}")
+    print(f"Budget remaining: ${summary['budget_remaining']:.4f}")
+    print(f"Steps completed: {summary['steps']}")
+```
+
+## Budget with Tool Call Limit
+
+```python
+with cascadeflow.run(budget=1.00, max_tool_calls=5) as session:
+    result = await agent.run("Search and analyze this dataset")
+    # Stops when either budget or tool call limit is hit
+```
+
+## Per-Agent Budgets
+
+```python
+@cascadeflow.agent(budget=0.10)
+async def triage_agent(query: str):
+    """Cheap triage — $0.10 max."""
+    return await llm.complete(query)
+
+@cascadeflow.agent(budget=2.00)
+async def research_agent(query: str):
+    """Deep research — $2.00 max."""
+    return await llm.complete(query)
+```
+
+## Cost Tracking (Legacy API)
+
+For pre-harness budget enforcement using the telemetry API:
+
+```python
+from cascadeflow.telemetry import BudgetConfig, CostTracker, strict_budget_enforcement
+
+tracker = CostTracker(
+    budget_config=BudgetConfig(
+        daily_limit=10.0,
+        per_query_limit=0.50,
+        alert_threshold=0.8,
+    ),
+    enforcement_callback=strict_budget_enforcement,
+)
+
+# Track costs manually
+tracker.track(model="gpt-4o", cost=0.003)
+print(f"Daily spend: ${tracker.daily_spend:.4f}")
+```
+
+## Decision Trace
+
+```python
+with cascadeflow.run(budget=0.50) as session:
+    result = await agent.run("Multi-step analysis")
+
+    for record in session.trace():
+        if record['action'] == 'stop':
+            print(f"Stopped at step {record['step']}: {record['reason']}")
+        else:
+            print(f"Step {record['step']}: {record['action']} (${record['cost_total']:.4f})")
+```
+
+## Source
+
+[examples/enforcement/basic_enforcement.py](https://github.com/lemony-ai/cascadeflow/blob/main/examples/enforcement/basic_enforcement.py)
diff --git a/docs-site/examples/compliance-gating.mdx b/docs-site/examples/compliance-gating.mdx
new file mode 100644
index 00000000..19f9fbd3
--- /dev/null
+++ b/docs-site/examples/compliance-gating.mdx
@@ -0,0 +1,89 @@
+---
+title: Compliance Gating
+description: GDPR, HIPAA, PCI, and strict model allowlists with enforcement examples for regulated agent workflows.
+---
+
+# Compliance Gating
+
+Restrict which models can be used based on compliance requirements.
+
+## GDPR Compliance
+
+Only allow models approved for EU data processing:
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(compliance="gdpr") as session:
+    # Only gpt-4o, gpt-4o-mini, gpt-3.5-turbo are allowed
+    result = await agent.run("Process this EU customer feedback")
+
+    for record in session.trace():
+        if record['action'] == 'switch_model':
+            print(f"Model switched: {record['reason']}")
+```
+
+## HIPAA Compliance
+
+For healthcare data — stricter allowlist:
+
+```python
+with cascadeflow.run(compliance="hipaa") as session:
+    # Only gpt-4o, gpt-4o-mini are allowed
+    result = await agent.run("Summarize this patient record")
+```
+
+## PCI Compliance
+
+For payment card data:
+
+```python
+with cascadeflow.run(compliance="pci") as session:
+    # Only gpt-4o-mini, gpt-3.5-turbo are allowed
+    result = await agent.run("Analyze this transaction")
+```
+
+## Strict Mode
+
+Maximum restriction — single model only:
+
+```python
+with cascadeflow.run(compliance="strict") as session:
+    # Only gpt-4o is allowed
+    result = await agent.run("Classify this sensitive document")
+```
+
+## Compliance Allowlists
+
+| Mode | Allowed Models |
+|---|---|
+| `gdpr` | gpt-4o, gpt-4o-mini, gpt-3.5-turbo |
+| `hipaa` | gpt-4o, gpt-4o-mini |
+| `pci` | gpt-4o-mini, gpt-3.5-turbo |
+| `strict` | gpt-4o |
+
+## Combining with Budget
+
+```python
+@cascadeflow.agent(budget=1.00, compliance="gdpr")
+async def eu_data_agent(query: str):
+    """Process EU data within budget using only GDPR-approved models."""
+    return await llm.complete(query)
+```
+
+## Observe Mode for Audit
+
+Use `observe` mode to audit which models would be blocked without affecting production:
+
+```python
+cascadeflow.init(mode="observe")
+
+with cascadeflow.run(compliance="hipaa") as session:
+    result = await agent.run("Process health data")
+
+    # Check which calls would have been blocked
+    violations = [r for r in session.trace() if r['action'] == 'switch_model']
+    print(f"Compliance violations detected: {len(violations)}")
+```
diff --git a/docs-site/examples/enterprise-patterns.mdx b/docs-site/examples/enterprise-patterns.mdx
new file mode 100644
index 00000000..5949972c
--- /dev/null
+++ b/docs-site/examples/enterprise-patterns.mdx
@@ -0,0 +1,127 @@
+---
+title: Enterprise Patterns
+description: Production-ready patterns including retry logic, rate limiting, budget management, circuit breakers, caching, and health monitoring.
+---
+
+# Enterprise Patterns
+
+Production patterns for deploying cascadeflow at scale.
+
+## Retry with Exponential Backoff
+
+```python
+import asyncio
+from cascadeflow import CascadeAgent
+
+async def execute_with_retry(agent, query, max_retries=3, base_delay=1.0):
+    for attempt in range(max_retries):
+        try:
+            return await agent.run(query)
+        except Exception as e:
+            if attempt == max_retries - 1:
+                raise
+            delay = base_delay * (2 ** attempt)
+            await asyncio.sleep(delay)
+```
+
+## Rate Limiting
+
+```python
+import time
+from collections import deque
+
+class RateLimiter:
+    def __init__(self, max_requests: int, window_seconds: float):
+        self.max_requests = max_requests
+        self.window = window_seconds
+        self.requests = deque()
+
+    async def acquire(self):
+        now = time.monotonic()
+        while self.requests and self.requests[0] < now - self.window:
+            self.requests.popleft()
+        if len(self.requests) >= self.max_requests:
+            wait = self.requests[0] + self.window - now
+            await asyncio.sleep(wait)
+        self.requests.append(time.monotonic())
+```
+
+## Budget Management
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+# Per-user daily budget
+async def handle_user_request(user_id: str, query: str):
+    user_budget = get_user_remaining_budget(user_id)
+
+    with cascadeflow.run(budget=min(user_budget, 0.50)) as session:
+        result = await agent.run(query)
+
+        spent = session.summary()['cost_total']
+        update_user_budget(user_id, spent)
+        return result
+```
+
+## Circuit Breaker
+
+```python
+from cascadeflow import CircuitBreaker, CircuitBreakerConfig
+
+config = CircuitBreakerConfig(
+    failure_threshold=5,
+    recovery_timeout=30.0,
+    half_open_max_calls=2,
+)
+
+breaker = CircuitBreaker(config=config)
+
+async def safe_call(agent, query):
+    if not breaker.allow_request():
+        return fallback_response(query)
+    try:
+        result = await agent.run(query)
+        breaker.record_success()
+        return result
+    except Exception as e:
+        breaker.record_failure()
+        raise
+```
+
+## Response Caching
+
+```python
+from cascadeflow import ResponseCache
+
+cache = ResponseCache(max_size=1000, ttl_seconds=300)
+
+async def cached_run(agent, query):
+    cached = cache.get(query)
+    if cached:
+        return cached
+    result = await agent.run(query)
+    cache.set(query, result)
+    return result
+```
+
+## Health Monitoring
+
+```python
+with cascadeflow.run(budget=10.00) as session:
+    for query in production_queries:
+        result = await agent.run(query)
+
+    summary = session.summary()
+
+    # Alert on anomalies
+    if summary['cost_total'] > 8.0:
+        alert("Budget 80% consumed")
+    if summary['steps'] > 100:
+        alert("High step count")
+```
+
+## Source
+
+[examples/production_patterns.py](https://github.com/lemony-ai/cascadeflow/blob/main/examples/production_patterns.py)
diff --git a/docs-site/examples/kpi-weighted-routing.mdx b/docs-site/examples/kpi-weighted-routing.mdx
new file mode 100644
index 00000000..5bab7689
--- /dev/null
+++ b/docs-site/examples/kpi-weighted-routing.mdx
@@ -0,0 +1,95 @@
+---
+title: KPI-Weighted Routing
+description: Configure quality, cost, latency, and energy weights to encode business priorities into model routing decisions.
+---
+
+# KPI-Weighted Routing
+
+Inject business priorities into every model decision using KPI weights.
+
+## Quality-First (Premium Workload)
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(
+    budget=2.00,
+    kpi_weights={"quality": 0.8, "cost": 0.1, "latency": 0.1},
+    kpi_targets={"quality": 0.9}
+) as session:
+    # Routes to highest-quality models within budget
+    result = await agent.run("Draft a legal contract clause")
+    print(session.summary())
+```
+
+## Cost-First (High-Volume Batch)
+
+```python
+with cascadeflow.run(
+    budget=5.00,
+    kpi_weights={"cost": 0.7, "quality": 0.2, "latency": 0.1}
+) as session:
+    # Routes to cheapest models that meet quality floor
+    for query in batch_queries:
+        result = await agent.run(query)
+    print(f"Total cost: ${session.summary()['cost_total']:.4f}")
+```
+
+## Latency-First (Real-Time)
+
+```python
+with cascadeflow.run(
+    kpi_weights={"latency": 0.7, "quality": 0.2, "cost": 0.1},
+    max_latency_ms=2000.0
+) as session:
+    # Routes to fastest models, hard cap at 2 seconds
+    result = await agent.run("Quick classification task")
+```
+
+## Energy-Aware (Carbon-Conscious)
+
+```python
+with cascadeflow.run(
+    kpi_weights={"quality": 0.4, "energy": 0.3, "cost": 0.3},
+    max_energy=100.0
+) as session:
+    # Balances quality with energy efficiency
+    result = await agent.run("Summarize this report")
+    print(f"Energy used: {session.summary()['energy_used']:.1f} units")
+```
+
+## Per-Agent Profiles
+
+```python
+@cascadeflow.agent(
+    budget=0.10,
+    kpi_weights={"cost": 0.9, "quality": 0.1}
+)
+async def triage_agent(query: str):
+    """Quick classification — prioritize cost."""
+    return await llm.complete(query)
+
+@cascadeflow.agent(
+    budget=2.00,
+    kpi_weights={"quality": 0.9, "cost": 0.1},
+    kpi_targets={"quality": 0.95}
+)
+async def analysis_agent(query: str):
+    """Deep analysis — prioritize quality."""
+    return await llm.complete(query)
+```
+
+## Quality Priors
+
+The harness uses built-in quality priors for scoring:
+
+| Model | Quality Prior | Latency Prior |
+|---|---|---|
+| o1 | 0.95 | 0.40 |
+| gpt-4o | 0.90 | 0.72 |
+| gpt-4-turbo | 0.88 | 0.66 |
+| gpt-5-mini | 0.86 | 0.84 |
+| gpt-4o-mini | 0.75 | 0.93 |
+| gpt-3.5-turbo | 0.65 | 1.00 |
diff --git a/docs-site/examples/multi-agent.mdx b/docs-site/examples/multi-agent.mdx
new file mode 100644
index 00000000..06b9598b
--- /dev/null
+++ b/docs-site/examples/multi-agent.mdx
@@ -0,0 +1,103 @@
+---
+title: Multi-Agent Orchestration
+description: Multi-turn tool execution with agent-as-a-tool delegation and budget tracking across agent boundaries.
+---
+
+# Multi-Agent Orchestration
+
+cascadeflow supports multi-agent patterns with tool execution, delegation, and budget tracking across agent boundaries.
+
+## Tool Execution Loop
+
+```python
+import asyncio
+from cascadeflow import CascadeAgent, ModelConfig
+from cascadeflow.tools import ToolConfig, ToolExecutor
+
+# Define tools
+tools = [
+    ToolConfig(
+        name="calculator",
+        description="Evaluate a math expression",
+        parameters={"expression": {"type": "string"}},
+        handler=lambda expression: str(eval(expression)),
+    ),
+    ToolConfig(
+        name="search",
+        description="Search the web",
+        parameters={"query": {"type": "string"}},
+        handler=lambda query: f"Results for: {query}",
+    ),
+]
+
+agent = CascadeAgent(models=[
+    ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375),
+    ModelConfig(name="gpt-4o", provider="openai", cost=0.00625),
+])
+
+executor = ToolExecutor(tools=tools)
+
+async def main():
+    result = await agent.run(
+        "Calculate 15% of 250 and search for tax rates",
+        tools=tools,
+        tool_executor=executor,
+        max_steps=5,
+    )
+    print(result.content)
+
+asyncio.run(main())
+```
+
+## With Harness Budget Tracking
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(budget=1.00, max_tool_calls=10) as session:
+    result = await agent.run(
+        "Research this topic using multiple tools",
+        tools=tools,
+        tool_executor=executor,
+        max_steps=10,
+    )
+
+    summary = session.summary()
+    print(f"Cost: ${summary['cost_total']:.4f}")
+    print(f"Tool calls: {summary['tool_calls']}")
+    print(f"Steps: {summary['steps']}")
+```
+
+## Agent-as-a-Tool Delegation
+
+```python
+# Define a researcher agent as a tool
+researcher = CascadeAgent(models=[
+    ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375),
+    ModelConfig(name="gpt-4o", provider="openai", cost=0.00625),
+])
+
+async def research_handler(query: str) -> str:
+    result = await researcher.run(query)
+    return result.content
+
+# Main agent can delegate to researcher
+tools = [
+    ToolConfig(
+        name="research",
+        description="Delegate research to a specialist agent",
+        parameters={"query": {"type": "string"}},
+        handler=research_handler,
+    ),
+]
+
+# Budget tracks across both agents
+with cascadeflow.run(budget=2.00) as session:
+    result = await main_agent.run("Analyze and research this topic", tools=tools)
+```
+
+## Source
+
+[examples/agentic_multi_agent.py](https://github.com/lemony-ai/cascadeflow/blob/main/examples/agentic_multi_agent.py)
diff --git a/docs-site/favicon.svg b/docs-site/favicon.svg
new file mode 100644
index 00000000..496df9f5
--- /dev/null
+++ b/docs-site/favicon.svg
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg id="Layer_1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 91.76 91.76">
+  <path d="M38.19,54.36c-4.15-4.15-4.15-10.87,0-15.02,4.15-4.15,10.87-4.15,15.02,0,4.15,4.15,4.15,10.87,0,15.02s-10.87,4.15-15.02,0Z"/>
+  <path d="M63.4,91.76H28.7c-7.93,0-14.35-6.42-14.35-14.35h49.05v14.35Z"/>
+  <path d="M14.35,77.41c-7.93,0-14.35-6.42-14.35-14.35V29.8h14.35v47.61Z"/>
+  <path d="M28.36,0h34.7c7.93,0,14.35,6.42,14.35,14.35H28.36V0Z"/>
+  <path d="M77.41,14.35c7.93,0,14.35,6.42,14.35,14.35v33.26h-14.35V14.35Z"/>
+</svg>
\ No newline at end of file
diff --git a/docs-site/get-started/how-it-works.mdx b/docs-site/get-started/how-it-works.mdx
new file mode 100644
index 00000000..721feef6
--- /dev/null
+++ b/docs-site/get-started/how-it-works.mdx
@@ -0,0 +1,112 @@
+---
+title: How It Works
+description: Architecture of cascadeflow's two engines — Cascade for speculative model routing and Harness for agent runtime intelligence.
+---
+
+# How It Works
+
+cascadeflow ships two complementary engines that can be used independently or together.
+
+## Cascade Engine
+
+The Cascade Engine optimizes model selection through **speculative execution with quality validation**:
+
+1. **Speculatively executes** small, fast models first — optimistic execution ($0.15-0.30/1M tokens)
+2. **Validates quality** of responses using configurable thresholds (completeness, confidence, correctness)
+3. **Dynamically escalates** to larger models only when quality validation fails ($1.25-3.00/1M tokens)
+4. **Learns patterns** to optimize future cascading decisions and domain-specific routing
+
+In practice, 60-70% of queries are handled by small, efficient models without escalation.
+
+**Result:** 40-85% cost reduction, 2-10x faster responses, zero quality loss.
+
+```
+Query → Domain Detection → Try Draft Model → Quality Check
+                                                  │
+                                          Pass ───┘─── Fail
+                                           │            │
+                                        Return      Escalate to
+                                        Result      Verifier Model
+```
+
+## Harness Engine
+
+The Harness Engine provides **agent runtime intelligence** — budget enforcement, compliance gating, KPI-weighted routing, energy tracking, and decision traces.
+
+Unlike the Cascade Engine which routes between models, the Harness Engine wraps existing agent execution and makes decisions at every step:
+
+```
+Agent Step → Harness Decision → allow / switch_model / deny_tool / stop
+                 │
+                 ├── Check budget remaining
+                 ├── Check compliance allowlist
+                 ├── Score KPI dimensions
+                 ├── Check tool call cap
+                 ├── Check latency cap
+                 └── Check energy cap
+```
+
+### Decision Flow
+
+For each LLM call or tool execution inside an agent loop, the harness:
+
+1. **Records** the model, step number, and cumulative metrics
+2. **Evaluates** all configured constraints (budget, compliance, tool calls, latency, energy)
+3. **Scores** the call against KPI weights if configured
+4. **Decides** an action: `allow`, `switch_model`, `deny_tool`, or `stop`
+5. **Enforces** the action if in `enforce` mode (logs only in `observe` mode)
+6. **Appends** a trace record for auditability
+
+### HarnessConfig
+
+All harness behavior is configured through a single dataclass:
+
+```python
+HarnessConfig(
+    mode="enforce",           # off | observe | enforce
+    budget=0.50,              # Max USD for the run
+    max_tool_calls=10,        # Max tool/function calls
+    max_latency_ms=5000.0,    # Max wall-clock ms per call
+    max_energy=100.0,         # Max energy units
+    compliance="gdpr",        # gdpr | hipaa | pci | strict
+    kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1},
+    kpi_targets={"quality": 0.9},
+)
+```
+
+## Combined Usage
+
+When both engines are active, the Cascade Engine handles model selection while the Harness Engine enforces constraints:
+
+```python
+import cascadeflow
+from cascadeflow import CascadeAgent, ModelConfig
+
+# Harness: enforce budget and compliance
+cascadeflow.init(mode="enforce")
+
+# Cascade: speculative model routing
+agent = CascadeAgent(models=[
+    ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375),
+    ModelConfig(name="gpt-4o", provider="openai", cost=0.00625),
+])
+
+with cascadeflow.run(budget=1.00) as session:
+    result = await agent.run("Analyze this contract for GDPR compliance")
+    print(session.summary())
+```
+
+## Provider Abstraction
+
+cascadeflow supports 17+ providers through a unified interface:
+
+| Provider | Type | Package |
+|---|---|---|
+| OpenAI | API | `cascadeflow[openai]` |
+| Anthropic | API | `cascadeflow[anthropic]` |
+| Groq | API | `cascadeflow[groq]` |
+| Together | API | `cascadeflow[together]` |
+| Hugging Face | API | `cascadeflow[huggingface]` |
+| Ollama | Local | Built-in (HTTP) |
+| vLLM | Local | `cascadeflow[vllm]` |
+| Vercel AI SDK | TypeScript | `@cascadeflow/vercel-ai` |
diff --git a/docs-site/get-started/installation.mdx b/docs-site/get-started/installation.mdx
new file mode 100644
index 00000000..ff6b8583
--- /dev/null
+++ b/docs-site/get-started/installation.mdx
@@ -0,0 +1,101 @@
+---
+title: Installation
+description: Install cascadeflow with pip extras for Python or npm packages for TypeScript, including provider-specific setup.
+---
+
+# Installation
+
+## Python
+
+### Minimal install
+
+```bash
+pip install cascadeflow
+```
+
+Core dependencies: `pydantic>=2.0.0`, `httpx>=0.25.0`, `tiktoken>=0.5.0`, `rich>=13.0.0`.
+
+### With providers
+
+```bash
+pip install "cascadeflow[providers]"  # OpenAI + Anthropic + Groq
+```
+
+Individual providers:
+
+```bash
+pip install "cascadeflow[openai]"      # OpenAI
+pip install "cascadeflow[anthropic]"   # Anthropic
+pip install "cascadeflow[groq]"        # Groq
+pip install "cascadeflow[huggingface]" # Hugging Face
+pip install "cascadeflow[together]"    # Together AI
+```
+
+### With framework integrations
+
+```bash
+pip install "cascadeflow[langchain]"       # LangChain/LangGraph
+pip install "cascadeflow[openai-agents]"   # OpenAI Agents SDK
+pip install "cascadeflow[crewai]"          # CrewAI (Python 3.10+)
+pip install "cascadeflow[google-adk]"      # Google ADK (Python 3.10+)
+```
+
+### Local inference
+
+```bash
+pip install "cascadeflow[vllm]"  # vLLM (Python 3.10-3.13)
+```
+
+Ollama does not need a Python package — cascadeflow communicates with Ollama via HTTP at `localhost:11434`. Install Ollama separately from [ollama.ai](https://ollama.ai).
+
+### Everything
+
+```bash
+pip install "cascadeflow[all]"  # All providers + semantic routing
+```
+
+### Development
+
+```bash
+git clone https://github.com/lemony-ai/cascadeflow.git
+cd cascadeflow
+pip install -e ".[dev]"
+```
+
+## TypeScript
+
+### Core
+
+```bash
+npm install @cascadeflow/core
+```
+
+### Framework packages
+
+```bash
+npm install @cascadeflow/langchain                # LangChain integration
+npm install @cascadeflow/vercel-ai                 # Vercel AI SDK middleware
+npm install @cascadeflow/n8n-nodes-cascadeflow     # n8n community node
+```
+
+## Provider Setup
+
+Set API keys as environment variables:
+
+```bash
+export OPENAI_API_KEY="sk-..."
+export ANTHROPIC_API_KEY="sk-ant-..."
+export GROQ_API_KEY="gsk_..."
+```
+
+cascadeflow auto-detects available providers based on which API keys are set.
+
+## Verify Installation
+
+```bash
+python -c "import cascadeflow; print(cascadeflow.__version__)"
+```
+
+```bash
+python -c "from cascadeflow import init, run, HarnessConfig, HarnessRunContext; print('OK')"
+```
diff --git a/docs-site/get-started/introduction.mdx b/docs-site/get-started/introduction.mdx
new file mode 100644
index 00000000..39c2f74c
--- /dev/null
+++ b/docs-site/get-started/introduction.mdx
@@ -0,0 +1,62 @@
+---
+title: Introduction
+description: What cascadeflow is, how it differs from external proxies, and when to use it for agent runtime intelligence.
+---
+
+# Introduction
+
+cascadeflow is an in-process intelligence layer that sits inside AI agent execution loops. Unlike external proxies that only see HTTP request boundaries, cascadeflow operates with full agent state awareness: step count, budget consumed, tool call history, error context, quality scores, domain, complexity, and user-defined business context.
+
+## What makes cascadeflow different
+
+**1. Inside-the-loop control.** Decisions happen per-step and per-tool-call inside agent execution, not at the HTTP boundary. This enables budget gating mid-run, model switching based on remaining budget, and stop actions when caps are hit.
+
+**2. Multi-dimensional optimization.** Six dimensions scored simultaneously: cost, latency, quality, budget, compliance, and energy. Not just cost routing.
+
+**3. Business logic injection.** KPI weights and targets let teams encode business priorities (e.g. 60% quality, 30% cost, 10% latency) into every model decision.
+
+**4. Actionable decisions.** Four actions: `allow`, `switch_model`, `deny_tool`, `stop`. The harness does not just observe — it controls execution flow.
+
+**5. Full transparency.** Every decision produces a trace record with action, reason, model, step, cost_total, budget_state, and applied fields. Audit-ready.
+
+**6. Measurable value.** Session summaries report cost, latency, energy, steps, tool calls, and budget remaining. Before/after comparison is built in.
+
+**7. Cross-framework policy layer.** Unified KPI semantics across LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, and Vercel AI SDK.
+
+**8. Latency advantage.** In-process instrumentation adds less than 1ms overhead per call. External proxies add 10-50ms of network round-trip latency per LLM call.
+
+## Proxy vs In-Process Harness
+
+| Dimension | External Proxy | cascadeflow Harness |
+|---|---|---|
+| **Scope** | HTTP request boundary | Inside agent execution loop |
+| **Dimensions** | Cost only | Cost + quality + latency + budget + compliance + energy |
+| **Latency overhead** | 10-50ms network RTT | <1ms in-process |
+| **Business logic** | None | KPI weights and targets |
+| **Enforcement** | None (observe only) | stop, deny_tool, switch_model |
+| **Auditability** | Request logs | Per-step decision traces |
+
+## When to use cascadeflow
+
+- You run AI agents (LangChain, LangGraph, CrewAI, OpenAI Agents SDK, Google ADK, or custom)
+- You want to reduce LLM costs without changing agent code
+- You need budget enforcement across multi-step agent runs
+- You need to inject business KPIs (quality, cost, latency, energy) into agent decisions
+- You need compliance-aware model gating (GDPR, HIPAA, PCI, strict)
+- You want full trace recording for auditability and tuning
+
+## When NOT to use cascadeflow
+
+- Single one-off LLM calls (overhead not justified)
+- You only use one model and don't want routing
+- You need a hosted proxy service (cascadeflow is a library, not a SaaS)
+
+## Two Engines
+
+cascadeflow ships two complementary engines:
+
+**Cascade Engine** — Speculative execution with quality validation. Tries cheap models first, validates quality, escalates only when needed. Achieves 40-85% cost savings on typical workloads.
+
+**Harness Engine** — Agent runtime intelligence. Budget enforcement, compliance gating, KPI-weighted routing, energy tracking, and decision traces. Works inside agent loops with full state awareness.
+
+Both engines can be used independently or together.
diff --git a/docs-site/get-started/quickstart.mdx b/docs-site/get-started/quickstart.mdx
new file mode 100644
index 00000000..64189077
--- /dev/null
+++ b/docs-site/get-started/quickstart.mdx
@@ -0,0 +1,118 @@
+---
+title: Quickstart
+description: Get cascadeflow running in 3 minutes with zero code changes using the harness API.
+---
+
+# Quickstart
+
+Three tiers of integration — pick the one that matches your needs.
+
+## Install
+
+<CodeGroup>
+
+```bash pip
+pip install "cascadeflow[openai]"
+```
+
+```bash With integrations
+pip install "cascadeflow[langchain]"     # LangChain/LangGraph
+pip install "cascadeflow[openai-agents]" # OpenAI Agents SDK
+pip install "cascadeflow[crewai]"        # CrewAI
+pip install "cascadeflow[google-adk]"    # Google ADK
+```
+
+```bash npm
+npm install @cascadeflow/core
+```
+
+</CodeGroup>
+
+## Tier 1: Zero-Change Observability
+
+Add two lines. All OpenAI and Anthropic SDK calls are automatically tracked.
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="observe")
+
+# Your existing code — no changes needed
+import openai
+client = openai.OpenAI()
+response = client.chat.completions.create(
+    model="gpt-4o",
+    messages=[{"role": "user", "content": "What is cascadeflow?"}]
+)
+# cascadeflow is now tracking cost, latency, energy, and model usage.
+```
+
+## Tier 2: Scoped Runs with Budget
+
+Wrap agent execution in a `run()` context manager for budget tracking and enforcement.
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(budget=0.50, max_tool_calls=10) as session:
+    # Your agent code here
+    result = await agent.run("Analyze this dataset and create a report")
+
+    # After execution, inspect metrics
+    summary = session.summary()
+    print(f"Cost: ${summary['cost_total']:.4f}")
+    print(f"Steps: {summary['steps']}")
+    print(f"Tool calls: {summary['tool_calls']}")
+    print(f"Budget remaining: ${summary['budget_remaining']:.4f}")
+
+    # Full decision audit trail
+    for decision in session.trace():
+        print(f"  Step {decision['step']}: {decision['action']} — {decision['reason']}")
+```
+
+## Tier 3: Decorated Agents with Policy
+
+Annotate agent functions with budget, compliance, and KPI metadata.
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+@cascadeflow.agent(
+    budget=0.20,
+    compliance="gdpr",
+    kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1}
+)
+async def research_agent(query: str):
+    return await llm.complete(query)
+```
+
+## Harness Modes
+
+| Mode | Tracking | Enforcement | Use Case |
+|---|---|---|---|
+| `off` | No | No | Disabled |
+| `observe` | Yes | No | Safe production rollout, metrics collection |
+| `enforce` | Yes | Yes | Budget caps, compliance gating, stop actions |
+
+Start with `observe` in production. Switch to `enforce` once you've validated the metrics.
+
+## Next Steps
+
+<CardGroup cols={2}>
+  <Card title="Installation" icon="download" href="/get-started/installation">
+    All pip extras, npm packages, and provider setup.
+  </Card>
+  <Card title="How It Works" icon="gears" href="/get-started/how-it-works">
+    Architecture of the Cascade and Harness engines.
+  </Card>
+  <Card title="Budget Enforcement" icon="wallet" href="/harness/budget-enforcement">
+    Per-run and per-user budget caps.
+  </Card>
+  <Card title="Integrations" icon="plug" href="/integrations/overview">
+    LangChain, OpenAI Agents, CrewAI, Google ADK, n8n, Vercel AI.
+  </Card>
+</CardGroup>
diff --git a/docs-site/harness/actions.mdx b/docs-site/harness/actions.mdx
new file mode 100644
index 00000000..a904eed8
--- /dev/null
+++ b/docs-site/harness/actions.mdx
@@ -0,0 +1,99 @@
+---
+title: Decision Actions
+description: Four harness actions — allow, switch_model, deny_tool, and stop — and when each is triggered.
+---
+
+# Decision Actions
+
+The harness makes one of four decisions at every step. Actions are computed in both `observe` and `enforce` modes, but only applied in `enforce` mode.
+
+## Actions
+
+### `allow`
+
+Proceed normally. No constraints are violated.
+
+```
+Step 1: allow — budget ok, model compliant
+```
+
+This is the most common action. It means all hard caps (budget, tool calls, latency, energy) are within limits and compliance is satisfied.
+
+### `switch_model`
+
+Route to a different model. Triggered when:
+- The current model is not in the compliance allowlist
+- KPI scoring indicates a better model choice
+- Budget pressure suggests a cheaper alternative
+
+```
+Step 3: switch_model — compliance violation, switching to gpt-4o-mini (gdpr allowlist)
+```
+
+In `enforce` mode, the harness substitutes the model. In `observe` mode, the original model is used and the trace records what would have happened.
+
+### `deny_tool`
+
+Block a tool/function call. Triggered when `max_tool_calls` is reached.
+
+```
+Step 5: deny_tool — tool call cap reached (10/10)
+```
+
+In `enforce` mode, the tool call is blocked. The agent receives a signal that the tool was denied.
+
+### `stop`
+
+Halt agent execution. Triggered when:
+- Budget is exceeded
+- Latency cap is exceeded
+- Energy cap is exceeded
+
+```
+Step 7: stop — budget exceeded ($0.52 > $0.50 cap)
+```
+
+In `enforce` mode, the agent loop is stopped. In `observe` mode, execution continues and the trace records the violation.
+
+## Decision Priority
+
+When multiple constraints are violated simultaneously, the harness applies this priority:
+
+1. **Compliance** — check first (switch_model or stop)
+2. **Budget** — check second (stop)
+3. **Tool calls** — check third (deny_tool)
+4. **Latency** — check fourth (stop)
+5. **Energy** — check fifth (stop)
+6. **KPI scoring** — soft optimization (switch_model or allow)
+
+## Hard vs Soft Controls
+
+**Hard controls** trigger `stop` or `deny_tool` when limits are exceeded:
+- `budget` — max USD
+- `max_tool_calls` — max tool/function calls
+- `max_latency_ms` — max wall-clock ms per call
+- `max_energy` — max energy units
+- `compliance` — model allowlist
+
+**Soft controls** influence model selection through KPI weights but never block execution:
+- `kpi_weights` — relative importance of quality, cost, latency, energy
+- `kpi_targets` — target values for KPI dimensions
+
+## Example: Combined Constraints
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(
+    budget=1.00,
+    max_tool_calls=5,
+    compliance="gdpr",
+    kpi_weights={"quality": 0.6, "cost": 0.4}
+) as session:
+    result = await agent.run("Process EU customer data")
+
+    for record in session.trace():
+        print(f"Step {record['step']}: {record['action']} — {record['reason']}")
+```
diff --git a/docs-site/harness/budget-enforcement.mdx b/docs-site/harness/budget-enforcement.mdx
new file mode 100644
index 00000000..079752ac
--- /dev/null
+++ b/docs-site/harness/budget-enforcement.mdx
@@ -0,0 +1,83 @@
+---
+title: Budget Enforcement
+description: Configure budget enforcement with per-run caps and automatic stop actions when budget is exceeded.
+---
+
+# Budget Enforcement
+
+The harness tracks cumulative cost across all LLM calls in a run and enforces budget caps in `enforce` mode.
+
+## Per-Run Budget
+
+Set a budget cap on a scoped run:
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(budget=0.50) as session:
+    # Agent executes multiple LLM calls
+    result = await agent.run("Research and summarize this topic")
+
+    summary = session.summary()
+    print(f"Total cost: ${summary['cost_total']:.4f}")
+    print(f"Budget remaining: ${summary['budget_remaining']:.4f}")
+```
+
+When cumulative cost exceeds the budget:
+- In `observe` mode: the trace records `action: "stop"` with `applied: false`
+- In `enforce` mode: the harness stops execution with `action: "stop"` and `applied: true`
+
+## Per-Agent Budget
+
+Attach budget metadata to agent functions:
+
+```python
+@cascadeflow.agent(budget=0.20)
+async def cheap_agent(query: str):
+    return await llm.complete(query)
+
+@cascadeflow.agent(budget=2.00)
+async def premium_agent(query: str):
+    return await llm.complete(query)
+```
+
+## Budget Pressure Routing
+
+When budget is partially consumed, the harness can route to cheaper models. This happens automatically when KPI weights include a cost dimension:
+
+```python
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(
+    budget=1.00,
+    kpi_weights={"quality": 0.5, "cost": 0.5}
+) as session:
+    # Early calls may use gpt-4o (high quality)
+    # As budget pressure increases, routing shifts toward gpt-4o-mini (lower cost)
+    for query in queries:
+        result = await agent.run(query)
+```
+
+## Cost Calculation
+
+Cost is estimated from the built-in pricing table:
+
+```
+cost = (input_tokens / 1_000_000) * input_price + (output_tokens / 1_000_000) * output_price
+```
+
+The pricing table covers 18 models across OpenAI, Anthropic, and Google. Unknown models are resolved via fuzzy matching.
+
+## Combining with Tool Call Caps
+
+Budget and tool call caps work together:
+
+```python
+with cascadeflow.run(budget=0.50, max_tool_calls=10) as session:
+    # Stops when either limit is hit
+    result = await agent.run("Analyze this data")
+```
+
+The harness checks all constraints at every step. The first constraint that is violated triggers the corresponding action (`stop` for budget, `deny_tool` for tool calls).
diff --git a/docs-site/harness/compliance.mdx b/docs-site/harness/compliance.mdx
new file mode 100644
index 00000000..febb0de5
--- /dev/null
+++ b/docs-site/harness/compliance.mdx
@@ -0,0 +1,66 @@
+---
+title: Compliance Gating
+description: GDPR, HIPAA, PCI, and strict model allowlists for compliance-aware model gating in agent workflows.
+---
+
+# Compliance Gating
+
+The harness enforces model allowlists based on compliance requirements. When a compliance mode is set, only models in the corresponding allowlist are permitted.
+
+## Compliance Modes
+
+| Mode | Allowed Models | Use Case |
+|---|---|---|
+| `gdpr` | gpt-4o, gpt-4o-mini, gpt-3.5-turbo | EU data protection |
+| `hipaa` | gpt-4o, gpt-4o-mini | Healthcare data |
+| `pci` | gpt-4o-mini, gpt-3.5-turbo | Payment card data |
+| `strict` | gpt-4o | Maximum restriction |
+
+## Usage
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+# GDPR compliance — only gpt-4o, gpt-4o-mini, gpt-3.5-turbo allowed
+with cascadeflow.run(compliance="gdpr") as session:
+    result = await agent.run("Process this EU customer data")
+```
+
+Or as agent metadata:
+
+```python
+@cascadeflow.agent(compliance="hipaa")
+async def medical_agent(query: str):
+    return await llm.complete(query)
+```
+
+## Enforcement Behavior
+
+When a model outside the allowlist is requested:
+
+- In `observe` mode: the trace records `action: "switch_model"` with the suggested compliant alternative, but execution continues with the original model
+- In `enforce` mode: the harness blocks the non-compliant model and either switches to a compliant alternative or stops execution
+
+## Combining with Budget
+
+Compliance and budget constraints are independent. Both are checked at every step:
+
+```python
+with cascadeflow.run(budget=0.50, compliance="gdpr") as session:
+    # Must stay within budget AND use only GDPR-approved models
+    result = await agent.run("Analyze EU customer feedback")
+```
+
+## Custom Allowlists
+
+The built-in allowlists cover common regulations. For custom requirements, set compliance at the integration level or use the `HarnessConfig` directly:
+
+```python
+config = HarnessConfig(
+    mode="enforce",
+    compliance="strict",  # Only gpt-4o
+)
+cascadeflow.init(config=config)
+```
diff --git a/docs-site/harness/decision-trace.mdx b/docs-site/harness/decision-trace.mdx
new file mode 100644
index 00000000..2b1b14a6
--- /dev/null
+++ b/docs-site/harness/decision-trace.mdx
@@ -0,0 +1,102 @@
+---
+title: Decision Traces
+description: Per-step audit trail of every harness decision — action, reason, model, cost, budget state, and enforcement status.
+---
+
+# Decision Traces
+
+Every harness decision produces a trace record. Traces provide a full audit trail for debugging, compliance reporting, and performance tuning.
+
+## Trace Format
+
+Each trace record contains:
+
+| Field | Type | Description |
+|---|---|---|
+| `action` | string | `"allow"`, `"switch_model"`, `"deny_tool"`, or `"stop"` |
+| `reason` | string | Human-readable explanation of the decision |
+| `model` | string | Model name used for the call |
+| `step` | int | Step number in the run (1-indexed) |
+| `cost_total` | float | Cumulative cost in USD at this step |
+| `budget_state` | string | `"ok"`, `"warning"`, or `"exceeded"` |
+| `applied` | bool | `true` if the action was enforced, `false` in observe mode |
+
+## Accessing Traces
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="observe")
+
+with cascadeflow.run(budget=0.50) as session:
+    result = await agent.run("Research this topic")
+
+    # Full decision trace
+    for record in session.trace():
+        print(f"Step {record['step']}: {record['action']} — {record['reason']}")
+        print(f"  Model: {record['model']}, Cost: ${record['cost_total']:.4f}")
+        print(f"  Budget: {record['budget_state']}, Applied: {record['applied']}")
+```
+
+Example output:
+
+```
+Step 1: allow — budget ok, model compliant
+  Model: gpt-4o-mini, Cost: $0.0003
+  Budget: ok, Applied: false
+Step 2: allow — budget ok, model compliant
+  Model: gpt-4o-mini, Cost: $0.0007
+  Budget: ok, Applied: false
+Step 3: switch_model — budget pressure, routing to cheaper model
+  Model: gpt-4o, Cost: $0.0032
+  Budget: warning, Applied: false
+```
+
+## Observe vs Enforce
+
+In `observe` mode, traces record what the harness *would* do:
+- `applied` is always `false`
+- Agent execution continues regardless of the action
+
+In `enforce` mode, traces record what the harness *did*:
+- `applied` is `true` when the action was enforced
+- `stop` actions halt execution
+- `deny_tool` actions block tool calls
+
+## Privacy
+
+Decision traces do not contain prompt content, response content, or user data. They only contain:
+- Model names and step numbers
+- Cost and budget metrics
+- Action decisions and reasons
+
+This makes traces safe for logging, external storage, and compliance reporting without data classification concerns.
+
+## Callbacks
+
+Register callbacks to receive trace records in real time:
+
+```python
+from cascadeflow import get_harness_callback_manager, set_harness_callback_manager
+
+cb_manager = get_harness_callback_manager()
+
+# Traces are emitted through the callback system
+# Use framework-specific integrations for structured access
+```
+
+## Session Summary
+
+In addition to per-step traces, `session.summary()` provides aggregate metrics:
+
+```python
+summary = session.summary()
+# {
+#     "cost_total": 0.0032,
+#     "steps": 3,
+#     "tool_calls": 1,
+#     "latency_total_ms": 1250.0,
+#     "energy_used": 45.2,
+#     "budget_remaining": 0.4968,
+# }
+```
diff --git a/docs-site/harness/energy-tracking.mdx b/docs-site/harness/energy-tracking.mdx
new file mode 100644
index 00000000..a3d292ee
--- /dev/null
+++ b/docs-site/harness/energy-tracking.mdx
@@ -0,0 +1,99 @@
+---
+title: Energy Tracking
+description: Deterministic compute-intensity coefficients for carbon-aware AI operations, with energy caps and per-model coefficients.
+---
+
+# Energy Tracking
+
+The harness tracks energy consumption using deterministic compute-intensity coefficients. This provides a proxy for carbon impact without requiring real-time power measurement.
+
+## Energy Formula
+
+```
+energy_units = coefficient * (input_tokens + output_tokens * 1.5)
+```
+
+Output tokens are weighted 1.5x because generation is more compute-intensive than prompt processing.
+
+## Energy Coefficients
+
+| Model | Coefficient | Relative Cost |
+|---|---|---|
+| gpt-3.5-turbo | 0.20 | Lowest |
+| gemini-1.5-flash | 0.20 | Lowest |
+| gemini-2.0-flash | 0.25 | Very low |
+| claude-haiku-3.5 | 0.30 | Low |
+| gemini-2.5-flash | 0.30 | Low |
+| gpt-4o-mini | 0.30 | Low |
+| gpt-5-mini | 0.35 | Low |
+| o3-mini | 0.50 | Medium |
+| o1-mini | 0.80 | Medium-high |
+| gpt-4o | 1.00 | Baseline |
+| claude-sonnet-4 | 1.00 | Baseline |
+| gemini-1.5-pro | 1.00 | Baseline |
+| gpt-5 | 1.20 | High |
+| gemini-2.5-pro | 1.20 | High |
+| gpt-4-turbo | 1.50 | High |
+| gpt-4 | 1.50 | High |
+| claude-opus-4.5 | 1.80 | Very high |
+| o1 | 2.00 | Highest |
+
+## Energy Caps
+
+Set a maximum energy budget for a run:
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(max_energy=100.0) as session:
+    result = await agent.run("Process this large dataset")
+
+    summary = session.summary()
+    print(f"Energy used: {summary['energy_used']:.1f} units")
+```
+
+When energy exceeds the cap:
+- In `observe` mode: logged but not enforced
+- In `enforce` mode: execution stops with `action: "stop"`
+
+## Energy-Aware KPI Weights
+
+Include energy in KPI weights for carbon-aware routing:
+
+```python
+with cascadeflow.run(
+    kpi_weights={"quality": 0.4, "cost": 0.3, "energy": 0.3}
+) as session:
+    # Routes toward lower-energy models when quality allows
+    result = await agent.run("Summarize this article")
+```
+
+## Pricing Table
+
+Full pricing for all 18 supported models (USD per 1M tokens):
+
+| Model | Input | Output |
+|---|---|---|
+| **OpenAI** | | |
+| gpt-4o | $2.50 | $10.00 |
+| gpt-4o-mini | $0.15 | $0.60 |
+| gpt-5 | $1.25 | $10.00 |
+| gpt-5-mini | $0.20 | $0.80 |
+| gpt-4-turbo | $10.00 | $30.00 |
+| gpt-4 | $30.00 | $60.00 |
+| gpt-3.5-turbo | $0.50 | $1.50 |
+| o1 | $15.00 | $60.00 |
+| o1-mini | $3.00 | $12.00 |
+| o3-mini | $1.10 | $4.40 |
+| **Anthropic** | | |
+| claude-sonnet-4 | $3.00 | $15.00 |
+| claude-haiku-3.5 | $1.00 | $5.00 |
+| claude-opus-4.5 | $5.00 | $25.00 |
+| **Google** | | |
+| gemini-2.5-flash | $0.15 | $0.60 |
+| gemini-2.5-pro | $1.25 | $10.00 |
+| gemini-2.0-flash | $0.10 | $0.40 |
+| gemini-1.5-flash | $0.075 | $0.30 |
+| gemini-1.5-pro | $1.25 | $5.00 |
diff --git a/docs-site/harness/kpi-optimization.mdx b/docs-site/harness/kpi-optimization.mdx
new file mode 100644
index 00000000..e07e1023
--- /dev/null
+++ b/docs-site/harness/kpi-optimization.mdx
@@ -0,0 +1,103 @@
+---
+title: KPI-Weighted Routing
+description: Inject business priorities as quality, cost, latency, and energy weights into every model routing decision.
+---
+
+# KPI-Weighted Routing
+
+The harness scores each model decision against configurable KPI weights. This lets teams encode business priorities into agent behavior without changing agent code.
+
+## KPI Dimensions
+
+| Dimension | Score Source | Range | What it means |
+|---|---|---|---|
+| `quality` | Model quality priors | 0.0-1.0 | Higher = better output quality |
+| `cost` | Inverse of model cost | 0.0-1.0 | Higher = cheaper model |
+| `latency` | Model latency priors | 0.0-1.0 | Higher = faster response |
+| `energy` | Inverse of energy coefficient | 0.0-1.0 | Higher = lower compute intensity |
+
+## Configuration
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(
+    kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1},
+    kpi_targets={"quality": 0.9}
+) as session:
+    result = await agent.run("Analyze this legal document")
+```
+
+### Weights
+
+Weights are relative — they don't need to sum to 1.0 (they are normalized internally). They control the relative importance of each dimension in the composite score.
+
+```python
+# Quality-first (premium workload)
+kpi_weights = {"quality": 0.8, "cost": 0.1, "latency": 0.1}
+
+# Cost-first (high-volume batch)
+kpi_weights = {"quality": 0.2, "cost": 0.7, "latency": 0.1}
+
+# Balanced
+kpi_weights = {"quality": 0.4, "cost": 0.3, "latency": 0.2, "energy": 0.1}
+```
+
+### Targets
+
+Targets set minimum acceptable values. If a model's score for a dimension falls below the target, it is penalized in the composite score.
+
+```python
+kpi_targets = {
+    "quality": 0.9,   # Require high quality
+    "latency": 0.7,   # Require reasonable speed
+}
+```
+
+## Scoring Formula
+
+The composite score for a model is:
+
+```
+score = quality_prior * w_quality + cost_utility * w_cost + latency_prior * w_latency + energy_utility * w_energy
+```
+
+Where `w_*` are the normalized weights and utility values are computed from model priors.
+
+## Quality Priors
+
+Built-in quality priors for common models (OpenAI):
+
+| Model | Quality | Latency |
+|---|---|---|
+| o1 | 0.95 | 0.40 |
+| gpt-4o | 0.90 | 0.72 |
+| gpt-4-turbo | 0.88 | 0.66 |
+| gpt-4 | 0.87 | 0.52 |
+| gpt-5-mini | 0.86 | 0.84 |
+| o1-mini | 0.82 | 0.60 |
+| o3-mini | 0.80 | 0.78 |
+| gpt-4o-mini | 0.75 | 0.93 |
+| gpt-3.5-turbo | 0.65 | 1.00 |
+
+## Per-Agent KPI Weights
+
+Different agents can have different priorities:
+
+```python
+@cascadeflow.agent(
+    budget=0.50,
+    kpi_weights={"quality": 0.8, "cost": 0.2}
+)
+async def quality_agent(query: str):
+    return await llm.complete(query)
+
+@cascadeflow.agent(
+    budget=0.10,
+    kpi_weights={"cost": 0.8, "quality": 0.2}
+)
+async def budget_agent(query: str):
+    return await llm.complete(query)
+```
diff --git a/docs-site/harness/modes.mdx b/docs-site/harness/modes.mdx
new file mode 100644
index 00000000..46a86840
--- /dev/null
+++ b/docs-site/harness/modes.mdx
@@ -0,0 +1,78 @@
+---
+title: Harness Modes
+description: Three harness modes — off, observe, and enforce — with rollout guidance for production deployments.
+---
+
+# Harness Modes
+
+cascadeflow operates in one of three modes, set at initialization.
+
+## Modes
+
+### `off`
+
+No tracking, no enforcement. The harness is completely disabled. This is the default.
+
+```python
+cascadeflow.init(mode="off")
+```
+
+### `observe`
+
+Track all metrics and decisions, but never block execution. Every LLM call and tool execution is recorded with full decision traces. Actions are computed but not enforced — `applied` is always `false` in trace records.
+
+```python
+cascadeflow.init(mode="observe")
+```
+
+Use `observe` for:
+- Initial production rollout to validate metrics before enforcing
+- Shadow-mode testing to understand what the harness would do
+- Cost and usage analytics without affecting agent behavior
+
+### `enforce`
+
+Track all metrics and enforce constraints. When a hard cap is hit (budget, tool calls, latency, energy) or a compliance violation is detected, the harness takes action: `stop`, `deny_tool`, or `switch_model`.
+
+```python
+cascadeflow.init(mode="enforce")
+```
+
+Use `enforce` when:
+- You have validated metrics in `observe` mode
+- You need hard budget caps to prevent runaway costs
+- Compliance requirements mandate model gating
+
+## Rollout Guidance
+
+Recommended rollout sequence for production:
+
+1. **Deploy with `observe`** — No risk to agent behavior. Collect metrics, review decision traces, validate that the harness sees what you expect.
+
+2. **Review traces** — Check that compliance allowlists, budget calculations, and KPI scoring match your expectations.
+
+3. **Switch to `enforce`** — Once validated, change the mode. The harness will now enforce constraints.
+
+4. **Monitor** — Use `session.summary()` and `session.trace()` to monitor enforcement in production.
+
+```python
+import os
+
+# Environment-driven mode selection
+mode = os.getenv("CASCADEFLOW_MODE", "observe")
+cascadeflow.init(mode=mode)
+```
+
+## Mode Behavior Matrix
+
+| Behavior | `off` | `observe` | `enforce` |
+|---|---|---|---|
+| Cost tracking | No | Yes | Yes |
+| Latency tracking | No | Yes | Yes |
+| Energy tracking | No | Yes | Yes |
+| Decision traces | No | Yes | Yes |
+| Budget enforcement | No | No | Yes |
+| Tool call gating | No | No | Yes |
+| Compliance gating | No | No | Yes |
+| `session.summary()` | Empty | Full metrics | Full metrics |
+| `session.trace()` | Empty | Decisions (applied=false) | Decisions (applied=true) |
diff --git a/docs-site/harness/overview.mdx b/docs-site/harness/overview.mdx
new file mode 100644
index 00000000..8486c8c4
--- /dev/null
+++ b/docs-site/harness/overview.mdx
@@ -0,0 +1,80 @@
+---
+title: Harness Overview
+description: Overview of the cascadeflow harness — six optimization dimensions, HarnessConfig surface, and high-level decision flow.
+---
+
+# Harness Overview
+
+The cascadeflow harness is an in-process intelligence layer that wraps AI agent execution. It tracks, scores, and optionally enforces constraints across six dimensions for every LLM call and tool execution inside agent loops.
+
+## Six Dimensions
+
+| Dimension | What it measures | Hard cap | Soft scoring |
+|---|---|---|---|
+| **Cost** | Estimated USD from the pricing table | `budget` | `kpi_weights.cost` |
+| **Latency** | Wall-clock milliseconds per LLM call | `max_latency_ms` | `kpi_weights.latency` |
+| **Quality** | Model quality priors (0-1 score) | -- | `kpi_weights.quality` |
+| **Tool calls** | Count of tool/function calls | `max_tool_calls` | -- |
+| **Energy** | Compute-intensity coefficient | `max_energy` | `kpi_weights.energy` |
+| **Compliance** | Model allowlist per regulation | `compliance` | -- |
+
+## HarnessConfig
+
+All harness behavior is configured through a single dataclass:
+
+```python
+from cascadeflow import HarnessConfig
+
+config = HarnessConfig(
+    mode="enforce",                    # "off" | "observe" | "enforce"
+    verbose=False,                     # Print decisions to stderr
+    budget=0.50,                       # Max USD for the run (None = unlimited)
+    max_tool_calls=10,                 # Max tool/function calls (None = unlimited)
+    max_latency_ms=5000.0,             # Max wall-clock ms per call (None = unlimited)
+    max_energy=100.0,                  # Max energy units (None = unlimited)
+    kpi_targets={"quality": 0.9},      # Target values for KPI dimensions
+    kpi_weights={                      # Relative importance of each dimension
+        "quality": 0.6,
+        "cost": 0.3,
+        "latency": 0.1,
+    },
+    compliance="gdpr",                 # "gdpr" | "hipaa" | "pci" | "strict" | None
+)
+```
+
+## Activation
+
+```python
+import cascadeflow
+
+# Global activation
+cascadeflow.init(mode="observe")
+
+# Scoped run with overrides
+with cascadeflow.run(budget=0.50, max_tool_calls=10) as session:
+    # agent code
+    pass
+
+# Decorated agent function
+@cascadeflow.agent(budget=0.20, compliance="gdpr")
+async def my_agent(query: str):
+    pass
+```
+
+## Decision Flow
+
+For each LLM call or tool execution:
+
+1. **Record** model, step number, cumulative cost, latency, energy
+2. **Check compliance** — is the model in the allowlist for the configured regulation?
+3. **Check hard caps** — budget, tool calls, latency, energy
+4. **Score KPI dimensions** — quality, cost, latency, energy weighted by `kpi_weights`
+5. **Decide action** — `allow`, `switch_model`, `deny_tool`, or `stop`
+6. **Enforce or log** — enforce in `enforce` mode, log only in `observe` mode
+7. **Append trace** — full decision record for auditability
+
+## Supported Models
+
+The harness includes a built-in pricing table for 18 models across OpenAI, Anthropic, and Google. Unknown models are resolved via fuzzy matching (e.g. `gpt-5-mini` matches even before official pricing is announced).
+
+See [Energy Tracking](/harness/energy-tracking) for the full pricing and energy coefficients table.
diff --git a/docs-site/index.mdx b/docs-site/index.mdx
new file mode 100644
index 00000000..2e99a0a2
--- /dev/null
+++ b/docs-site/index.mdx
@@ -0,0 +1,91 @@
+---
+title: cascadeflow
+description: Agent runtime intelligence layer — optimize cost, latency, quality, budget, compliance, and energy across AI agent workflows.
+---
+
+# cascadeflow
+
+The in-process intelligence layer for AI agents. Optimize cost, latency, quality, budget, compliance, and energy — inside the execution loop, not at the HTTP boundary.
+
+<CardGroup cols={2}>
+  <Card title="Quickstart" icon="rocket" href="/get-started/quickstart">
+    Get running in 3 minutes with zero code changes.
+  </Card>
+  <Card title="How It Works" icon="diagram-project" href="/get-started/how-it-works">
+    Two engines: Cascade for model routing, Harness for agent intelligence.
+  </Card>
+  <Card title="Harness API" icon="shield-halved" href="/harness/overview">
+    Budget enforcement, compliance gating, KPI-weighted routing, energy tracking.
+  </Card>
+  <Card title="Integrations" icon="plug" href="/integrations/overview">
+    LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, Vercel AI SDK.
+  </Card>
+</CardGroup>
+
+## Install
+
+<CodeGroup>
+
+```bash pip
+pip install cascadeflow
+```
+
+```bash npm
+npm install @cascadeflow/core
+```
+
+</CodeGroup>
+
+## Quick Start
+
+<CodeGroup>
+
+```python Observe (zero-change)
+import cascadeflow
+cascadeflow.init(mode="observe")
+# All OpenAI/Anthropic SDK calls are now tracked.
+```
+
+```python Scoped Run
+import cascadeflow
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(budget=0.50) as session:
+    result = await agent.run("Analyze this dataset")
+    print(session.summary())
+```
+
+```python Decorated Agent
+import cascadeflow
+cascadeflow.init(mode="enforce")
+
+@cascadeflow.agent(budget=0.20, compliance="gdpr")
+async def my_agent(query: str):
+    return await llm.complete(query)
+```
+
+</CodeGroup>
+
+## Supported Frameworks
+
+| Framework | Python | TypeScript | Integration Type |
+|---|---|---|---|
+| LangChain / LangGraph | `cascadeflow[langchain]` | `@cascadeflow/langchain` | Callback handler |
+| OpenAI Agents SDK | `cascadeflow[openai-agents]` | -- | ModelProvider |
+| CrewAI | `cascadeflow[crewai]` | -- | llm_hooks |
+| Google ADK | `cascadeflow[google-adk]` | -- | BasePlugin |
+| n8n | -- | `@cascadeflow/n8n-nodes-cascadeflow` | Community node |
+| Vercel AI SDK | -- | `@cascadeflow/vercel-ai` | Middleware |
+
+## Six Dimensions
+
+cascadeflow optimizes across six dimensions simultaneously:
+
+| Dimension | What it controls | Example |
+|---|---|---|
+| **Cost** | USD per LLM call from pricing table | Budget cap of $0.50 per run |
+| **Latency** | Wall-clock milliseconds per call | Max 2000ms per call |
+| **Quality** | Model quality priors for routing | 60% weight on quality KPI |
+| **Budget** | Cumulative spend tracking and caps | Per-user daily limits |
+| **Compliance** | Model allowlists per regulation | GDPR: only gpt-4o, gpt-4o-mini |
+| **Energy** | Compute-intensity coefficients | Carbon-aware model selection |
diff --git a/docs-site/integrations/crewai.mdx b/docs-site/integrations/crewai.mdx
new file mode 100644
index 00000000..1fae1fde
--- /dev/null
+++ b/docs-site/integrations/crewai.mdx
@@ -0,0 +1,78 @@
+---
+title: CrewAI
+description: Hook-based harness integration for CrewAI with budget gating, metrics tracking, and decision traces across crew steps.
+---
+
+# CrewAI Integration
+
+cascadeflow integrates with CrewAI through the native `llm_hooks` system. Call `enable()` to register global hooks that track all crew steps, enforce budget caps, and record decision traces.
+
+## Install
+
+```bash
+pip install "cascadeflow[crewai]"
+```
+
+## Quick Start
+
+```python
+from crewai import Agent, Crew, Process, Task
+import cascadeflow
+from cascadeflow.integrations.crewai import CrewAIHarnessConfig, enable
+
+cascadeflow.init(mode="observe")
+
+# Enable harness hooks
+config = CrewAIHarnessConfig(
+    fail_open=True,
+    budget_gate=True,
+)
+enable(config=config)
+
+# Define agents and tasks as usual
+researcher = Agent(
+    role="Researcher",
+    goal="Find relevant information",
+    llm="gpt-4o-mini",
+)
+
+task = Task(
+    description="Research the topic of AI agent frameworks",
+    agent=researcher,
+)
+
+crew = Crew(
+    agents=[researcher],
+    tasks=[task],
+    process=Process.sequential,
+)
+
+# Run with budget tracking
+with cascadeflow.run(budget=1.00) as session:
+    result = crew.kickoff()
+    print(session.summary())
+    for record in session.trace():
+        print(f"Step {record['step']}: {record['action']} — {record['reason']}")
+```
+
+## Configuration
+
+```python
+config = CrewAIHarnessConfig(
+    fail_open=True,    # Continue on harness errors
+    budget_gate=True,  # Enforce budget caps
+)
+```
+
+## Features
+
+- Tracks all crew steps automatically via `llm_hooks`
+- Budget gating stops crew execution when budget is exceeded
+- Full decision trace across all agents in the crew
+- Fail-open mode for production safety
+- No changes to existing CrewAI agent or task definitions
+
+## Limitations
+
+- Tool-level gating is not currently applied (CrewAI hooks operate at the LLM call level)
+- Model switching depends on CrewAI's model configuration
diff --git a/docs-site/integrations/google-adk.mdx b/docs-site/integrations/google-adk.mdx
new file mode 100644
index 00000000..8b6f3403
--- /dev/null
+++ b/docs-site/integrations/google-adk.mdx
@@ -0,0 +1,91 @@
+---
+title: Google ADK
+description: Plugin-based harness integration for Google Agent Development Kit with budget enforcement and metrics tracking.
+---
+
+# Google ADK Integration
+
+cascadeflow integrates with Google's Agent Development Kit (ADK) through the `BasePlugin` system. Call `enable()` to get a plugin that plugs into `Runner(plugins=[...])`.
+
+## Install
+
+```bash
+pip install "cascadeflow[google-adk]"
+```
+
+Requires Python 3.10+.
+
+## Quick Start
+
+```python
+import asyncio
+from google.adk.agents import Agent
+from google.adk.runners import Runner
+from google.adk.sessions import InMemorySessionService
+from google.genai.types import Content, Part
+
+import cascadeflow
+from cascadeflow.integrations.google_adk import GoogleADKHarnessConfig, enable
+
+cascadeflow.init(mode="observe")
+
+# Enable harness plugin
+config = GoogleADKHarnessConfig(
+    fail_open=True,
+    enable_budget_gate=True,
+)
+plugin = enable(config=config)
+
+# Create ADK agent
+agent = Agent(
+    name="research_agent",
+    model="gemini-2.5-flash",
+    instruction="You are a helpful research assistant.",
+)
+
+# Run with plugin
+session_service = InMemorySessionService()
+runner = Runner(agent=agent, plugins=[plugin])
+
+async def main():
+    with cascadeflow.run(budget=0.50) as session:
+        user_content = Content(parts=[Part(text="Explain cascadeflow")])
+        async for event in runner.run_async(
+            session_id="test",
+            user_id="user-1",
+            new_message=user_content,
+        ):
+            pass  # Process streaming events
+
+        print(session.summary())
+
+asyncio.run(main())
+```
+
+## Configuration
+
+```python
+config = GoogleADKHarnessConfig(
+    fail_open=True,          # Continue on harness errors
+    enable_budget_gate=True, # Enforce budget caps
+)
+```
+
+## Supported Gemini Models
+
+| Model | Input $/1M | Output $/1M | Energy Coeff |
+|---|---|---|---|
+| gemini-2.5-flash | $0.15 | $0.60 | 0.30 |
+| gemini-2.5-pro | $1.25 | $10.00 | 1.20 |
+| gemini-2.0-flash | $0.10 | $0.40 | 0.25 |
+| gemini-1.5-flash | $0.075 | $0.30 | 0.20 |
+| gemini-1.5-pro | $1.25 | $5.00 | 1.00 |
+
+## Budget Enforcement
+
+When budget is exceeded in `enforce` mode, the plugin returns an `LlmResponse` with `error_code="BUDGET_EXCEEDED"`. The ADK runner handles this as a graceful stop.
+
+## Limitations
+
+- Tool gating is not applied (intentional design choice — ADK manages tool execution internally)
+- Model switching depends on ADK's model configuration
diff --git a/docs-site/integrations/langchain.mdx b/docs-site/integrations/langchain.mdx
new file mode 100644
index 00000000..2f29062f
--- /dev/null
+++ b/docs-site/integrations/langchain.mdx
@@ -0,0 +1,106 @@
+---
+title: LangChain
+description: Harness-aware callback handler for LangChain and LangGraph with budget tracking, cost analytics, and decision traces.
+---
+
+# LangChain Integration
+
+cascadeflow integrates with LangChain through a callback handler that wraps any `BaseChatModel`. Works with LCEL chains, streaming, tool calling, structured output, and LangGraph agents.
+
+## Install
+
+<CodeGroup>
+
+```bash Python
+pip install "cascadeflow[langchain]"
+```
+
+```bash TypeScript
+npm install @cascadeflow/langchain @langchain/core @langchain/openai
+```
+
+</CodeGroup>
+
+## Quick Start
+
+<CodeGroup>
+
+```python Python — Harness callback
+import cascadeflow
+from cascadeflow.integrations.langchain import get_harness_callback
+from langchain_openai import ChatOpenAI
+
+cascadeflow.init(mode="observe")
+
+model = ChatOpenAI(model="gpt-4o")
+cb = get_harness_callback()
+
+with cascadeflow.run(budget=0.50) as session:
+    result = await model.ainvoke("Explain quantum computing", config={"callbacks": [cb]})
+    print(session.summary())
+```
+
+```python Python — Cascade routing
+from langchain_openai import ChatOpenAI
+from langchain_anthropic import ChatAnthropic
+from cascadeflow.integrations.langchain import CascadeFlow
+
+cascade = CascadeFlow(
+    drafter=ChatOpenAI(model="gpt-4o-mini"),
+    verifier=ChatAnthropic(model="claude-sonnet-4"),
+    quality_threshold=0.8,
+)
+
+result = await cascade.ainvoke("Explain quantum computing")
+```
+
+```typescript TypeScript — Drop-in cascade
+import { ChatOpenAI } from '@langchain/openai';
+import { ChatAnthropic } from '@langchain/anthropic';
+import { withCascade } from '@cascadeflow/langchain';
+
+const cascade = withCascade({
+  drafter: new ChatOpenAI({ model: 'gpt-4o-mini' }),
+  verifier: new ChatAnthropic({ model: 'claude-sonnet-4' }),
+  qualityThreshold: 0.8,
+});
+
+const result = await cascade.invoke('Explain quantum computing');
+```
+
+</CodeGroup>
+
+## Features
+
+- Full LCEL support (pipes, sequences, batch)
+- Streaming with pre-routing
+- Tool calling and structured output
+- LangSmith cost tracking metadata
+- Cost tracking callbacks
+- Domain policies with `cascadeflow_domain` metadata
+
+## Cost Tracking Callback
+
+```python
+from cascadeflow.integrations.langchain.langchain_callbacks import get_cascade_callback
+
+with get_cascade_callback() as cb:
+    response = await cascade.ainvoke("What is Python?")
+    print(f"Total cost: ${cb.total_cost:.6f}")
+    print(f"Drafter cost: ${cb.drafter_cost:.6f}")
+    print(f"Verifier cost: ${cb.verifier_cost:.6f}")
+```
+
+## LangSmith Integration
+
+When LangSmith tracing is enabled, cascadeflow adds metadata to runs:
+- `cascade_decision`: whether the drafter was accepted
+- `modelUsed`: which model produced the final response
+- `drafterQuality`: quality score from validation
+- `savingsPercentage`: cost savings achieved
+
+```bash
+export LANGSMITH_API_KEY="..."
+export LANGSMITH_PROJECT="my-project"
+export LANGSMITH_TRACING=true
+```
diff --git a/docs-site/integrations/n8n.mdx b/docs-site/integrations/n8n.mdx
new file mode 100644
index 00000000..efb89f51
--- /dev/null
+++ b/docs-site/integrations/n8n.mdx
@@ -0,0 +1,70 @@
+---
+title: n8n
+description: cascadeflow community nodes for n8n with cascade model routing, tool gating, and harness modes for no-code AI workflows.
+---
+
+# n8n Integration
+
+cascadeflow provides two community nodes for n8n workflows: a Model sub-node for drop-in cascade routing and an Agent node for standalone multi-step reasoning.
+
+## Install
+
+In n8n:
+1. Go to **Settings** > **Community Nodes**
+2. Search for: `@cascadeflow/n8n-nodes-cascadeflow`
+3. Click **Install**
+
+Or via npm:
+```bash
+npm install @cascadeflow/n8n-nodes-cascadeflow
+```
+
+## Two Nodes
+
+| Node | Type | Use Case |
+|---|---|---|
+| **CascadeFlow (Model)** | Language Model sub-node | Drop-in for any Chain/LLM node |
+| **CascadeFlow Agent** | Standalone agent | Tool calling, memory, multi-step reasoning |
+
+## CascadeFlow (Model)
+
+Drop-in replacement for any AI Chat Model in n8n chains:
+
+1. Add two **AI Chat Model** nodes (cheap drafter + powerful verifier)
+2. Add **CascadeFlow (Model)** and connect both models
+3. Connect to a **Basic LLM Chain** or **Chain** node
+4. Check the **Logs tab** to see cascade decisions
+
+**Features:**
+- Quality threshold (default: 0.4)
+- 16 supported domains (Code, Math, Data, Legal, Medical, Financial, etc.)
+- Complexity thresholds for automatic routing
+
+## CascadeFlow Agent
+
+Standalone agent with tool calling and multi-step reasoning:
+
+1. Add a **Chat Trigger** node
+2. Add **CascadeFlow Agent** and connect to the trigger
+3. Connect **Drafter**, **Verifier**, optional **Memory** and **Tools**
+4. Check the **Output tab** for cascade metadata and decision trace
+
+**Features:**
+- Harness mode: `observe` or `enforce`
+- Budget caps and tool call limits
+- Tool routing rules: Cascade (default) or Verifier (for high-stakes tools)
+- Tool call validation with JSON schema checking
+
+## Complexity Thresholds
+
+| Level | Threshold | Routing |
+|---|---|---|
+| Trivial | 0.25 | Always use drafter |
+| Simple | 0.40 | Prefer drafter |
+| Moderate | 0.55 | Quality-dependent |
+| Hard | 0.70 | Prefer verifier |
+| Expert | 0.80 | Always use verifier |
+
+## Result
+
+40-85% cost savings in n8n workflows with zero changes to existing chains.
diff --git a/docs-site/integrations/openai-agents.mdx b/docs-site/integrations/openai-agents.mdx
new file mode 100644
index 00000000..1a189a6b
--- /dev/null
+++ b/docs-site/integrations/openai-agents.mdx
@@ -0,0 +1,77 @@
+---
+title: OpenAI Agents SDK
+description: CascadeFlowModelProvider for OpenAI Agents SDK with model candidates, tool gating, and budget tracking.
+---
+
+# OpenAI Agents SDK Integration
+
+cascadeflow provides a `CascadeFlowModelProvider` that integrates with the OpenAI Agents SDK as an explicit `ModelProvider`. Supports model candidates, tool gating, and scoped budget tracking.
+
+## Install
+
+```bash
+pip install "cascadeflow[openai-agents]"
+```
+
+## Quick Start
+
+```python
+import asyncio
+from agents import Agent, Runner
+import cascadeflow
+from cascadeflow.integrations.openai_agents import (
+    CascadeFlowModelProvider,
+    OpenAIAgentsIntegrationConfig,
+)
+
+cascadeflow.init(mode="observe")
+
+# Configure integration
+config = OpenAIAgentsIntegrationConfig(
+    model_candidates=["gpt-4o-mini", "gpt-4o"],
+    enable_tool_gating=True,
+)
+
+provider = CascadeFlowModelProvider(config=config)
+
+agent = Agent(
+    name="research_agent",
+    instructions="You are a helpful research assistant.",
+    model_provider=provider,
+)
+
+async def main():
+    with cascadeflow.run(budget=0.50) as session:
+        result = await Runner.run(agent, "Explain cascadeflow")
+        print(result.final_output)
+        print(session.summary())
+
+asyncio.run(main())
+```
+
+## Features
+
+- **Model candidates**: List of models the provider can select from based on harness scoring
+- **Tool gating**: Block tool calls when `max_tool_calls` is reached
+- **Scoped runs**: Use `cascadeflow.run()` for per-task budget tracking
+- **Decision traces**: Full audit trail of model selection and tool gating decisions
+- **Fail-open**: If the harness encounters an error, execution continues with the default model
+
+## Configuration
+
+```python
+config = OpenAIAgentsIntegrationConfig(
+    model_candidates=["gpt-4o-mini", "gpt-4o"],  # Models to choose from
+    enable_tool_gating=True,                       # Block tools at cap
+)
+```
+
+## Session Metrics
+
+After a run, `session.summary()` includes:
+- `cost_total`: cumulative USD spent
+- `budget_remaining`: USD left in the budget
+- `step_count`: number of LLM calls
+- `tool_calls`: number of tool executions
+- `latency_used_ms`: total latency
+- `energy_used`: total energy units
diff --git a/docs-site/integrations/overview.mdx b/docs-site/integrations/overview.mdx
new file mode 100644
index 00000000..92bda53e
--- /dev/null
+++ b/docs-site/integrations/overview.mdx
@@ -0,0 +1,53 @@
+---
+title: Integrations Overview
+description: Matrix of all cascadeflow framework integrations with supported features, languages, and integration patterns.
+---
+
+# Integrations Overview
+
+cascadeflow integrates with six agent frameworks. All integrations are opt-in — install the extra and explicitly enable.
+
+## Integration Matrix
+
+| Framework | Language | Package | Integration Type | Budget Gating | Tool Gating | Traces |
+|---|---|---|---|---|---|---|
+| [LangChain](/integrations/langchain) | Python, TS | `cascadeflow[langchain]`, `@cascadeflow/langchain` | Callback handler | Yes | No | Yes |
+| [OpenAI Agents SDK](/integrations/openai-agents) | Python | `cascadeflow[openai-agents]` | ModelProvider | Yes | Yes | Yes |
+| [CrewAI](/integrations/crewai) | Python | `cascadeflow[crewai]` | llm_hooks | Yes | No | Yes |
+| [Google ADK](/integrations/google-adk) | Python | `cascadeflow[google-adk]` | BasePlugin | Yes | No | Yes |
+| [n8n](/integrations/n8n) | TypeScript | `@cascadeflow/n8n-nodes-cascadeflow` | Community node | Yes | Yes | Yes |
+| [Vercel AI SDK](/integrations/vercel-ai) | TypeScript | `@cascadeflow/vercel-ai` | Middleware | Yes | No | Yes |
+
+## Integration Patterns
+
+Each integration follows the same principle: wrap the framework's extension point with cascadeflow's harness, without modifying agent code.
+
+### Python
+
+```python
+import cascadeflow
+cascadeflow.init(mode="observe")
+
+# Framework-specific activation
+from cascadeflow.integrations.langchain import get_harness_callback
+from cascadeflow.integrations.openai_agents import CascadeFlowModelProvider
+from cascadeflow.integrations.crewai import enable as enable_crewai
+from cascadeflow.integrations.google_adk import enable as enable_adk
+```
+
+### TypeScript
+
+```bash
+npm install @cascadeflow/langchain
+npm install @cascadeflow/vercel-ai
+npm install @cascadeflow/n8n-nodes-cascadeflow
+```
+
+## Choosing an Integration
+
+- **LangChain/LangGraph**: Use if you have existing LangChain chains or agents. The callback handler wraps any `BaseChatModel`.
+- **OpenAI Agents SDK**: Use if you're building with OpenAI's Agents SDK. The `ModelProvider` supports model candidates and tool gating.
+- **CrewAI**: Use if you're building multi-agent crews. The `llm_hooks` integration tracks all crew steps.
+- **Google ADK**: Use if you're building with Google's Agent Development Kit. The plugin integrates with `Runner`.
+- **n8n**: Use if you're building no-code workflows. The community node adds cascade routing to any n8n flow.
+- **Vercel AI SDK**: Use if you're building TypeScript server-side agents. The middleware wraps AI SDK streams.
diff --git a/docs-site/integrations/vercel-ai.mdx b/docs-site/integrations/vercel-ai.mdx
new file mode 100644
index 00000000..9b2d9257
--- /dev/null
+++ b/docs-site/integrations/vercel-ai.mdx
@@ -0,0 +1,88 @@
+---
+title: Vercel AI SDK
+description: TypeScript middleware integration for Vercel AI SDK with cascade routing, multi-turn chat, and tool execution.
+---
+
+# Vercel AI SDK Integration
+
+cascadeflow integrates with the Vercel AI SDK as middleware, providing cascade routing for server-side AI applications with streaming support.
+
+## Install
+
+```bash
+npm install @cascadeflow/vercel-ai
+```
+
+## Quick Start
+
+```typescript
+import { createChatHandler } from '@cascadeflow/vercel-ai';
+import { CascadeAgent } from '@cascadeflow/core';
+
+const agent = new CascadeAgent({
+  models: [
+    { name: 'gpt-4o-mini', provider: 'openai', cost: 0.000375 },
+    { name: 'gpt-4o', provider: 'openai', cost: 0.00625 },
+  ],
+});
+
+const handler = createChatHandler(agent, {
+  protocol: 'data',           // AI SDK v4 data stream
+  tools,                       // Tool definitions
+  toolHandlers,                // Server-side tool execution
+  maxSteps: 5,                 // Multi-step tool loops
+});
+
+// Use in Next.js API route, Express, or any Node.js server
+export const POST = handler;
+```
+
+## Features
+
+- **AI SDK v4 `data` stream** and **AI SDK v5/v6 UI streams**
+- **`useChat` multi-turn support** — conversation history preserved
+- **`parts` message format** (AI SDK v6)
+- **Tool call streaming visibility** — see tool calls as they happen
+- **Server-side tool execution** via `toolExecutor` or `toolHandlers`
+- **Multi-step controls**: `maxSteps`, `forceDirect`
+- **Cascade decision stream parts** — optional metadata in the stream
+- **Request-level overrides** with allowlist + shared-secret guard
+
+## Multi-Turn Chat
+
+```typescript
+import { useChat } from 'ai/react';
+
+export default function Chat() {
+  const { messages, input, handleSubmit, handleInputChange } = useChat({
+    api: '/api/chat',
+  });
+
+  return (
+    <div>
+      {messages.map((m) => (
+        <div key={m.id}>{m.content}</div>
+      ))}
+      <form onSubmit={handleSubmit}>
+        <input value={input} onChange={handleInputChange} />
+      </form>
+    </div>
+  );
+}
+```
+
+## Request Overrides
+
+Override cascade behavior per request (protected by shared secret):
+
+```typescript
+const handler = createChatHandler(agent, {
+  protocol: 'data',
+  allowOverrides: ['forceDirect', 'maxSteps'],
+  overrideSecret: process.env.OVERRIDE_SECRET,
+});
+```
+
+## Result
+
+40-85% cost savings for Vercel AI SDK applications with streaming support and zero client-side changes.
diff --git a/docs-site/logo/cascadeflow-dark.svg b/docs-site/logo/cascadeflow-dark.svg
new file mode 100644
index 00000000..3c1a2870
--- /dev/null
+++ b/docs-site/logo/cascadeflow-dark.svg
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg id="Layer_1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 562.99 91.76">
+  <defs>
+    <style>
+      .cls-1 {
+        fill: #fff;
+      }
+    </style>
+  </defs>
+  <path class="cls-1" d="M38.19,54.36c-4.15-4.15-4.15-10.87,0-15.02,4.15-4.15,10.87-4.15,15.02,0,4.15,4.15,4.15,10.87,0,15.02s-10.87,4.15-15.02,0Z"/>
+  <path class="cls-1" d="M63.4,91.76H28.7c-7.93,0-14.35-6.42-14.35-14.35h49.05v14.35Z"/>
+  <path class="cls-1" d="M14.35,77.41c-7.93,0-14.35-6.42-14.35-14.35V29.8h14.35v47.61Z"/>
+  <path class="cls-1" d="M28.36,0h34.7c7.93,0,14.35,6.42,14.35,14.35H28.36V0Z"/>
+  <path class="cls-1" d="M77.41,14.35c7.93,0,14.35,6.42,14.35,14.35v33.26h-14.35V14.35Z"/>
+  <g>
+    <path class="cls-1" d="M120,46.52c0-11.41,7.61-19.16,18.8-19.16,9.14,0,15.58,5.12,16.67,12.8.07.51-.29.8-.73.8h-6.44c-.51,0-.66-.22-.8-.73-.95-4.31-3.95-6.29-8.7-6.29-6.66,0-10.53,4.83-10.53,12.58s3.95,12.43,10.53,12.43c5.05,0,7.9-1.83,9.07-6.36.07-.44.37-.66.8-.66h6.44c.44,0,.8.22.73.66-1.24,7.75-7.53,12.94-17.04,12.94-11.41,0-18.8-7.46-18.8-19.01Z"/>
+    <path class="cls-1" d="M160.96,46.52c0-11.63,6.44-19.16,16.53-19.16,4.83,0,9.36,2.34,11.92,6.14h.29v-4.97c0-.44.29-.73.73-.73h6.66c.44,0,.73.29.73.73v30.13h4.02c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-5.41c-4.31,0-6.36-1.75-6.36-5.34v-.29h-.29c-2.49,3.73-7.24,6.07-12.29,6.07-10.09,0-16.53-7.46-16.53-19.01ZM179.75,58.95c6.44,0,10.24-4.68,10.24-12.43s-3.88-12.58-10.24-12.58-10.53,4.75-10.53,12.58,4.02,12.43,10.53,12.43Z"/>
+    <path class="cls-1" d="M207.03,53.84c-.07-.51.22-.8.66-.8h6.44c.51,0,.66.29.73.66.8,3.95,3.14,5.56,7.75,5.56,3.8,0,6.29-1.76,6.29-4.46,0-2.56-1.54-3.73-5.41-4.46l-5.19-.95c-7.09-1.32-10.46-4.68-10.46-10.9,0-6.73,5.34-11.12,13.82-11.12s13.46,3.95,14.26,10.82c.07.51-.22.8-.66.8h-6.44c-.51,0-.73-.37-.8-.66-.73-3.36-2.63-4.68-6.36-4.68s-5.85,1.54-5.85,4.39c0,2.27,1.02,3.66,4.97,4.39l5.56,1.02c8.04,1.46,10.53,4.61,10.53,10.9,0,7.17-5.12,11.19-14.26,11.19s-14.85-4.17-15.58-11.7Z"/>
+    <path class="cls-1" d="M242.35,46.52c0-11.41,7.61-19.16,18.8-19.16,9.14,0,15.58,5.12,16.67,12.8.07.51-.29.8-.73.8h-6.44c-.51,0-.66-.22-.8-.73-.95-4.31-3.95-6.29-8.7-6.29-6.66,0-10.53,4.83-10.53,12.58s3.95,12.43,10.53,12.43c5.05,0,7.9-1.83,9.07-6.36.07-.44.37-.66.8-.66h6.44c.44,0,.8.22.73.66-1.24,7.75-7.53,12.94-17.04,12.94-11.41,0-18.8-7.46-18.8-19.01Z"/>
+    <path class="cls-1" d="M283.3,46.52c0-11.63,6.44-19.16,16.53-19.16,4.83,0,9.36,2.34,11.92,6.14h.29v-4.97c0-.44.29-.73.73-.73h6.66c.44,0,.73.29.73.73v30.13h4.02c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-5.41c-4.31,0-6.36-1.75-6.36-5.34v-.29h-.29c-2.49,3.73-7.24,6.07-12.29,6.07-10.09,0-16.53-7.46-16.53-19.01ZM302.1,58.95c6.44,0,10.24-4.68,10.24-12.43s-3.88-12.58-10.24-12.58-10.53,4.75-10.53,12.58,4.02,12.43,10.53,12.43Z"/>
+    <path class="cls-1" d="M328.93,46.52c0-11.63,6.44-19.16,16.53-19.16,4.83,0,9.36,2.34,11.92,6.14h.29V13.17c0-.44.29-.73.73-.73h6.66c.44,0,.73.29.73.73v51.19c0,.44-.29.73-.73.73h-6.29c-.44,0-.73-.29-.73-.73v-4.9h-.29c-2.49,3.73-7.24,6.07-12.29,6.07-10.09,0-16.53-7.46-16.53-19.01ZM347.73,58.95c6.44,0,10.24-4.68,10.24-12.43s-3.88-12.58-10.24-12.58-10.53,4.75-10.53,12.58,4.02,12.43,10.53,12.43Z"/>
+    <path class="cls-1" d="M373.1,46.52c0-11.7,7.39-19.16,18.87-19.16,10.6,0,17.19,7.31,17.19,19.09v1.97c0,.44-.29.73-.73.73h-26.99c.58,6.14,4.24,9.8,10.53,9.8,4.97,0,7.97-1.9,9-4.97.15-.44.37-.59.8-.59h6.44c.51,0,.8.22.73.59-1.32,6.8-7.39,11.55-16.97,11.55-11.48,0-18.87-7.46-18.87-19.01ZM401.19,43.23c-.37-5.78-3.88-9.29-9.21-9.29-6.14,0-9.73,3.22-10.46,9.29h19.67Z"/>
+    <path class="cls-1" d="M420.64,64.37v-30.13h-6.36c-.44,0-.73-.29-.73-.73v-4.97c0-.44.29-.73.73-.73h6.36v-4.75c0-6.88,3.73-10.6,10.6-10.6h6.51c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-9v8.92h9c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-9v30.13c0,.44-.29.73-.73.73h-6.66c-.44,0-.73-.29-.73-.73ZM445.79,13.17c0-.44.29-.73.73-.73h6.66c.44,0,.73.29.73.73v45.49h6.8c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-4.32c-6.88,0-10.6-3.73-10.6-10.6V13.17Z"/>
+    <path class="cls-1" d="M464.73,46.52c0-11.41,7.61-19.16,18.65-19.16s18.8,7.75,18.8,19.16-7.61,19.01-18.8,19.01-18.65-7.68-18.65-19.01ZM483.38,58.95c6.51,0,10.53-4.68,10.53-12.43s-3.95-12.58-10.53-12.58-10.38,4.75-10.38,12.58,3.88,12.43,10.38,12.43Z"/>
+    <path class="cls-1" d="M515.99,64.51l-10.46-35.91c-.15-.51.15-.8.66-.8h6.88c.44,0,.73.22.8.66l7.46,29.98h.29l7.46-29.98c.07-.44.37-.66.8-.66h9.07c.44,0,.73.22.8.66l7.46,29.98h.29l7.46-29.98c.07-.44.37-.66.8-.66h6.51c.51,0,.8.29.66.8l-10.46,35.91c-.15.37-.37.58-.8.58h-9c-.44,0-.73-.22-.8-.66l-7.53-29.25h-.22l-7.53,29.25c-.07.44-.37.66-.8.66h-9c-.44,0-.66-.22-.8-.58Z"/>
+  </g>
+</svg>
\ No newline at end of file
diff --git a/docs-site/logo/cascadeflow-light.svg b/docs-site/logo/cascadeflow-light.svg
new file mode 100644
index 00000000..8ca48234
--- /dev/null
+++ b/docs-site/logo/cascadeflow-light.svg
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg id="Layer_1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 562.99 91.76">
+  <path d="M38.19,54.36c-4.15-4.15-4.15-10.87,0-15.02,4.15-4.15,10.87-4.15,15.02,0,4.15,4.15,4.15,10.87,0,15.02s-10.87,4.15-15.02,0Z"/>
+  <path d="M63.4,91.76H28.7c-7.93,0-14.35-6.42-14.35-14.35h49.05v14.35Z"/>
+  <path d="M14.35,77.41c-7.93,0-14.35-6.42-14.35-14.35V29.8h14.35v47.61Z"/>
+  <path d="M28.36,0h34.7c7.93,0,14.35,6.42,14.35,14.35H28.36V0Z"/>
+  <path d="M77.41,14.35c7.93,0,14.35,6.42,14.35,14.35v33.26h-14.35V14.35Z"/>
+  <g>
+    <path d="M120,46.52c0-11.41,7.61-19.16,18.8-19.16,9.14,0,15.58,5.12,16.67,12.8.07.51-.29.8-.73.8h-6.44c-.51,0-.66-.22-.8-.73-.95-4.31-3.95-6.29-8.7-6.29-6.66,0-10.53,4.83-10.53,12.58s3.95,12.43,10.53,12.43c5.05,0,7.9-1.83,9.07-6.36.07-.44.37-.66.8-.66h6.44c.44,0,.8.22.73.66-1.24,7.75-7.53,12.94-17.04,12.94-11.41,0-18.8-7.46-18.8-19.01Z"/>
+    <path d="M160.96,46.52c0-11.63,6.44-19.16,16.53-19.16,4.83,0,9.36,2.34,11.92,6.14h.29v-4.97c0-.44.29-.73.73-.73h6.66c.44,0,.73.29.73.73v30.13h4.02c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-5.41c-4.31,0-6.36-1.75-6.36-5.34v-.29h-.29c-2.49,3.73-7.24,6.07-12.29,6.07-10.09,0-16.53-7.46-16.53-19.01ZM179.75,58.95c6.44,0,10.24-4.68,10.24-12.43s-3.88-12.58-10.24-12.58-10.53,4.75-10.53,12.58,4.02,12.43,10.53,12.43Z"/>
+    <path d="M207.03,53.84c-.07-.51.22-.8.66-.8h6.44c.51,0,.66.29.73.66.8,3.95,3.14,5.56,7.75,5.56,3.8,0,6.29-1.76,6.29-4.46,0-2.56-1.54-3.73-5.41-4.46l-5.19-.95c-7.09-1.32-10.46-4.68-10.46-10.9,0-6.73,5.34-11.12,13.82-11.12s13.46,3.95,14.26,10.82c.07.51-.22.8-.66.8h-6.44c-.51,0-.73-.37-.8-.66-.73-3.36-2.63-4.68-6.36-4.68s-5.85,1.54-5.85,4.39c0,2.27,1.02,3.66,4.97,4.39l5.56,1.02c8.04,1.46,10.53,4.61,10.53,10.9,0,7.17-5.12,11.19-14.26,11.19s-14.85-4.17-15.58-11.7Z"/>
+    <path d="M242.35,46.52c0-11.41,7.61-19.16,18.8-19.16,9.14,0,15.58,5.12,16.67,12.8.07.51-.29.8-.73.8h-6.44c-.51,0-.66-.22-.8-.73-.95-4.31-3.95-6.29-8.7-6.29-6.66,0-10.53,4.83-10.53,12.58s3.95,12.43,10.53,12.43c5.05,0,7.9-1.83,9.07-6.36.07-.44.37-.66.8-.66h6.44c.44,0,.8.22.73.66-1.24,7.75-7.53,12.94-17.04,12.94-11.41,0-18.8-7.46-18.8-19.01Z"/>
+    <path d="M283.3,46.52c0-11.63,6.44-19.16,16.53-19.16,4.83,0,9.36,2.34,11.92,6.14h.29v-4.97c0-.44.29-.73.73-.73h6.66c.44,0,.73.29.73.73v30.13h4.02c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-5.41c-4.31,0-6.36-1.75-6.36-5.34v-.29h-.29c-2.49,3.73-7.24,6.07-12.29,6.07-10.09,0-16.53-7.46-16.53-19.01ZM302.1,58.95c6.44,0,10.24-4.68,10.24-12.43s-3.88-12.58-10.24-12.58-10.53,4.75-10.53,12.58,4.02,12.43,10.53,12.43Z"/>
+    <path d="M328.93,46.52c0-11.63,6.44-19.16,16.53-19.16,4.83,0,9.36,2.34,11.92,6.14h.29V13.17c0-.44.29-.73.73-.73h6.66c.44,0,.73.29.73.73v51.19c0,.44-.29.73-.73.73h-6.29c-.44,0-.73-.29-.73-.73v-4.9h-.29c-2.49,3.73-7.24,6.07-12.29,6.07-10.09,0-16.53-7.46-16.53-19.01ZM347.73,58.95c6.44,0,10.24-4.68,10.24-12.43s-3.88-12.58-10.24-12.58-10.53,4.75-10.53,12.58,4.02,12.43,10.53,12.43Z"/>
+    <path d="M373.1,46.52c0-11.7,7.39-19.16,18.87-19.16,10.6,0,17.19,7.31,17.19,19.09v1.97c0,.44-.29.73-.73.73h-26.99c.58,6.14,4.24,9.8,10.53,9.8,4.97,0,7.97-1.9,9-4.97.15-.44.37-.59.8-.59h6.44c.51,0,.8.22.73.59-1.32,6.8-7.39,11.55-16.97,11.55-11.48,0-18.87-7.46-18.87-19.01ZM401.19,43.23c-.37-5.78-3.88-9.29-9.21-9.29-6.14,0-9.73,3.22-10.46,9.29h19.67Z"/>
+    <path d="M420.64,64.37v-30.13h-6.36c-.44,0-.73-.29-.73-.73v-4.97c0-.44.29-.73.73-.73h6.36v-4.75c0-6.88,3.73-10.6,10.6-10.6h6.51c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-9v8.92h9c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-9v30.13c0,.44-.29.73-.73.73h-6.66c-.44,0-.73-.29-.73-.73ZM445.79,13.17c0-.44.29-.73.73-.73h6.66c.44,0,.73.29.73.73v45.49h6.8c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-4.32c-6.88,0-10.6-3.73-10.6-10.6V13.17Z"/>
+    <path d="M464.73,46.52c0-11.41,7.61-19.16,18.65-19.16s18.8,7.75,18.8,19.16-7.61,19.01-18.8,19.01-18.65-7.68-18.65-19.01ZM483.38,58.95c6.51,0,10.53-4.68,10.53-12.43s-3.95-12.58-10.53-12.58-10.38,4.75-10.38,12.58,3.88,12.43,10.38,12.43Z"/>
+    <path d="M515.99,64.51l-10.46-35.91c-.15-.51.15-.8.66-.8h6.88c.44,0,.73.22.8.66l7.46,29.98h.29l7.46-29.98c.07-.44.37-.66.8-.66h9.07c.44,0,.73.22.8.66l7.46,29.98h.29l7.46-29.98c.07-.44.37-.66.8-.66h6.51c.51,0,.8.29.66.8l-10.46,35.91c-.15.37-.37.58-.8.58h-9c-.44,0-.73-.22-.8-.66l-7.53-29.25h-.22l-7.53,29.25c-.07.44-.37.66-.8.66h-9c-.44,0-.66-.22-.8-.58Z"/>
+  </g>
+</svg>
\ No newline at end of file
diff --git a/docs/README.md b/docs/README.md
index 5280a562..08c5c0c8 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,8 +1,10 @@
 # cascadeflow Documentation
 
-Welcome to cascadeflow documentation! 🌊
+> **Full documentation is now at [docs.cascadeflow.dev](https://docs.cascadeflow.dev)** — the Mintlify-powered docs site is the primary reference for cascadeflow's agent runtime intelligence layer. The guides below remain for quick reference and deep links.
 
-## 📖 Quick Links
+Agent runtime intelligence layer — optimize cost, latency, quality, budget, compliance, and energy across AI agent workflows. In-process harness, not a proxy.
+
+## Quick Links
 
 - [Installation Guide](INSTALLATION.md)
 - [Quick Start Guide](guides/quickstart.md)
@@ -40,6 +42,7 @@ Welcome to cascadeflow documentation! 🌊
 - [Agent Intelligence V2/V2.1 Plan](strategy/agent-intelligence-v2-plan.md) - Unified strategic and execution plan for in-process agent intelligence harness delivery
 
 ### Integrations
+- [LangChain Integration](guides/langchain_integration.md) - Callback handler for LangChain/LangGraph with harness-aware cascading
 - [OpenAI Agents SDK Integration](guides/openai_agents_integration.md) - Harness-aware model provider for existing OpenAI Agents apps
 - [CrewAI Integration](guides/crewai_integration.md) - Hook-based harness metrics + budget gating (opt-in)
 - [Google ADK Integration](guides/google_adk_integration.md) - Plugin-based harness integration for ADK runners (opt-in)
diff --git a/llms.txt b/llms.txt
index 51bb8437..dbba72ca 100644
--- a/llms.txt
+++ b/llms.txt
@@ -1,7 +1,7 @@
 # cascadeflow
 
-> Agent intelligence harness for cost, quality, latency, energy, and business KPI optimization.
-> In-process library (not a proxy). Works inside agent loops with full state awareness.
+> Agent runtime intelligence layer for AI agent workflows.
+> In-process harness (not a proxy). Works inside agent loops with full state awareness.
 
 ## Install
 
@@ -13,6 +13,41 @@ import cascadeflow
 cascadeflow.init(mode="observe")
 # All openai/anthropic SDK calls are now tracked. Switch to "enforce" for budget gating.
 
+## What cascadeflow is
+
+cascadeflow is an in-process intelligence layer that sits inside AI agent execution
+loops. Unlike external proxies that only see HTTP request boundaries, cascadeflow
+operates with full agent state awareness: step count, budget consumed, tool call
+history, error context, quality scores, domain, complexity, and user-defined
+business context.
+
+Eight things make cascadeflow different:
+
+1. Inside-the-loop control. Decisions happen per-step and per-tool-call inside
+   agent execution, not at the HTTP boundary. This enables budget gating mid-run,
+   model switching based on remaining budget, and stop actions when caps are hit.
+
+2. Multi-dimensional optimization. Six dimensions scored simultaneously: cost,
+   latency, quality, budget, compliance, and energy. Not just cost routing.
+
+3. Business logic injection. KPI weights and targets let teams encode business
+   priorities (e.g. 60% quality, 30% cost, 10% latency) into every model decision.
+
+4. Actionable decisions. Four actions: allow, switch_model, deny_tool, stop.
+   The harness does not just observe — it controls execution flow.
+
+5. Full transparency. Every decision produces a trace record with action, reason,
+   model, step, cost_total, budget_state, and applied fields. Audit-ready.
+
+6. Measurable value. Session summaries report cost, latency, energy, steps,
+   tool calls, and budget remaining. Before/after comparison is built in.
+
+7. Category leadership. Cross-framework policy layer with unified KPI semantics
+   across LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, Vercel AI SDK.
+
+8. Latency advantage. In-process instrumentation adds <1ms overhead per call.
+   External proxies add 10-50ms of network round-trip latency per LLM call.
+
 ## When to use cascadeflow
 
 - You run AI agents (LangChain, LangGraph, CrewAI, OpenAI Agents SDK, Google ADK, or custom)
@@ -28,6 +63,17 @@ cascadeflow.init(mode="observe")
 - You only use one model and don't want routing
 - You need a hosted proxy service (cascadeflow is a library, not a SaaS)
 
+## Proxy vs cascadeflow
+
+| Dimension          | External proxy             | cascadeflow harness          |
+|--------------------|----------------------------|------------------------------|
+| Scope              | HTTP request boundary      | Inside agent execution loop  |
+| Dimensions         | Cost only                  | Cost + quality + latency + budget + compliance + energy |
+| Latency overhead   | 10-50ms network RTT        | <1ms in-process              |
+| Business logic     | None                       | KPI weights and targets      |
+| Enforcement        | None (observe only)        | stop, deny_tool, switch_model |
+| Auditability       | Request logs               | Per-step decision traces     |
+
 ## Key APIs
 
 - cascadeflow.init(mode) -- activate harness globally (off | observe | enforce)
@@ -36,6 +82,20 @@ cascadeflow.init(mode="observe")
 - session.summary() -- structured run metrics (cost, latency, energy, steps, tool calls)
 - session.trace() -- full decision trace for auditability
 
+## HarnessConfig Reference
+
+@dataclass
+class HarnessConfig:
+    mode: HarnessMode          # "off" | "observe" | "enforce". Default: "off"
+    verbose: bool              # Print decisions to stderr. Default: False
+    budget: Optional[float]    # Max USD for the run. Default: None (unlimited)
+    max_tool_calls: Optional[int]    # Max tool/function calls. Default: None
+    max_latency_ms: Optional[float]  # Max wall-clock ms per call. Default: None
+    max_energy: Optional[float]      # Max energy units. Default: None
+    kpi_targets: Optional[dict]      # {"quality": 0.9, "cost": 0.5, ...}
+    kpi_weights: Optional[dict]      # {"quality": 0.6, "cost": 0.3, "latency": 0.1}
+    compliance: Optional[str]        # "gdpr" | "hipaa" | "pci" | "strict"
+
 ## Harness Modes
 
 - off: no tracking, no enforcement
@@ -50,6 +110,31 @@ cascadeflow.init(mode="observe")
 - Tool calls: count of tool/function calls executed
 - Quality: model quality priors for KPI-weighted scoring
 
+## Decision Actions
+
+- allow: proceed normally
+- switch_model: route to cheaper/better model (where runtime allows)
+- deny_tool: block tool execution when tool call cap reached
+- stop: halt agent loop when budget/latency/energy cap exceeded
+
+## Decision Trace Format
+
+Each decision produces a record with these fields:
+- action: "allow" | "switch_model" | "deny_tool" | "stop"
+- reason: human-readable explanation
+- model: model name used for the call
+- step: integer step number in the run
+- cost_total: cumulative cost in USD at this step
+- budget_state: "ok" | "warning" | "exceeded"
+- applied: true if the action was enforced (false in observe mode)
+
+## Compliance Model Allowlists
+
+- gdpr: gpt-4o, gpt-4o-mini, gpt-3.5-turbo
+- hipaa: gpt-4o, gpt-4o-mini
+- pci: gpt-4o-mini, gpt-3.5-turbo
+- strict: gpt-4o only
+
 ## Integrations
 
 pip install cascadeflow[langchain]       # LangChain/LangGraph callback handler
@@ -57,31 +142,73 @@ pip install cascadeflow[openai-agents]   # OpenAI Agents SDK ModelProvider
 pip install cascadeflow[crewai]          # CrewAI llm_hooks integration
 pip install cascadeflow[google-adk]      # Google ADK BasePlugin
 
-All integrations are opt-in. Install the extra and explicitly enable the integration.
-
-## Integration Patterns
+npm install @cascadeflow/core            # TypeScript core
+npm install @cascadeflow/langchain       # LangChain TypeScript
+npm install @cascadeflow/vercel-ai       # Vercel AI SDK middleware
+npm install @cascadeflow/n8n-nodes-cascadeflow  # n8n community node
 
-- LangChain: HarnessAwareCascadeFlowCallbackHandler via get_harness_callback()
-- OpenAI Agents SDK: CascadeFlowModelProvider with model candidates and tool gating
-- CrewAI: enable() registers global llm_hooks for budget gating and tracking
-- Google ADK: enable() returns a BasePlugin for Runner(plugins=[plugin])
-- n8n: Built-in harness mode (observe/enforce) on the Agent node with UI parameters
-- Vercel AI SDK: TypeScript middleware integration
-
-## Decision Actions
-
-- allow: proceed normally
-- switch_model: route to cheaper/better model (where runtime allows)
-- deny_tool: block tool execution when tool call cap reached
-- stop: halt agent loop when budget/latency/energy cap exceeded
-
-## Supported Models (pricing table)
+All integrations are opt-in. Install the extra and explicitly enable the integration.
 
-OpenAI: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-3.5-turbo, o1, o1-mini, o3-mini, gpt-5, gpt-5-mini
-Anthropic: claude-sonnet-4, claude-haiku-3.5, claude-opus-4.5
-Google: gemini-2.5-flash, gemini-2.5-pro, gemini-2.0-flash, gemini-1.5-flash, gemini-1.5-pro
+## Integration Code Snippets
+
+LangChain:
+  from cascadeflow.integrations.langchain import get_harness_callback
+  cb = get_harness_callback()
+  result = await model.ainvoke("query", config={"callbacks": [cb]})
+
+OpenAI Agents SDK:
+  from cascadeflow.integrations.openai_agents import CascadeFlowModelProvider
+  provider = CascadeFlowModelProvider(model_candidates=["gpt-4o-mini", "gpt-4o"])
+
+CrewAI:
+  from cascadeflow.integrations.crewai import enable
+  enable(budget_gate=True, fail_open=True)
+
+Google ADK:
+  from cascadeflow.integrations.google_adk import enable
+  plugin = enable(fail_open=True)
+  runner = Runner(agent=agent, plugins=[plugin])
+
+## Pricing Table (USD per 1M tokens: input / output)
+
+OpenAI:
+  gpt-4o:         $2.50 / $10.00
+  gpt-4o-mini:    $0.15 / $0.60
+  gpt-5:          $1.25 / $10.00
+  gpt-5-mini:     $0.20 / $0.80
+  gpt-4-turbo:    $10.00 / $30.00
+  gpt-4:          $30.00 / $60.00
+  gpt-3.5-turbo:  $0.50 / $1.50
+  o1:             $15.00 / $60.00
+  o1-mini:        $3.00 / $12.00
+  o3-mini:        $1.10 / $4.40
+
+Anthropic:
+  claude-sonnet-4:  $3.00 / $15.00
+  claude-haiku-3.5: $1.00 / $5.00
+  claude-opus-4.5:  $5.00 / $25.00
+
+Google:
+  gemini-2.5-flash: $0.15 / $0.60
+  gemini-2.5-pro:   $1.25 / $10.00
+  gemini-2.0-flash:  $0.10 / $0.40
+  gemini-1.5-flash: $0.075 / $0.30
+  gemini-1.5-pro:   $1.25 / $5.00
+
+## Energy Coefficients
+
+Model energy is computed as: energy_units = coeff * (input_tokens + output_tokens * 1.5)
+
+  gpt-4o: 1.0       gpt-4o-mini: 0.3     gpt-5: 1.2
+  gpt-5-mini: 0.35  gpt-4-turbo: 1.5     gpt-4: 1.5
+  gpt-3.5-turbo: 0.2  o1: 2.0            o1-mini: 0.8
+  o3-mini: 0.5      claude-sonnet-4: 1.0  claude-haiku-3.5: 0.3
+  claude-opus-4.5: 1.8  gemini-2.5-flash: 0.3  gemini-2.5-pro: 1.2
+  gemini-2.0-flash: 0.25  gemini-1.5-flash: 0.2  gemini-1.5-pro: 1.0
 
 ## Links
 
+- Docs: https://docs.cascadeflow.dev
 - Source: https://github.com/lemony-ai/cascadeflow
 - PyPI: pip install cascadeflow
+- npm: npm install @cascadeflow/core
diff --git a/pyproject.toml b/pyproject.toml
index b746a6e0..bc7c7072 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "cascadeflow"
 version = "1.0.0"
-description = "Smart AI model cascading for cost optimization - Save 40-85% on LLM costs with 2-6x faster responses. Available for Python and TypeScript/JavaScript."
+description = "Agent runtime intelligence layer — optimize cost, latency, quality, budget, compliance, and energy across AI agent workflows."
 readme = "README.md"
 requires-python = ">=3.9"
 license = "MIT"
@@ -32,9 +32,17 @@ keywords = [
     "javascript",
     "browser",
     "edge-functions",
+    "agent-intelligence",
+    "runtime-optimization",
+    "budget-enforcement",
+    "compliance",
+    "harness",
+    "agent-runtime",
+    "kpi",
+    "energy-tracking",
 ]
 classifiers = [
-    "Development Status :: 4 - Beta",
+    "Development Status :: 5 - Production/Stable",
     "Intended Audience :: Developers",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3.9",
@@ -141,7 +149,7 @@ all = [
 
 [project.urls]
 Homepage = "https://lemony.ai"
-Documentation = "https://github.com/lemony-ai/cascadeflow"
+Documentation = "https://docs.cascadeflow.dev"
 Repository = "https://github.com/lemony-ai/cascadeflow"
 "Bug Tracker" = "https://github.com/lemony-ai/cascadeflow/issues"
 Changelog = "https://github.com/lemony-ai/cascadeflow/releases"

From adbf47eee1b5bebb8921c6a96f7204a40bd79b08 Mon Sep 17 00:00:00 2001
From: saschabuehrle <sascha.buehrle@gmail.com>
Date: Thu, 5 Mar 2026 17:15:21 +0100
Subject: [PATCH 49/49] fix: switch GitHub Stars badge from social to flat
 style

Social-style shields.io badges intermittently render as "invalid"
due to GitHub API rate limiting. Flat style is more reliable.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 27baf1be..51de5118 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@
 [![Python Docs](https://img.shields.io/badge/docs-Python-blue)](./docs/)
 [![TypeScript Docs](https://img.shields.io/badge/docs-TypeScript-red)](./docs/)
 [![X Follow](https://img.shields.io/twitter/follow/saschabuehrle?style=social)](https://x.com/saschabuehrle)
-[![GitHub Stars](https://img.shields.io/github/stars/lemony-ai/cascadeflow?style=social)](https://github.com/lemony-ai/cascadeflow)
+[![GitHub Stars](https://img.shields.io/github/stars/lemony-ai/cascadeflow?style=flat&color=yellow&label=Stars)](https://github.com/lemony-ai/cascadeflow/stargazers)
 
 <br>