diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 4b2411d5..3138b54f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -47,6 +47,52 @@ jobs:
           fail_ci_if_error: false
           token: ${{ secrets.CODECOV_TOKEN }}
 
+  # Python opt-in integration install + focused tests
+  test-python-optional-integrations:
+    name: Python Optional Integrations (${{ matrix.integration }} / py${{ matrix.python-version }})
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - integration: openai-agents
+            python-version: '3.9'
+            extras: ".[dev,openai,openai-agents]"
+            tests: "tests/test_openai_agents_integration.py"
+          - integration: openai-agents
+            python-version: '3.11'
+            extras: ".[dev,openai,openai-agents]"
+            tests: "tests/test_openai_agents_integration.py"
+          - integration: crewai
+            python-version: '3.11'
+            extras: ".[dev,crewai,openai]"
+            tests: "tests/test_crewai_integration.py"
+          - integration: google-adk
+            python-version: '3.11'
+            extras: ".[dev,google-adk]"
+            tests: "tests/test_google_adk_integration.py"
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+
+      - name: Install integration dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e "${{ matrix.extras }}"
+
+      - name: Run focused integration tests
+        run: |
+          pytest ${{ matrix.tests }} -v
+        env:
+          PYTHONPATH: ${{ github.workspace }}
+
   # TypeScript Core Tests
   test-typescript-core:
     name: TypeScript Core Tests
diff --git a/README.md b/README.md
index 63e9af87..51de5118 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
   <img alt="cascadeflow Logo" src="./.github/assets/CF_logo_dark.svg" width="80%" style="margin: 20px auto;">
 </picture>
 
-# Smart AI model cascading for cost optimization
+# Agent Runtime Intelligence Layer
 
 [![PyPI version](https://img.shields.io/pypi/v/cascadeflow?color=blue&label=Python)](https://pypi.org/project/cascadeflow/)
 [![npm version](https://img.shields.io/npm/v/@cascadeflow/core?color=red&label=TypeScript)](https://www.npmjs.com/package/@cascadeflow/core)
@@ -17,10 +17,11 @@
 [![PyPI Downloads](https://static.pepy.tech/badge/cascadeflow)](https://pepy.tech/project/cascadeflow)
 [![npm Downloads](https://img.shields.io/npm/dt/@cascadeflow/n8n-nodes-cascadeflow?label=npm%20downloads&color=orange)](https://www.npmjs.com/search?q=%40cascadeflow)
 [![Tests](https://github.com/lemony-ai/cascadeflow/actions/workflows/test.yml/badge.svg)](https://github.com/lemony-ai/cascadeflow/actions/workflows/test.yml)
+[![Docs](https://img.shields.io/badge/docs-cascadeflow.dev-blue)](https://docs.cascadeflow.dev)
 [![Python Docs](https://img.shields.io/badge/docs-Python-blue)](./docs/)
 [![TypeScript Docs](https://img.shields.io/badge/docs-TypeScript-red)](./docs/)
 [![X Follow](https://img.shields.io/twitter/follow/saschabuehrle?style=social)](https://x.com/saschabuehrle)
-[![GitHub Stars](https://img.shields.io/github/stars/lemony-ai/cascadeflow?style=social)](https://github.com/lemony-ai/cascadeflow)
+[![GitHub Stars](https://img.shields.io/github/stars/lemony-ai/cascadeflow?style=flat&color=yellow&label=Stars)](https://github.com/lemony-ai/cascadeflow/stargazers)
 
 <br>
 
@@ -28,17 +29,15 @@
 
 <br>
 
-**[<img src=".github/assets/CF_python_color.svg" width="22" height="22" alt="Python" style="vertical-align: middle;"/> Python](#-python) • [<img src=".github/assets/CF_ts_color.svg" width="22" height="22" alt="TypeScript" style="vertical-align: middle;"/> TypeScript](#-typescript) • [<picture><source media="(prefers-color-scheme: dark)" srcset="./.github/assets/LC-logo-bright.png"><source media="(prefers-color-scheme: light)" srcset="./.github/assets/LC-logo-dark.png"><img src=".github/assets/LC-logo-dark.png" height="22" alt="LangChain" style="vertical-align: middle;"></picture> LangChain](#-langchain-integration) • [<img src=".github/assets/CF_n8n_color.svg" width="22" height="22" alt="n8n" style="vertical-align: middle;"/> n8n](#-n8n-integration) • [<picture><source media="(prefers-color-scheme: dark)" srcset="./.github/assets/CF_vercel_bright.svg"><source media="(prefers-color-scheme: light)" srcset="./.github/assets/CF_vercel_dark.svg"><img src=".github/assets/CF_vercel_dark.svg" width="22" height="22" alt="Vercel AI" style="vertical-align: middle;"></picture> Vercel AI](./packages/integrations/vercel-ai/) • [<img src=".github/assets/CF_openclaw_color.svg" width="22" height="22" alt="OpenClaw" style="vertical-align: middle;"/> OpenClaw](https://clawhub.ai/saschabuehrle/cascadeflow) • [📖 Docs](./docs/) • [💡 Examples](#examples)**
+**[<img src=".github/assets/CF_python_color.svg" width="22" height="22" alt="Python" style="vertical-align: middle;"/> Python](#-python) • [<img src=".github/assets/CF_ts_color.svg" width="22" height="22" alt="TypeScript" style="vertical-align: middle;"/> TypeScript](#-typescript) • [<picture><source media="(prefers-color-scheme: dark)" srcset="./.github/assets/LC-logo-bright.png"><source media="(prefers-color-scheme: light)" srcset="./.github/assets/LC-logo-dark.png"><img src=".github/assets/LC-logo-dark.png" height="22" alt="LangChain" style="vertical-align: middle;"></picture> LangChain](#-langchain-integration) • [<img src=".github/assets/CF_n8n_color.svg" width="22" height="22" alt="n8n" style="vertical-align: middle;"/> n8n](#-n8n-integration) • [<picture><source media="(prefers-color-scheme: dark)" srcset="./.github/assets/CF_vercel_bright.svg"><source media="(prefers-color-scheme: light)" srcset="./.github/assets/CF_vercel_dark.svg"><img src=".github/assets/CF_vercel_dark.svg" width="22" height="22" alt="Vercel AI" style="vertical-align: middle;"></picture> Vercel AI](./packages/integrations/vercel-ai/) • [<img src=".github/assets/CF_openclaw_color.svg" width="22" height="22" alt="OpenClaw" style="vertical-align: middle;"/> OpenClaw](https://clawhub.ai/saschabuehrle/cascadeflow) • [Full Docs](https://docs.cascadeflow.dev) • [📖 Docs](./docs/) • [💡 Examples](#examples)**
 
 </div>
 
 ---
 
-**Stop Bleeding Money on AI Calls. Cut Costs 30-65% in 3 Lines of Code.**
+**The in-process intelligence layer for AI agents.** Optimize cost, latency, quality, budget, compliance, and energy — inside the execution loop, not at the HTTP boundary.
 
-40-70% of text prompts and 20-60% of agent calls don't need expensive flagship models. You're overpaying every single day.
-
-*cascadeflow fixes this with intelligent model cascading, available in Python and TypeScript.*
+cascadeflow works where external proxies can't: per-step model decisions based on agent state, per-tool-call budget gating, runtime stop/continue/escalate actions, and business KPI injection during agent loops. Sub-1ms overhead. Works with LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, and Vercel AI SDK.
 
 ```python
 pip install cascadeflow
@@ -52,6 +51,17 @@ npm install @cascadeflow/core
 
 ## Why cascadeflow?
 
+### Proxy vs In-Process Harness
+
+| Dimension | External Proxy | cascadeflow Harness |
+|---|---|---|
+| **Scope** | HTTP request boundary | Inside agent execution loop |
+| **Dimensions** | Cost only | Cost + quality + latency + budget + compliance + energy |
+| **Latency overhead** | 10-50ms network RTT | <1ms in-process |
+| **Business logic** | None | KPI weights and targets |
+| **Enforcement** | None (observe only) | stop, deny_tool, switch_model |
+| **Auditability** | Request logs | Per-step decision traces |
+
 cascadeflow is an intelligent AI model cascading library that dynamically selects the optimal model for each query or tool call through speculative execution. It's based on the research that 40-70% of queries don't require slow, expensive flagship models, and domain-specific smaller models often outperform large general-purpose models on specialized tasks. For the remaining queries that need advanced reasoning, cascadeflow automatically escalates to flagship models if needed.
 
 ### Use Cases
@@ -140,6 +150,34 @@ In practice, 60-70% of queries are handled by small, efficient models (8-20x cos
 
 ---
 
+## Harness API
+
+Three tiers of integration — zero-change observability to full policy control:
+
+**Tier 1: Zero-change observability**
+```python
+import cascadeflow
+cascadeflow.init(mode="observe")
+# All OpenAI/Anthropic SDK calls are now tracked. No code changes needed.
+```
+
+**Tier 2: Scoped runs with budget**
+```python
+with cascadeflow.run(budget=0.50, max_tool_calls=10) as session:
+    result = await agent.run("Analyze this dataset")
+    print(session.summary())  # cost, latency, energy, steps, tool calls
+    print(session.trace())    # full decision audit trail
+```
+
+**Tier 3: Decorated agents with policy**
+```python
+@cascadeflow.agent(budget=0.20, compliance="gdpr", kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1})
+async def my_agent(query: str):
+    return await llm.complete(query)
+```
+
+---
+
 ## Quick Start
 
 ### Drop-In Gateway (Existing Apps)
@@ -724,6 +762,12 @@ console.log(`Warnings: ${validation.warnings}`);
 | 📋 **Message & Tool Call Lists** | Full conversation history with tool_calls and tool_call_id preservation across turns |
 | 🪝 **Hooks & Callbacks** | Telemetry callbacks, cost events, and streaming hooks for observability |
 | 🏭 **Production Ready**  | Streaming, batch processing, tool handling, reasoning model support, caching, error recovery, anomaly detection |
+| 💳 **Budget Enforcement** | Per-run and per-user budget caps with automatic stop actions when limits are exceeded |
+| 🔒 **Compliance Gating** | GDPR, HIPAA, PCI, and strict model allowlists — block non-compliant models before execution |
+| 📊 **KPI-Weighted Routing** | Inject business priorities (quality, cost, latency, energy) as weights into every model decision |
+| 🌱 **Energy Tracking** | Deterministic compute-intensity coefficients for carbon-aware AI operations |
+| 🔍 **Decision Traces** | Full per-step audit trail: action, reason, model, cost, budget state, enforcement status |
+| ⚙️ **Harness Modes** | off / observe / enforce — roll out safely with observe, then switch to enforce when ready |
 
 ---
 
@@ -774,7 +818,7 @@ If you use cascadeflow in your research or project, please cite:
 ```bibtex
 @software{cascadeflow2025,
   author = {Lemony Inc., Sascha Buehrle and Contributors},
-  title = {cascadeflow: Smart AI model cascading for cost optimization},
+  title = {cascadeflow: Agent runtime intelligence layer for AI agent workflows},
   year = {2025},
   publisher = {GitHub},
   url = {https://github.com/lemony-ai/cascadeflow}
diff --git a/cascadeflow/__init__.py b/cascadeflow/__init__.py
index 1b61a9f3..af4c429a 100644
--- a/cascadeflow/__init__.py
+++ b/cascadeflow/__init__.py
@@ -1,30 +1,23 @@
 """
-cascadeflow - Smart AI model cascading for cost optimization.
-
-Route queries intelligently across multiple AI models from tiny SLMs
-to frontier LLMs based on complexity, domain, and budget.
-
-Features:
-- 🚀 Speculative cascades (2-3x faster)
-- 💰 60-95% cost savings
-- 🎯 Per-prompt domain detection
-- 🎨 2.0x domain boost for specialists
-- 🔍 Multi-factor optimization
-- 🆓 Free tier (Ollama + Groq)
-- ⚡ 3 lines of code
-
-Example:
-    >>> from cascadeflow import CascadeAgent, CascadePresets
-    >>>
-    >>> # Auto-detect available models
-    >>> models = CascadePresets.auto_detect_models()
-    >>>
-    >>> # Create agent with intelligence layer
-    >>> agent = CascadeAgent(models, enable_caching=True)
-    >>>
-    >>> # Run query (automatically optimized!)
-    >>> result = await agent.run("Fix this Python bug")
-    >>> print(f"Used {result.model_used} - Cost: ${result.cost:.6f}")
+cascadeflow - Agent runtime intelligence layer.
+
+In-process harness that optimizes cost, latency, quality, budget, compliance,
+and energy across AI agent workflows. Works inside agent execution loops with
+full state awareness -- not an external proxy.
+
+Quick start:
+    import cascadeflow
+    cascadeflow.init(mode="observe")
+    # All OpenAI/Anthropic SDK calls are now tracked and traced.
+
+Key APIs:
+    cascadeflow.init(mode)        -- activate harness (off | observe | enforce)
+    cascadeflow.run(budget)       -- scoped run with budget/trace
+    @cascadeflow.agent(budget)    -- policy metadata on agent functions
+    session.summary()             -- structured metrics
+    session.trace()               -- full decision audit trail
+
+Integrations: LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, Vercel AI SDK
 """
 
 __version__ = "1.0.0"
@@ -240,6 +233,10 @@
 )
 
 # NEW: Harness API scaffold (V2 core branch)
+# NOTE: harness.agent is NOT re-exported here — it would shadow the
+# cascadeflow.agent *module* and break dotted-path resolution
+# (e.g. patch("cascadeflow.agent.PROVIDER_REGISTRY")).
+# Use ``from cascadeflow.harness import agent`` instead.
 from .harness import (
     HarnessConfig,
     HarnessInitReport,
@@ -247,7 +244,6 @@
     init,
     reset,
     run,
-    agent as harness_agent,
     get_harness_config,
     get_current_run,
 )
@@ -401,7 +397,6 @@
     "init",
     "reset",
     "run",
-    "harness_agent",
     "get_harness_config",
     "get_current_run",
     # ===== PROVIDERS =====
diff --git a/cascadeflow/harness/__init__.py b/cascadeflow/harness/__init__.py
index 43a03662..74c07219 100644
--- a/cascadeflow/harness/__init__.py
+++ b/cascadeflow/harness/__init__.py
@@ -14,11 +14,13 @@
     HarnessInitReport,
     HarnessRunContext,
     agent,
+    get_harness_callback_manager,
     get_current_run,
     get_harness_config,
     init,
     reset,
     run,
+    set_harness_callback_manager,
 )
 
 __all__ = [
@@ -29,6 +31,8 @@
     "run",
     "agent",
     "get_current_run",
+    "get_harness_callback_manager",
     "get_harness_config",
+    "set_harness_callback_manager",
     "reset",
 ]
diff --git a/cascadeflow/harness/api.py b/cascadeflow/harness/api.py
index a71d5f5a..95ff4245 100644
--- a/cascadeflow/harness/api.py
+++ b/cascadeflow/harness/api.py
@@ -4,8 +4,10 @@
 import json
 import logging
 import os
+import time
 from contextvars import ContextVar, Token
 from dataclasses import dataclass, field
+from functools import wraps
 from importlib.util import find_spec
 from pathlib import Path
 from typing import Any, Callable, Literal, Optional, TypeVar, cast
@@ -39,7 +41,21 @@ class HarnessInitReport:
 
 @dataclass
 class HarnessRunContext:
+    """Scoped run context for tracking harness metrics across LLM calls.
+
+    Thread safety: the context is stored in a ``ContextVar`` and is safe for
+    asyncio (each task gets its own copy of the token).  However, the context
+    object itself uses plain attribute mutation (``+=``) for counters.  If
+    multiple OS threads share the *same* ``HarnessRunContext`` instance,
+    concurrent updates may race.  Each ``with run(...)`` scope should be
+    confined to a single thread or asyncio task.
+    """
+
     run_id: str = field(default_factory=lambda: uuid4().hex[:12])
+    _started_monotonic: float = field(default_factory=time.monotonic, init=False, repr=False)
+    started_at_ms: float = field(default_factory=lambda: time.time() * 1000)
+    ended_at_ms: Optional[float] = None
+    duration_ms: Optional[float] = None
     mode: HarnessMode = "off"
     budget_max: Optional[float] = None
     tool_calls_max: Optional[int] = None
@@ -73,6 +89,9 @@ def __enter__(self) -> HarnessRunContext:
         return self
 
     def __exit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
+        self.ended_at_ms = time.time() * 1000
+        self.duration_ms = max(0.0, (time.monotonic() - self._started_monotonic) * 1000.0)
+        self._log_summary()
         if self._token is not None:
             _current_run.reset(self._token)
             self._token = None
@@ -86,6 +105,44 @@ async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
     def trace(self) -> list[dict[str, Any]]:
         return list(self._trace)
 
+    def summary(self) -> dict[str, Any]:
+        return {
+            "run_id": self.run_id,
+            "mode": self.mode,
+            "step_count": self.step_count,
+            "tool_calls": self.tool_calls,
+            "cost": self.cost,
+            "savings": self.savings,
+            "latency_used_ms": self.latency_used_ms,
+            "energy_used": self.energy_used,
+            "budget_max": self.budget_max,
+            "budget_remaining": self.budget_remaining,
+            "last_action": self.last_action,
+            "model_used": self.model_used,
+            "duration_ms": self.duration_ms,
+        }
+
+    def _log_summary(self) -> None:
+        if self.mode == "off" or self.step_count <= 0:
+            return
+        logger.info(
+            (
+                "harness run summary run_id=%s mode=%s steps=%d tool_calls=%d "
+                "cost=%.6f latency_ms=%.2f energy=%.4f last_action=%s model=%s "
+                "budget_remaining=%s"
+            ),
+            self.run_id,
+            self.mode,
+            self.step_count,
+            self.tool_calls,
+            self.cost,
+            self.latency_used_ms,
+            self.energy_used,
+            self.last_action,
+            self.model_used,
+            self.budget_remaining,
+        )
+
     def record(
         self,
         action: str,
@@ -95,19 +152,42 @@ def record(
         applied: Optional[bool] = None,
         decision_mode: Optional[str] = None,
     ) -> None:
-        self.last_action = action
-        self.model_used = model
+        safe_action = _sanitize_trace_value(action, max_length=_MAX_ACTION_LEN)
+        if not safe_action:
+            logger.warning("record() called with empty action, defaulting to 'allow'")
+            safe_action = "allow"
+        safe_reason = _sanitize_trace_value(reason, max_length=_MAX_REASON_LEN) or "unspecified"
+        safe_model = (
+            _sanitize_trace_value(model, max_length=_MAX_MODEL_LEN) if model is not None else None
+        )
+
+        self.last_action = safe_action
+        self.model_used = safe_model
         entry: dict[str, Any] = {
-            "action": action,
-            "reason": reason,
-            "model": model,
+            "action": safe_action,
+            "reason": safe_reason,
+            "model": safe_model,
             "run_id": self.run_id,
+            "mode": self.mode,
+            "step": self.step_count,
+            "timestamp_ms": time.time() * 1000,
+            "tool_calls_total": self.tool_calls,
+            "cost_total": self.cost,
+            "latency_used_ms": self.latency_used_ms,
+            "energy_used": self.energy_used,
+            "budget_state": {
+                "max": self.budget_max,
+                "remaining": self.budget_remaining,
+            },
         }
         if applied is not None:
             entry["applied"] = applied
         if decision_mode is not None:
             entry["decision_mode"] = decision_mode
         self._trace.append(entry)
+        if len(self._trace) > _MAX_TRACE_ENTRIES:
+            self._trace = self._trace[-_MAX_TRACE_ENTRIES:]
+        _emit_harness_decision(entry)
 
 
 _harness_config: HarnessConfig = HarnessConfig()
@@ -115,6 +195,7 @@ def record(
     "cascadeflow_harness_run", default=None
 )
 _is_instrumented: bool = False
+_harness_callback_manager: Any = None
 _UNSET = object()
 
 
@@ -124,6 +205,32 @@ def _validate_mode(mode: str) -> HarnessMode:
     return cast(HarnessMode, mode)
 
 
+_VALID_COMPLIANCE_VALUES = {"gdpr", "hipaa", "pci", "strict"}
+
+
+def _validate_harness_params(
+    *,
+    budget: Optional[float],
+    max_tool_calls: Optional[int],
+    max_latency_ms: Optional[float],
+    max_energy: Optional[float],
+    compliance: Optional[str],
+) -> None:
+    """Validate harness parameters, raising ValueError for invalid inputs."""
+    if budget is not None and budget < 0:
+        raise ValueError(f"budget must be non-negative, got {budget}")
+    if max_tool_calls is not None and max_tool_calls < 0:
+        raise ValueError(f"max_tool_calls must be non-negative, got {max_tool_calls}")
+    if max_latency_ms is not None and max_latency_ms < 0:
+        raise ValueError(f"max_latency_ms must be non-negative, got {max_latency_ms}")
+    if max_energy is not None and max_energy < 0:
+        raise ValueError(f"max_energy must be non-negative, got {max_energy}")
+    if compliance is not None and compliance.strip().lower() not in _VALID_COMPLIANCE_VALUES:
+        raise ValueError(
+            f"compliance must be one of {sorted(_VALID_COMPLIANCE_VALUES)}, got {compliance!r}"
+        )
+
+
 def _detect_sdks() -> dict[str, bool]:
     return {
         "openai": find_spec("openai") is not None,
@@ -139,6 +246,15 @@ def get_current_run() -> Optional[HarnessRunContext]:
     return _current_run.get()
 
 
+def get_harness_callback_manager() -> Any:
+    return _harness_callback_manager
+
+
+def set_harness_callback_manager(callback_manager: Any) -> None:
+    global _harness_callback_manager
+    _harness_callback_manager = callback_manager
+
+
 def reset() -> None:
     """
     Reset harness global state and unpatch instrumented clients.
@@ -148,15 +264,72 @@ def reset() -> None:
 
     global _harness_config
     global _is_instrumented
+    global _harness_callback_manager
+    global _cached_cascade_decision_event
 
-    from cascadeflow.harness.instrument import unpatch_openai
+    from cascadeflow.harness.instrument import unpatch_anthropic, unpatch_openai
 
     unpatch_openai()
+    unpatch_anthropic()
     _harness_config = HarnessConfig()
     _is_instrumented = False
+    _harness_callback_manager = None
+    _cached_cascade_decision_event = None
     _current_run.set(None)
 
 
+_MAX_ACTION_LEN = 64
+_MAX_REASON_LEN = 160
+_MAX_MODEL_LEN = 128
+_MAX_ENV_JSON_LEN = 4096
+_MAX_TRACE_ENTRIES = 1000
+
+
+def _sanitize_trace_value(value: Any, *, max_length: int) -> Optional[str]:
+    if value is None:
+        return None
+    text = str(value).replace("\n", " ").replace("\r", " ").strip()
+    text = "".join(c for c in text if c.isprintable())
+    if len(text) > max_length:
+        text = text[: max_length - 3] + "..."
+    return text or None
+
+
+_cached_cascade_decision_event: Any = None
+
+
+def _emit_harness_decision(entry: dict[str, Any]) -> None:
+    global _cached_cascade_decision_event
+
+    manager = get_harness_callback_manager()
+    if manager is None:
+        return
+
+    trigger = getattr(manager, "trigger", None)
+    if not callable(trigger):
+        logger.debug("harness callback manager has no trigger() method")
+        return
+
+    if _cached_cascade_decision_event is None:
+        try:
+            from cascadeflow.telemetry.callbacks import CallbackEvent
+
+            _cached_cascade_decision_event = CallbackEvent.CASCADE_DECISION
+        except Exception:
+            logger.debug("telemetry callbacks unavailable for harness decision emit", exc_info=True)
+            return
+
+    try:
+        trigger(
+            _cached_cascade_decision_event,
+            query="[harness]",
+            data=dict(entry),
+            workflow="harness",
+        )
+    except Exception:
+        logger.debug("failed to emit harness decision callback", exc_info=True)
+
+
 def _parse_bool(raw: str) -> bool:
     normalized = raw.strip().lower()
     return normalized in {"1", "true", "yes", "on"}
@@ -171,6 +344,8 @@ def _parse_int(raw: str) -> int:
 
 
 def _parse_json_dict(raw: str) -> dict[str, float]:
+    if len(raw) > _MAX_ENV_JSON_LEN:
+        raise ValueError(f"JSON config exceeds {_MAX_ENV_JSON_LEN} characters for harness env var")
     value = json.loads(raw)
     if not isinstance(value, dict):
         raise ValueError("expected JSON object")
@@ -305,9 +480,12 @@ def init(
     kpi_targets: Optional[dict[str, float]] | object = _UNSET,
     kpi_weights: Optional[dict[str, float]] | object = _UNSET,
     compliance: Optional[str] | object = _UNSET,
+    callback_manager: Any | object = _UNSET,
 ) -> HarnessInitReport:
     """
-    Initialize global harness settings and instrument detected SDK clients.
+    Initialize global harness settings.
+
+    This is a scaffold API for V2 work and intentionally performs no request patching yet.
     """
 
     global _harness_config
@@ -338,8 +516,18 @@ def init(
     resolved_compliance = _resolve_value(
         "compliance", compliance, env_config, file_config, None, sources
     )
+    if callback_manager is not _UNSET:
+        set_harness_callback_manager(callback_manager)
+        sources["callback_manager"] = "code"
 
     validated_mode = _validate_mode(str(resolved_mode))
+    _validate_harness_params(
+        budget=cast(Optional[float], resolved_budget),
+        max_tool_calls=cast(Optional[int], resolved_max_tool_calls),
+        max_latency_ms=cast(Optional[float], resolved_max_latency_ms),
+        max_energy=cast(Optional[float], resolved_max_energy),
+        compliance=cast(Optional[str], resolved_compliance),
+    )
     _harness_config = HarnessConfig(
         mode=validated_mode,
         verbose=bool(resolved_verbose),
@@ -361,13 +549,29 @@ def init(
 
         if patch_openai():
             instrumented.append("openai")
-    elif validated_mode == "off":
-        from cascadeflow.harness.instrument import is_patched, unpatch_openai
+        else:
+            detected_but_not_instrumented.append("openai")
+
+    if validated_mode != "off" and sdk_presence["anthropic"]:
+        from cascadeflow.harness.instrument import patch_anthropic
 
-        if is_patched():
+        if patch_anthropic():
+            instrumented.append("anthropic")
+        else:
+            detected_but_not_instrumented.append("anthropic")
+
+    if validated_mode == "off":
+        from cascadeflow.harness.instrument import (
+            is_anthropic_patched,
+            is_openai_patched,
+            unpatch_anthropic,
+            unpatch_openai,
+        )
+
+        if is_openai_patched():
             unpatch_openai()
-    if sdk_presence["anthropic"]:
-        detected_but_not_instrumented.append("anthropic")
+        if is_anthropic_patched():
+            unpatch_anthropic()
 
     if _is_instrumented:
         logger.debug("harness init called again; instrumentation remains idempotent")
@@ -415,6 +619,14 @@ def run(
     resolved_kpi_weights = kpi_weights if kpi_weights is not None else config.kpi_weights
     resolved_compliance = compliance if compliance is not None else config.compliance
 
+    _validate_harness_params(
+        budget=resolved_budget,
+        max_tool_calls=resolved_tool_calls,
+        max_latency_ms=resolved_latency,
+        max_energy=resolved_energy,
+        compliance=resolved_compliance,
+    )
+
     return HarnessRunContext(
         mode=config.mode,
         budget_max=resolved_budget,
@@ -453,18 +665,18 @@ def decorator(func: F) -> F:
 
         if inspect.iscoroutinefunction(func):
 
+            @wraps(func)
             async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
                 return await func(*args, **kwargs)
 
             async_wrapper.__cascadeflow_agent_policy__ = metadata  # type: ignore[attr-defined]
-            async_wrapper.__name__ = getattr(func, "__name__", "wrapped_agent")
             return cast(F, async_wrapper)
 
+        @wraps(func)
         def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
             return func(*args, **kwargs)
 
         sync_wrapper.__cascadeflow_agent_policy__ = metadata  # type: ignore[attr-defined]
-        sync_wrapper.__name__ = getattr(func, "__name__", "wrapped_agent")
         return cast(F, sync_wrapper)
 
     return decorator
diff --git a/cascadeflow/harness/instrument.py b/cascadeflow/harness/instrument.py
index c2fbd7ab..4b08b9f6 100644
--- a/cascadeflow/harness/instrument.py
+++ b/cascadeflow/harness/instrument.py
@@ -1,11 +1,10 @@
-"""OpenAI Python client auto-instrumentation for cascadeflow harness.
+"""Python SDK auto-instrumentation for cascadeflow harness.
 
-Patches ``openai.resources.chat.completions.Completions.create`` (sync) and
-``AsyncCompletions.create`` (async) to intercept LLM calls for observe/enforce
-modes.
+Patches OpenAI and Anthropic SDK request methods to intercept LLM calls for
+observe/enforce modes.
 
-This module is called internally by ``cascadeflow.harness.init()``.  Users
-should not call ``patch_openai`` / ``unpatch_openai`` directly.
+This module is called internally by ``cascadeflow.harness.init()``. Users
+should not call patch/unpatch helpers directly.
 
 Implementation notes:
     - Patching is class-level (all current and future client instances).
@@ -51,6 +50,9 @@
 _openai_patched: bool = False
 _original_sync_create: Any = None
 _original_async_create: Any = None
+_anthropic_patched: bool = False
+_original_anthropic_sync_create: Any = None
+_original_anthropic_async_create: Any = None
 
 _MODEL_TOTAL_COSTS: dict[str, float] = {
     name: _model_total_price_shared(name) for name in _PRICING_MODELS
@@ -140,7 +142,7 @@ def _estimate_energy(model: str, prompt_tokens: int, completion_tokens: int) ->
     return _estimate_energy_shared(model, prompt_tokens, completion_tokens)
 
 
-def _count_tool_calls_in_response(response: Any) -> int:
+def _count_tool_calls_in_openai_response(response: Any) -> int:
     """Count tool calls in a non-streaming ChatCompletion response."""
     choices = getattr(response, "choices", None)
     if not choices:
@@ -154,7 +156,7 @@ def _count_tool_calls_in_response(response: Any) -> int:
     return len(tool_calls)
 
 
-def _extract_usage(response: Any) -> tuple[int, int]:
+def _extract_openai_usage(response: Any) -> tuple[int, int]:
     """Extract (prompt_tokens, completion_tokens) from a response."""
     usage = getattr(response, "usage", None)
     if usage is None:
@@ -165,6 +167,29 @@ def _extract_usage(response: Any) -> tuple[int, int]:
     )
 
 
+def _extract_anthropic_usage(response: Any) -> tuple[int, int]:
+    """Extract (input_tokens, output_tokens) from an Anthropic response."""
+    usage = getattr(response, "usage", None)
+    if usage is None:
+        return 0, 0
+    return (
+        getattr(usage, "input_tokens", 0) or 0,
+        getattr(usage, "output_tokens", 0) or 0,
+    )
+
+
+def _count_tool_calls_in_anthropic_response(response: Any) -> int:
+    """Count Anthropic ``tool_use`` blocks in a non-streaming response."""
+    content = getattr(response, "content", None)
+    if not content:
+        return 0
+    count = 0
+    for block in content:
+        if getattr(block, "type", None) == "tool_use":
+            count += 1
+    return count
+
+
 def _model_total_cost(model: str) -> float:
     return _MODEL_TOTAL_COSTS.get(model, _model_total_price_shared(model))
 
@@ -596,6 +621,9 @@ def __next__(self) -> Any:
         except StopIteration:
             self._finalize()
             raise
+        except Exception:
+            self._finalize()
+            raise
 
     def __enter__(self) -> _InstrumentedStream:
         if hasattr(self._stream, "__enter__"):
@@ -625,6 +653,9 @@ async def __anext__(self) -> Any:
         except StopAsyncIteration:
             self._finalize()
             raise
+        except Exception:
+            self._finalize()
+            raise
 
     async def __aenter__(self) -> _InstrumentedAsyncStream:
         if hasattr(self._stream, "__aenter__"):
@@ -638,6 +669,174 @@ async def __aexit__(self, *args: Any) -> bool:
         return False
 
 
+class _InstrumentedAnthropicStreamBase:
+    """Shared stream-wrapper logic for sync and async Anthropic streams."""
+
+    __slots__ = (
+        "_stream",
+        "_ctx",
+        "_model",
+        "_start_time",
+        "_pre_action",
+        "_pre_reason",
+        "_pre_model",
+        "_pre_applied",
+        "_decision_mode",
+        "_input_tokens",
+        "_output_tokens",
+        "_tool_call_count",
+        "_finalized",
+    )
+
+    def __init__(
+        self,
+        stream: Any,
+        ctx: Any,
+        model: str,
+        start_time: float,
+        pre_action: str = "allow",
+        pre_reason: str = "observe",
+        pre_model: str | None = None,
+        pre_applied: bool = True,
+        decision_mode: str = "observe",
+    ) -> None:
+        self._stream = stream
+        self._ctx = ctx
+        self._model = model
+        self._start_time = start_time
+        self._pre_action = pre_action
+        self._pre_reason = pre_reason
+        self._pre_model = pre_model or model
+        self._pre_applied = pre_applied
+        self._decision_mode = decision_mode
+        self._input_tokens: int = 0
+        self._output_tokens: int = 0
+        self._tool_call_count: int = 0
+        self._finalized: bool = False
+
+    def close(self) -> None:
+        self._finalize()
+        if hasattr(self._stream, "close"):
+            self._stream.close()
+
+    def _inspect_event(self, event: Any) -> None:
+        event_type = getattr(event, "type", None)
+
+        if event_type == "message_start":
+            message = getattr(event, "message", None)
+            usage = getattr(message, "usage", None)
+            if usage is not None:
+                input_tokens = getattr(usage, "input_tokens", None)
+                output_tokens = getattr(usage, "output_tokens", None)
+                if isinstance(input_tokens, (int, float)):
+                    self._input_tokens = int(input_tokens) if input_tokens > 0 else 0
+                if isinstance(output_tokens, (int, float)):
+                    self._output_tokens = int(output_tokens) if output_tokens > 0 else 0
+            return
+
+        usage = getattr(event, "usage", None)
+        if usage is not None:
+            input_tokens = getattr(usage, "input_tokens", None)
+            output_tokens = getattr(usage, "output_tokens", None)
+            if isinstance(input_tokens, (int, float)) and input_tokens > 0:
+                self._input_tokens = int(input_tokens)
+            if isinstance(output_tokens, (int, float)):
+                self._output_tokens = int(output_tokens) if output_tokens > 0 else 0
+
+        if event_type == "content_block_start":
+            content_block = getattr(event, "content_block", None)
+            block_type = getattr(content_block, "type", None)
+            if block_type in {"tool_use", "server_tool_use"}:
+                self._tool_call_count += 1
+
+    def _finalize(self) -> None:
+        if self._finalized:
+            return
+        self._finalized = True
+
+        if self._ctx is None:
+            return
+
+        elapsed_ms = (time.monotonic() - self._start_time) * 1000
+        _update_context(
+            self._ctx,
+            self._model,
+            self._input_tokens,
+            self._output_tokens,
+            self._tool_call_count,
+            elapsed_ms,
+            action=self._pre_action,
+            action_reason=self._pre_reason,
+            action_model=self._pre_model,
+            applied=self._pre_applied,
+            decision_mode=self._decision_mode,
+        )
+
+
+class _InstrumentedAnthropicStream(_InstrumentedAnthropicStreamBase):
+    """Wraps an Anthropic sync stream and tracks usage at stream end."""
+
+    __slots__ = ()
+
+    def __iter__(self) -> _InstrumentedAnthropicStream:
+        return self
+
+    def __next__(self) -> Any:
+        try:
+            event = next(self._stream)
+            self._inspect_event(event)
+            return event
+        except StopIteration:
+            self._finalize()
+            raise
+        except Exception:
+            self._finalize()
+            raise
+
+    def __enter__(self) -> _InstrumentedAnthropicStream:
+        if hasattr(self._stream, "__enter__"):
+            self._stream.__enter__()
+        return self
+
+    def __exit__(self, *args: Any) -> bool:
+        self._finalize()
+        if hasattr(self._stream, "__exit__"):
+            return self._stream.__exit__(*args)  # type: ignore[no-any-return]
+        return False
+
+
+class _InstrumentedAnthropicAsyncStream(_InstrumentedAnthropicStreamBase):
+    """Wraps an Anthropic async stream and tracks usage at stream end."""
+
+    __slots__ = ()
+
+    def __aiter__(self) -> _InstrumentedAnthropicAsyncStream:
+        return self
+
+    async def __anext__(self) -> Any:
+        try:
+            event = await self._stream.__anext__()
+            self._inspect_event(event)
+            return event
+        except StopAsyncIteration:
+            self._finalize()
+            raise
+        except Exception:
+            self._finalize()
+            raise
+
+    async def __aenter__(self) -> _InstrumentedAnthropicAsyncStream:
+        if hasattr(self._stream, "__aenter__"):
+            await self._stream.__aenter__()
+        return self
+
+    async def __aexit__(self, *args: Any) -> bool:
+        self._finalize()
+        if hasattr(self._stream, "__aexit__"):
+            return await self._stream.__aexit__(*args)  # type: ignore[no-any-return]
+        return False
+
+
 # ---------------------------------------------------------------------------
 # Wrapper factories
 # ---------------------------------------------------------------------------
@@ -713,8 +912,8 @@ def _finalize_interception(
 
     if (not state.is_stream) and ctx:
         elapsed_ms = (time.monotonic() - state.start_time) * 1000
-        prompt_tokens, completion_tokens = _extract_usage(response)
-        tool_call_count = _count_tool_calls_in_response(response)
+        prompt_tokens, completion_tokens = _extract_openai_usage(response)
+        tool_call_count = _count_tool_calls_in_openai_response(response)
         _update_context(
             ctx,
             state.model,
@@ -810,6 +1009,158 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
     return wrapper
 
 
+def _make_patched_anthropic_create(original_fn: Any) -> Any:
+    """Create a patched version of ``anthropic.Messages.create``."""
+
+    @functools.wraps(original_fn)
+    def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
+        from cascadeflow.harness.api import get_current_run, get_harness_config
+
+        config = get_harness_config()
+        ctx = get_current_run()
+        mode = ctx.mode if ctx else config.mode
+
+        if mode == "off":
+            return original_fn(self, *args, **kwargs)
+
+        model: str = kwargs.get("model", "unknown")
+        pre_action = "allow"
+        pre_reason = mode
+        pre_model = model
+        pre_applied = True
+
+        if ctx:
+            kwargs, model, pre_action, pre_reason, pre_model, pre_applied = (
+                _resolve_pre_call_decision(
+                    ctx,
+                    mode,
+                    model,
+                    kwargs,
+                )
+            )
+
+        is_stream = bool(kwargs.get("stream", False))
+        start_time = time.monotonic()
+        response = original_fn(self, *args, **kwargs)
+
+        if not ctx:
+            logger.debug(
+                "harness %s (anthropic): model=%s (no active run scope, metrics not tracked)",
+                mode,
+                model,
+            )
+            return response
+
+        if is_stream:
+            return _InstrumentedAnthropicStream(
+                response,
+                ctx,
+                model,
+                start_time,
+                pre_action,
+                pre_reason,
+                pre_model,
+                pre_applied,
+                mode,
+            )
+
+        elapsed_ms = (time.monotonic() - start_time) * 1000
+        input_tokens, output_tokens = _extract_anthropic_usage(response)
+        tool_call_count = _count_tool_calls_in_anthropic_response(response)
+        _update_context(
+            ctx,
+            model,
+            input_tokens,
+            output_tokens,
+            tool_call_count,
+            elapsed_ms,
+            action=pre_action,
+            action_reason=pre_reason,
+            action_model=pre_model,
+            applied=pre_applied,
+            decision_mode=mode,
+        )
+        return response
+
+    return wrapper
+
+
+def _make_patched_anthropic_async_create(original_fn: Any) -> Any:
+    """Create a patched version of ``anthropic.AsyncMessages.create``."""
+
+    @functools.wraps(original_fn)
+    async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
+        from cascadeflow.harness.api import get_current_run, get_harness_config
+
+        config = get_harness_config()
+        ctx = get_current_run()
+        mode = ctx.mode if ctx else config.mode
+
+        if mode == "off":
+            return await original_fn(self, *args, **kwargs)
+
+        model: str = kwargs.get("model", "unknown")
+        pre_action = "allow"
+        pre_reason = mode
+        pre_model = model
+        pre_applied = True
+
+        if ctx:
+            kwargs, model, pre_action, pre_reason, pre_model, pre_applied = (
+                _resolve_pre_call_decision(
+                    ctx,
+                    mode,
+                    model,
+                    kwargs,
+                )
+            )
+
+        is_stream = bool(kwargs.get("stream", False))
+        start_time = time.monotonic()
+        response = await original_fn(self, *args, **kwargs)
+
+        if not ctx:
+            logger.debug(
+                "harness %s async (anthropic): model=%s (no active run scope, metrics not tracked)",
+                mode,
+                model,
+            )
+            return response
+
+        if is_stream:
+            return _InstrumentedAnthropicAsyncStream(
+                response,
+                ctx,
+                model,
+                start_time,
+                pre_action,
+                pre_reason,
+                pre_model,
+                pre_applied,
+                mode,
+            )
+
+        elapsed_ms = (time.monotonic() - start_time) * 1000
+        input_tokens, output_tokens = _extract_anthropic_usage(response)
+        tool_call_count = _count_tool_calls_in_anthropic_response(response)
+        _update_context(
+            ctx,
+            model,
+            input_tokens,
+            output_tokens,
+            tool_call_count,
+            elapsed_ms,
+            action=pre_action,
+            action_reason=pre_reason,
+            action_model=pre_model,
+            applied=pre_applied,
+            decision_mode=mode,
+        )
+        return response
+
+    return wrapper
+
+
 # ---------------------------------------------------------------------------
 # Public API (called by cascadeflow.harness.api)
 # ---------------------------------------------------------------------------
@@ -846,6 +1197,37 @@ def patch_openai() -> bool:
     return True
 
 
+def patch_anthropic() -> bool:
+    """Patch the Anthropic Python client for harness instrumentation.
+
+    Returns ``True`` if patching succeeded, ``False`` if anthropic is not
+    installed. Idempotent: safe to call multiple times.
+    """
+    global _anthropic_patched, _original_anthropic_sync_create, _original_anthropic_async_create
+
+    if _anthropic_patched:
+        logger.debug("anthropic already patched, skipping")
+        return True
+
+    try:
+        from anthropic.resources.messages import AsyncMessages, Messages
+    except ImportError:
+        logger.debug("anthropic package not available, skipping instrumentation")
+        return False
+
+    _original_anthropic_sync_create = Messages.create
+    _original_anthropic_async_create = AsyncMessages.create
+
+    Messages.create = _make_patched_anthropic_create(_original_anthropic_sync_create)  # type: ignore[assignment]
+    AsyncMessages.create = _make_patched_anthropic_async_create(  # type: ignore[assignment]
+        _original_anthropic_async_create,
+    )
+
+    _anthropic_patched = True
+    logger.info("anthropic client instrumented (sync + async)")
+    return True
+
+
 def unpatch_openai() -> None:
     """Restore original OpenAI client methods.
 
@@ -873,6 +1255,43 @@ def unpatch_openai() -> None:
     logger.info("openai client unpatched")
 
 
-def is_patched() -> bool:
+def unpatch_anthropic() -> None:
+    """Restore original Anthropic client methods.
+
+    Safe to call even if not patched. Used by ``reset()`` and tests.
+    """
+    global _anthropic_patched, _original_anthropic_sync_create, _original_anthropic_async_create
+
+    if not _anthropic_patched:
+        return
+
+    try:
+        from anthropic.resources.messages import AsyncMessages, Messages
+    except ImportError:
+        _anthropic_patched = False
+        return
+
+    if _original_anthropic_sync_create is not None:
+        Messages.create = _original_anthropic_sync_create  # type: ignore[assignment]
+    if _original_anthropic_async_create is not None:
+        AsyncMessages.create = _original_anthropic_async_create  # type: ignore[assignment]
+
+    _original_anthropic_sync_create = None
+    _original_anthropic_async_create = None
+    _anthropic_patched = False
+    logger.info("anthropic client unpatched")
+
+
+def is_openai_patched() -> bool:
     """Return whether the OpenAI client is currently patched."""
     return _openai_patched
+
+
+def is_anthropic_patched() -> bool:
+    """Return whether the Anthropic client is currently patched."""
+    return _anthropic_patched
+
+
+def is_patched() -> bool:
+    """Return whether any supported Python SDK is currently patched."""
+    return _openai_patched or _anthropic_patched
diff --git a/cascadeflow/harness/pricing.py b/cascadeflow/harness/pricing.py
index bd86323e..81a1de06 100644
--- a/cascadeflow/harness/pricing.py
+++ b/cascadeflow/harness/pricing.py
@@ -1,11 +1,17 @@
 """Shared harness pricing and energy profiles.
 
 This module centralizes model-cost and energy-estimation defaults used by
-harness integrations (OpenAI auto-instrumentation, OpenAI Agents SDK, CrewAI).
+harness integrations (OpenAI auto-instrumentation, OpenAI Agents SDK, CrewAI,
+Google ADK).
+
+A future pricing registry will consolidate with ``cascadeflow.pricing``
+and LiteLLM live data.  Until then this module is the canonical source
+for harness-level cost/energy estimation.
 """
 
 from __future__ import annotations
 
+import re as _re
 from typing import Final
 
 # USD per 1M tokens (input, output).
@@ -21,15 +27,22 @@
     "o1": (15.00, 60.00),
     "o1-mini": (3.00, 12.00),
     "o3-mini": (1.10, 4.40),
-    # Anthropic aliases used by CrewAI model names.
+    # Anthropic
     "claude-sonnet-4": (3.00, 15.00),
     "claude-haiku-3.5": (1.00, 5.00),
     "claude-opus-4.5": (5.00, 25.00),
+    # Google Gemini
+    "gemini-2.5-flash": (0.15, 0.60),
+    "gemini-2.5-pro": (1.25, 10.00),
+    "gemini-2.0-flash": (0.10, 0.40),
+    "gemini-1.5-flash": (0.075, 0.30),
+    "gemini-1.5-pro": (1.25, 5.00),
 }
 DEFAULT_PRICING_USD_PER_M: Final[tuple[float, float]] = (2.50, 10.00)
 
 # Deterministic proxy coefficients for energy tracking.
 ENERGY_COEFFICIENTS: Final[dict[str, float]] = {
+    # OpenAI
     "gpt-4o": 1.0,
     "gpt-4o-mini": 0.3,
     "gpt-5": 1.2,
@@ -40,6 +53,16 @@
     "o1": 2.0,
     "o1-mini": 0.8,
     "o3-mini": 0.5,
+    # Anthropic
+    "claude-sonnet-4": 1.0,
+    "claude-haiku-3.5": 0.3,
+    "claude-opus-4.5": 1.8,
+    # Google Gemini
+    "gemini-2.5-flash": 0.3,
+    "gemini-2.5-pro": 1.2,
+    "gemini-2.0-flash": 0.25,
+    "gemini-1.5-flash": 0.2,
+    "gemini-1.5-pro": 1.0,
 }
 DEFAULT_ENERGY_COEFFICIENT: Final[float] = 1.0
 ENERGY_OUTPUT_WEIGHT: Final[float] = 1.5
@@ -60,19 +83,85 @@
 )
 
 
+# ---------------------------------------------------------------------------
+# Fuzzy model-name resolution
+# ---------------------------------------------------------------------------
+
+# Pre-compiled pattern for stripping version/preview/date suffixes.
+# Matches: -preview, -preview-05-20, -20250120, -latest, -exp-0827, etc.
+_VERSION_SUFFIX_RE = _re.compile(
+    r"(-preview(?:-\d{2,4}-\d{2})?|-\d{8,}|-latest|-exp(?:-\d+)?|-it)$"
+)
+
+# Cache for resolved model → pricing key lookups.
+_pricing_key_cache: dict[str, str | None] = {}
+
+
+def _resolve_pricing_key(model: str) -> str | None:
+    """Resolve a model name to a known pricing table key.
+
+    Tries exact match first, then strips version/preview/date suffixes,
+    then tries longest-prefix match against known model names.
+    Returns ``None`` when no match is found (caller should use defaults).
+    """
+    if model in _pricing_key_cache:
+        return _pricing_key_cache[model]
+
+    # Exact match
+    if model in PRICING_USD_PER_M:
+        _pricing_key_cache[model] = model
+        return model
+
+    # Strip version suffixes and retry
+    stripped = _VERSION_SUFFIX_RE.sub("", model)
+    if stripped != model and stripped in PRICING_USD_PER_M:
+        _pricing_key_cache[model] = stripped
+        return stripped
+
+    # Longest-prefix match (e.g. "gemini-2.5-flash-8b" → "gemini-2.5-flash")
+    best: str | None = None
+    best_len = 0
+    for known in PRICING_USD_PER_M:
+        if model.startswith(known) and len(known) > best_len:
+            best = known
+            best_len = len(known)
+    if best is not None:
+        _pricing_key_cache[model] = best
+        return best
+
+    _pricing_key_cache[model] = None
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Public estimation helpers
+# ---------------------------------------------------------------------------
+
+
 def estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
     """Estimate USD cost from token usage."""
-    in_price, out_price = PRICING_USD_PER_M.get(model, DEFAULT_PRICING_USD_PER_M)
+    key = _resolve_pricing_key(model)
+    in_price, out_price = (
+        PRICING_USD_PER_M.get(key, DEFAULT_PRICING_USD_PER_M) if key else DEFAULT_PRICING_USD_PER_M
+    )
     return (input_tokens / 1_000_000.0) * in_price + (output_tokens / 1_000_000.0) * out_price
 
 
 def estimate_energy(model: str, input_tokens: int, output_tokens: int) -> float:
     """Estimate deterministic proxy energy units."""
-    coefficient = ENERGY_COEFFICIENTS.get(model, DEFAULT_ENERGY_COEFFICIENT)
-    return coefficient * (input_tokens + (output_tokens * ENERGY_OUTPUT_WEIGHT))
+    key = _resolve_pricing_key(model)
+    coeff = (
+        ENERGY_COEFFICIENTS.get(key, DEFAULT_ENERGY_COEFFICIENT)
+        if key
+        else DEFAULT_ENERGY_COEFFICIENT
+    )
+    return coeff * (input_tokens + (output_tokens * ENERGY_OUTPUT_WEIGHT))
 
 
 def model_total_price(model: str) -> float:
     """Return total (input + output) price per 1M tokens."""
-    in_price, out_price = PRICING_USD_PER_M.get(model, DEFAULT_PRICING_USD_PER_M)
+    key = _resolve_pricing_key(model)
+    in_price, out_price = (
+        PRICING_USD_PER_M.get(key, DEFAULT_PRICING_USD_PER_M) if key else DEFAULT_PRICING_USD_PER_M
+    )
     return in_price + out_price
diff --git a/cascadeflow/integrations/__init__.py b/cascadeflow/integrations/__init__.py
index 33552773..61c3ebbd 100644
--- a/cascadeflow/integrations/__init__.py
+++ b/cascadeflow/integrations/__init__.py
@@ -185,6 +185,28 @@
     crewai_is_enabled = None
     crewai_get_config = None
 
+# Try to import Google ADK integration
+try:
+    from .google_adk import (
+        GOOGLE_ADK_AVAILABLE,
+        GoogleADKHarnessConfig,
+        CascadeFlowADKPlugin,
+        enable as google_adk_enable,
+        disable as google_adk_disable,
+        is_available as google_adk_is_available,
+        is_enabled as google_adk_is_enabled,
+        get_config as google_adk_get_config,
+    )
+except ImportError:
+    GOOGLE_ADK_AVAILABLE = False
+    GoogleADKHarnessConfig = None
+    CascadeFlowADKPlugin = None
+    google_adk_enable = None
+    google_adk_disable = None
+    google_adk_is_available = None
+    google_adk_is_enabled = None
+    google_adk_get_config = None
+
 __all__ = []
 
 if LITELLM_AVAILABLE:
@@ -285,6 +307,20 @@
         ]
     )
 
+if GOOGLE_ADK_AVAILABLE:
+    __all__.extend(
+        [
+            "GOOGLE_ADK_AVAILABLE",
+            "GoogleADKHarnessConfig",
+            "CascadeFlowADKPlugin",
+            "google_adk_enable",
+            "google_adk_disable",
+            "google_adk_is_available",
+            "google_adk_is_enabled",
+            "google_adk_get_config",
+        ]
+    )
+
 # Integration capabilities
 INTEGRATION_CAPABILITIES = {
     "litellm": LITELLM_AVAILABLE,
@@ -294,6 +330,7 @@
     "openclaw": OPENCLAW_AVAILABLE,
     "paygentic": PAYGENTIC_AVAILABLE,
     "crewai": CREWAI_AVAILABLE,
+    "google_adk": GOOGLE_ADK_AVAILABLE,
 }
 
 
@@ -319,4 +356,5 @@ def get_integration_info():
         "openclaw_available": OPENCLAW_AVAILABLE,
         "paygentic_available": PAYGENTIC_AVAILABLE,
         "crewai_available": CREWAI_AVAILABLE,
+        "google_adk_available": GOOGLE_ADK_AVAILABLE,
     }
diff --git a/cascadeflow/integrations/google_adk.py b/cascadeflow/integrations/google_adk.py
new file mode 100644
index 00000000..325d21b2
--- /dev/null
+++ b/cascadeflow/integrations/google_adk.py
@@ -0,0 +1,486 @@
+"""Google ADK (Agent Development Kit) harness integration for cascadeflow.
+
+Uses ADK's ``BasePlugin`` system to intercept all LLM calls across all agents
+in a Runner, feeding metrics into ``cascadeflow.harness`` run contexts.
+
+This module is optional — ``pip install cascadeflow[google-adk]`` pulls in the
+google-adk dependency.  When google-adk is not installed the public helpers
+return gracefully and ``GOOGLE_ADK_AVAILABLE`` is ``False``.
+
+Integration surface:
+    - ``enable()``:  create and return a plugin instance
+    - ``disable()``: deactivate the plugin and clean up
+    - ``CascadeFlowADKPlugin``: BasePlugin subclass for Runner(plugins=[...])
+
+Unlike CrewAI (global hooks), ADK plugins are registered per-Runner.
+``enable()`` returns the plugin instance; the user passes it to
+``Runner(plugins=[plugin])``.
+
+Design note — no tool gating:
+    ADK's ``tools_dict`` is part of agent definition, not per-call.
+    Budget gate via ``before_model_callback`` provides sufficient cost control.
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from dataclasses import dataclass
+from importlib.util import find_spec
+from typing import Any, Optional
+
+from cascadeflow.harness.api import get_current_run
+from cascadeflow.harness.pricing import estimate_cost, estimate_energy
+
+logger = logging.getLogger("cascadeflow.integrations.google_adk")
+
+GOOGLE_ADK_AVAILABLE = find_spec("google.adk") is not None
+
+# Resolve the base class: use ADK's BasePlugin when available, else object.
+_ADKBasePlugin: type
+if GOOGLE_ADK_AVAILABLE:
+    try:
+        from google.adk.plugins import BasePlugin as _ADKBasePlugin  # type: ignore[assignment]
+    except ImportError:
+        _ADKBasePlugin = object  # type: ignore[assignment,misc]
+        GOOGLE_ADK_AVAILABLE = False
+else:
+    _ADKBasePlugin = object  # type: ignore[assignment,misc]
+
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class GoogleADKHarnessConfig:
+    """Runtime configuration for the Google ADK harness integration.
+
+    fail_open:
+        If ``True`` (default), errors inside callbacks never break ADK
+        execution — they are logged and swallowed.
+    enable_budget_gate:
+        If ``True`` (default), ``before_model_callback`` blocks calls when
+        the harness run budget is exhausted (enforce mode only).
+    """
+
+    fail_open: bool = True
+    enable_budget_gate: bool = True
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _normalize_model_name(model: str) -> str:
+    """Strip LiteLlm-style provider prefix (``openai/gpt-4o`` → ``gpt-4o``).
+
+    Also handles ``models/gemini-2.5-flash`` → ``gemini-2.5-flash``.
+    """
+    if "/" in model:
+        return model.rsplit("/", 1)[-1]
+    return model
+
+
+def _count_function_calls(content: Any) -> int:
+    """Count ``function_call`` parts in an ADK LlmResponse content."""
+    if content is None:
+        return 0
+    parts = getattr(content, "parts", None)
+    if not parts:
+        return 0
+    count = 0
+    for part in parts:
+        if getattr(part, "function_call", None) is not None:
+            count += 1
+    return count
+
+
+# ---------------------------------------------------------------------------
+# Plugin
+# ---------------------------------------------------------------------------
+
+
+class CascadeFlowADKPlugin(_ADKBasePlugin):  # type: ignore[misc]
+    """Google ADK BasePlugin with cascadeflow harness awareness.
+
+    Intercepts every LLM call across all agents in a Runner to provide:
+    - Budget enforcement (enforce mode: short-circuits with error response)
+    - Cost, latency, and energy tracking
+    - Tool call counting
+    - Full trace recording into HarnessRunContext
+    """
+
+    def __init__(self, config: Optional[GoogleADKHarnessConfig] = None) -> None:
+        # google-adk BasePlugin requires a stable plugin name.
+        try:
+            super().__init__(name="cascadeflow_harness")
+        except TypeError:
+            # Fallback for local test environments where BasePlugin is ``object``.
+            super().__init__()
+            self.name = "cascadeflow_harness"
+        self._config = config or GoogleADKHarnessConfig()
+        self._active = True
+        self._call_seq: int = 0
+        # Track call metadata between before/after callbacks.
+        # Keyed by id(callback_context) to guarantee uniqueness even when
+        # two concurrent calls share (invocation_id, agent_name).
+        self._call_start_times: dict[int, float] = {}
+        self._call_models: dict[int, str] = {}
+        # Fallback mapping for runtimes that provide distinct callback_context
+        # objects between before/after callbacks.
+        self._call_fallback_keys: dict[tuple[str, str], list[int]] = {}
+
+    @staticmethod
+    def _callback_key(callback_context: Any) -> int:
+        """Return a unique key for a callback_context object.
+
+        Uses ``id()`` which is guaranteed unique for the lifetime of the
+        object — ADK keeps the same CallbackContext alive across the
+        before/after/error callback sequence for a single LLM call.
+        """
+        return id(callback_context)
+
+    @staticmethod
+    def _fallback_key(callback_context: Any) -> tuple[str, str]:
+        """Return a stable fallback key for correlation across callbacks."""
+        invocation_id = str(getattr(callback_context, "invocation_id", "") or "")
+        agent_name = str(getattr(callback_context, "agent_name", "") or "")
+        return (invocation_id, agent_name)
+
+    def _track_call_key(self, callback_context: Any, key: int) -> None:
+        """Register key in fallback queue for cross-object callback matching."""
+        fallback_key = self._fallback_key(callback_context)
+        if not fallback_key[0] and not fallback_key[1]:
+            return
+        self._call_fallback_keys.setdefault(fallback_key, []).append(key)
+
+    def _resolve_call_key(self, callback_context: Any) -> int | None:
+        """Resolve stored key for callback context across runtime variants."""
+        key = self._callback_key(callback_context)
+        if key in self._call_models or key in self._call_start_times:
+            return key
+
+        fallback_key = self._fallback_key(callback_context)
+        keys = self._call_fallback_keys.get(fallback_key)
+        if not keys:
+            return None
+
+        resolved = keys.pop(0)
+        if not keys:
+            self._call_fallback_keys.pop(fallback_key, None)
+        return resolved
+
+    async def before_model_callback(
+        self,
+        callback_context: Any,
+        llm_request: Any,
+    ) -> Any:
+        """Budget gate and timing setup.
+
+        Returns ``None`` to proceed normally, or an ``LlmResponse`` with
+        an error to short-circuit the call when budget is exhausted.
+        """
+        if not self._active:
+            return None
+
+        try:
+            ctx = get_current_run()
+            if ctx is None:
+                return None
+            if ctx.mode == "off":
+                return None
+
+            # Extract model name from request
+            model_raw = getattr(llm_request, "model", None) or "unknown"
+            model = _normalize_model_name(str(model_raw))
+
+            key = self._callback_key(callback_context)
+
+            # Budget gate in enforce mode
+            if (
+                self._config.enable_budget_gate
+                and ctx.mode == "enforce"
+                and ctx.budget_max is not None
+                and ctx.cost >= ctx.budget_max
+            ):
+                logger.warning(
+                    "google-adk: blocking LLM call — budget exhausted "
+                    "(spent $%.4f of $%.4f max)",
+                    ctx.cost,
+                    ctx.budget_max,
+                )
+                ctx.record(action="stop", reason="budget_exhausted", model=model)
+                return self._make_budget_error_response(ctx)
+
+            # Record start time and model for after_model_callback
+            self._call_start_times[key] = time.monotonic()
+            self._call_models[key] = model
+            self._track_call_key(callback_context, key)
+
+            return None
+        except Exception:
+            if self._config.fail_open:
+                logger.debug("google-adk before_model_callback error (fail_open)", exc_info=True)
+                return None
+            raise
+
+    async def after_model_callback(
+        self,
+        callback_context: Any,
+        llm_response: Any,
+    ) -> Any:
+        """Extract tokens, count tool calls, estimate cost/energy, update run context."""
+        if not self._active:
+            return None
+
+        try:
+            ctx = get_current_run()
+            if ctx is None:
+                return None
+            if ctx.mode == "off":
+                return None
+
+            key = self._resolve_call_key(callback_context)
+
+            # Recover model name stored during before_model_callback
+            model = self._call_models.pop(key, "unknown") if key is not None else "unknown"
+
+            # Extract token counts from usage_metadata
+            input_tokens, output_tokens = self._extract_tokens(llm_response)
+
+            # Count function_call parts in response content
+            content = getattr(llm_response, "content", None)
+            tool_calls = _count_function_calls(content)
+
+            # Cost and energy estimation
+            cost = estimate_cost(model, input_tokens, output_tokens)
+            energy = estimate_energy(model, input_tokens, output_tokens)
+
+            # Latency
+            start_time = self._call_start_times.pop(key, None) if key is not None else None
+            elapsed_ms = (time.monotonic() - start_time) * 1000 if start_time else 0.0
+
+            # Update run context
+            ctx.cost += cost
+            ctx.step_count += 1
+            ctx.latency_used_ms += elapsed_ms
+            ctx.energy_used += energy
+            ctx.tool_calls += tool_calls
+
+            if ctx.budget_max is not None:
+                ctx.budget_remaining = ctx.budget_max - ctx.cost
+
+            ctx.model_used = model
+            ctx.record(action="allow", reason=ctx.mode, model=model)
+
+            logger.debug(
+                "google-adk: tracked call model=%s cost=$%.6f latency=%.0fms tools=%d",
+                model,
+                cost,
+                elapsed_ms,
+                tool_calls,
+            )
+
+            return None
+        except Exception:
+            if self._config.fail_open:
+                logger.debug("google-adk after_model_callback error (fail_open)", exc_info=True)
+                return None
+            raise
+
+    async def on_model_error_callback(
+        self,
+        callback_context: Any,
+        llm_request: Any = None,
+        error: Exception | None = None,
+    ) -> Any:
+        """Record error in trace and clean up timing state."""
+        if not self._active:
+            return None
+
+        try:
+            # Backward-compatible calling form used in existing tests:
+            # on_model_error_callback(callback_context, error)
+            if error is None and isinstance(llm_request, Exception):
+                error = llm_request
+
+            key = self._resolve_call_key(callback_context)
+            model = self._call_models.pop(key, "unknown") if key is not None else "unknown"
+            if key is not None:
+                self._call_start_times.pop(key, None)
+
+            ctx = get_current_run()
+            if ctx is not None and error is not None:
+                error_type = type(error).__name__
+                ctx.record(
+                    action="error",
+                    reason=f"model_error:{error_type}",
+                    model=model,
+                )
+
+            return None
+        except Exception:
+            if self._config.fail_open:
+                logger.debug("google-adk on_model_error_callback error (fail_open)", exc_info=True)
+                return None
+            raise
+
+    def deactivate(self) -> None:
+        """Make all callbacks no-ops without unregistering from Runner."""
+        self._active = False
+        self._call_seq = 0
+        self._call_start_times.clear()
+        self._call_models.clear()
+        self._call_fallback_keys.clear()
+
+    @staticmethod
+    def _extract_tokens(llm_response: Any) -> tuple[int, int]:
+        """Extract input/output token counts from an ADK LlmResponse.
+
+        ADK responses carry ``usage_metadata`` with ``prompt_token_count``
+        and ``candidates_token_count``.  Falls back to estimating from
+        content text (4 chars ≈ 1 token).
+        """
+        usage = getattr(llm_response, "usage_metadata", None)
+        if usage is not None:
+            input_tokens = getattr(usage, "prompt_token_count", 0) or 0
+            output_tokens = getattr(usage, "candidates_token_count", 0) or 0
+            if input_tokens > 0 or output_tokens > 0:
+                return int(input_tokens), int(output_tokens)
+
+        # Fallback: estimate from content text
+        content = getattr(llm_response, "content", None)
+        if content is not None:
+            parts = getattr(content, "parts", None)
+            if parts:
+                text_chars = sum(len(getattr(p, "text", "") or "") for p in parts)
+                return 0, max(text_chars // 4, 1)
+
+        return 0, 0
+
+    @staticmethod
+    def _make_budget_error_response(ctx: Any) -> Any:
+        """Build an LlmResponse that short-circuits the LLM call.
+
+        When ADK is available we return a real ``LlmResponse``.  When not
+        (shouldn't happen in practice), we return a sentinel dict.
+
+        The user-facing message is intentionally generic to avoid leaking
+        internal spend/limit numbers.  Exact figures are logged separately.
+        """
+        # Generic message safe for end-user exposure.
+        msg = "cascadeflow harness budget exceeded"
+        # Detailed figures for operators only.
+        logger.warning(
+            "google-adk: budget exceeded — spent $%.4f of $%.4f max",
+            ctx.cost,
+            ctx.budget_max,
+        )
+        if GOOGLE_ADK_AVAILABLE:
+            try:
+                from google.adk.models import LlmResponse  # type: ignore[import-untyped]
+                from google.genai.types import Content, Part  # type: ignore[import-untyped]
+
+                return LlmResponse(
+                    content=Content(parts=[Part(text=msg)]),
+                    error_code="BUDGET_EXCEEDED",
+                    error_message=msg,
+                )
+            except ImportError:
+                pass
+
+        return {"error_code": "BUDGET_EXCEEDED", "error_message": msg}
+
+
+# ---------------------------------------------------------------------------
+# Module-level state
+# ---------------------------------------------------------------------------
+
+_config: GoogleADKHarnessConfig = GoogleADKHarnessConfig()
+_plugin_instance: Optional[CascadeFlowADKPlugin] = None
+_enabled: bool = False
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def is_available() -> bool:
+    """Return whether the google-adk package is installed."""
+    return GOOGLE_ADK_AVAILABLE
+
+
+def is_enabled() -> bool:
+    """Return whether a plugin instance has been created via ``enable()``."""
+    return _enabled
+
+
+def get_config() -> GoogleADKHarnessConfig:
+    """Return a copy of the current configuration."""
+    return GoogleADKHarnessConfig(
+        fail_open=_config.fail_open,
+        enable_budget_gate=_config.enable_budget_gate,
+    )
+
+
+def enable(
+    config: Optional[GoogleADKHarnessConfig] = None,
+) -> CascadeFlowADKPlugin:
+    """Create a cascadeflow-instrumented ADK plugin instance.
+
+    Unlike CrewAI (global hooks), ADK plugins are per-Runner.  Pass the
+    returned plugin to ``Runner(plugins=[plugin])``.
+
+    Idempotent: returns the same instance on repeated calls unless
+    ``disable()`` was called in between.
+
+    Args:
+        config: Optional configuration overrides.
+
+    Returns:
+        ``CascadeFlowADKPlugin`` instance ready for ``Runner(plugins=[...])``.
+    """
+    global _config, _plugin_instance, _enabled
+
+    if _enabled and _plugin_instance is not None:
+        logger.debug("google-adk plugin already enabled; returning existing instance")
+        return _plugin_instance
+
+    if config is not None:
+        _config = config
+
+    _plugin_instance = CascadeFlowADKPlugin(config=_config)
+    _enabled = True
+    logger.info("google-adk harness plugin created")
+    return _plugin_instance
+
+
+def disable() -> None:
+    """Deactivate the plugin and clear module state.
+
+    Safe to call even if not enabled.
+    """
+    global _plugin_instance, _enabled
+
+    if _plugin_instance is not None:
+        _plugin_instance.deactivate()
+
+    _plugin_instance = None
+    _enabled = False
+    logger.info("google-adk harness plugin disabled")
+
+
+__all__ = [
+    "GOOGLE_ADK_AVAILABLE",
+    "GoogleADKHarnessConfig",
+    "CascadeFlowADKPlugin",
+    "enable",
+    "disable",
+    "is_available",
+    "is_enabled",
+    "get_config",
+]
diff --git a/cascadeflow/integrations/langchain/__init__.py b/cascadeflow/integrations/langchain/__init__.py
index 45c6ea2f..7b3f9551 100644
--- a/cascadeflow/integrations/langchain/__init__.py
+++ b/cascadeflow/integrations/langchain/__init__.py
@@ -54,6 +54,14 @@
     CascadeFlowCallbackHandler,
     get_cascade_callback,
 )
+from .harness_callback import (
+    HarnessAwareCascadeFlowCallbackHandler,
+    get_harness_callback,
+)
+from .harness_state import (
+    apply_langgraph_state,
+    extract_langgraph_state,
+)
 
 __all__ = [
     # Main classes
@@ -93,4 +101,8 @@
     # LangChain callback handlers
     "CascadeFlowCallbackHandler",
     "get_cascade_callback",
+    "HarnessAwareCascadeFlowCallbackHandler",
+    "get_harness_callback",
+    "extract_langgraph_state",
+    "apply_langgraph_state",
 ]
diff --git a/cascadeflow/integrations/langchain/harness_callback.py b/cascadeflow/integrations/langchain/harness_callback.py
new file mode 100644
index 00000000..01f08d8c
--- /dev/null
+++ b/cascadeflow/integrations/langchain/harness_callback.py
@@ -0,0 +1,248 @@
+"""Harness-aware callbacks for LangChain/LangGraph integration.
+
+Enforce-mode limitations (LangChain callback architecture):
+    - ``stop`` (budget/latency/energy exceeded): fully enforced — raises
+      BudgetExceededError or HarnessStopError from ``on_llm_start``.
+    - ``deny_tool`` (tool-call cap): fully enforced at the tool level via
+      ``on_tool_start`` — raises HarnessStopError before tool execution.
+    - ``switch_model``: **observe-only** — LangChain dispatches the LLM call
+      before ``on_llm_start`` returns, so the callback cannot redirect to a
+      different model.  The decision is recorded with ``applied=False``.
+    - ``deny_tool`` at LLM level (pre-call decision): **observe-only** — the
+      callback cannot strip tools from an already-dispatched LLM request.
+      The decision is recorded with ``applied=False``.
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from contextlib import contextmanager
+from typing import Any, Optional
+
+from cascadeflow.harness import get_current_run
+from cascadeflow.harness.pricing import estimate_cost, estimate_energy
+from cascadeflow.schema.exceptions import HarnessStopError
+
+from .harness_state import apply_langgraph_state, extract_langgraph_state
+from .langchain_callbacks import CascadeFlowCallbackHandler
+from .utils import extract_token_usage
+
+logger = logging.getLogger("cascadeflow.harness.langchain")
+
+
+class HarnessAwareCascadeFlowCallbackHandler(CascadeFlowCallbackHandler):
+    """LangChain callback that bridges native lifecycle events into HarnessRunContext.
+
+    See module docstring for enforce-mode limitations on ``switch_model``
+    and LLM-level ``deny_tool``.
+    """
+
+    def __init__(self, *, fail_open: bool = True):
+        super().__init__()
+        self.fail_open = fail_open
+        self._llm_started_at: Optional[float] = None
+        self._pre_action: str = "allow"
+        self._pre_reason: str = "allow"
+        self._pre_model: Optional[str] = None
+        self._pre_recorded: bool = False
+
+    def _handle_harness_error(self, error: Exception) -> None:
+        if self.fail_open:
+            logger.exception("langchain harness callback failed (fail-open)", exc_info=error)
+            return
+        raise error
+
+    def _sync_state(self, payload: dict[str, Any]) -> None:
+        run_ctx = get_current_run()
+        if run_ctx is None:
+            return
+        state = extract_langgraph_state(payload)
+        if state:
+            apply_langgraph_state(run_ctx, state)
+
+    def on_llm_start(self, serialized: dict[str, Any], prompts: list[str], **kwargs: Any) -> None:
+        super().on_llm_start(serialized=serialized, prompts=prompts, **kwargs)
+        self._llm_started_at = time.monotonic()
+        self._pre_action = "allow"
+        self._pre_reason = "allow"
+        self._pre_model = self.current_model
+        self._pre_recorded = False
+
+        try:
+            self._sync_state(kwargs)
+
+            run_ctx = get_current_run()
+            if run_ctx is None:
+                return
+
+            model_name = self.current_model or "unknown"
+            invocation_params = kwargs.get("invocation_params")
+            has_tools = False
+            if isinstance(invocation_params, dict):
+                has_tools = bool(invocation_params.get("tools"))
+            if not has_tools:
+                has_tools = bool(kwargs.get("tools"))
+
+            from cascadeflow.harness.instrument import (
+                _evaluate_pre_call_decision,
+                _raise_stop_error,
+            )  # noqa: I001
+
+            decision = _evaluate_pre_call_decision(run_ctx, model_name, has_tools=has_tools)
+            self._pre_action = decision.action
+            self._pre_reason = decision.reason
+            self._pre_model = decision.target_model
+
+            if run_ctx.mode == "observe":
+                if decision.action != "allow":
+                    run_ctx.record(
+                        action=decision.action,
+                        reason=decision.reason,
+                        model=decision.target_model,
+                        applied=False,
+                        decision_mode="observe",
+                    )
+                    self._pre_recorded = True
+                return
+
+            if run_ctx.mode != "enforce":
+                return
+
+            if decision.action == "stop":
+                run_ctx.record(
+                    action="stop",
+                    reason=decision.reason,
+                    model=model_name,
+                    applied=True,
+                    decision_mode="enforce",
+                )
+                self._pre_recorded = True
+                _raise_stop_error(run_ctx, decision.reason)
+
+            if decision.action == "switch_model":
+                run_ctx.record(
+                    action="switch_model",
+                    reason=decision.reason,
+                    model=decision.target_model,
+                    applied=False,
+                    decision_mode="enforce",
+                )
+                self._pre_recorded = True
+
+            if decision.action == "deny_tool" and has_tools:
+                run_ctx.record(
+                    action="deny_tool",
+                    reason=decision.reason,
+                    model=model_name,
+                    applied=False,
+                    decision_mode="enforce",
+                )
+                self._pre_recorded = True
+
+        except Exception as exc:
+            self._handle_harness_error(exc)
+
+    def on_llm_end(self, response: Any, **kwargs: Any) -> None:
+        super().on_llm_end(response=response, **kwargs)
+
+        try:
+            self._sync_state(kwargs)
+            run_ctx = get_current_run()
+            if run_ctx is None:
+                return
+
+            model_name = self.current_model
+            if not model_name and getattr(response, "llm_output", None):
+                model_name = response.llm_output.get("model_name")
+            model_name = model_name or "unknown"
+
+            token_usage = extract_token_usage(response)
+            prompt_tokens = int(token_usage["input"])
+            completion_tokens = int(token_usage["output"])
+            elapsed_ms = 0.0
+            if self._llm_started_at is not None:
+                elapsed_ms = (time.monotonic() - self._llm_started_at) * 1000.0
+
+            run_ctx.step_count += 1
+            run_ctx.cost += estimate_cost(model_name, prompt_tokens, completion_tokens)
+            run_ctx.energy_used += estimate_energy(model_name, prompt_tokens, completion_tokens)
+            run_ctx.latency_used_ms += elapsed_ms
+
+            if run_ctx.budget_max is not None:
+                run_ctx.budget_remaining = run_ctx.budget_max - run_ctx.cost
+
+            if self._pre_action == "allow":
+                run_ctx.record(
+                    action="allow",
+                    reason="langchain_step",
+                    model=model_name,
+                    applied=True,
+                    decision_mode=run_ctx.mode,
+                )
+            elif not self._pre_recorded:
+                run_ctx.record(
+                    action=self._pre_action,
+                    reason=self._pre_reason,
+                    model=self._pre_model or model_name,
+                    applied=False,
+                    decision_mode=run_ctx.mode,
+                )
+
+        except Exception as exc:
+            self._handle_harness_error(exc)
+        finally:
+            self._llm_started_at = None
+            self._pre_action = "allow"
+            self._pre_reason = "allow"
+            self._pre_model = None
+            self._pre_recorded = False
+
+    def on_tool_start(self, serialized: dict[str, Any], input_str: str, **kwargs: Any) -> Any:
+        try:
+            self._sync_state(kwargs)
+            run_ctx = get_current_run()
+            if run_ctx is None:
+                return None
+            if run_ctx.tool_calls_max is None:
+                return None
+
+            if run_ctx.tool_calls >= run_ctx.tool_calls_max:
+                if run_ctx.mode == "observe":
+                    run_ctx.record(
+                        action="deny_tool",
+                        reason="max_tool_calls_reached",
+                        model=self.current_model,
+                        applied=False,
+                        decision_mode="observe",
+                    )
+                    return None
+                if run_ctx.mode == "enforce":
+                    run_ctx.record(
+                        action="deny_tool",
+                        reason="max_tool_calls_reached",
+                        model=self.current_model,
+                        applied=True,
+                        decision_mode="enforce",
+                    )
+                    raise HarnessStopError(
+                        "cascadeflow harness deny_tool: max tool calls reached",
+                        reason="max_tool_calls_reached",
+                    )
+
+            # Track executed tools (not predicted tool calls in LLM output).
+            run_ctx.tool_calls += 1
+            return None
+        except Exception as exc:
+            self._handle_harness_error(exc)
+            return None
+
+
+@contextmanager
+def get_harness_callback(*, fail_open: bool = True):
+    """Context manager that yields a harness-aware LangChain callback handler."""
+    callback = HarnessAwareCascadeFlowCallbackHandler(fail_open=fail_open)
+    yield callback
+
+
+__all__ = ["HarnessAwareCascadeFlowCallbackHandler", "get_harness_callback"]
diff --git a/cascadeflow/integrations/langchain/harness_state.py b/cascadeflow/integrations/langchain/harness_state.py
new file mode 100644
index 00000000..b4b40da5
--- /dev/null
+++ b/cascadeflow/integrations/langchain/harness_state.py
@@ -0,0 +1,124 @@
+"""LangGraph/LangChain state extraction helpers for harness integration."""
+
+from __future__ import annotations
+
+from typing import Any, Mapping, Optional
+
+
+def _as_int(value: Any) -> Optional[int]:
+    try:
+        if value is None:
+            return None
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _as_float(value: Any) -> Optional[float]:
+    try:
+        if value is None:
+            return None
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _extract_candidate_state(source: Any) -> Optional[Mapping[str, Any]]:
+    """Extract a named state container from a mapping.
+
+    Only returns state from explicitly named keys (langgraph_state, graph_state,
+    state).  Returns None when no named key matches — avoids treating arbitrary
+    kwargs as harness state.
+    """
+    if not isinstance(source, Mapping):
+        return None
+
+    for key in ("langgraph_state", "graph_state", "state"):
+        candidate = source.get(key)
+        if isinstance(candidate, Mapping):
+            return candidate
+
+    return None
+
+
+def extract_langgraph_state(payload: Any) -> dict[str, Any]:
+    """Extract normalized harness-relevant fields from LangGraph-style state payloads."""
+
+    candidates: list[Mapping[str, Any]] = []
+    root = _extract_candidate_state(payload)
+    if root is not None:
+        candidates.append(root)
+
+    if isinstance(payload, Mapping):
+        metadata = payload.get("metadata")
+        if isinstance(metadata, Mapping):
+            state_from_metadata = _extract_candidate_state(metadata)
+            if state_from_metadata is not None:
+                candidates.append(state_from_metadata)
+
+        configurable = payload.get("configurable")
+        if isinstance(configurable, Mapping):
+            state_from_configurable = _extract_candidate_state(configurable)
+            if state_from_configurable is not None:
+                candidates.append(state_from_configurable)
+
+    merged: dict[str, Any] = {}
+    for source in candidates:
+        if "agent_id" in source and isinstance(source.get("agent_id"), str):
+            merged["agent_id"] = source["agent_id"]
+        if "model" in source and isinstance(source.get("model"), str):
+            merged["model_used"] = source["model"]
+        if "model_used" in source and isinstance(source.get("model_used"), str):
+            merged["model_used"] = source["model_used"]
+
+        step_count = _as_int(source.get("step_count", source.get("step")))
+        if step_count is not None:
+            merged["step_count"] = step_count
+
+        tool_calls = _as_int(source.get("tool_calls"))
+        if tool_calls is not None:
+            merged["tool_calls"] = tool_calls
+
+        budget_remaining = _as_float(source.get("budget_remaining"))
+        if budget_remaining is not None:
+            merged["budget_remaining"] = budget_remaining
+
+        latency_used_ms = _as_float(source.get("latency_used_ms", source.get("latency_ms")))
+        if latency_used_ms is not None:
+            merged["latency_used_ms"] = latency_used_ms
+
+        energy_used = _as_float(source.get("energy_used", source.get("energy")))
+        if energy_used is not None:
+            merged["energy_used"] = energy_used
+
+    return merged
+
+
+def apply_langgraph_state(run_ctx: Any, state: Mapping[str, Any]) -> None:
+    """Apply extracted state fields onto an active HarnessRunContext."""
+    if run_ctx is None or not isinstance(state, Mapping):
+        return
+
+    step_count = _as_int(state.get("step_count"))
+    if step_count is not None and step_count > getattr(run_ctx, "step_count", 0):
+        run_ctx.step_count = step_count
+
+    tool_calls = _as_int(state.get("tool_calls"))
+    if tool_calls is not None and tool_calls > getattr(run_ctx, "tool_calls", 0):
+        run_ctx.tool_calls = tool_calls
+
+    latency_used_ms = _as_float(state.get("latency_used_ms"))
+    if latency_used_ms is not None and latency_used_ms > getattr(run_ctx, "latency_used_ms", 0.0):
+        run_ctx.latency_used_ms = latency_used_ms
+
+    energy_used = _as_float(state.get("energy_used"))
+    if energy_used is not None and energy_used > getattr(run_ctx, "energy_used", 0.0):
+        run_ctx.energy_used = energy_used
+
+    budget_remaining = _as_float(state.get("budget_remaining"))
+    if budget_remaining is not None:
+        run_ctx.budget_remaining = budget_remaining
+
+    model_used = state.get("model_used")
+    if isinstance(model_used, str) and model_used:
+        run_ctx.model_used = model_used
diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py
new file mode 100644
index 00000000..9ba062e5
--- /dev/null
+++ b/cascadeflow/integrations/langchain/tests/test_langchain_harness_callback.py
@@ -0,0 +1,213 @@
+"""Tests for harness-aware LangChain callback integration."""
+
+from __future__ import annotations
+
+import pytest
+from langchain_core.messages import AIMessage
+from langchain_core.outputs import ChatGeneration, LLMResult
+
+from cascadeflow.harness import init, reset, run
+from cascadeflow.integrations.langchain.harness_callback import (
+    HarnessAwareCascadeFlowCallbackHandler,
+)
+from cascadeflow.integrations.langchain.harness_state import (
+    apply_langgraph_state,
+    extract_langgraph_state,
+)
+from cascadeflow.integrations.langchain.utils import extract_tool_calls
+from cascadeflow.schema.exceptions import BudgetExceededError, HarnessStopError
+
+
+@pytest.fixture(autouse=True)
+def _reset_harness_state() -> None:
+    reset()
+
+
+def _llm_result(model_name: str, prompt_tokens: int, completion_tokens: int) -> LLMResult:
+    generation = ChatGeneration(message=AIMessage(content="ok"), generation_info={})
+    return LLMResult(
+        generations=[[generation]],
+        llm_output={
+            "model_name": model_name,
+            "token_usage": {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens,
+            },
+        },
+    )
+
+
+def test_harness_callback_updates_active_run_metrics() -> None:
+    init(mode="observe", budget=1.0)
+    handler = HarnessAwareCascadeFlowCallbackHandler()
+
+    with run(budget=1.0) as ctx:
+        handler.on_llm_start(
+            serialized={},
+            prompts=["hello"],
+            invocation_params={"model": "gpt-4o-mini"},
+        )
+        handler.on_llm_end(_llm_result("gpt-4o-mini", 120, 80))
+
+        assert ctx.step_count == 1
+        assert ctx.cost > 0
+        assert ctx.energy_used > 0
+        assert ctx.budget_remaining is not None
+        assert ctx.budget_remaining < 1.0
+        assert ctx.last_action == "allow"
+        assert ctx.model_used == "gpt-4o-mini"
+
+
+def test_harness_callback_enforce_raises_when_budget_exhausted() -> None:
+    init(mode="enforce", budget=0.1)
+    handler = HarnessAwareCascadeFlowCallbackHandler(fail_open=False)
+
+    with run(budget=0.1) as ctx:
+        ctx.cost = 0.1
+        ctx.budget_remaining = 0.0
+
+        with pytest.raises(BudgetExceededError):
+            handler.on_llm_start(
+                serialized={},
+                prompts=["hello"],
+                invocation_params={"model": "gpt-4o-mini"},
+            )
+
+        trace = ctx.trace()
+        assert trace
+        assert trace[-1]["action"] == "stop"
+        assert trace[-1]["reason"] == "budget_exceeded"
+        assert trace[-1]["applied"] is True
+
+
+def test_harness_callback_observe_records_non_applied_decisions() -> None:
+    init(mode="observe", budget=1.0)
+    handler = HarnessAwareCascadeFlowCallbackHandler()
+
+    with run(budget=1.0) as ctx:
+        ctx.cost = 0.9
+        ctx.budget_remaining = 0.1
+
+        handler.on_llm_start(
+            serialized={},
+            prompts=["hello"],
+            invocation_params={"model": "gpt-4o", "tools": [{"name": "lookup"}]},
+        )
+
+        trace = ctx.trace()
+        assert trace
+        assert trace[-1]["action"] in {"switch_model", "deny_tool"}
+        assert trace[-1]["applied"] is False
+        assert trace[-1]["decision_mode"] == "observe"
+
+
+def test_harness_callback_enforce_denies_tool_when_limit_reached() -> None:
+    init(mode="enforce", max_tool_calls=0, budget=1.0)
+    handler = HarnessAwareCascadeFlowCallbackHandler(fail_open=False)
+
+    with run(max_tool_calls=0, budget=1.0) as ctx:
+        with pytest.raises(HarnessStopError, match="max tool calls"):
+            handler.on_tool_start(serialized={"name": "search"}, input_str="query")
+
+        trace = ctx.trace()
+        assert trace
+        assert trace[-1]["action"] == "deny_tool"
+        assert trace[-1]["applied"] is True
+        assert trace[-1]["decision_mode"] == "enforce"
+
+
+def test_on_llm_end_no_run_context_is_safe() -> None:
+    handler = HarnessAwareCascadeFlowCallbackHandler()
+    handler.on_llm_start(
+        serialized={},
+        prompts=["hello"],
+        invocation_params={"model": "gpt-4o-mini"},
+    )
+    handler.on_llm_end(_llm_result("gpt-4o-mini", 10, 5))
+
+
+def test_on_tool_start_no_run_context_is_safe() -> None:
+    handler = HarnessAwareCascadeFlowCallbackHandler()
+    handler.on_tool_start(serialized={"name": "search"}, input_str="query")
+
+
+def test_extract_state_ignores_plain_kwargs() -> None:
+    """Kwargs without a named state key should not leak into state."""
+    state = extract_langgraph_state({"model": "gpt-4o", "invocation_params": {"tools": []}})
+    assert state == {}
+
+
+def test_tool_deny_uses_run_ctx_tool_calls() -> None:
+    """Tool gating should use run_ctx.tool_calls, not a local counter."""
+    init(mode="enforce", max_tool_calls=2, budget=1.0)
+    handler = HarnessAwareCascadeFlowCallbackHandler(fail_open=False)
+
+    with run(max_tool_calls=2, budget=1.0) as ctx:
+        # Simulate tool calls already counted by on_llm_end or other integrations
+        ctx.tool_calls = 2
+
+        with pytest.raises(HarnessStopError, match="max tool calls"):
+            handler.on_tool_start(serialized={"name": "search"}, input_str="query")
+
+
+def test_tool_start_counts_executions_and_blocks_after_limit() -> None:
+    init(mode="enforce", max_tool_calls=1, budget=1.0)
+    handler = HarnessAwareCascadeFlowCallbackHandler(fail_open=False)
+
+    with run(max_tool_calls=1, budget=1.0) as ctx:
+        assert ctx.tool_calls == 0
+        assert handler.on_tool_start(serialized={"name": "search"}, input_str="first") is None
+        assert ctx.tool_calls == 1
+
+        with pytest.raises(HarnessStopError, match="max tool calls"):
+            handler.on_tool_start(serialized={"name": "search"}, input_str="second")
+
+        assert ctx.tool_calls == 1
+        trace = ctx.trace()
+        assert trace[-1]["action"] == "deny_tool"
+        assert trace[-1]["applied"] is True
+
+
+def test_extract_tool_calls_supports_llm_result_nested_generations() -> None:
+    generation = ChatGeneration(
+        message=AIMessage(
+            content="", tool_calls=[{"name": "search", "args": {"q": "x"}, "id": "t1"}]
+        ),
+        generation_info={},
+    )
+    llm_result = LLMResult(generations=[[generation]], llm_output={"model_name": "gpt-4o-mini"})
+    tool_calls = extract_tool_calls(llm_result)
+    assert len(tool_calls) == 1
+    assert tool_calls[0]["name"] == "search"
+
+
+def test_extract_and_apply_langgraph_state() -> None:
+    state = extract_langgraph_state(
+        {
+            "metadata": {
+                "langgraph_state": {
+                    "step": 4,
+                    "tool_calls": 3,
+                    "budget_remaining": 0.42,
+                    "latency_ms": 130.0,
+                    "energy": 77.0,
+                    "model": "gpt-4o-mini",
+                }
+            }
+        }
+    )
+
+    assert state["step_count"] == 4
+    assert state["tool_calls"] == 3
+    assert state["model_used"] == "gpt-4o-mini"
+
+    init(mode="observe", budget=1.0)
+    with run(budget=1.0) as ctx:
+        apply_langgraph_state(ctx, state)
+        assert ctx.step_count == 4
+        assert ctx.tool_calls == 3
+        assert ctx.budget_remaining == pytest.approx(0.42)
+        assert ctx.latency_used_ms == pytest.approx(130.0)
+        assert ctx.energy_used == pytest.approx(77.0)
+        assert ctx.model_used == "gpt-4o-mini"
diff --git a/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py b/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py
index fdbcff1d..0f051519 100644
--- a/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py
+++ b/cascadeflow/integrations/langchain/tests/test_langchain_integration_features.py
@@ -4,7 +4,11 @@
 from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
 from langchain_core.outputs import ChatGeneration, ChatResult
 
+from cascadeflow.harness import init, reset, run
 from cascadeflow.integrations.langchain import CascadeFlow
+from cascadeflow.integrations.langchain.harness_callback import (
+    HarnessAwareCascadeFlowCallbackHandler,
+)
 
 
 class MockSequenceChatModel(BaseChatModel):
@@ -116,3 +120,38 @@ def test_domain_policy_direct_to_verifier_skips_drafter() -> None:
     assert drafter.calls == 0
     assert verifier.calls == 1
     assert result.llm_output["cascade"]["routing_reason"] == "domain_policy_direct"
+
+
+def test_wrapper_only_auto_adds_harness_callback_inside_active_run_scope() -> None:
+    reset()
+    init(mode="observe")
+    drafter = MockSequenceChatModel("draft")
+    verifier = MockSequenceChatModel("verify")
+    cascade = CascadeFlow(drafter=drafter, verifier=verifier, enable_pre_router=False)
+
+    outside_callbacks = cascade._resolve_callbacks([])
+    assert not any(
+        isinstance(cb, HarnessAwareCascadeFlowCallbackHandler) for cb in outside_callbacks
+    )
+
+    with run():
+        inside_callbacks = cascade._resolve_callbacks([])
+        assert any(
+            isinstance(cb, HarnessAwareCascadeFlowCallbackHandler) for cb in inside_callbacks
+        )
+
+
+def test_wrapper_does_not_duplicate_harness_callback() -> None:
+    reset()
+    init(mode="observe")
+    drafter = MockSequenceChatModel("draft")
+    verifier = MockSequenceChatModel("verify")
+    cascade = CascadeFlow(drafter=drafter, verifier=verifier, enable_pre_router=False)
+    existing = HarnessAwareCascadeFlowCallbackHandler()
+
+    with run():
+        callbacks = cascade._resolve_callbacks([existing])
+        assert (
+            len([cb for cb in callbacks if isinstance(cb, HarnessAwareCascadeFlowCallbackHandler)])
+            == 1
+        )
diff --git a/cascadeflow/integrations/langchain/utils.py b/cascadeflow/integrations/langchain/utils.py
index fe47a353..04f3e4a5 100644
--- a/cascadeflow/integrations/langchain/utils.py
+++ b/cascadeflow/integrations/langchain/utils.py
@@ -195,6 +195,10 @@ def extract_tool_calls(response: Any) -> list[dict[str, Any]]:
     msg = None
     if hasattr(response, "generations") and response.generations:
         generation = response.generations[0]
+        # LLMResult.generations is often list[list[Generation]], while ChatResult
+        # uses list[Generation]. Support both shapes.
+        if isinstance(generation, list) and generation:
+            generation = generation[0]
         msg = getattr(generation, "message", None)
     else:
         msg = getattr(response, "message", None) or response
diff --git a/cascadeflow/integrations/langchain/wrapper.py b/cascadeflow/integrations/langchain/wrapper.py
index ed6d554b..f108d60f 100644
--- a/cascadeflow/integrations/langchain/wrapper.py
+++ b/cascadeflow/integrations/langchain/wrapper.py
@@ -169,6 +169,35 @@ def _split_runnable_config(
                 model_kwargs[key] = value
         return model_kwargs, config
 
+    def _resolve_callbacks(self, raw_callbacks: Any) -> list[Any]:
+        if raw_callbacks is None:
+            callbacks: list[Any] = []
+        elif isinstance(raw_callbacks, list):
+            callbacks = list(raw_callbacks)
+        elif isinstance(raw_callbacks, tuple):
+            callbacks = list(raw_callbacks)
+        else:
+            callbacks = [raw_callbacks]
+
+        try:
+            from cascadeflow.harness import get_current_run, get_harness_config
+
+            harness_config = get_harness_config()
+            run_ctx = get_current_run()
+            if harness_config.mode == "off" or run_ctx is None or run_ctx.mode == "off":
+                return callbacks
+
+            from .harness_callback import HarnessAwareCascadeFlowCallbackHandler
+
+            if any(isinstance(cb, HarnessAwareCascadeFlowCallbackHandler) for cb in callbacks):
+                return callbacks
+
+            callbacks.append(HarnessAwareCascadeFlowCallbackHandler())
+            return callbacks
+        except Exception:
+            # Preserve existing behavior for users who do not enable harness flows.
+            return callbacks
+
     def _generate(
         self,
         messages: list[BaseMessage],
@@ -202,7 +231,7 @@ def _generate(
             merged_kwargs["stop"] = stop
 
         # Extract callbacks before filtering (need to pass them explicitly to nested models)
-        callbacks = merged_kwargs.get("callbacks", [])
+        callbacks = self._resolve_callbacks(merged_kwargs.get("callbacks", []))
 
         existing_tags = merged_kwargs.get("tags", []) or []
         base_tags = existing_tags + ["cascadeflow"] if existing_tags else ["cascadeflow"]
@@ -599,7 +628,7 @@ async def _agenerate(
             merged_kwargs["stop"] = stop
 
         # Extract callbacks before filtering (need to pass them explicitly to nested models)
-        callbacks = merged_kwargs.get("callbacks", [])
+        callbacks = self._resolve_callbacks(merged_kwargs.get("callbacks", []))
 
         existing_tags = merged_kwargs.get("tags", []) or []
         base_tags = existing_tags + ["cascadeflow"] if existing_tags else ["cascadeflow"]
@@ -1001,7 +1030,7 @@ def _stream(
         stream_kwargs, base_config = self._split_runnable_config(merged_kwargs)
         base_tags = (base_config.get("tags") or []) + ["cascadeflow"]
         existing_metadata = base_config.get("metadata", {}) or {}
-        callbacks = base_config.get("callbacks", [])
+        callbacks = self._resolve_callbacks(base_config.get("callbacks", []))
         resolved_domain = self._resolve_domain(messages, existing_metadata)
         effective_quality_threshold = self._effective_quality_threshold(resolved_domain)
         force_verifier_for_domain = self._domain_forces_verifier(resolved_domain)
@@ -1324,7 +1353,7 @@ async def _astream(
         stream_kwargs, base_config = self._split_runnable_config(merged_kwargs)
         base_tags = (base_config.get("tags") or []) + ["cascadeflow"]
         existing_metadata = base_config.get("metadata", {}) or {}
-        callbacks = base_config.get("callbacks", [])
+        callbacks = self._resolve_callbacks(base_config.get("callbacks", []))
         safe_kwargs = {
             k: v
             for k, v in stream_kwargs.items()
diff --git a/docs-site/api-reference/python/agent-decorator.mdx b/docs-site/api-reference/python/agent-decorator.mdx
new file mode 100644
index 00000000..912a03fd
--- /dev/null
+++ b/docs-site/api-reference/python/agent-decorator.mdx
@@ -0,0 +1,79 @@
+---
+title: "@cascadeflow.agent()"
+description: Decorate agent functions with policy metadata including budget, compliance, and KPI weights.
+---
+
+# @cascadeflow.agent()
+
+Annotate agent functions with policy metadata. The decorator attaches budget, compliance, and KPI configuration to the function for the harness to use at runtime.
+
+## Signature
+
+```python
+def agent(
+    budget: Optional[float] = None,
+    compliance: Optional[str] = None,
+    kpi_weights: Optional[dict[str, float]] = None,
+    kpi_targets: Optional[dict[str, float]] = None,
+    max_tool_calls: Optional[int] = None,
+)
+```
+
+## Parameters
+
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `budget` | `float \| None` | `None` | Max USD for this agent |
+| `compliance` | `str \| None` | `None` | Compliance mode |
+| `kpi_weights` | `dict \| None` | `None` | KPI dimension weights |
+| `kpi_targets` | `dict \| None` | `None` | KPI dimension targets |
+| `max_tool_calls` | `int \| None` | `None` | Max tool/function calls |
+
+## Usage
+
+### Basic
+
+```python
+@cascadeflow.agent(budget=0.20)
+async def my_agent(query: str):
+    return await llm.complete(query)
+```
+
+### With compliance
+
+```python
+@cascadeflow.agent(budget=0.50, compliance="gdpr")
+async def eu_agent(query: str):
+    return await llm.complete(query)
+```
+
+### With KPI weights
+
+```python
+@cascadeflow.agent(
+    budget=1.00,
+    kpi_weights={"quality": 0.8, "cost": 0.2},
+    kpi_targets={"quality": 0.9},
+)
+async def premium_agent(query: str):
+    return await llm.complete(query)
+```
+
+### Multiple agents with different policies
+
+```python
+@cascadeflow.agent(budget=0.10, kpi_weights={"cost": 0.9, "quality": 0.1})
+async def triage_agent(query: str):
+    return await llm.complete(query)
+
+@cascadeflow.agent(budget=2.00, kpi_weights={"quality": 0.9, "cost": 0.1})
+async def analysis_agent(query: str):
+    return await llm.complete(query)
+```
+
+## Notes
+
+- The decorator does not wrap or modify the function's execution. It attaches metadata that the harness reads at runtime.
+- Works with both sync and async functions.
+- Requires `init()` to have been called for the metadata to take effect.
+- Can be combined with `run()` — the run's constraints are checked in addition to the decorator's.
diff --git a/docs-site/api-reference/python/harness-config.mdx b/docs-site/api-reference/python/harness-config.mdx
new file mode 100644
index 00000000..42ae7a6d
--- /dev/null
+++ b/docs-site/api-reference/python/harness-config.mdx
@@ -0,0 +1,73 @@
+---
+title: HarnessConfig
+description: Full configuration dataclass for the cascadeflow harness with all fields, types, and defaults.
+---
+
+# HarnessConfig
+
+Configuration dataclass for the cascadeflow harness. Pass to `cascadeflow.init(config=...)` for full control.
+
+## Definition
+
+```python
+from dataclasses import dataclass
+from typing import Optional
+
+@dataclass
+class HarnessConfig:
+    mode: HarnessMode = "off"
+    verbose: bool = False
+    budget: Optional[float] = None
+    max_tool_calls: Optional[int] = None
+    max_latency_ms: Optional[float] = None
+    max_energy: Optional[float] = None
+    kpi_targets: Optional[dict[str, float]] = None
+    kpi_weights: Optional[dict[str, float]] = None
+    compliance: Optional[str] = None
+```
+
+## Fields
+
+| Field | Type | Default | Description |
+|---|---|---|---|
+| `mode` | `"off" \| "observe" \| "enforce"` | `"off"` | Harness mode |
+| `verbose` | `bool` | `False` | Print decisions to stderr |
+| `budget` | `float \| None` | `None` | Max USD for the run (None = unlimited) |
+| `max_tool_calls` | `int \| None` | `None` | Max tool/function calls (None = unlimited) |
+| `max_latency_ms` | `float \| None` | `None` | Max wall-clock ms per call (None = unlimited) |
+| `max_energy` | `float \| None` | `None` | Max energy units (None = unlimited) |
+| `kpi_targets` | `dict \| None` | `None` | Target values per KPI dimension |
+| `kpi_weights` | `dict \| None` | `None` | Relative weights per KPI dimension |
+| `compliance` | `str \| None` | `None` | Compliance mode: `"gdpr"`, `"hipaa"`, `"pci"`, `"strict"` |
+
+## HarnessMode
+
+```python
+HarnessMode = Literal["off", "observe", "enforce"]
+```
+
+## Usage
+
+```python
+from cascadeflow import HarnessConfig
+import cascadeflow
+
+config = HarnessConfig(
+    mode="enforce",
+    budget=1.00,
+    max_tool_calls=20,
+    max_energy=200.0,
+    compliance="gdpr",
+    kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1},
+    kpi_targets={"quality": 0.85},
+    verbose=True,
+)
+
+cascadeflow.init(config=config)
+```
+
+## Import
+
+```python
+from cascadeflow import HarnessConfig
+```
diff --git a/docs-site/api-reference/python/init.mdx b/docs-site/api-reference/python/init.mdx
new file mode 100644
index 00000000..b07a0e00
--- /dev/null
+++ b/docs-site/api-reference/python/init.mdx
@@ -0,0 +1,68 @@
+---
+title: cascadeflow.init()
+description: Activate the cascadeflow harness globally with a mode and optional configuration.
+---
+
+# cascadeflow.init()
+
+Activate the harness globally. All subsequent LLM calls (OpenAI, Anthropic) are automatically tracked.
+
+## Signature
+
+```python
+def init(
+    mode: HarnessMode = "off",
+    *,
+    config: Optional[HarnessConfig] = None,
+    verbose: bool = False,
+) -> HarnessInitReport
+```
+
+## Parameters
+
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `mode` | `"off" \| "observe" \| "enforce"` | `"off"` | Harness mode |
+| `config` | `HarnessConfig \| None` | `None` | Full configuration (overrides mode) |
+| `verbose` | `bool` | `False` | Print decisions to stderr |
+
+## Returns
+
+`HarnessInitReport` — confirmation of harness activation with mode and configuration summary.
+
+## Usage
+
+### Minimal
+
+```python
+import cascadeflow
+cascadeflow.init(mode="observe")
+```
+
+### With config
+
+```python
+from cascadeflow import HarnessConfig
+
+config = HarnessConfig(
+    mode="enforce",
+    budget=1.00,
+    compliance="gdpr",
+    verbose=True,
+)
+cascadeflow.init(config=config)
+```
+
+### Environment-driven
+
+```python
+import os
+cascadeflow.init(mode=os.getenv("CASCADEFLOW_MODE", "observe"))
+```
+
+## Notes
+
+- Call `init()` once at application startup, before any LLM calls
+- Calling `init()` again replaces the previous configuration
+- Use `cascadeflow.reset()` to deactivate the harness
+- `init(mode="off")` is equivalent to not calling `init()` at all
diff --git a/docs-site/api-reference/python/run-context.mdx b/docs-site/api-reference/python/run-context.mdx
new file mode 100644
index 00000000..be9377a4
--- /dev/null
+++ b/docs-site/api-reference/python/run-context.mdx
@@ -0,0 +1,76 @@
+---
+title: HarnessRunContext
+description: Run context object yielded by cascadeflow.run() with summary(), trace(), and budget tracking methods.
+---
+
+# HarnessRunContext
+
+The context object yielded by `cascadeflow.run()`. Provides access to run metrics, decision traces, and budget state.
+
+## Methods
+
+### summary()
+
+Returns aggregate metrics for the run.
+
+```python
+summary = session.summary()
+```
+
+Returns a dict with:
+
+| Key | Type | Description |
+|---|---|---|
+| `cost_total` | `float` | Cumulative cost in USD |
+| `steps` | `int` | Number of LLM calls |
+| `tool_calls` | `int` | Number of tool/function calls |
+| `latency_total_ms` | `float` | Total wall-clock latency in ms |
+| `energy_used` | `float` | Total energy units consumed |
+| `budget_remaining` | `float \| None` | USD remaining (None if no budget set) |
+
+### trace()
+
+Returns the list of decision records for the run.
+
+```python
+records = session.trace()
+```
+
+Each record is a dict with:
+
+| Key | Type | Description |
+|---|---|---|
+| `action` | `str` | `"allow"`, `"switch_model"`, `"deny_tool"`, or `"stop"` |
+| `reason` | `str` | Human-readable explanation |
+| `model` | `str` | Model name |
+| `step` | `int` | Step number (1-indexed) |
+| `cost_total` | `float` | Cumulative cost at this step |
+| `budget_state` | `str` | `"ok"`, `"warning"`, or `"exceeded"` |
+| `applied` | `bool` | Whether the action was enforced |
+
+## Usage
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(budget=0.50) as session:
+    result = await agent.run("Analyze this dataset")
+
+    # Aggregate metrics
+    summary = session.summary()
+    print(f"Cost: ${summary['cost_total']:.4f}")
+    print(f"Steps: {summary['steps']}")
+    print(f"Budget remaining: ${summary['budget_remaining']:.4f}")
+
+    # Decision trace
+    for record in session.trace():
+        print(f"Step {record['step']}: {record['action']} — {record['reason']}")
+```
+
+## Import
+
+```python
+from cascadeflow import HarnessRunContext
+```
diff --git a/docs-site/api-reference/python/run.mdx b/docs-site/api-reference/python/run.mdx
new file mode 100644
index 00000000..72202a74
--- /dev/null
+++ b/docs-site/api-reference/python/run.mdx
@@ -0,0 +1,83 @@
+---
+title: cascadeflow.run()
+description: Create a scoped run context with budget caps, tool call limits, and metrics tracking.
+---
+
+# cascadeflow.run()
+
+Create a scoped run context manager that tracks metrics and optionally enforces constraints for a block of agent execution.
+
+## Signature
+
+```python
+def run(
+    budget: Optional[float] = None,
+    max_tool_calls: Optional[int] = None,
+    max_latency_ms: Optional[float] = None,
+    max_energy: Optional[float] = None,
+    compliance: Optional[str] = None,
+    kpi_weights: Optional[dict[str, float]] = None,
+    kpi_targets: Optional[dict[str, float]] = None,
+) -> ContextManager[HarnessRunContext]
+```
+
+## Parameters
+
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `budget` | `float \| None` | `None` | Max USD for this run |
+| `max_tool_calls` | `int \| None` | `None` | Max tool/function calls |
+| `max_latency_ms` | `float \| None` | `None` | Max wall-clock ms per call |
+| `max_energy` | `float \| None` | `None` | Max energy units |
+| `compliance` | `str \| None` | `None` | `"gdpr"`, `"hipaa"`, `"pci"`, or `"strict"` |
+| `kpi_weights` | `dict \| None` | `None` | KPI dimension weights |
+| `kpi_targets` | `dict \| None` | `None` | KPI dimension targets |
+
+## Returns
+
+Context manager yielding `HarnessRunContext`. See [HarnessRunContext](/api-reference/python/run-context).
+
+## Usage
+
+### Basic budget
+
+```python
+with cascadeflow.run(budget=0.50) as session:
+    result = await agent.run("Analyze this data")
+    print(session.summary())
+```
+
+### Full configuration
+
+```python
+with cascadeflow.run(
+    budget=1.00,
+    max_tool_calls=10,
+    max_energy=100.0,
+    compliance="gdpr",
+    kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1},
+    kpi_targets={"quality": 0.9},
+) as session:
+    result = await agent.run("Process EU customer data")
+    print(session.summary())
+    for record in session.trace():
+        print(f"Step {record['step']}: {record['action']}")
+```
+
+### Nested runs
+
+Runs can be nested. Inner runs inherit the parent's remaining budget:
+
+```python
+with cascadeflow.run(budget=1.00) as outer:
+    with cascadeflow.run(budget=0.30) as inner:
+        await agent.run("Sub-task")
+    # outer.summary() includes inner costs
+```
+
+## Notes
+
+- `run()` requires `init()` to have been called first
+- Parameters override the global config for the duration of the block
+- Use `session.summary()` for aggregate metrics
+- Use `session.trace()` for per-step decision records
diff --git a/docs-site/api-reference/typescript/core.mdx b/docs-site/api-reference/typescript/core.mdx
new file mode 100644
index 00000000..ae8f8311
--- /dev/null
+++ b/docs-site/api-reference/typescript/core.mdx
@@ -0,0 +1,77 @@
+---
+title: "@cascadeflow/core"
+description: TypeScript core package with CascadeAgent for model routing, cost tracking, and quality validation.
+---
+
+# @cascadeflow/core
+
+The core TypeScript package for cascadeflow. Provides `CascadeAgent` for speculative model cascading with quality validation.
+
+## Install
+
+```bash
+npm install @cascadeflow/core
+```
+
+## CascadeAgent
+
+```typescript
+import { CascadeAgent, ModelConfig } from '@cascadeflow/core';
+
+const agent = new CascadeAgent({
+  models: [
+    { name: 'gpt-4o-mini', provider: 'openai', cost: 0.000375 },
+    { name: 'gpt-4o', provider: 'openai', cost: 0.00625 },
+  ],
+});
+
+const result = await agent.run('What is TypeScript?');
+console.log(`Model: ${result.modelUsed}`);
+console.log(`Cost: $${result.totalCost}`);
+console.log(`Saved: ${result.savingsPercentage}%`);
+```
+
+## ModelConfig
+
+```typescript
+interface ModelConfig {
+  name: string;        // Model name (e.g. 'gpt-4o-mini')
+  provider: string;    // Provider name (e.g. 'openai')
+  cost: number;        // Cost per token (approximate)
+}
+```
+
+## CascadeAgentOptions
+
+```typescript
+interface CascadeAgentOptions {
+  models: ModelConfig[];
+  quality?: {
+    threshold?: number;              // Confidence threshold (0-1)
+    requireMinimumTokens?: number;   // Min response length
+    useSemanticValidation?: boolean; // Enable ML validation
+    semanticThreshold?: number;      // Semantic similarity threshold
+  };
+}
+```
+
+## Result
+
+```typescript
+interface CascadeResult {
+  content: string;
+  modelUsed: string;
+  totalCost: number;
+  savingsPercentage: number;
+  cascadeDecision: string;
+}
+```
+
+## Features
+
+- Speculative execution with quality validation
+- Multi-provider support (OpenAI, Anthropic, Groq, Ollama, vLLM)
+- Streaming responses
+- Tool calling and structured output
+- Cost tracking and analytics
+- Works in Node.js, Browser, and Edge Functions
diff --git a/docs-site/api-reference/typescript/langchain.mdx b/docs-site/api-reference/typescript/langchain.mdx
new file mode 100644
index 00000000..9a9e3050
--- /dev/null
+++ b/docs-site/api-reference/typescript/langchain.mdx
@@ -0,0 +1,77 @@
+---
+title: "@cascadeflow/langchain"
+description: TypeScript LangChain integration with withCascade() for drop-in cascade routing and model discovery helpers.
+---
+
+# @cascadeflow/langchain
+
+LangChain integration for TypeScript. Provides `withCascade()` for drop-in cascade routing with any LangChain chat model.
+
+## Install
+
+```bash
+npm install @cascadeflow/langchain @langchain/core @langchain/openai
+```
+
+## withCascade
+
+Creates a cascade-enabled chat model from a drafter and verifier.
+
+```typescript
+import { ChatOpenAI } from '@langchain/openai';
+import { ChatAnthropic } from '@langchain/anthropic';
+import { withCascade } from '@cascadeflow/langchain';
+
+const cascade = withCascade({
+  drafter: new ChatOpenAI({ model: 'gpt-4o-mini' }),
+  verifier: new ChatAnthropic({ model: 'claude-sonnet-4' }),
+  qualityThreshold: 0.8,
+});
+
+// Use like any LangChain chat model
+const result = await cascade.invoke('Explain quantum computing');
+
+// With LCEL chains
+const chain = prompt.pipe(cascade).pipe(new StringOutputParser());
+```
+
+## Options
+
+```typescript
+interface CascadeOptions {
+  drafter: BaseChatModel;        // Cheap, fast model
+  verifier: BaseChatModel;       // Powerful fallback model
+  qualityThreshold?: number;     // 0-1, default 0.4
+}
+```
+
+## Model Discovery
+
+```typescript
+import {
+  discoverCascadePairs,
+  findBestCascadePair,
+  analyzeModel,
+  validateCascadePair,
+} from '@cascadeflow/langchain';
+
+const models = [
+  new ChatOpenAI({ model: 'gpt-4o-mini' }),
+  new ChatOpenAI({ model: 'gpt-4o' }),
+  new ChatAnthropic({ model: 'claude-sonnet-4' }),
+];
+
+const best = findBestCascadePair(models);
+const cascade = withCascade({
+  drafter: best.drafter,
+  verifier: best.verifier,
+});
+```
+
+## Features
+
+- Full LCEL support (pipes, sequences, batch)
+- Streaming with pre-routing
+- Tool calling and structured output
+- LangSmith cost tracking metadata
+- Model discovery and pair validation
diff --git a/docs-site/api-reference/typescript/vercel-ai.mdx b/docs-site/api-reference/typescript/vercel-ai.mdx
new file mode 100644
index 00000000..ae9af949
--- /dev/null
+++ b/docs-site/api-reference/typescript/vercel-ai.mdx
@@ -0,0 +1,63 @@
+---
+title: "@cascadeflow/vercel-ai"
+description: Vercel AI SDK middleware integration for cascade routing with streaming, multi-turn chat, and tool execution.
+---
+
+# @cascadeflow/vercel-ai
+
+Middleware integration for the Vercel AI SDK. Adds cascade routing to AI SDK applications with streaming support.
+
+## Install
+
+```bash
+npm install @cascadeflow/vercel-ai
+```
+
+## createChatHandler
+
+Creates a request handler for AI SDK chat endpoints.
+
+```typescript
+import { createChatHandler } from '@cascadeflow/vercel-ai';
+import { CascadeAgent } from '@cascadeflow/core';
+
+const agent = new CascadeAgent({
+  models: [
+    { name: 'gpt-4o-mini', provider: 'openai', cost: 0.000375 },
+    { name: 'gpt-4o', provider: 'openai', cost: 0.00625 },
+  ],
+});
+
+const handler = createChatHandler(agent, {
+  protocol: 'data',
+  tools,
+  toolHandlers,
+  maxSteps: 5,
+});
+```
+
+## Options
+
+```typescript
+interface ChatHandlerOptions {
+  protocol: 'data' | 'ui';             // AI SDK stream protocol
+  tools?: ToolDefinition[];             // Tool definitions
+  toolHandlers?: Record<string, Function>; // Server-side tool execution
+  toolExecutor?: Function;              // Universal tool executor
+  maxSteps?: number;                    // Multi-step tool loop limit
+  forceDirect?: boolean;                // Skip cascade, use verifier
+  allowOverrides?: string[];            // Request-level override keys
+  overrideSecret?: string;              // Shared secret for overrides
+}
+```
+
+## Features
+
+- AI SDK v4 `data` stream and v5/v6 UI streams
+- `useChat` multi-turn support
+- `parts` message format (AI SDK v6)
+- Tool call streaming visibility
+- Server-side tool execution loops
+- Multi-step controls
+- Cascade decision stream parts
+- Request-level overrides with allowlist
diff --git a/docs-site/changelog.mdx b/docs-site/changelog.mdx
new file mode 100644
index 00000000..2cda1c2f
--- /dev/null
+++ b/docs-site/changelog.mdx
@@ -0,0 +1,28 @@
+---
+title: Changelog
+description: Release history and changelog for cascadeflow.
+---
+
+# Changelog
+
+For the full release history, see [GitHub Releases](https://github.com/lemony-ai/cascadeflow/releases).
+
+## Recent Highlights
+
+- **v1.0.0** — Agent runtime intelligence layer with harness API, 6 framework integrations, compliance gating, KPI-weighted routing, energy tracking, decision traces
+- Agent loops and multi-agent orchestration
+- Tool execution engine with parallel execution and risk gating
+- Hooks and callbacks for telemetry and observability
+- Vercel AI SDK integration (17+ additional providers)
+- OpenClaw provider for custom deployments
+- Gateway server (drop-in OpenAI/Anthropic-compatible endpoint)
+- User tier management with per-user budgets
+- Semantic quality validators via FastEmbed
+- Domain-aware cascading with 16 domain classifications
+- Benchmark reports (MMLU, GSM8K, MT-Bench, HumanEval, TruthfulQA)
+
+## Links
+
+- [GitHub Releases](https://github.com/lemony-ai/cascadeflow/releases)
+- [PyPI](https://pypi.org/project/cascadeflow/)
+- [npm](https://www.npmjs.com/package/@cascadeflow/core)
diff --git a/docs-site/contributing.mdx b/docs-site/contributing.mdx
new file mode 100644
index 00000000..ff45625e
--- /dev/null
+++ b/docs-site/contributing.mdx
@@ -0,0 +1,96 @@
+---
+title: Contributing
+description: How to contribute to cascadeflow — development setup, code style, testing, and pull request process.
+---
+
+# Contributing
+
+We welcome contributions to cascadeflow. This guide covers development setup for both Python and TypeScript.
+
+## Monorepo Structure
+
+```
+cascadeflow/
+  cascadeflow/         # Python package
+  packages/
+    core/              # TypeScript core
+    langchain-cascadeflow/  # LangChain TypeScript
+    integrations/
+      vercel-ai/       # Vercel AI SDK
+      n8n/             # n8n community nodes
+  tests/               # Python tests
+  examples/            # Python examples
+  docs/                # Documentation
+  docs-site/           # Mintlify docs site
+```
+
+## Python Development
+
+### Setup
+
+```bash
+git clone https://github.com/lemony-ai/cascadeflow.git
+cd cascadeflow
+python -m venv .venv
+source .venv/bin/activate
+pip install -e ".[dev]"
+pre-commit install
+```
+
+### Code Style
+
+- **Formatter**: Black (line length 100)
+- **Linter**: Ruff
+- **Type checker**: mypy
+- **Import sorting**: isort
+
+```bash
+black cascadeflow/ tests/
+ruff check cascadeflow/ tests/
+mypy cascadeflow/
+```
+
+### Testing
+
+```bash
+pytest tests/ -x -q                    # Run all tests
+pytest tests/ -m "not integration"     # Skip integration tests
+pytest tests/ --cov=cascadeflow        # With coverage
+```
+
+## TypeScript Development
+
+### Setup
+
+```bash
+cd packages/core
+pnpm install
+pnpm build
+pnpm test
+```
+
+### Code Style
+
+- **Linter**: ESLint
+- **Language**: TypeScript (strict mode)
+- **Indentation**: 2 spaces
+
+## Making Changes
+
+1. Create a branch from `main`
+2. Make changes with clear, descriptive commits
+3. Follow commit conventions: `feat:`, `fix:`, `docs:`, `test:`, `refactor:`, `chore:`
+4. Add tests for new functionality
+5. Ensure all tests pass
+
+## Pull Requests
+
+- All PRs require review approval
+- Linear history enforced (no merge commits)
+- CI must pass before merge
+
+## Links
+
+- [GitHub Issues](https://github.com/lemony-ai/cascadeflow/issues) — Bug reports and feature requests
+- [GitHub Discussions](https://github.com/lemony-ai/cascadeflow/discussions) — Questions and community
+- [Email](mailto:hello@lemony.ai) — Direct support
diff --git a/docs-site/docs.json b/docs-site/docs.json
new file mode 100644
index 00000000..1e441f37
--- /dev/null
+++ b/docs-site/docs.json
@@ -0,0 +1,130 @@
+{
+  "$schema": "https://mintlify.com/docs.json",
+  "theme": "palm",
+  "name": "cascadeflow",
+  "colors": {
+    "primary": "#0E7490",
+    "light": "#22D3EE",
+    "dark": "#0E7490"
+  },
+  "logo": {
+    "light": "/logo/cascadeflow-light.svg",
+    "dark": "/logo/cascadeflow-dark.svg"
+  },
+  "favicon": "/favicon.svg",
+  "tabs": [
+    { "id": "get-started", "name": "Get Started" },
+    { "id": "harness", "name": "Harness" },
+    { "id": "integrations", "name": "Integrations" },
+    { "id": "api-reference", "name": "API Reference" },
+    { "id": "examples", "name": "Examples" }
+  ],
+  "navigation": {
+    "get-started": [
+      {
+        "group": "Get Started",
+        "pages": [
+          "get-started/introduction",
+          "get-started/quickstart",
+          "get-started/installation",
+          "get-started/how-it-works"
+        ]
+      },
+      {
+        "group": "Resources",
+        "pages": [
+          "changelog",
+          "contributing"
+        ]
+      }
+    ],
+    "harness": [
+      {
+        "group": "Harness",
+        "pages": [
+          "harness/overview",
+          "harness/modes",
+          "harness/budget-enforcement",
+          "harness/compliance",
+          "harness/kpi-optimization",
+          "harness/energy-tracking",
+          "harness/decision-trace",
+          "harness/actions"
+        ]
+      }
+    ],
+    "integrations": [
+      {
+        "group": "Integrations",
+        "pages": [
+          "integrations/overview",
+          "integrations/langchain",
+          "integrations/openai-agents",
+          "integrations/crewai",
+          "integrations/google-adk",
+          "integrations/n8n",
+          "integrations/vercel-ai"
+        ]
+      }
+    ],
+    "api-reference": [
+      {
+        "group": "Python",
+        "pages": [
+          "api-reference/python/init",
+          "api-reference/python/run",
+          "api-reference/python/agent-decorator",
+          "api-reference/python/harness-config",
+          "api-reference/python/run-context"
+        ]
+      },
+      {
+        "group": "TypeScript",
+        "pages": [
+          "api-reference/typescript/core",
+          "api-reference/typescript/vercel-ai",
+          "api-reference/typescript/langchain"
+        ]
+      }
+    ],
+    "examples": [
+      {
+        "group": "Examples",
+        "pages": [
+          "examples/basic-usage",
+          "examples/budget-enforcement",
+          "examples/compliance-gating",
+          "examples/kpi-weighted-routing",
+          "examples/multi-agent",
+          "examples/enterprise-patterns"
+        ]
+      }
+    ]
+  },
+  "topbarLinks": [
+    {
+      "name": "GitHub",
+      "url": "https://github.com/lemony-ai/cascadeflow"
+    }
+  ],
+  "topbarCtaButton": {
+    "name": "Get Started",
+    "url": "/get-started/quickstart"
+  },
+  "footerSocials": {
+    "github": "https://github.com/lemony-ai/cascadeflow",
+    "x": "https://x.com/saschabuehrle"
+  },
+  "anchors": [
+    {
+      "name": "GitHub",
+      "icon": "github",
+      "url": "https://github.com/lemony-ai/cascadeflow"
+    },
+    {
+      "name": "PyPI",
+      "icon": "python",
+      "url": "https://pypi.org/project/cascadeflow/"
+    }
+  ]
+}
diff --git a/docs-site/examples/basic-usage.mdx b/docs-site/examples/basic-usage.mdx
new file mode 100644
index 00000000..9cf838d0
--- /dev/null
+++ b/docs-site/examples/basic-usage.mdx
@@ -0,0 +1,81 @@
+---
+title: Basic Usage
+description: Simple cascade setup with OpenAI models showing speculative execution, cost tracking, and savings calculation.
+---
+
+# Basic Usage
+
+A minimal example showing cascadeflow's speculative cascade with two OpenAI models.
+
+## Setup
+
+```bash
+pip install "cascadeflow[openai]"
+export OPENAI_API_KEY="sk-..."
+```
+
+## Code
+
+```python
+import asyncio
+from cascadeflow import CascadeAgent, ModelConfig
+
+agent = CascadeAgent(models=[
+    ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375),
+    ModelConfig(name="gpt-4o", provider="openai", cost=0.00625),
+])
+
+queries = [
+    "What's the capital of France?",        # Simple — draft model handles
+    "Explain quantum computing",             # Medium — may escalate
+    "Write a Python function to sort a list", # Code — domain routing
+]
+
+async def main():
+    total_cost = 0
+    baseline_cost = 0
+
+    for query in queries:
+        result = await agent.run(query)
+        total_cost += result.total_cost
+        baseline_cost += result.total_cost if result.model_used == "gpt-4o" else result.total_cost * (0.00625 / 0.000375)
+
+        print(f"Query: {query[:40]}...")
+        print(f"  Model: {result.model_used}")
+        print(f"  Cost: ${result.total_cost:.6f}")
+        print()
+
+    savings = (1 - total_cost / baseline_cost) * 100 if baseline_cost > 0 else 0
+    print(f"Total cost: ${total_cost:.6f}")
+    print(f"Savings: {savings:.0f}%")
+
+asyncio.run(main())
+```
+
+## How It Works
+
+1. `gpt-4o-mini` (draft model) handles the query first
+2. Quality validation checks the response
+3. If quality passes, the draft response is returned (60-70% of queries)
+4. If quality fails, `gpt-4o` (verifier model) handles the query
+5. Cost tracking reports per-query and aggregate metrics
+
+## TypeScript
+
+```typescript
+import { CascadeAgent } from '@cascadeflow/core';
+
+const agent = new CascadeAgent({
+  models: [
+    { name: 'gpt-4o-mini', provider: 'openai', cost: 0.000375 },
+    { name: 'gpt-4o', provider: 'openai', cost: 0.00625 },
+  ],
+});
+
+const result = await agent.run('What is TypeScript?');
+console.log(`Model: ${result.modelUsed}, Cost: $${result.totalCost}`);
+```
+
+## Source
+
+[examples/basic_usage.py](https://github.com/lemony-ai/cascadeflow/blob/main/examples/basic_usage.py)
diff --git a/docs-site/examples/budget-enforcement.mdx b/docs-site/examples/budget-enforcement.mdx
new file mode 100644
index 00000000..dab52ed9
--- /dev/null
+++ b/docs-site/examples/budget-enforcement.mdx
@@ -0,0 +1,84 @@
+---
+title: Budget Enforcement
+description: Per-run and per-user budget caps with enforcement callbacks, cost tracking, and automatic stop actions.
+---
+
+# Budget Enforcement
+
+Enforce spending limits on agent runs with automatic stop actions when budget is exceeded.
+
+## Basic Budget Cap
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(budget=0.50) as session:
+    result = await agent.run("Research and summarize this topic")
+
+    summary = session.summary()
+    print(f"Cost: ${summary['cost_total']:.4f}")
+    print(f"Budget remaining: ${summary['budget_remaining']:.4f}")
+    print(f"Steps completed: {summary['steps']}")
+```
+
+## Budget with Tool Call Limit
+
+```python
+with cascadeflow.run(budget=1.00, max_tool_calls=5) as session:
+    result = await agent.run("Search and analyze this dataset")
+    # Stops when either budget or tool call limit is hit
+```
+
+## Per-Agent Budgets
+
+```python
+@cascadeflow.agent(budget=0.10)
+async def triage_agent(query: str):
+    """Cheap triage — $0.10 max."""
+    return await llm.complete(query)
+
+@cascadeflow.agent(budget=2.00)
+async def research_agent(query: str):
+    """Deep research — $2.00 max."""
+    return await llm.complete(query)
+```
+
+## Cost Tracking (Legacy API)
+
+For pre-harness budget enforcement using the telemetry API:
+
+```python
+from cascadeflow.telemetry import BudgetConfig, CostTracker, strict_budget_enforcement
+
+tracker = CostTracker(
+    budget_config=BudgetConfig(
+        daily_limit=10.0,
+        per_query_limit=0.50,
+        alert_threshold=0.8,
+    ),
+    enforcement_callback=strict_budget_enforcement,
+)
+
+# Track costs manually
+tracker.track(model="gpt-4o", cost=0.003)
+print(f"Daily spend: ${tracker.daily_spend:.4f}")
+```
+
+## Decision Trace
+
+```python
+with cascadeflow.run(budget=0.50) as session:
+    result = await agent.run("Multi-step analysis")
+
+    for record in session.trace():
+        if record['action'] == 'stop':
+            print(f"Stopped at step {record['step']}: {record['reason']}")
+        else:
+            print(f"Step {record['step']}: {record['action']} (${record['cost_total']:.4f})")
+```
+
+## Source
+
+[examples/enforcement/basic_enforcement.py](https://github.com/lemony-ai/cascadeflow/blob/main/examples/enforcement/basic_enforcement.py)
diff --git a/docs-site/examples/compliance-gating.mdx b/docs-site/examples/compliance-gating.mdx
new file mode 100644
index 00000000..19f9fbd3
--- /dev/null
+++ b/docs-site/examples/compliance-gating.mdx
@@ -0,0 +1,89 @@
+---
+title: Compliance Gating
+description: GDPR, HIPAA, PCI, and strict model allowlists with enforcement examples for regulated agent workflows.
+---
+
+# Compliance Gating
+
+Restrict which models can be used based on compliance requirements.
+
+## GDPR Compliance
+
+Only allow models approved for EU data processing:
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(compliance="gdpr") as session:
+    # Only gpt-4o, gpt-4o-mini, gpt-3.5-turbo are allowed
+    result = await agent.run("Process this EU customer feedback")
+
+    for record in session.trace():
+        if record['action'] == 'switch_model':
+            print(f"Model switched: {record['reason']}")
+```
+
+## HIPAA Compliance
+
+For healthcare data — stricter allowlist:
+
+```python
+with cascadeflow.run(compliance="hipaa") as session:
+    # Only gpt-4o, gpt-4o-mini are allowed
+    result = await agent.run("Summarize this patient record")
+```
+
+## PCI Compliance
+
+For payment card data:
+
+```python
+with cascadeflow.run(compliance="pci") as session:
+    # Only gpt-4o-mini, gpt-3.5-turbo are allowed
+    result = await agent.run("Analyze this transaction")
+```
+
+## Strict Mode
+
+Maximum restriction — single model only:
+
+```python
+with cascadeflow.run(compliance="strict") as session:
+    # Only gpt-4o is allowed
+    result = await agent.run("Classify this sensitive document")
+```
+
+## Compliance Allowlists
+
+| Mode | Allowed Models |
+|---|---|
+| `gdpr` | gpt-4o, gpt-4o-mini, gpt-3.5-turbo |
+| `hipaa` | gpt-4o, gpt-4o-mini |
+| `pci` | gpt-4o-mini, gpt-3.5-turbo |
+| `strict` | gpt-4o |
+
+## Combining with Budget
+
+```python
+@cascadeflow.agent(budget=1.00, compliance="gdpr")
+async def eu_data_agent(query: str):
+    """Process EU data within budget using only GDPR-approved models."""
+    return await llm.complete(query)
+```
+
+## Observe Mode for Audit
+
+Use `observe` mode to audit which models would be blocked without affecting production:
+
+```python
+cascadeflow.init(mode="observe")
+
+with cascadeflow.run(compliance="hipaa") as session:
+    result = await agent.run("Process health data")
+
+    # Check which calls would have been blocked
+    violations = [r for r in session.trace() if r['action'] == 'switch_model']
+    print(f"Compliance violations detected: {len(violations)}")
+```
diff --git a/docs-site/examples/enterprise-patterns.mdx b/docs-site/examples/enterprise-patterns.mdx
new file mode 100644
index 00000000..5949972c
--- /dev/null
+++ b/docs-site/examples/enterprise-patterns.mdx
@@ -0,0 +1,127 @@
+---
+title: Enterprise Patterns
+description: Production-ready patterns including retry logic, rate limiting, budget management, circuit breakers, caching, and health monitoring.
+---
+
+# Enterprise Patterns
+
+Production patterns for deploying cascadeflow at scale.
+
+## Retry with Exponential Backoff
+
+```python
+import asyncio
+from cascadeflow import CascadeAgent
+
+async def execute_with_retry(agent, query, max_retries=3, base_delay=1.0):
+    for attempt in range(max_retries):
+        try:
+            return await agent.run(query)
+        except Exception as e:
+            if attempt == max_retries - 1:
+                raise
+            delay = base_delay * (2 ** attempt)
+            await asyncio.sleep(delay)
+```
+
+## Rate Limiting
+
+```python
+import time
+from collections import deque
+
+class RateLimiter:
+    def __init__(self, max_requests: int, window_seconds: float):
+        self.max_requests = max_requests
+        self.window = window_seconds
+        self.requests = deque()
+
+    async def acquire(self):
+        now = time.monotonic()
+        while self.requests and self.requests[0] < now - self.window:
+            self.requests.popleft()
+        if len(self.requests) >= self.max_requests:
+            wait = self.requests[0] + self.window - now
+            await asyncio.sleep(wait)
+        self.requests.append(time.monotonic())
+```
+
+## Budget Management
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+# Per-user daily budget
+async def handle_user_request(user_id: str, query: str):
+    user_budget = get_user_remaining_budget(user_id)
+
+    with cascadeflow.run(budget=min(user_budget, 0.50)) as session:
+        result = await agent.run(query)
+
+        spent = session.summary()['cost_total']
+        update_user_budget(user_id, spent)
+        return result
+```
+
+## Circuit Breaker
+
+```python
+from cascadeflow import CircuitBreaker, CircuitBreakerConfig
+
+config = CircuitBreakerConfig(
+    failure_threshold=5,
+    recovery_timeout=30.0,
+    half_open_max_calls=2,
+)
+
+breaker = CircuitBreaker(config=config)
+
+async def safe_call(agent, query):
+    if not breaker.allow_request():
+        return fallback_response(query)
+    try:
+        result = await agent.run(query)
+        breaker.record_success()
+        return result
+    except Exception as e:
+        breaker.record_failure()
+        raise
+```
+
+## Response Caching
+
+```python
+from cascadeflow import ResponseCache
+
+cache = ResponseCache(max_size=1000, ttl_seconds=300)
+
+async def cached_run(agent, query):
+    cached = cache.get(query)
+    if cached:
+        return cached
+    result = await agent.run(query)
+    cache.set(query, result)
+    return result
+```
+
+## Health Monitoring
+
+```python
+with cascadeflow.run(budget=10.00) as session:
+    for query in production_queries:
+        result = await agent.run(query)
+
+    summary = session.summary()
+
+    # Alert on anomalies
+    if summary['cost_total'] > 8.0:
+        alert("Budget 80% consumed")
+    if summary['steps'] > 100:
+        alert("High step count")
+```
+
+## Source
+
+[examples/production_patterns.py](https://github.com/lemony-ai/cascadeflow/blob/main/examples/production_patterns.py)
diff --git a/docs-site/examples/kpi-weighted-routing.mdx b/docs-site/examples/kpi-weighted-routing.mdx
new file mode 100644
index 00000000..5bab7689
--- /dev/null
+++ b/docs-site/examples/kpi-weighted-routing.mdx
@@ -0,0 +1,95 @@
+---
+title: KPI-Weighted Routing
+description: Configure quality, cost, latency, and energy weights to encode business priorities into model routing decisions.
+---
+
+# KPI-Weighted Routing
+
+Inject business priorities into every model decision using KPI weights.
+
+## Quality-First (Premium Workload)
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(
+    budget=2.00,
+    kpi_weights={"quality": 0.8, "cost": 0.1, "latency": 0.1},
+    kpi_targets={"quality": 0.9}
+) as session:
+    # Routes to highest-quality models within budget
+    result = await agent.run("Draft a legal contract clause")
+    print(session.summary())
+```
+
+## Cost-First (High-Volume Batch)
+
+```python
+with cascadeflow.run(
+    budget=5.00,
+    kpi_weights={"cost": 0.7, "quality": 0.2, "latency": 0.1}
+) as session:
+    # Routes to cheapest models that meet quality floor
+    for query in batch_queries:
+        result = await agent.run(query)
+    print(f"Total cost: ${session.summary()['cost_total']:.4f}")
+```
+
+## Latency-First (Real-Time)
+
+```python
+with cascadeflow.run(
+    kpi_weights={"latency": 0.7, "quality": 0.2, "cost": 0.1},
+    max_latency_ms=2000.0
+) as session:
+    # Routes to fastest models, hard cap at 2 seconds
+    result = await agent.run("Quick classification task")
+```
+
+## Energy-Aware (Carbon-Conscious)
+
+```python
+with cascadeflow.run(
+    kpi_weights={"quality": 0.4, "energy": 0.3, "cost": 0.3},
+    max_energy=100.0
+) as session:
+    # Balances quality with energy efficiency
+    result = await agent.run("Summarize this report")
+    print(f"Energy used: {session.summary()['energy_used']:.1f} units")
+```
+
+## Per-Agent Profiles
+
+```python
+@cascadeflow.agent(
+    budget=0.10,
+    kpi_weights={"cost": 0.9, "quality": 0.1}
+)
+async def triage_agent(query: str):
+    """Quick classification — prioritize cost."""
+    return await llm.complete(query)
+
+@cascadeflow.agent(
+    budget=2.00,
+    kpi_weights={"quality": 0.9, "cost": 0.1},
+    kpi_targets={"quality": 0.95}
+)
+async def analysis_agent(query: str):
+    """Deep analysis — prioritize quality."""
+    return await llm.complete(query)
+```
+
+## Quality Priors
+
+The harness uses built-in quality priors for scoring:
+
+| Model | Quality Prior | Latency Prior |
+|---|---|---|
+| o1 | 0.95 | 0.40 |
+| gpt-4o | 0.90 | 0.72 |
+| gpt-4-turbo | 0.88 | 0.66 |
+| gpt-5-mini | 0.86 | 0.84 |
+| gpt-4o-mini | 0.75 | 0.93 |
+| gpt-3.5-turbo | 0.65 | 1.00 |
diff --git a/docs-site/examples/multi-agent.mdx b/docs-site/examples/multi-agent.mdx
new file mode 100644
index 00000000..06b9598b
--- /dev/null
+++ b/docs-site/examples/multi-agent.mdx
@@ -0,0 +1,103 @@
+---
+title: Multi-Agent Orchestration
+description: Multi-turn tool execution with agent-as-a-tool delegation and budget tracking across agent boundaries.
+---
+
+# Multi-Agent Orchestration
+
+cascadeflow supports multi-agent patterns with tool execution, delegation, and budget tracking across agent boundaries.
+
+## Tool Execution Loop
+
+```python
+import asyncio
+from cascadeflow import CascadeAgent, ModelConfig
+from cascadeflow.tools import ToolConfig, ToolExecutor
+
+# Define tools
+tools = [
+    ToolConfig(
+        name="calculator",
+        description="Evaluate a math expression",
+        parameters={"expression": {"type": "string"}},
+        handler=lambda expression: str(eval(expression)),
+    ),
+    ToolConfig(
+        name="search",
+        description="Search the web",
+        parameters={"query": {"type": "string"}},
+        handler=lambda query: f"Results for: {query}",
+    ),
+]
+
+agent = CascadeAgent(models=[
+    ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375),
+    ModelConfig(name="gpt-4o", provider="openai", cost=0.00625),
+])
+
+executor = ToolExecutor(tools=tools)
+
+async def main():
+    result = await agent.run(
+        "Calculate 15% of 250 and search for tax rates",
+        tools=tools,
+        tool_executor=executor,
+        max_steps=5,
+    )
+    print(result.content)
+
+asyncio.run(main())
+```
+
+## With Harness Budget Tracking
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(budget=1.00, max_tool_calls=10) as session:
+    result = await agent.run(
+        "Research this topic using multiple tools",
+        tools=tools,
+        tool_executor=executor,
+        max_steps=10,
+    )
+
+    summary = session.summary()
+    print(f"Cost: ${summary['cost_total']:.4f}")
+    print(f"Tool calls: {summary['tool_calls']}")
+    print(f"Steps: {summary['steps']}")
+```
+
+## Agent-as-a-Tool Delegation
+
+```python
+# Define a researcher agent as a tool
+researcher = CascadeAgent(models=[
+    ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375),
+    ModelConfig(name="gpt-4o", provider="openai", cost=0.00625),
+])
+
+async def research_handler(query: str) -> str:
+    result = await researcher.run(query)
+    return result.content
+
+# Main agent can delegate to researcher
+tools = [
+    ToolConfig(
+        name="research",
+        description="Delegate research to a specialist agent",
+        parameters={"query": {"type": "string"}},
+        handler=research_handler,
+    ),
+]
+
+# Budget tracks across both agents
+with cascadeflow.run(budget=2.00) as session:
+    result = await main_agent.run("Analyze and research this topic", tools=tools)
+```
+
+## Source
+
+[examples/agentic_multi_agent.py](https://github.com/lemony-ai/cascadeflow/blob/main/examples/agentic_multi_agent.py)
diff --git a/docs-site/favicon.svg b/docs-site/favicon.svg
new file mode 100644
index 00000000..496df9f5
--- /dev/null
+++ b/docs-site/favicon.svg
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg id="Layer_1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 91.76 91.76">
+  <path d="M38.19,54.36c-4.15-4.15-4.15-10.87,0-15.02,4.15-4.15,10.87-4.15,15.02,0,4.15,4.15,4.15,10.87,0,15.02s-10.87,4.15-15.02,0Z"/>
+  <path d="M63.4,91.76H28.7c-7.93,0-14.35-6.42-14.35-14.35h49.05v14.35Z"/>
+  <path d="M14.35,77.41c-7.93,0-14.35-6.42-14.35-14.35V29.8h14.35v47.61Z"/>
+  <path d="M28.36,0h34.7c7.93,0,14.35,6.42,14.35,14.35H28.36V0Z"/>
+  <path d="M77.41,14.35c7.93,0,14.35,6.42,14.35,14.35v33.26h-14.35V14.35Z"/>
+</svg>
\ No newline at end of file
diff --git a/docs-site/get-started/how-it-works.mdx b/docs-site/get-started/how-it-works.mdx
new file mode 100644
index 00000000..721feef6
--- /dev/null
+++ b/docs-site/get-started/how-it-works.mdx
@@ -0,0 +1,112 @@
+---
+title: How It Works
+description: Architecture of cascadeflow's two engines — Cascade for speculative model routing and Harness for agent runtime intelligence.
+---
+
+# How It Works
+
+cascadeflow ships two complementary engines that can be used independently or together.
+
+## Cascade Engine
+
+The Cascade Engine optimizes model selection through **speculative execution with quality validation**:
+
+1. **Speculatively executes** small, fast models first — optimistic execution ($0.15-0.30/1M tokens)
+2. **Validates quality** of responses using configurable thresholds (completeness, confidence, correctness)
+3. **Dynamically escalates** to larger models only when quality validation fails ($1.25-3.00/1M tokens)
+4. **Learns patterns** to optimize future cascading decisions and domain-specific routing
+
+In practice, 60-70% of queries are handled by small, efficient models without escalation.
+
+**Result:** 40-85% cost reduction, 2-10x faster responses, zero quality loss.
+
+```
+Query → Domain Detection → Try Draft Model → Quality Check
+                                                  │
+                                          Pass ───┘─── Fail
+                                           │            │
+                                        Return      Escalate to
+                                        Result      Verifier Model
+```
+
+## Harness Engine
+
+The Harness Engine provides **agent runtime intelligence** — budget enforcement, compliance gating, KPI-weighted routing, energy tracking, and decision traces.
+
+Unlike the Cascade Engine which routes between models, the Harness Engine wraps existing agent execution and makes decisions at every step:
+
+```
+Agent Step → Harness Decision → allow / switch_model / deny_tool / stop
+                 │
+                 ├── Check budget remaining
+                 ├── Check compliance allowlist
+                 ├── Score KPI dimensions
+                 ├── Check tool call cap
+                 ├── Check latency cap
+                 └── Check energy cap
+```
+
+### Decision Flow
+
+For each LLM call or tool execution inside an agent loop, the harness:
+
+1. **Records** the model, step number, and cumulative metrics
+2. **Evaluates** all configured constraints (budget, compliance, tool calls, latency, energy)
+3. **Scores** the call against KPI weights if configured
+4. **Decides** an action: `allow`, `switch_model`, `deny_tool`, or `stop`
+5. **Enforces** the action if in `enforce` mode (logs only in `observe` mode)
+6. **Appends** a trace record for auditability
+
+### HarnessConfig
+
+All harness behavior is configured through a single dataclass:
+
+```python
+HarnessConfig(
+    mode="enforce",           # off | observe | enforce
+    budget=0.50,              # Max USD for the run
+    max_tool_calls=10,        # Max tool/function calls
+    max_latency_ms=5000.0,    # Max wall-clock ms per call
+    max_energy=100.0,         # Max energy units
+    compliance="gdpr",        # gdpr | hipaa | pci | strict
+    kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1},
+    kpi_targets={"quality": 0.9},
+)
+```
+
+## Combined Usage
+
+When both engines are active, the Cascade Engine handles model selection while the Harness Engine enforces constraints:
+
+```python
+import cascadeflow
+from cascadeflow import CascadeAgent, ModelConfig
+
+# Harness: enforce budget and compliance
+cascadeflow.init(mode="enforce")
+
+# Cascade: speculative model routing
+agent = CascadeAgent(models=[
+    ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375),
+    ModelConfig(name="gpt-4o", provider="openai", cost=0.00625),
+])
+
+with cascadeflow.run(budget=1.00) as session:
+    result = await agent.run("Analyze this contract for GDPR compliance")
+    print(session.summary())
+```
+
+## Provider Abstraction
+
+cascadeflow supports 17+ providers through a unified interface:
+
+| Provider | Type | Package |
+|---|---|---|
+| OpenAI | API | `cascadeflow[openai]` |
+| Anthropic | API | `cascadeflow[anthropic]` |
+| Groq | API | `cascadeflow[groq]` |
+| Together | API | `cascadeflow[together]` |
+| Hugging Face | API | `cascadeflow[huggingface]` |
+| Ollama | Local | Built-in (HTTP) |
+| vLLM | Local | `cascadeflow[vllm]` |
+| Vercel AI SDK | TypeScript | `@cascadeflow/vercel-ai` |
diff --git a/docs-site/get-started/installation.mdx b/docs-site/get-started/installation.mdx
new file mode 100644
index 00000000..ff6b8583
--- /dev/null
+++ b/docs-site/get-started/installation.mdx
@@ -0,0 +1,101 @@
+---
+title: Installation
+description: Install cascadeflow with pip extras for Python or npm packages for TypeScript, including provider-specific setup.
+---
+
+# Installation
+
+## Python
+
+### Minimal install
+
+```bash
+pip install cascadeflow
+```
+
+Core dependencies: `pydantic>=2.0.0`, `httpx>=0.25.0`, `tiktoken>=0.5.0`, `rich>=13.0.0`.
+
+### With providers
+
+```bash
+pip install "cascadeflow[providers]"  # OpenAI + Anthropic + Groq
+```
+
+Individual providers:
+
+```bash
+pip install "cascadeflow[openai]"      # OpenAI
+pip install "cascadeflow[anthropic]"   # Anthropic
+pip install "cascadeflow[groq]"        # Groq
+pip install "cascadeflow[huggingface]" # Hugging Face
+pip install "cascadeflow[together]"    # Together AI
+```
+
+### With framework integrations
+
+```bash
+pip install "cascadeflow[langchain]"       # LangChain/LangGraph
+pip install "cascadeflow[openai-agents]"   # OpenAI Agents SDK
+pip install "cascadeflow[crewai]"          # CrewAI (Python 3.10+)
+pip install "cascadeflow[google-adk]"      # Google ADK (Python 3.10+)
+```
+
+### Local inference
+
+```bash
+pip install "cascadeflow[vllm]"  # vLLM (Python 3.10-3.13)
+```
+
+Ollama does not need a Python package — cascadeflow communicates with Ollama via HTTP at `localhost:11434`. Install Ollama separately from [ollama.ai](https://ollama.ai).
+
+### Everything
+
+```bash
+pip install "cascadeflow[all]"  # All providers + semantic routing
+```
+
+### Development
+
+```bash
+git clone https://github.com/lemony-ai/cascadeflow.git
+cd cascadeflow
+pip install -e ".[dev]"
+```
+
+## TypeScript
+
+### Core
+
+```bash
+npm install @cascadeflow/core
+```
+
+### Framework packages
+
+```bash
+npm install @cascadeflow/langchain                # LangChain integration
+npm install @cascadeflow/vercel-ai                 # Vercel AI SDK middleware
+npm install @cascadeflow/n8n-nodes-cascadeflow     # n8n community node
+```
+
+## Provider Setup
+
+Set API keys as environment variables:
+
+```bash
+export OPENAI_API_KEY="sk-..."
+export ANTHROPIC_API_KEY="sk-ant-..."
+export GROQ_API_KEY="gsk_..."
+```
+
+cascadeflow auto-detects available providers based on which API keys are set.
+
+## Verify Installation
+
+```bash
+python -c "import cascadeflow; print(cascadeflow.__version__)"
+```
+
+```bash
+python -c "from cascadeflow import init, run, HarnessConfig, HarnessRunContext; print('OK')"
+```
diff --git a/docs-site/get-started/introduction.mdx b/docs-site/get-started/introduction.mdx
new file mode 100644
index 00000000..39c2f74c
--- /dev/null
+++ b/docs-site/get-started/introduction.mdx
@@ -0,0 +1,62 @@
+---
+title: Introduction
+description: What cascadeflow is, how it differs from external proxies, and when to use it for agent runtime intelligence.
+---
+
+# Introduction
+
+cascadeflow is an in-process intelligence layer that sits inside AI agent execution loops. Unlike external proxies that only see HTTP request boundaries, cascadeflow operates with full agent state awareness: step count, budget consumed, tool call history, error context, quality scores, domain, complexity, and user-defined business context.
+
+## What makes cascadeflow different
+
+**1. Inside-the-loop control.** Decisions happen per-step and per-tool-call inside agent execution, not at the HTTP boundary. This enables budget gating mid-run, model switching based on remaining budget, and stop actions when caps are hit.
+
+**2. Multi-dimensional optimization.** Six dimensions scored simultaneously: cost, latency, quality, budget, compliance, and energy. Not just cost routing.
+
+**3. Business logic injection.** KPI weights and targets let teams encode business priorities (e.g. 60% quality, 30% cost, 10% latency) into every model decision.
+
+**4. Actionable decisions.** Four actions: `allow`, `switch_model`, `deny_tool`, `stop`. The harness does not just observe — it controls execution flow.
+
+**5. Full transparency.** Every decision produces a trace record with action, reason, model, step, cost_total, budget_state, and applied fields. Audit-ready.
+
+**6. Measurable value.** Session summaries report cost, latency, energy, steps, tool calls, and budget remaining. Before/after comparison is built in.
+
+**7. Cross-framework policy layer.** Unified KPI semantics across LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, and Vercel AI SDK.
+
+**8. Latency advantage.** In-process instrumentation adds less than 1ms overhead per call. External proxies add 10-50ms of network round-trip latency per LLM call.
+
+## Proxy vs In-Process Harness
+
+| Dimension | External Proxy | cascadeflow Harness |
+|---|---|---|
+| **Scope** | HTTP request boundary | Inside agent execution loop |
+| **Dimensions** | Cost only | Cost + quality + latency + budget + compliance + energy |
+| **Latency overhead** | 10-50ms network RTT | <1ms in-process |
+| **Business logic** | None | KPI weights and targets |
+| **Enforcement** | None (observe only) | stop, deny_tool, switch_model |
+| **Auditability** | Request logs | Per-step decision traces |
+
+## When to use cascadeflow
+
+- You run AI agents (LangChain, LangGraph, CrewAI, OpenAI Agents SDK, Google ADK, or custom)
+- You want to reduce LLM costs without changing agent code
+- You need budget enforcement across multi-step agent runs
+- You need to inject business KPIs (quality, cost, latency, energy) into agent decisions
+- You need compliance-aware model gating (GDPR, HIPAA, PCI, strict)
+- You want full trace recording for auditability and tuning
+
+## When NOT to use cascadeflow
+
+- Single one-off LLM calls (overhead not justified)
+- You only use one model and don't want routing
+- You need a hosted proxy service (cascadeflow is a library, not a SaaS)
+
+## Two Engines
+
+cascadeflow ships two complementary engines:
+
+**Cascade Engine** — Speculative execution with quality validation. Tries cheap models first, validates quality, escalates only when needed. Achieves 40-85% cost savings on typical workloads.
+
+**Harness Engine** — Agent runtime intelligence. Budget enforcement, compliance gating, KPI-weighted routing, energy tracking, and decision traces. Works inside agent loops with full state awareness.
+
+Both engines can be used independently or together.
diff --git a/docs-site/get-started/quickstart.mdx b/docs-site/get-started/quickstart.mdx
new file mode 100644
index 00000000..64189077
--- /dev/null
+++ b/docs-site/get-started/quickstart.mdx
@@ -0,0 +1,118 @@
+---
+title: Quickstart
+description: Get cascadeflow running in 3 minutes with zero code changes using the harness API.
+---
+
+# Quickstart
+
+Three tiers of integration — pick the one that matches your needs.
+
+## Install
+
+<CodeGroup>
+
+```bash pip
+pip install "cascadeflow[openai]"
+```
+
+```bash With integrations
+pip install "cascadeflow[langchain]"     # LangChain/LangGraph
+pip install "cascadeflow[openai-agents]" # OpenAI Agents SDK
+pip install "cascadeflow[crewai]"        # CrewAI
+pip install "cascadeflow[google-adk]"    # Google ADK
+```
+
+```bash npm
+npm install @cascadeflow/core
+```
+
+</CodeGroup>
+
+## Tier 1: Zero-Change Observability
+
+Add two lines. All OpenAI and Anthropic SDK calls are automatically tracked.
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="observe")
+
+# Your existing code — no changes needed
+import openai
+client = openai.OpenAI()
+response = client.chat.completions.create(
+    model="gpt-4o",
+    messages=[{"role": "user", "content": "What is cascadeflow?"}]
+)
+# cascadeflow is now tracking cost, latency, energy, and model usage.
+```
+
+## Tier 2: Scoped Runs with Budget
+
+Wrap agent execution in a `run()` context manager for budget tracking and enforcement.
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(budget=0.50, max_tool_calls=10) as session:
+    # Your agent code here
+    result = await agent.run("Analyze this dataset and create a report")
+
+    # After execution, inspect metrics
+    summary = session.summary()
+    print(f"Cost: ${summary['cost_total']:.4f}")
+    print(f"Steps: {summary['steps']}")
+    print(f"Tool calls: {summary['tool_calls']}")
+    print(f"Budget remaining: ${summary['budget_remaining']:.4f}")
+
+    # Full decision audit trail
+    for decision in session.trace():
+        print(f"  Step {decision['step']}: {decision['action']} — {decision['reason']}")
+```
+
+## Tier 3: Decorated Agents with Policy
+
+Annotate agent functions with budget, compliance, and KPI metadata.
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+@cascadeflow.agent(
+    budget=0.20,
+    compliance="gdpr",
+    kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1}
+)
+async def research_agent(query: str):
+    return await llm.complete(query)
+```
+
+## Harness Modes
+
+| Mode | Tracking | Enforcement | Use Case |
+|---|---|---|---|
+| `off` | No | No | Disabled |
+| `observe` | Yes | No | Safe production rollout, metrics collection |
+| `enforce` | Yes | Yes | Budget caps, compliance gating, stop actions |
+
+Start with `observe` in production. Switch to `enforce` once you've validated the metrics.
+
+## Next Steps
+
+<CardGroup cols={2}>
+  <Card title="Installation" icon="download" href="/get-started/installation">
+    All pip extras, npm packages, and provider setup.
+  </Card>
+  <Card title="How It Works" icon="gears" href="/get-started/how-it-works">
+    Architecture of the Cascade and Harness engines.
+  </Card>
+  <Card title="Budget Enforcement" icon="wallet" href="/harness/budget-enforcement">
+    Per-run and per-user budget caps.
+  </Card>
+  <Card title="Integrations" icon="plug" href="/integrations/overview">
+    LangChain, OpenAI Agents, CrewAI, Google ADK, n8n, Vercel AI.
+  </Card>
+</CardGroup>
diff --git a/docs-site/harness/actions.mdx b/docs-site/harness/actions.mdx
new file mode 100644
index 00000000..a904eed8
--- /dev/null
+++ b/docs-site/harness/actions.mdx
@@ -0,0 +1,99 @@
+---
+title: Decision Actions
+description: Four harness actions — allow, switch_model, deny_tool, and stop — and when each is triggered.
+---
+
+# Decision Actions
+
+The harness makes one of four decisions at every step. Actions are computed in both `observe` and `enforce` modes, but only applied in `enforce` mode.
+
+## Actions
+
+### `allow`
+
+Proceed normally. No constraints are violated.
+
+```
+Step 1: allow — budget ok, model compliant
+```
+
+This is the most common action. It means all hard caps (budget, tool calls, latency, energy) are within limits and compliance is satisfied.
+
+### `switch_model`
+
+Route to a different model. Triggered when:
+- The current model is not in the compliance allowlist
+- KPI scoring indicates a better model choice
+- Budget pressure suggests a cheaper alternative
+
+```
+Step 3: switch_model — compliance violation, switching to gpt-4o-mini (gdpr allowlist)
+```
+
+In `enforce` mode, the harness substitutes the model. In `observe` mode, the original model is used and the trace records what would have happened.
+
+### `deny_tool`
+
+Block a tool/function call. Triggered when `max_tool_calls` is reached.
+
+```
+Step 5: deny_tool — tool call cap reached (10/10)
+```
+
+In `enforce` mode, the tool call is blocked. The agent receives a signal that the tool was denied.
+
+### `stop`
+
+Halt agent execution. Triggered when:
+- Budget is exceeded
+- Latency cap is exceeded
+- Energy cap is exceeded
+
+```
+Step 7: stop — budget exceeded ($0.52 > $0.50 cap)
+```
+
+In `enforce` mode, the agent loop is stopped. In `observe` mode, execution continues and the trace records the violation.
+
+## Decision Priority
+
+When multiple constraints are violated simultaneously, the harness applies this priority:
+
+1. **Compliance** — check first (switch_model or stop)
+2. **Budget** — check second (stop)
+3. **Tool calls** — check third (deny_tool)
+4. **Latency** — check fourth (stop)
+5. **Energy** — check fifth (stop)
+6. **KPI scoring** — soft optimization (switch_model or allow)
+
+## Hard vs Soft Controls
+
+**Hard controls** trigger `stop` or `deny_tool` when limits are exceeded:
+- `budget` — max USD
+- `max_tool_calls` — max tool/function calls
+- `max_latency_ms` — max wall-clock ms per call
+- `max_energy` — max energy units
+- `compliance` — model allowlist
+
+**Soft controls** influence model selection through KPI weights but never block execution:
+- `kpi_weights` — relative importance of quality, cost, latency, energy
+- `kpi_targets` — target values for KPI dimensions
+
+## Example: Combined Constraints
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(
+    budget=1.00,
+    max_tool_calls=5,
+    compliance="gdpr",
+    kpi_weights={"quality": 0.6, "cost": 0.4}
+) as session:
+    result = await agent.run("Process EU customer data")
+
+    for record in session.trace():
+        print(f"Step {record['step']}: {record['action']} — {record['reason']}")
+```
diff --git a/docs-site/harness/budget-enforcement.mdx b/docs-site/harness/budget-enforcement.mdx
new file mode 100644
index 00000000..079752ac
--- /dev/null
+++ b/docs-site/harness/budget-enforcement.mdx
@@ -0,0 +1,83 @@
+---
+title: Budget Enforcement
+description: Configure budget enforcement with per-run caps and automatic stop actions when budget is exceeded.
+---
+
+# Budget Enforcement
+
+The harness tracks cumulative cost across all LLM calls in a run and enforces budget caps in `enforce` mode.
+
+## Per-Run Budget
+
+Set a budget cap on a scoped run:
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(budget=0.50) as session:
+    # Agent executes multiple LLM calls
+    result = await agent.run("Research and summarize this topic")
+
+    summary = session.summary()
+    print(f"Total cost: ${summary['cost_total']:.4f}")
+    print(f"Budget remaining: ${summary['budget_remaining']:.4f}")
+```
+
+When cumulative cost exceeds the budget:
+- In `observe` mode: the trace records `action: "stop"` with `applied: false`
+- In `enforce` mode: the harness stops execution with `action: "stop"` and `applied: true`
+
+## Per-Agent Budget
+
+Attach budget metadata to agent functions:
+
+```python
+@cascadeflow.agent(budget=0.20)
+async def cheap_agent(query: str):
+    return await llm.complete(query)
+
+@cascadeflow.agent(budget=2.00)
+async def premium_agent(query: str):
+    return await llm.complete(query)
+```
+
+## Budget Pressure Routing
+
+When budget is partially consumed, the harness can route to cheaper models. This happens automatically when KPI weights include a cost dimension:
+
+```python
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(
+    budget=1.00,
+    kpi_weights={"quality": 0.5, "cost": 0.5}
+) as session:
+    # Early calls may use gpt-4o (high quality)
+    # As budget pressure increases, routing shifts toward gpt-4o-mini (lower cost)
+    for query in queries:
+        result = await agent.run(query)
+```
+
+## Cost Calculation
+
+Cost is estimated from the built-in pricing table:
+
+```
+cost = (input_tokens / 1_000_000) * input_price + (output_tokens / 1_000_000) * output_price
+```
+
+The pricing table covers 18 models across OpenAI, Anthropic, and Google. Unknown models are resolved via fuzzy matching.
+
+## Combining with Tool Call Caps
+
+Budget and tool call caps work together:
+
+```python
+with cascadeflow.run(budget=0.50, max_tool_calls=10) as session:
+    # Stops when either limit is hit
+    result = await agent.run("Analyze this data")
+```
+
+The harness checks all constraints at every step. The first constraint that is violated triggers the corresponding action (`stop` for budget, `deny_tool` for tool calls).
diff --git a/docs-site/harness/compliance.mdx b/docs-site/harness/compliance.mdx
new file mode 100644
index 00000000..febb0de5
--- /dev/null
+++ b/docs-site/harness/compliance.mdx
@@ -0,0 +1,66 @@
+---
+title: Compliance Gating
+description: GDPR, HIPAA, PCI, and strict model allowlists for compliance-aware model gating in agent workflows.
+---
+
+# Compliance Gating
+
+The harness enforces model allowlists based on compliance requirements. When a compliance mode is set, only models in the corresponding allowlist are permitted.
+
+## Compliance Modes
+
+| Mode | Allowed Models | Use Case |
+|---|---|---|
+| `gdpr` | gpt-4o, gpt-4o-mini, gpt-3.5-turbo | EU data protection |
+| `hipaa` | gpt-4o, gpt-4o-mini | Healthcare data |
+| `pci` | gpt-4o-mini, gpt-3.5-turbo | Payment card data |
+| `strict` | gpt-4o | Maximum restriction |
+
+## Usage
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+# GDPR compliance — only gpt-4o, gpt-4o-mini, gpt-3.5-turbo allowed
+with cascadeflow.run(compliance="gdpr") as session:
+    result = await agent.run("Process this EU customer data")
+```
+
+Or as agent metadata:
+
+```python
+@cascadeflow.agent(compliance="hipaa")
+async def medical_agent(query: str):
+    return await llm.complete(query)
+```
+
+## Enforcement Behavior
+
+When a model outside the allowlist is requested:
+
+- In `observe` mode: the trace records `action: "switch_model"` with the suggested compliant alternative, but execution continues with the original model
+- In `enforce` mode: the harness blocks the non-compliant model and either switches to a compliant alternative or stops execution
+
+## Combining with Budget
+
+Compliance and budget constraints are independent. Both are checked at every step:
+
+```python
+with cascadeflow.run(budget=0.50, compliance="gdpr") as session:
+    # Must stay within budget AND use only GDPR-approved models
+    result = await agent.run("Analyze EU customer feedback")
+```
+
+## Custom Allowlists
+
+The built-in allowlists cover common regulations. For custom requirements, set compliance at the integration level or use the `HarnessConfig` directly:
+
+```python
+config = HarnessConfig(
+    mode="enforce",
+    compliance="strict",  # Only gpt-4o
+)
+cascadeflow.init(config=config)
+```
diff --git a/docs-site/harness/decision-trace.mdx b/docs-site/harness/decision-trace.mdx
new file mode 100644
index 00000000..2b1b14a6
--- /dev/null
+++ b/docs-site/harness/decision-trace.mdx
@@ -0,0 +1,102 @@
+---
+title: Decision Traces
+description: Per-step audit trail of every harness decision — action, reason, model, cost, budget state, and enforcement status.
+---
+
+# Decision Traces
+
+Every harness decision produces a trace record. Traces provide a full audit trail for debugging, compliance reporting, and performance tuning.
+
+## Trace Format
+
+Each trace record contains:
+
+| Field | Type | Description |
+|---|---|---|
+| `action` | string | `"allow"`, `"switch_model"`, `"deny_tool"`, or `"stop"` |
+| `reason` | string | Human-readable explanation of the decision |
+| `model` | string | Model name used for the call |
+| `step` | int | Step number in the run (1-indexed) |
+| `cost_total` | float | Cumulative cost in USD at this step |
+| `budget_state` | string | `"ok"`, `"warning"`, or `"exceeded"` |
+| `applied` | bool | `true` if the action was enforced, `false` in observe mode |
+
+## Accessing Traces
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="observe")
+
+with cascadeflow.run(budget=0.50) as session:
+    result = await agent.run("Research this topic")
+
+    # Full decision trace
+    for record in session.trace():
+        print(f"Step {record['step']}: {record['action']} — {record['reason']}")
+        print(f"  Model: {record['model']}, Cost: ${record['cost_total']:.4f}")
+        print(f"  Budget: {record['budget_state']}, Applied: {record['applied']}")
+```
+
+Example output:
+
+```
+Step 1: allow — budget ok, model compliant
+  Model: gpt-4o-mini, Cost: $0.0003
+  Budget: ok, Applied: false
+Step 2: allow — budget ok, model compliant
+  Model: gpt-4o-mini, Cost: $0.0007
+  Budget: ok, Applied: false
+Step 3: switch_model — budget pressure, routing to cheaper model
+  Model: gpt-4o, Cost: $0.0032
+  Budget: warning, Applied: false
+```
+
+## Observe vs Enforce
+
+In `observe` mode, traces record what the harness *would* do:
+- `applied` is always `false`
+- Agent execution continues regardless of the action
+
+In `enforce` mode, traces record what the harness *did*:
+- `applied` is `true` when the action was enforced
+- `stop` actions halt execution
+- `deny_tool` actions block tool calls
+
+## Privacy
+
+Decision traces do not contain prompt content, response content, or user data. They only contain:
+- Model names and step numbers
+- Cost and budget metrics
+- Action decisions and reasons
+
+This makes traces safe for logging, external storage, and compliance reporting without data classification concerns.
+
+## Callbacks
+
+Register callbacks to receive trace records in real time:
+
+```python
+from cascadeflow import get_harness_callback_manager, set_harness_callback_manager
+
+cb_manager = get_harness_callback_manager()
+
+# Traces are emitted through the callback system
+# Use framework-specific integrations for structured access
+```
+
+## Session Summary
+
+In addition to per-step traces, `session.summary()` provides aggregate metrics:
+
+```python
+summary = session.summary()
+# {
+#     "cost_total": 0.0032,
+#     "steps": 3,
+#     "tool_calls": 1,
+#     "latency_total_ms": 1250.0,
+#     "energy_used": 45.2,
+#     "budget_remaining": 0.4968,
+# }
+```
diff --git a/docs-site/harness/energy-tracking.mdx b/docs-site/harness/energy-tracking.mdx
new file mode 100644
index 00000000..a3d292ee
--- /dev/null
+++ b/docs-site/harness/energy-tracking.mdx
@@ -0,0 +1,99 @@
+---
+title: Energy Tracking
+description: Deterministic compute-intensity coefficients for carbon-aware AI operations, with energy caps and per-model coefficients.
+---
+
+# Energy Tracking
+
+The harness tracks energy consumption using deterministic compute-intensity coefficients. This provides a proxy for carbon impact without requiring real-time power measurement.
+
+## Energy Formula
+
+```
+energy_units = coefficient * (input_tokens + output_tokens * 1.5)
+```
+
+Output tokens are weighted 1.5x because generation is more compute-intensive than prompt processing.
+
+## Energy Coefficients
+
+| Model | Coefficient | Relative Cost |
+|---|---|---|
+| gpt-3.5-turbo | 0.20 | Lowest |
+| gemini-1.5-flash | 0.20 | Lowest |
+| gemini-2.0-flash | 0.25 | Very low |
+| claude-haiku-3.5 | 0.30 | Low |
+| gemini-2.5-flash | 0.30 | Low |
+| gpt-4o-mini | 0.30 | Low |
+| gpt-5-mini | 0.35 | Low |
+| o3-mini | 0.50 | Medium |
+| o1-mini | 0.80 | Medium-high |
+| gpt-4o | 1.00 | Baseline |
+| claude-sonnet-4 | 1.00 | Baseline |
+| gemini-1.5-pro | 1.00 | Baseline |
+| gpt-5 | 1.20 | High |
+| gemini-2.5-pro | 1.20 | High |
+| gpt-4-turbo | 1.50 | High |
+| gpt-4 | 1.50 | High |
+| claude-opus-4.5 | 1.80 | Very high |
+| o1 | 2.00 | Highest |
+
+## Energy Caps
+
+Set a maximum energy budget for a run:
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(max_energy=100.0) as session:
+    result = await agent.run("Process this large dataset")
+
+    summary = session.summary()
+    print(f"Energy used: {summary['energy_used']:.1f} units")
+```
+
+When energy exceeds the cap:
+- In `observe` mode: logged but not enforced
+- In `enforce` mode: execution stops with `action: "stop"`
+
+## Energy-Aware KPI Weights
+
+Include energy in KPI weights for carbon-aware routing:
+
+```python
+with cascadeflow.run(
+    kpi_weights={"quality": 0.4, "cost": 0.3, "energy": 0.3}
+) as session:
+    # Routes toward lower-energy models when quality allows
+    result = await agent.run("Summarize this article")
+```
+
+## Pricing Table
+
+Full pricing for all 18 supported models (USD per 1M tokens):
+
+| Model | Input | Output |
+|---|---|---|
+| **OpenAI** | | |
+| gpt-4o | $2.50 | $10.00 |
+| gpt-4o-mini | $0.15 | $0.60 |
+| gpt-5 | $1.25 | $10.00 |
+| gpt-5-mini | $0.20 | $0.80 |
+| gpt-4-turbo | $10.00 | $30.00 |
+| gpt-4 | $30.00 | $60.00 |
+| gpt-3.5-turbo | $0.50 | $1.50 |
+| o1 | $15.00 | $60.00 |
+| o1-mini | $3.00 | $12.00 |
+| o3-mini | $1.10 | $4.40 |
+| **Anthropic** | | |
+| claude-sonnet-4 | $3.00 | $15.00 |
+| claude-haiku-3.5 | $1.00 | $5.00 |
+| claude-opus-4.5 | $5.00 | $25.00 |
+| **Google** | | |
+| gemini-2.5-flash | $0.15 | $0.60 |
+| gemini-2.5-pro | $1.25 | $10.00 |
+| gemini-2.0-flash | $0.10 | $0.40 |
+| gemini-1.5-flash | $0.075 | $0.30 |
+| gemini-1.5-pro | $1.25 | $5.00 |
diff --git a/docs-site/harness/kpi-optimization.mdx b/docs-site/harness/kpi-optimization.mdx
new file mode 100644
index 00000000..e07e1023
--- /dev/null
+++ b/docs-site/harness/kpi-optimization.mdx
@@ -0,0 +1,103 @@
+---
+title: KPI-Weighted Routing
+description: Inject business priorities as quality, cost, latency, and energy weights into every model routing decision.
+---
+
+# KPI-Weighted Routing
+
+The harness scores each model decision against configurable KPI weights. This lets teams encode business priorities into agent behavior without changing agent code.
+
+## KPI Dimensions
+
+| Dimension | Score Source | Range | What it means |
+|---|---|---|---|
+| `quality` | Model quality priors | 0.0-1.0 | Higher = better output quality |
+| `cost` | Inverse of model cost | 0.0-1.0 | Higher = cheaper model |
+| `latency` | Model latency priors | 0.0-1.0 | Higher = faster response |
+| `energy` | Inverse of energy coefficient | 0.0-1.0 | Higher = lower compute intensity |
+
+## Configuration
+
+```python
+import cascadeflow
+
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(
+    kpi_weights={"quality": 0.6, "cost": 0.3, "latency": 0.1},
+    kpi_targets={"quality": 0.9}
+) as session:
+    result = await agent.run("Analyze this legal document")
+```
+
+### Weights
+
+Weights are relative — they don't need to sum to 1.0 (they are normalized internally). They control the relative importance of each dimension in the composite score.
+
+```python
+# Quality-first (premium workload)
+kpi_weights = {"quality": 0.8, "cost": 0.1, "latency": 0.1}
+
+# Cost-first (high-volume batch)
+kpi_weights = {"quality": 0.2, "cost": 0.7, "latency": 0.1}
+
+# Balanced
+kpi_weights = {"quality": 0.4, "cost": 0.3, "latency": 0.2, "energy": 0.1}
+```
+
+### Targets
+
+Targets set minimum acceptable values. If a model's score for a dimension falls below the target, it is penalized in the composite score.
+
+```python
+kpi_targets = {
+    "quality": 0.9,   # Require high quality
+    "latency": 0.7,   # Require reasonable speed
+}
+```
+
+## Scoring Formula
+
+The composite score for a model is:
+
+```
+score = quality_prior * w_quality + cost_utility * w_cost + latency_prior * w_latency + energy_utility * w_energy
+```
+
+Where `w_*` are the normalized weights and utility values are computed from model priors.
+
+## Quality Priors
+
+Built-in quality priors for common models (OpenAI):
+
+| Model | Quality | Latency |
+|---|---|---|
+| o1 | 0.95 | 0.40 |
+| gpt-4o | 0.90 | 0.72 |
+| gpt-4-turbo | 0.88 | 0.66 |
+| gpt-4 | 0.87 | 0.52 |
+| gpt-5-mini | 0.86 | 0.84 |
+| o1-mini | 0.82 | 0.60 |
+| o3-mini | 0.80 | 0.78 |
+| gpt-4o-mini | 0.75 | 0.93 |
+| gpt-3.5-turbo | 0.65 | 1.00 |
+
+## Per-Agent KPI Weights
+
+Different agents can have different priorities:
+
+```python
+@cascadeflow.agent(
+    budget=0.50,
+    kpi_weights={"quality": 0.8, "cost": 0.2}
+)
+async def quality_agent(query: str):
+    return await llm.complete(query)
+
+@cascadeflow.agent(
+    budget=0.10,
+    kpi_weights={"cost": 0.8, "quality": 0.2}
+)
+async def budget_agent(query: str):
+    return await llm.complete(query)
+```
diff --git a/docs-site/harness/modes.mdx b/docs-site/harness/modes.mdx
new file mode 100644
index 00000000..46a86840
--- /dev/null
+++ b/docs-site/harness/modes.mdx
@@ -0,0 +1,78 @@
+---
+title: Harness Modes
+description: Three harness modes — off, observe, and enforce — with rollout guidance for production deployments.
+---
+
+# Harness Modes
+
+cascadeflow operates in one of three modes, set at initialization.
+
+## Modes
+
+### `off`
+
+No tracking, no enforcement. The harness is completely disabled. This is the default.
+
+```python
+cascadeflow.init(mode="off")
+```
+
+### `observe`
+
+Track all metrics and decisions, but never block execution. Every LLM call and tool execution is recorded with full decision traces. Actions are computed but not enforced — `applied` is always `false` in trace records.
+
+```python
+cascadeflow.init(mode="observe")
+```
+
+Use `observe` for:
+- Initial production rollout to validate metrics before enforcing
+- Shadow-mode testing to understand what the harness would do
+- Cost and usage analytics without affecting agent behavior
+
+### `enforce`
+
+Track all metrics and enforce constraints. When a hard cap is hit (budget, tool calls, latency, energy) or a compliance violation is detected, the harness takes action: `stop`, `deny_tool`, or `switch_model`.
+
+```python
+cascadeflow.init(mode="enforce")
+```
+
+Use `enforce` when:
+- You have validated metrics in `observe` mode
+- You need hard budget caps to prevent runaway costs
+- Compliance requirements mandate model gating
+
+## Rollout Guidance
+
+Recommended rollout sequence for production:
+
+1. **Deploy with `observe`** — No risk to agent behavior. Collect metrics, review decision traces, validate that the harness sees what you expect.
+
+2. **Review traces** — Check that compliance allowlists, budget calculations, and KPI scoring match your expectations.
+
+3. **Switch to `enforce`** — Once validated, change the mode. The harness will now enforce constraints.
+
+4. **Monitor** — Use `session.summary()` and `session.trace()` to monitor enforcement in production.
+
+```python
+import os
+
+# Environment-driven mode selection
+mode = os.getenv("CASCADEFLOW_MODE", "observe")
+cascadeflow.init(mode=mode)
+```
+
+## Mode Behavior Matrix
+
+| Behavior | `off` | `observe` | `enforce` |
+|---|---|---|---|
+| Cost tracking | No | Yes | Yes |
+| Latency tracking | No | Yes | Yes |
+| Energy tracking | No | Yes | Yes |
+| Decision traces | No | Yes | Yes |
+| Budget enforcement | No | No | Yes |
+| Tool call gating | No | No | Yes |
+| Compliance gating | No | No | Yes |
+| `session.summary()` | Empty | Full metrics | Full metrics |
+| `session.trace()` | Empty | Decisions (applied=false) | Decisions (applied=true) |
diff --git a/docs-site/harness/overview.mdx b/docs-site/harness/overview.mdx
new file mode 100644
index 00000000..8486c8c4
--- /dev/null
+++ b/docs-site/harness/overview.mdx
@@ -0,0 +1,80 @@
+---
+title: Harness Overview
+description: Overview of the cascadeflow harness — six optimization dimensions, HarnessConfig surface, and high-level decision flow.
+---
+
+# Harness Overview
+
+The cascadeflow harness is an in-process intelligence layer that wraps AI agent execution. It tracks, scores, and optionally enforces constraints across six dimensions for every LLM call and tool execution inside agent loops.
+
+## Six Dimensions
+
+| Dimension | What it measures | Hard cap | Soft scoring |
+|---|---|---|---|
+| **Cost** | Estimated USD from the pricing table | `budget` | `kpi_weights.cost` |
+| **Latency** | Wall-clock milliseconds per LLM call | `max_latency_ms` | `kpi_weights.latency` |
+| **Quality** | Model quality priors (0-1 score) | -- | `kpi_weights.quality` |
+| **Tool calls** | Count of tool/function calls | `max_tool_calls` | -- |
+| **Energy** | Compute-intensity coefficient | `max_energy` | `kpi_weights.energy` |
+| **Compliance** | Model allowlist per regulation | `compliance` | -- |
+
+## HarnessConfig
+
+All harness behavior is configured through a single dataclass:
+
+```python
+from cascadeflow import HarnessConfig
+
+config = HarnessConfig(
+    mode="enforce",                    # "off" | "observe" | "enforce"
+    verbose=False,                     # Print decisions to stderr
+    budget=0.50,                       # Max USD for the run (None = unlimited)
+    max_tool_calls=10,                 # Max tool/function calls (None = unlimited)
+    max_latency_ms=5000.0,             # Max wall-clock ms per call (None = unlimited)
+    max_energy=100.0,                  # Max energy units (None = unlimited)
+    kpi_targets={"quality": 0.9},      # Target values for KPI dimensions
+    kpi_weights={                      # Relative importance of each dimension
+        "quality": 0.6,
+        "cost": 0.3,
+        "latency": 0.1,
+    },
+    compliance="gdpr",                 # "gdpr" | "hipaa" | "pci" | "strict" | None
+)
+```
+
+## Activation
+
+```python
+import cascadeflow
+
+# Global activation
+cascadeflow.init(mode="observe")
+
+# Scoped run with overrides
+with cascadeflow.run(budget=0.50, max_tool_calls=10) as session:
+    # agent code
+    pass
+
+# Decorated agent function
+@cascadeflow.agent(budget=0.20, compliance="gdpr")
+async def my_agent(query: str):
+    pass
+```
+
+## Decision Flow
+
+For each LLM call or tool execution:
+
+1. **Record** model, step number, cumulative cost, latency, energy
+2. **Check compliance** — is the model in the allowlist for the configured regulation?
+3. **Check hard caps** — budget, tool calls, latency, energy
+4. **Score KPI dimensions** — quality, cost, latency, energy weighted by `kpi_weights`
+5. **Decide action** — `allow`, `switch_model`, `deny_tool`, or `stop`
+6. **Enforce or log** — enforce in `enforce` mode, log only in `observe` mode
+7. **Append trace** — full decision record for auditability
+
+## Supported Models
+
+The harness includes a built-in pricing table for 18 models across OpenAI, Anthropic, and Google. Unknown models are resolved via fuzzy matching (e.g. `gpt-5-mini` matches even before official pricing is announced).
+
+See [Energy Tracking](/harness/energy-tracking) for the full pricing and energy coefficients table.
diff --git a/docs-site/index.mdx b/docs-site/index.mdx
new file mode 100644
index 00000000..2e99a0a2
--- /dev/null
+++ b/docs-site/index.mdx
@@ -0,0 +1,91 @@
+---
+title: cascadeflow
+description: Agent runtime intelligence layer — optimize cost, latency, quality, budget, compliance, and energy across AI agent workflows.
+---
+
+# cascadeflow
+
+The in-process intelligence layer for AI agents. Optimize cost, latency, quality, budget, compliance, and energy — inside the execution loop, not at the HTTP boundary.
+
+<CardGroup cols={2}>
+  <Card title="Quickstart" icon="rocket" href="/get-started/quickstart">
+    Get running in 3 minutes with zero code changes.
+  </Card>
+  <Card title="How It Works" icon="diagram-project" href="/get-started/how-it-works">
+    Two engines: Cascade for model routing, Harness for agent intelligence.
+  </Card>
+  <Card title="Harness API" icon="shield-halved" href="/harness/overview">
+    Budget enforcement, compliance gating, KPI-weighted routing, energy tracking.
+  </Card>
+  <Card title="Integrations" icon="plug" href="/integrations/overview">
+    LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, Vercel AI SDK.
+  </Card>
+</CardGroup>
+
+## Install
+
+<CodeGroup>
+
+```bash pip
+pip install cascadeflow
+```
+
+```bash npm
+npm install @cascadeflow/core
+```
+
+</CodeGroup>
+
+## Quick Start
+
+<CodeGroup>
+
+```python Observe (zero-change)
+import cascadeflow
+cascadeflow.init(mode="observe")
+# All OpenAI/Anthropic SDK calls are now tracked.
+```
+
+```python Scoped Run
+import cascadeflow
+cascadeflow.init(mode="enforce")
+
+with cascadeflow.run(budget=0.50) as session:
+    result = await agent.run("Analyze this dataset")
+    print(session.summary())
+```
+
+```python Decorated Agent
+import cascadeflow
+cascadeflow.init(mode="enforce")
+
+@cascadeflow.agent(budget=0.20, compliance="gdpr")
+async def my_agent(query: str):
+    return await llm.complete(query)
+```
+
+</CodeGroup>
+
+## Supported Frameworks
+
+| Framework | Python | TypeScript | Integration Type |
+|---|---|---|---|
+| LangChain / LangGraph | `cascadeflow[langchain]` | `@cascadeflow/langchain` | Callback handler |
+| OpenAI Agents SDK | `cascadeflow[openai-agents]` | -- | ModelProvider |
+| CrewAI | `cascadeflow[crewai]` | -- | llm_hooks |
+| Google ADK | `cascadeflow[google-adk]` | -- | BasePlugin |
+| n8n | -- | `@cascadeflow/n8n-nodes-cascadeflow` | Community node |
+| Vercel AI SDK | -- | `@cascadeflow/vercel-ai` | Middleware |
+
+## Six Dimensions
+
+cascadeflow optimizes across six dimensions simultaneously:
+
+| Dimension | What it controls | Example |
+|---|---|---|
+| **Cost** | USD per LLM call from pricing table | Budget cap of $0.50 per run |
+| **Latency** | Wall-clock milliseconds per call | Max 2000ms per call |
+| **Quality** | Model quality priors for routing | 60% weight on quality KPI |
+| **Budget** | Cumulative spend tracking and caps | Per-user daily limits |
+| **Compliance** | Model allowlists per regulation | GDPR: only gpt-4o, gpt-4o-mini |
+| **Energy** | Compute-intensity coefficients | Carbon-aware model selection |
diff --git a/docs-site/integrations/crewai.mdx b/docs-site/integrations/crewai.mdx
new file mode 100644
index 00000000..1fae1fde
--- /dev/null
+++ b/docs-site/integrations/crewai.mdx
@@ -0,0 +1,78 @@
+---
+title: CrewAI
+description: Hook-based harness integration for CrewAI with budget gating, metrics tracking, and decision traces across crew steps.
+---
+
+# CrewAI Integration
+
+cascadeflow integrates with CrewAI through the native `llm_hooks` system. Call `enable()` to register global hooks that track all crew steps, enforce budget caps, and record decision traces.
+
+## Install
+
+```bash
+pip install "cascadeflow[crewai]"
+```
+
+## Quick Start
+
+```python
+from crewai import Agent, Crew, Process, Task
+import cascadeflow
+from cascadeflow.integrations.crewai import CrewAIHarnessConfig, enable
+
+cascadeflow.init(mode="observe")
+
+# Enable harness hooks
+config = CrewAIHarnessConfig(
+    fail_open=True,
+    budget_gate=True,
+)
+enable(config=config)
+
+# Define agents and tasks as usual
+researcher = Agent(
+    role="Researcher",
+    goal="Find relevant information",
+    llm="gpt-4o-mini",
+)
+
+task = Task(
+    description="Research the topic of AI agent frameworks",
+    agent=researcher,
+)
+
+crew = Crew(
+    agents=[researcher],
+    tasks=[task],
+    process=Process.sequential,
+)
+
+# Run with budget tracking
+with cascadeflow.run(budget=1.00) as session:
+    result = crew.kickoff()
+    print(session.summary())
+    for record in session.trace():
+        print(f"Step {record['step']}: {record['action']} — {record['reason']}")
+```
+
+## Configuration
+
+```python
+config = CrewAIHarnessConfig(
+    fail_open=True,    # Continue on harness errors
+    budget_gate=True,  # Enforce budget caps
+)
+```
+
+## Features
+
+- Tracks all crew steps automatically via `llm_hooks`
+- Budget gating stops crew execution when budget is exceeded
+- Full decision trace across all agents in the crew
+- Fail-open mode for production safety
+- No changes to existing CrewAI agent or task definitions
+
+## Limitations
+
+- Tool-level gating is not currently applied (CrewAI hooks operate at the LLM call level)
+- Model switching depends on CrewAI's model configuration
diff --git a/docs-site/integrations/google-adk.mdx b/docs-site/integrations/google-adk.mdx
new file mode 100644
index 00000000..8b6f3403
--- /dev/null
+++ b/docs-site/integrations/google-adk.mdx
@@ -0,0 +1,91 @@
+---
+title: Google ADK
+description: Plugin-based harness integration for Google Agent Development Kit with budget enforcement and metrics tracking.
+---
+
+# Google ADK Integration
+
+cascadeflow integrates with Google's Agent Development Kit (ADK) through the `BasePlugin` system. Call `enable()` to get a plugin that plugs into `Runner(plugins=[...])`.
+
+## Install
+
+```bash
+pip install "cascadeflow[google-adk]"
+```
+
+Requires Python 3.10+.
+
+## Quick Start
+
+```python
+import asyncio
+from google.adk.agents import Agent
+from google.adk.runners import Runner
+from google.adk.sessions import InMemorySessionService
+from google.genai.types import Content, Part
+
+import cascadeflow
+from cascadeflow.integrations.google_adk import GoogleADKHarnessConfig, enable
+
+cascadeflow.init(mode="observe")
+
+# Enable harness plugin
+config = GoogleADKHarnessConfig(
+    fail_open=True,
+    enable_budget_gate=True,
+)
+plugin = enable(config=config)
+
+# Create ADK agent
+agent = Agent(
+    name="research_agent",
+    model="gemini-2.5-flash",
+    instruction="You are a helpful research assistant.",
+)
+
+# Run with plugin
+session_service = InMemorySessionService()
+runner = Runner(agent=agent, plugins=[plugin])
+
+async def main():
+    with cascadeflow.run(budget=0.50) as session:
+        user_content = Content(parts=[Part(text="Explain cascadeflow")])
+        async for event in runner.run_async(
+            session_id="test",
+            user_id="user-1",
+            new_message=user_content,
+        ):
+            pass  # Process streaming events
+
+        print(session.summary())
+
+asyncio.run(main())
+```
+
+## Configuration
+
+```python
+config = GoogleADKHarnessConfig(
+    fail_open=True,          # Continue on harness errors
+    enable_budget_gate=True, # Enforce budget caps
+)
+```
+
+## Supported Gemini Models
+
+| Model | Input $/1M | Output $/1M | Energy Coeff |
+|---|---|---|---|
+| gemini-2.5-flash | $0.15 | $0.60 | 0.30 |
+| gemini-2.5-pro | $1.25 | $10.00 | 1.20 |
+| gemini-2.0-flash | $0.10 | $0.40 | 0.25 |
+| gemini-1.5-flash | $0.075 | $0.30 | 0.20 |
+| gemini-1.5-pro | $1.25 | $5.00 | 1.00 |
+
+## Budget Enforcement
+
+When budget is exceeded in `enforce` mode, the plugin returns an `LlmResponse` with `error_code="BUDGET_EXCEEDED"`. The ADK runner handles this as a graceful stop.
+
+## Limitations
+
+- Tool gating is not applied (intentional design choice — ADK manages tool execution internally)
+- Model switching depends on ADK's model configuration
diff --git a/docs-site/integrations/langchain.mdx b/docs-site/integrations/langchain.mdx
new file mode 100644
index 00000000..2f29062f
--- /dev/null
+++ b/docs-site/integrations/langchain.mdx
@@ -0,0 +1,106 @@
+---
+title: LangChain
+description: Harness-aware callback handler for LangChain and LangGraph with budget tracking, cost analytics, and decision traces.
+---
+
+# LangChain Integration
+
+cascadeflow integrates with LangChain through a callback handler that wraps any `BaseChatModel`. Works with LCEL chains, streaming, tool calling, structured output, and LangGraph agents.
+
+## Install
+
+<CodeGroup>
+
+```bash Python
+pip install "cascadeflow[langchain]"
+```
+
+```bash TypeScript
+npm install @cascadeflow/langchain @langchain/core @langchain/openai
+```
+
+</CodeGroup>
+
+## Quick Start
+
+<CodeGroup>
+
+```python Python — Harness callback
+import cascadeflow
+from cascadeflow.integrations.langchain import get_harness_callback
+from langchain_openai import ChatOpenAI
+
+cascadeflow.init(mode="observe")
+
+model = ChatOpenAI(model="gpt-4o")
+cb = get_harness_callback()
+
+with cascadeflow.run(budget=0.50) as session:
+    result = await model.ainvoke("Explain quantum computing", config={"callbacks": [cb]})
+    print(session.summary())
+```
+
+```python Python — Cascade routing
+from langchain_openai import ChatOpenAI
+from langchain_anthropic import ChatAnthropic
+from cascadeflow.integrations.langchain import CascadeFlow
+
+cascade = CascadeFlow(
+    drafter=ChatOpenAI(model="gpt-4o-mini"),
+    verifier=ChatAnthropic(model="claude-sonnet-4"),
+    quality_threshold=0.8,
+)
+
+result = await cascade.ainvoke("Explain quantum computing")
+```
+
+```typescript TypeScript — Drop-in cascade
+import { ChatOpenAI } from '@langchain/openai';
+import { ChatAnthropic } from '@langchain/anthropic';
+import { withCascade } from '@cascadeflow/langchain';
+
+const cascade = withCascade({
+  drafter: new ChatOpenAI({ model: 'gpt-4o-mini' }),
+  verifier: new ChatAnthropic({ model: 'claude-sonnet-4' }),
+  qualityThreshold: 0.8,
+});
+
+const result = await cascade.invoke('Explain quantum computing');
+```
+
+</CodeGroup>
+
+## Features
+
+- Full LCEL support (pipes, sequences, batch)
+- Streaming with pre-routing
+- Tool calling and structured output
+- LangSmith cost tracking metadata
+- Cost tracking callbacks
+- Domain policies with `cascadeflow_domain` metadata
+
+## Cost Tracking Callback
+
+```python
+from cascadeflow.integrations.langchain.langchain_callbacks import get_cascade_callback
+
+with get_cascade_callback() as cb:
+    response = await cascade.ainvoke("What is Python?")
+    print(f"Total cost: ${cb.total_cost:.6f}")
+    print(f"Drafter cost: ${cb.drafter_cost:.6f}")
+    print(f"Verifier cost: ${cb.verifier_cost:.6f}")
+```
+
+## LangSmith Integration
+
+When LangSmith tracing is enabled, cascadeflow adds metadata to runs:
+- `cascade_decision`: whether the drafter was accepted
+- `modelUsed`: which model produced the final response
+- `drafterQuality`: quality score from validation
+- `savingsPercentage`: cost savings achieved
+
+```bash
+export LANGSMITH_API_KEY="..."
+export LANGSMITH_PROJECT="my-project"
+export LANGSMITH_TRACING=true
+```
diff --git a/docs-site/integrations/n8n.mdx b/docs-site/integrations/n8n.mdx
new file mode 100644
index 00000000..efb89f51
--- /dev/null
+++ b/docs-site/integrations/n8n.mdx
@@ -0,0 +1,70 @@
+---
+title: n8n
+description: cascadeflow community nodes for n8n with cascade model routing, tool gating, and harness modes for no-code AI workflows.
+---
+
+# n8n Integration
+
+cascadeflow provides two community nodes for n8n workflows: a Model sub-node for drop-in cascade routing and an Agent node for standalone multi-step reasoning.
+
+## Install
+
+In n8n:
+1. Go to **Settings** > **Community Nodes**
+2. Search for: `@cascadeflow/n8n-nodes-cascadeflow`
+3. Click **Install**
+
+Or via npm:
+```bash
+npm install @cascadeflow/n8n-nodes-cascadeflow
+```
+
+## Two Nodes
+
+| Node | Type | Use Case |
+|---|---|---|
+| **CascadeFlow (Model)** | Language Model sub-node | Drop-in for any Chain/LLM node |
+| **CascadeFlow Agent** | Standalone agent | Tool calling, memory, multi-step reasoning |
+
+## CascadeFlow (Model)
+
+Drop-in replacement for any AI Chat Model in n8n chains:
+
+1. Add two **AI Chat Model** nodes (cheap drafter + powerful verifier)
+2. Add **CascadeFlow (Model)** and connect both models
+3. Connect to a **Basic LLM Chain** or **Chain** node
+4. Check the **Logs tab** to see cascade decisions
+
+**Features:**
+- Quality threshold (default: 0.4)
+- 16 supported domains (Code, Math, Data, Legal, Medical, Financial, etc.)
+- Complexity thresholds for automatic routing
+
+## CascadeFlow Agent
+
+Standalone agent with tool calling and multi-step reasoning:
+
+1. Add a **Chat Trigger** node
+2. Add **CascadeFlow Agent** and connect to the trigger
+3. Connect **Drafter**, **Verifier**, optional **Memory** and **Tools**
+4. Check the **Output tab** for cascade metadata and decision trace
+
+**Features:**
+- Harness mode: `observe` or `enforce`
+- Budget caps and tool call limits
+- Tool routing rules: Cascade (default) or Verifier (for high-stakes tools)
+- Tool call validation with JSON schema checking
+
+## Complexity Thresholds
+
+| Level | Threshold | Routing |
+|---|---|---|
+| Trivial | 0.25 | Always use drafter |
+| Simple | 0.40 | Prefer drafter |
+| Moderate | 0.55 | Quality-dependent |
+| Hard | 0.70 | Prefer verifier |
+| Expert | 0.80 | Always use verifier |
+
+## Result
+
+40-85% cost savings in n8n workflows with zero changes to existing chains.
diff --git a/docs-site/integrations/openai-agents.mdx b/docs-site/integrations/openai-agents.mdx
new file mode 100644
index 00000000..1a189a6b
--- /dev/null
+++ b/docs-site/integrations/openai-agents.mdx
@@ -0,0 +1,77 @@
+---
+title: OpenAI Agents SDK
+description: CascadeFlowModelProvider for OpenAI Agents SDK with model candidates, tool gating, and budget tracking.
+---
+
+# OpenAI Agents SDK Integration
+
+cascadeflow provides a `CascadeFlowModelProvider` that integrates with the OpenAI Agents SDK as an explicit `ModelProvider`. Supports model candidates, tool gating, and scoped budget tracking.
+
+## Install
+
+```bash
+pip install "cascadeflow[openai-agents]"
+```
+
+## Quick Start
+
+```python
+import asyncio
+from agents import Agent, Runner
+import cascadeflow
+from cascadeflow.integrations.openai_agents import (
+    CascadeFlowModelProvider,
+    OpenAIAgentsIntegrationConfig,
+)
+
+cascadeflow.init(mode="observe")
+
+# Configure integration
+config = OpenAIAgentsIntegrationConfig(
+    model_candidates=["gpt-4o-mini", "gpt-4o"],
+    enable_tool_gating=True,
+)
+
+provider = CascadeFlowModelProvider(config=config)
+
+agent = Agent(
+    name="research_agent",
+    instructions="You are a helpful research assistant.",
+    model_provider=provider,
+)
+
+async def main():
+    with cascadeflow.run(budget=0.50) as session:
+        result = await Runner.run(agent, "Explain cascadeflow")
+        print(result.final_output)
+        print(session.summary())
+
+asyncio.run(main())
+```
+
+## Features
+
+- **Model candidates**: List of models the provider can select from based on harness scoring
+- **Tool gating**: Block tool calls when `max_tool_calls` is reached
+- **Scoped runs**: Use `cascadeflow.run()` for per-task budget tracking
+- **Decision traces**: Full audit trail of model selection and tool gating decisions
+- **Fail-open**: If the harness encounters an error, execution continues with the default model
+
+## Configuration
+
+```python
+config = OpenAIAgentsIntegrationConfig(
+    model_candidates=["gpt-4o-mini", "gpt-4o"],  # Models to choose from
+    enable_tool_gating=True,                       # Block tools at cap
+)
+```
+
+## Session Metrics
+
+After a run, `session.summary()` includes:
+- `cost_total`: cumulative USD spent
+- `budget_remaining`: USD left in the budget
+- `step_count`: number of LLM calls
+- `tool_calls`: number of tool executions
+- `latency_used_ms`: total latency
+- `energy_used`: total energy units
diff --git a/docs-site/integrations/overview.mdx b/docs-site/integrations/overview.mdx
new file mode 100644
index 00000000..92bda53e
--- /dev/null
+++ b/docs-site/integrations/overview.mdx
@@ -0,0 +1,53 @@
+---
+title: Integrations Overview
+description: Matrix of all cascadeflow framework integrations with supported features, languages, and integration patterns.
+---
+
+# Integrations Overview
+
+cascadeflow integrates with six agent frameworks. All integrations are opt-in — install the extra and explicitly enable.
+
+## Integration Matrix
+
+| Framework | Language | Package | Integration Type | Budget Gating | Tool Gating | Traces |
+|---|---|---|---|---|---|---|
+| [LangChain](/integrations/langchain) | Python, TS | `cascadeflow[langchain]`, `@cascadeflow/langchain` | Callback handler | Yes | No | Yes |
+| [OpenAI Agents SDK](/integrations/openai-agents) | Python | `cascadeflow[openai-agents]` | ModelProvider | Yes | Yes | Yes |
+| [CrewAI](/integrations/crewai) | Python | `cascadeflow[crewai]` | llm_hooks | Yes | No | Yes |
+| [Google ADK](/integrations/google-adk) | Python | `cascadeflow[google-adk]` | BasePlugin | Yes | No | Yes |
+| [n8n](/integrations/n8n) | TypeScript | `@cascadeflow/n8n-nodes-cascadeflow` | Community node | Yes | Yes | Yes |
+| [Vercel AI SDK](/integrations/vercel-ai) | TypeScript | `@cascadeflow/vercel-ai` | Middleware | Yes | No | Yes |
+
+## Integration Patterns
+
+Each integration follows the same principle: wrap the framework's extension point with cascadeflow's harness, without modifying agent code.
+
+### Python
+
+```python
+import cascadeflow
+cascadeflow.init(mode="observe")
+
+# Framework-specific activation
+from cascadeflow.integrations.langchain import get_harness_callback
+from cascadeflow.integrations.openai_agents import CascadeFlowModelProvider
+from cascadeflow.integrations.crewai import enable as enable_crewai
+from cascadeflow.integrations.google_adk import enable as enable_adk
+```
+
+### TypeScript
+
+```bash
+npm install @cascadeflow/langchain
+npm install @cascadeflow/vercel-ai
+npm install @cascadeflow/n8n-nodes-cascadeflow
+```
+
+## Choosing an Integration
+
+- **LangChain/LangGraph**: Use if you have existing LangChain chains or agents. The callback handler wraps any `BaseChatModel`.
+- **OpenAI Agents SDK**: Use if you're building with OpenAI's Agents SDK. The `ModelProvider` supports model candidates and tool gating.
+- **CrewAI**: Use if you're building multi-agent crews. The `llm_hooks` integration tracks all crew steps.
+- **Google ADK**: Use if you're building with Google's Agent Development Kit. The plugin integrates with `Runner`.
+- **n8n**: Use if you're building no-code workflows. The community node adds cascade routing to any n8n flow.
+- **Vercel AI SDK**: Use if you're building TypeScript server-side agents. The middleware wraps AI SDK streams.
diff --git a/docs-site/integrations/vercel-ai.mdx b/docs-site/integrations/vercel-ai.mdx
new file mode 100644
index 00000000..9b2d9257
--- /dev/null
+++ b/docs-site/integrations/vercel-ai.mdx
@@ -0,0 +1,88 @@
+---
+title: Vercel AI SDK
+description: TypeScript middleware integration for Vercel AI SDK with cascade routing, multi-turn chat, and tool execution.
+---
+
+# Vercel AI SDK Integration
+
+cascadeflow integrates with the Vercel AI SDK as middleware, providing cascade routing for server-side AI applications with streaming support.
+
+## Install
+
+```bash
+npm install @cascadeflow/vercel-ai
+```
+
+## Quick Start
+
+```typescript
+import { createChatHandler } from '@cascadeflow/vercel-ai';
+import { CascadeAgent } from '@cascadeflow/core';
+
+const agent = new CascadeAgent({
+  models: [
+    { name: 'gpt-4o-mini', provider: 'openai', cost: 0.000375 },
+    { name: 'gpt-4o', provider: 'openai', cost: 0.00625 },
+  ],
+});
+
+const handler = createChatHandler(agent, {
+  protocol: 'data',           // AI SDK v4 data stream
+  tools,                       // Tool definitions
+  toolHandlers,                // Server-side tool execution
+  maxSteps: 5,                 // Multi-step tool loops
+});
+
+// Use in Next.js API route, Express, or any Node.js server
+export const POST = handler;
+```
+
+## Features
+
+- **AI SDK v4 `data` stream** and **AI SDK v5/v6 UI streams**
+- **`useChat` multi-turn support** — conversation history preserved
+- **`parts` message format** (AI SDK v6)
+- **Tool call streaming visibility** — see tool calls as they happen
+- **Server-side tool execution** via `toolExecutor` or `toolHandlers`
+- **Multi-step controls**: `maxSteps`, `forceDirect`
+- **Cascade decision stream parts** — optional metadata in the stream
+- **Request-level overrides** with allowlist + shared-secret guard
+
+## Multi-Turn Chat
+
+```typescript
+import { useChat } from 'ai/react';
+
+export default function Chat() {
+  const { messages, input, handleSubmit, handleInputChange } = useChat({
+    api: '/api/chat',
+  });
+
+  return (
+    <div>
+      {messages.map((m) => (
+        <div key={m.id}>{m.content}</div>
+      ))}
+      <form onSubmit={handleSubmit}>
+        <input value={input} onChange={handleInputChange} />
+      </form>
+    </div>
+  );
+}
+```
+
+## Request Overrides
+
+Override cascade behavior per request (protected by shared secret):
+
+```typescript
+const handler = createChatHandler(agent, {
+  protocol: 'data',
+  allowOverrides: ['forceDirect', 'maxSteps'],
+  overrideSecret: process.env.OVERRIDE_SECRET,
+});
+```
+
+## Result
+
+40-85% cost savings for Vercel AI SDK applications with streaming support and zero client-side changes.
diff --git a/docs-site/logo/cascadeflow-dark.svg b/docs-site/logo/cascadeflow-dark.svg
new file mode 100644
index 00000000..3c1a2870
--- /dev/null
+++ b/docs-site/logo/cascadeflow-dark.svg
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg id="Layer_1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 562.99 91.76">
+  <defs>
+    <style>
+      .cls-1 {
+        fill: #fff;
+      }
+    </style>
+  </defs>
+  <path class="cls-1" d="M38.19,54.36c-4.15-4.15-4.15-10.87,0-15.02,4.15-4.15,10.87-4.15,15.02,0,4.15,4.15,4.15,10.87,0,15.02s-10.87,4.15-15.02,0Z"/>
+  <path class="cls-1" d="M63.4,91.76H28.7c-7.93,0-14.35-6.42-14.35-14.35h49.05v14.35Z"/>
+  <path class="cls-1" d="M14.35,77.41c-7.93,0-14.35-6.42-14.35-14.35V29.8h14.35v47.61Z"/>
+  <path class="cls-1" d="M28.36,0h34.7c7.93,0,14.35,6.42,14.35,14.35H28.36V0Z"/>
+  <path class="cls-1" d="M77.41,14.35c7.93,0,14.35,6.42,14.35,14.35v33.26h-14.35V14.35Z"/>
+  <g>
+    <path class="cls-1" d="M120,46.52c0-11.41,7.61-19.16,18.8-19.16,9.14,0,15.58,5.12,16.67,12.8.07.51-.29.8-.73.8h-6.44c-.51,0-.66-.22-.8-.73-.95-4.31-3.95-6.29-8.7-6.29-6.66,0-10.53,4.83-10.53,12.58s3.95,12.43,10.53,12.43c5.05,0,7.9-1.83,9.07-6.36.07-.44.37-.66.8-.66h6.44c.44,0,.8.22.73.66-1.24,7.75-7.53,12.94-17.04,12.94-11.41,0-18.8-7.46-18.8-19.01Z"/>
+    <path class="cls-1" d="M160.96,46.52c0-11.63,6.44-19.16,16.53-19.16,4.83,0,9.36,2.34,11.92,6.14h.29v-4.97c0-.44.29-.73.73-.73h6.66c.44,0,.73.29.73.73v30.13h4.02c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-5.41c-4.31,0-6.36-1.75-6.36-5.34v-.29h-.29c-2.49,3.73-7.24,6.07-12.29,6.07-10.09,0-16.53-7.46-16.53-19.01ZM179.75,58.95c6.44,0,10.24-4.68,10.24-12.43s-3.88-12.58-10.24-12.58-10.53,4.75-10.53,12.58,4.02,12.43,10.53,12.43Z"/>
+    <path class="cls-1" d="M207.03,53.84c-.07-.51.22-.8.66-.8h6.44c.51,0,.66.29.73.66.8,3.95,3.14,5.56,7.75,5.56,3.8,0,6.29-1.76,6.29-4.46,0-2.56-1.54-3.73-5.41-4.46l-5.19-.95c-7.09-1.32-10.46-4.68-10.46-10.9,0-6.73,5.34-11.12,13.82-11.12s13.46,3.95,14.26,10.82c.07.51-.22.8-.66.8h-6.44c-.51,0-.73-.37-.8-.66-.73-3.36-2.63-4.68-6.36-4.68s-5.85,1.54-5.85,4.39c0,2.27,1.02,3.66,4.97,4.39l5.56,1.02c8.04,1.46,10.53,4.61,10.53,10.9,0,7.17-5.12,11.19-14.26,11.19s-14.85-4.17-15.58-11.7Z"/>
+    <path class="cls-1" d="M242.35,46.52c0-11.41,7.61-19.16,18.8-19.16,9.14,0,15.58,5.12,16.67,12.8.07.51-.29.8-.73.8h-6.44c-.51,0-.66-.22-.8-.73-.95-4.31-3.95-6.29-8.7-6.29-6.66,0-10.53,4.83-10.53,12.58s3.95,12.43,10.53,12.43c5.05,0,7.9-1.83,9.07-6.36.07-.44.37-.66.8-.66h6.44c.44,0,.8.22.73.66-1.24,7.75-7.53,12.94-17.04,12.94-11.41,0-18.8-7.46-18.8-19.01Z"/>
+    <path class="cls-1" d="M283.3,46.52c0-11.63,6.44-19.16,16.53-19.16,4.83,0,9.36,2.34,11.92,6.14h.29v-4.97c0-.44.29-.73.73-.73h6.66c.44,0,.73.29.73.73v30.13h4.02c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-5.41c-4.31,0-6.36-1.75-6.36-5.34v-.29h-.29c-2.49,3.73-7.24,6.07-12.29,6.07-10.09,0-16.53-7.46-16.53-19.01ZM302.1,58.95c6.44,0,10.24-4.68,10.24-12.43s-3.88-12.58-10.24-12.58-10.53,4.75-10.53,12.58,4.02,12.43,10.53,12.43Z"/>
+    <path class="cls-1" d="M328.93,46.52c0-11.63,6.44-19.16,16.53-19.16,4.83,0,9.36,2.34,11.92,6.14h.29V13.17c0-.44.29-.73.73-.73h6.66c.44,0,.73.29.73.73v51.19c0,.44-.29.73-.73.73h-6.29c-.44,0-.73-.29-.73-.73v-4.9h-.29c-2.49,3.73-7.24,6.07-12.29,6.07-10.09,0-16.53-7.46-16.53-19.01ZM347.73,58.95c6.44,0,10.24-4.68,10.24-12.43s-3.88-12.58-10.24-12.58-10.53,4.75-10.53,12.58,4.02,12.43,10.53,12.43Z"/>
+    <path class="cls-1" d="M373.1,46.52c0-11.7,7.39-19.16,18.87-19.16,10.6,0,17.19,7.31,17.19,19.09v1.97c0,.44-.29.73-.73.73h-26.99c.58,6.14,4.24,9.8,10.53,9.8,4.97,0,7.97-1.9,9-4.97.15-.44.37-.59.8-.59h6.44c.51,0,.8.22.73.59-1.32,6.8-7.39,11.55-16.97,11.55-11.48,0-18.87-7.46-18.87-19.01ZM401.19,43.23c-.37-5.78-3.88-9.29-9.21-9.29-6.14,0-9.73,3.22-10.46,9.29h19.67Z"/>
+    <path class="cls-1" d="M420.64,64.37v-30.13h-6.36c-.44,0-.73-.29-.73-.73v-4.97c0-.44.29-.73.73-.73h6.36v-4.75c0-6.88,3.73-10.6,10.6-10.6h6.51c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-9v8.92h9c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-9v30.13c0,.44-.29.73-.73.73h-6.66c-.44,0-.73-.29-.73-.73ZM445.79,13.17c0-.44.29-.73.73-.73h6.66c.44,0,.73.29.73.73v45.49h6.8c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-4.32c-6.88,0-10.6-3.73-10.6-10.6V13.17Z"/>
+    <path class="cls-1" d="M464.73,46.52c0-11.41,7.61-19.16,18.65-19.16s18.8,7.75,18.8,19.16-7.61,19.01-18.8,19.01-18.65-7.68-18.65-19.01ZM483.38,58.95c6.51,0,10.53-4.68,10.53-12.43s-3.95-12.58-10.53-12.58-10.38,4.75-10.38,12.58,3.88,12.43,10.38,12.43Z"/>
+    <path class="cls-1" d="M515.99,64.51l-10.46-35.91c-.15-.51.15-.8.66-.8h6.88c.44,0,.73.22.8.66l7.46,29.98h.29l7.46-29.98c.07-.44.37-.66.8-.66h9.07c.44,0,.73.22.8.66l7.46,29.98h.29l7.46-29.98c.07-.44.37-.66.8-.66h6.51c.51,0,.8.29.66.8l-10.46,35.91c-.15.37-.37.58-.8.58h-9c-.44,0-.73-.22-.8-.66l-7.53-29.25h-.22l-7.53,29.25c-.07.44-.37.66-.8.66h-9c-.44,0-.66-.22-.8-.58Z"/>
+  </g>
+</svg>
\ No newline at end of file
diff --git a/docs-site/logo/cascadeflow-light.svg b/docs-site/logo/cascadeflow-light.svg
new file mode 100644
index 00000000..8ca48234
--- /dev/null
+++ b/docs-site/logo/cascadeflow-light.svg
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg id="Layer_1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 562.99 91.76">
+  <path d="M38.19,54.36c-4.15-4.15-4.15-10.87,0-15.02,4.15-4.15,10.87-4.15,15.02,0,4.15,4.15,4.15,10.87,0,15.02s-10.87,4.15-15.02,0Z"/>
+  <path d="M63.4,91.76H28.7c-7.93,0-14.35-6.42-14.35-14.35h49.05v14.35Z"/>
+  <path d="M14.35,77.41c-7.93,0-14.35-6.42-14.35-14.35V29.8h14.35v47.61Z"/>
+  <path d="M28.36,0h34.7c7.93,0,14.35,6.42,14.35,14.35H28.36V0Z"/>
+  <path d="M77.41,14.35c7.93,0,14.35,6.42,14.35,14.35v33.26h-14.35V14.35Z"/>
+  <g>
+    <path d="M120,46.52c0-11.41,7.61-19.16,18.8-19.16,9.14,0,15.58,5.12,16.67,12.8.07.51-.29.8-.73.8h-6.44c-.51,0-.66-.22-.8-.73-.95-4.31-3.95-6.29-8.7-6.29-6.66,0-10.53,4.83-10.53,12.58s3.95,12.43,10.53,12.43c5.05,0,7.9-1.83,9.07-6.36.07-.44.37-.66.8-.66h6.44c.44,0,.8.22.73.66-1.24,7.75-7.53,12.94-17.04,12.94-11.41,0-18.8-7.46-18.8-19.01Z"/>
+    <path d="M160.96,46.52c0-11.63,6.44-19.16,16.53-19.16,4.83,0,9.36,2.34,11.92,6.14h.29v-4.97c0-.44.29-.73.73-.73h6.66c.44,0,.73.29.73.73v30.13h4.02c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-5.41c-4.31,0-6.36-1.75-6.36-5.34v-.29h-.29c-2.49,3.73-7.24,6.07-12.29,6.07-10.09,0-16.53-7.46-16.53-19.01ZM179.75,58.95c6.44,0,10.24-4.68,10.24-12.43s-3.88-12.58-10.24-12.58-10.53,4.75-10.53,12.58,4.02,12.43,10.53,12.43Z"/>
+    <path d="M207.03,53.84c-.07-.51.22-.8.66-.8h6.44c.51,0,.66.29.73.66.8,3.95,3.14,5.56,7.75,5.56,3.8,0,6.29-1.76,6.29-4.46,0-2.56-1.54-3.73-5.41-4.46l-5.19-.95c-7.09-1.32-10.46-4.68-10.46-10.9,0-6.73,5.34-11.12,13.82-11.12s13.46,3.95,14.26,10.82c.07.51-.22.8-.66.8h-6.44c-.51,0-.73-.37-.8-.66-.73-3.36-2.63-4.68-6.36-4.68s-5.85,1.54-5.85,4.39c0,2.27,1.02,3.66,4.97,4.39l5.56,1.02c8.04,1.46,10.53,4.61,10.53,10.9,0,7.17-5.12,11.19-14.26,11.19s-14.85-4.17-15.58-11.7Z"/>
+    <path d="M242.35,46.52c0-11.41,7.61-19.16,18.8-19.16,9.14,0,15.58,5.12,16.67,12.8.07.51-.29.8-.73.8h-6.44c-.51,0-.66-.22-.8-.73-.95-4.31-3.95-6.29-8.7-6.29-6.66,0-10.53,4.83-10.53,12.58s3.95,12.43,10.53,12.43c5.05,0,7.9-1.83,9.07-6.36.07-.44.37-.66.8-.66h6.44c.44,0,.8.22.73.66-1.24,7.75-7.53,12.94-17.04,12.94-11.41,0-18.8-7.46-18.8-19.01Z"/>
+    <path d="M283.3,46.52c0-11.63,6.44-19.16,16.53-19.16,4.83,0,9.36,2.34,11.92,6.14h.29v-4.97c0-.44.29-.73.73-.73h6.66c.44,0,.73.29.73.73v30.13h4.02c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-5.41c-4.31,0-6.36-1.75-6.36-5.34v-.29h-.29c-2.49,3.73-7.24,6.07-12.29,6.07-10.09,0-16.53-7.46-16.53-19.01ZM302.1,58.95c6.44,0,10.24-4.68,10.24-12.43s-3.88-12.58-10.24-12.58-10.53,4.75-10.53,12.58,4.02,12.43,10.53,12.43Z"/>
+    <path d="M328.93,46.52c0-11.63,6.44-19.16,16.53-19.16,4.83,0,9.36,2.34,11.92,6.14h.29V13.17c0-.44.29-.73.73-.73h6.66c.44,0,.73.29.73.73v51.19c0,.44-.29.73-.73.73h-6.29c-.44,0-.73-.29-.73-.73v-4.9h-.29c-2.49,3.73-7.24,6.07-12.29,6.07-10.09,0-16.53-7.46-16.53-19.01ZM347.73,58.95c6.44,0,10.24-4.68,10.24-12.43s-3.88-12.58-10.24-12.58-10.53,4.75-10.53,12.58,4.02,12.43,10.53,12.43Z"/>
+    <path d="M373.1,46.52c0-11.7,7.39-19.16,18.87-19.16,10.6,0,17.19,7.31,17.19,19.09v1.97c0,.44-.29.73-.73.73h-26.99c.58,6.14,4.24,9.8,10.53,9.8,4.97,0,7.97-1.9,9-4.97.15-.44.37-.59.8-.59h6.44c.51,0,.8.22.73.59-1.32,6.8-7.39,11.55-16.97,11.55-11.48,0-18.87-7.46-18.87-19.01ZM401.19,43.23c-.37-5.78-3.88-9.29-9.21-9.29-6.14,0-9.73,3.22-10.46,9.29h19.67Z"/>
+    <path d="M420.64,64.37v-30.13h-6.36c-.44,0-.73-.29-.73-.73v-4.97c0-.44.29-.73.73-.73h6.36v-4.75c0-6.88,3.73-10.6,10.6-10.6h6.51c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-9v8.92h9c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-9v30.13c0,.44-.29.73-.73.73h-6.66c-.44,0-.73-.29-.73-.73ZM445.79,13.17c0-.44.29-.73.73-.73h6.66c.44,0,.73.29.73.73v45.49h6.8c.44,0,.73.29.73.73v4.97c0,.44-.29.73-.73.73h-4.32c-6.88,0-10.6-3.73-10.6-10.6V13.17Z"/>
+    <path d="M464.73,46.52c0-11.41,7.61-19.16,18.65-19.16s18.8,7.75,18.8,19.16-7.61,19.01-18.8,19.01-18.65-7.68-18.65-19.01ZM483.38,58.95c6.51,0,10.53-4.68,10.53-12.43s-3.95-12.58-10.53-12.58-10.38,4.75-10.38,12.58,3.88,12.43,10.38,12.43Z"/>
+    <path d="M515.99,64.51l-10.46-35.91c-.15-.51.15-.8.66-.8h6.88c.44,0,.73.22.8.66l7.46,29.98h.29l7.46-29.98c.07-.44.37-.66.8-.66h9.07c.44,0,.73.22.8.66l7.46,29.98h.29l7.46-29.98c.07-.44.37-.66.8-.66h6.51c.51,0,.8.29.66.8l-10.46,35.91c-.15.37-.37.58-.8.58h-9c-.44,0-.73-.22-.8-.66l-7.53-29.25h-.22l-7.53,29.25c-.07.44-.37.66-.8.66h-9c-.44,0-.66-.22-.8-.58Z"/>
+  </g>
+</svg>
\ No newline at end of file
diff --git a/docs/INSTALLATION.md b/docs/INSTALLATION.md
index c291bd93..6e44cdec 100644
--- a/docs/INSTALLATION.md
+++ b/docs/INSTALLATION.md
@@ -108,6 +108,24 @@ TOGETHER_API_KEY=...
 # vLLM - no API key needed! (local)
 ```
 
+## 🔌 Optional Integration Extras
+
+Integration packages are opt-in and never enabled by default.
+
+| Integration | Install Command | Python Requirement | Notes |
+|------------|-----------------|--------------------|-------|
+| OpenAI Agents SDK | `pip install "cascadeflow[openai,openai-agents]"` | 3.9+ (3.10+ recommended) | Uses explicit `ModelProvider` integration |
+| CrewAI | `pip install "cascadeflow[crewai,openai]"` | 3.10+ | Uses explicit CrewAI hook registration |
+| Google ADK | `pip install "cascadeflow[google-adk]"` | 3.10+ | Uses explicit ADK plugin in `Runner(plugins=[...])` |
+
+Optional for richer provider/model normalization in cost tracking:
+
+```bash
+pip install litellm
+```
+
+Without `litellm`, cascadeflow still provides built-in pricing-based cost estimates.
+
 ## 🚀 Quick Start
 
 ### For Production
diff --git a/docs/README.md b/docs/README.md
index 1238d7f8..08c5c0c8 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,8 +1,10 @@
 # cascadeflow Documentation
 
-Welcome to cascadeflow documentation! 🌊
+> **Full documentation is now at [docs.cascadeflow.dev](https://docs.cascadeflow.dev)** — the Mintlify-powered docs site is the primary reference for cascadeflow's agent runtime intelligence layer. The guides below remain for quick reference and deep links.
 
-## 📖 Quick Links
+Agent runtime intelligence layer — optimize cost, latency, quality, budget, compliance, and energy across AI agent workflows. In-process harness, not a proxy.
+
+## Quick Links
 
 - [Installation Guide](INSTALLATION.md)
 - [Quick Start Guide](guides/quickstart.md)
@@ -11,6 +13,7 @@ Welcome to cascadeflow documentation! 🌊
 
 ### Core Concepts
 - [Quickstart](guides/quickstart.md) - Get started with cascadeflow in 5 minutes
+- [Python Harness Quickstart](guides/python_harness_quickstart.md) - `init`, `run`, and `@agent` for in-process policy control
 - [Providers](guides/providers.md) - Configure and use different AI providers (OpenAI, Anthropic, Groq, Ollama, etc.)
 - [Presets](guides/presets.md) - Use built-in presets for common use cases
 - [Gateway Server](guides/gateway.md) - Drop-in OpenAI/Anthropic-compatible endpoint for existing apps
@@ -20,6 +23,7 @@ Welcome to cascadeflow documentation! 🌊
 - [Tools](guides/tools.md) - Function calling and tool usage with cascades
 - [Agentic Patterns (Python)](guides/agentic-python.md) - Tool loops and multi-agent orchestration in Python
 - [Agentic Patterns (TypeScript)](guides/agentic-typescript.md) - Tool loops, multi-agent orchestration, and message best practices
+- [Harness Telemetry & Privacy](guides/harness_telemetry_privacy.md) - Decision traces, callbacks, and privacy-safe observability
 - [Cost Tracking](guides/cost_tracking.md) - Track and analyze API costs across queries
 - [Proxy Routing](guides/proxy.md) - Route requests through provider-aware proxy plans
 
@@ -38,9 +42,12 @@ Welcome to cascadeflow documentation! 🌊
 - [Agent Intelligence V2/V2.1 Plan](strategy/agent-intelligence-v2-plan.md) - Unified strategic and execution plan for in-process agent intelligence harness delivery
 
 ### Integrations
+- [LangChain Integration](guides/langchain_integration.md) - Callback handler for LangChain/LangGraph with harness-aware cascading
+- [OpenAI Agents SDK Integration](guides/openai_agents_integration.md) - Harness-aware model provider for existing OpenAI Agents apps
+- [CrewAI Integration](guides/crewai_integration.md) - Hook-based harness metrics + budget gating (opt-in)
+- [Google ADK Integration](guides/google_adk_integration.md) - Plugin-based harness integration for ADK runners (opt-in)
 - [n8n Integration](guides/n8n_integration.md) - Use cascadeflow in n8n workflows
 - [Paygentic Integration](guides/paygentic_integration.md) - Usage metering and billing lifecycle helpers (opt-in)
-- [OpenAI Agents SDK Integration](guides/openai_agents_integration.md) - Harness-aware model provider for existing OpenAI Agents apps
 
 ## 📚 Examples
 
diff --git a/docs/guides/crewai_integration.md b/docs/guides/crewai_integration.md
new file mode 100644
index 00000000..8c1cec8a
--- /dev/null
+++ b/docs/guides/crewai_integration.md
@@ -0,0 +1,87 @@
+# CrewAI Integration
+
+Use cascadeflow as an explicit, opt-in harness integration for CrewAI via
+`llm_hooks`.
+
+## Design Principles
+
+- Integration-only: nothing is enabled by default
+- Works with existing CrewAI flows
+- Harness behavior is controlled by `cascadeflow.init(...)` and `cascadeflow.run(...)`
+- Fail-open integration path: harness integration errors should not break crew execution
+
+## Install
+
+```bash
+pip install "cascadeflow[crewai,openai]"
+```
+
+`crewai` is optional and only installed when you request this extra.
+Requires Python 3.10+.
+
+Optional (more precise provider/model cost tracking in harness telemetry):
+
+```bash
+pip install litellm
+```
+
+## Quickstart
+
+```python
+from crewai import Agent, Crew, Process, Task
+
+from cascadeflow import init, run
+from cascadeflow.integrations.crewai import CrewAIHarnessConfig, enable
+
+# Global harness defaults.
+init(mode="enforce", budget=1.0)
+
+# Explicitly register CrewAI hooks (integration-only behavior).
+enable(
+    config=CrewAIHarnessConfig(
+        fail_open=True,
+        enable_budget_gate=True,
+    )
+)
+
+agent = Agent(
+    role="Support Agent",
+    goal="Answer support questions clearly and concisely.",
+    backstory="You are helpful and direct.",
+    allow_delegation=False,
+    llm="openai/gpt-4o-mini",
+)
+
+task = Task(
+    description="Explain why model cascading helps control agent costs.",
+    expected_output="A concise explanation with one practical example.",
+    agent=agent,
+)
+
+with run(budget=0.4) as session:
+    crew = Crew(agents=[agent], tasks=[task], process=Process.sequential, verbose=False)
+    result = crew.kickoff()
+
+    print(result)
+    print(session.summary())
+    print(session.trace())
+```
+
+## What This Integration Adds
+
+- Budget gating in enforce mode (`before_llm_call` hook)
+- Run metrics in `cascadeflow.run()` scope:
+  - `cost`, `budget_remaining`, `step_count`, `latency_used_ms`, `energy_used`
+- Full decision trace through `run.trace()`
+
+## Current Scope
+
+- This integration uses CrewAI hook points, so it tracks and gates calls without
+  changing your crew/task definitions.
+- Tool-level deny/switch actions are not currently applied in this integration path.
+
+## Notes
+
+- Existing non-CrewAI users are unaffected.
+- If CrewAI is not installed, `enable()` returns `False` and no hooks are registered.
+- Without `litellm`, cost tracking still works using cascadeflow's built-in pricing estimates.
diff --git a/docs/guides/google_adk_integration.md b/docs/guides/google_adk_integration.md
new file mode 100644
index 00000000..76529bfc
--- /dev/null
+++ b/docs/guides/google_adk_integration.md
@@ -0,0 +1,172 @@
+# Google ADK Integration
+
+Integrate cascadeflow harness with Google's Agent Development Kit (ADK) to get
+budget enforcement, cost/latency/energy tracking, tool call counting, and full
+trace recording across all agents in an ADK Runner.
+
+---
+
+## Design Principles
+
+- **Plugin-based** — Uses ADK's `BasePlugin` system to intercept every LLM call
+  across all agents in a Runner. One plugin covers the entire agent graph.
+- **Opt-in** — Install `cascadeflow[google-adk]` and create a plugin explicitly.
+  Never enabled by default. Core cascadeflow behavior is unchanged unless you
+  explicitly wire this integration into `Runner(plugins=[...])`.
+- **Fail-open** — Integration errors are logged but never break ADK execution
+  (configurable).
+- **No tool gating** — ADK's `tools_dict` is part of agent definition, not
+  per-call. Budget gate via `before_model_callback` provides sufficient cost
+  control. This is an intentional difference from the OpenAI Agents integration.
+
+---
+
+## Installation
+
+```bash
+pip install "cascadeflow[google-adk]"
+```
+
+Requires Python 3.10+ (ADK requirement).
+
+Optional (more precise provider/model cost tracking in harness telemetry):
+
+```bash
+pip install litellm
+```
+
+---
+
+## Quick Start
+
+```python
+import asyncio
+from google.adk.agents import Agent
+from google.adk.runners import Runner
+from google.adk.sessions import InMemorySessionService
+
+from cascadeflow import init, run
+from cascadeflow.integrations.google_adk import enable
+
+# 1. Initialize harness
+init(mode="observe", budget=1.0)
+
+# 2. Create the cascadeflow plugin
+plugin = enable()
+
+# 3. Pass it to the Runner
+agent = Agent(name="my_agent", model="gemini-2.5-flash", instruction="Be helpful.")
+runner = Runner(
+    agent=agent,
+    app_name="my_app",
+    session_service=InMemorySessionService(),
+    plugins=[plugin],
+)
+
+# 4. Run within a harness scope
+async def main():
+    with run(budget=0.5) as session:
+        # ... run your agent ...
+        print(f"Cost: ${session.cost:.6f}")
+        print(f"Steps: {session.step_count}")
+        print(f"Tool calls: {session.tool_calls}")
+
+asyncio.run(main())
+```
+
+---
+
+## Features
+
+### Budget Enforcement
+
+In `enforce` mode, the plugin short-circuits LLM calls when the budget is
+exhausted by returning an `LlmResponse` with `error_code="BUDGET_EXCEEDED"`.
+
+```python
+init(mode="enforce", budget=0.10)  # Hard limit: $0.10
+plugin = enable()
+```
+
+### Cost and Energy Tracking
+
+Every LLM call is tracked with:
+- **Cost** — Estimated from model pricing (USD per 1M tokens)
+- **Energy** — Deterministic proxy coefficient for compute intensity
+- **Latency** — Wall-clock time per call
+- **Tool calls** — Count of `function_call` parts in responses
+
+By default this uses cascadeflow's built-in pricing table. If you install
+`litellm`, provider/model normalization can be more precise for some aliased
+model identifiers.
+
+### Trace Recording
+
+All decisions are recorded in the `HarnessRunContext` trace:
+
+```python
+with run() as session:
+    # ... run agents ...
+    for event in session.trace():
+        print(event)
+        # {"action": "allow", "reason": "observe", "model": "gemini-2.5-flash", ...}
+```
+
+### Configuration
+
+```python
+from cascadeflow.integrations.google_adk import enable, GoogleADKHarnessConfig
+
+plugin = enable(
+    config=GoogleADKHarnessConfig(
+        fail_open=True,           # Default: True. Never break ADK on integration errors.
+        enable_budget_gate=True,  # Default: True. Block calls when budget exhausted.
+    )
+)
+```
+
+---
+
+## Zero-Code Alternative
+
+If you don't need per-agent plugin integration, you can route ADK through a
+cascadeflow LiteLlm proxy by setting `base_url` on your Gemini model:
+
+```python
+# ADK uses LiteLlm under the hood — point it at your cascadeflow proxy
+agent = Agent(
+    name="my_agent",
+    model="openai/gemini-2.5-flash",  # LiteLlm format
+    instruction="...",
+)
+# Set OPENAI_API_BASE=http://localhost:8080/v1 to route through cascadeflow proxy
+```
+
+This gives you cost tracking at the proxy level without a plugin, but doesn't
+provide budget enforcement or per-agent trace recording.
+
+---
+
+## Supported Gemini Models
+
+| Model | Input $/1M | Output $/1M | Energy Coefficient |
+|-------|-----------|-------------|-------------------|
+| gemini-2.5-flash | $0.15 | $0.60 | 0.3 |
+| gemini-2.5-pro | $1.25 | $10.00 | 1.2 |
+| gemini-2.0-flash | $0.10 | $0.40 | 0.25 |
+| gemini-1.5-flash | $0.075 | $0.30 | 0.2 |
+| gemini-1.5-pro | $1.25 | $5.00 | 1.0 |
+
+All OpenAI and Anthropic models from the shared pricing table are also
+supported (e.g., when using LiteLlm provider prefixes).
+
+---
+
+## Troubleshooting
+
+| Symptom | Solution |
+|---------|----------|
+| `ImportError: google.adk` | `pip install "cascadeflow[google-adk]"` |
+| Plugin not tracking calls | Ensure `plugin` is passed to `Runner(plugins=[plugin])` |
+| Budget not enforced | Check `init(mode="enforce", ...)` — observe mode never blocks |
+| Zero cost reported | Model name may not match pricing table; check for provider prefix stripping |
diff --git a/docs/guides/harness_telemetry_privacy.md b/docs/guides/harness_telemetry_privacy.md
new file mode 100644
index 00000000..01e75402
--- /dev/null
+++ b/docs/guides/harness_telemetry_privacy.md
@@ -0,0 +1,59 @@
+# Harness Telemetry and Privacy
+
+Use this guide when you want harness observability without leaking user content.
+
+## What the Harness Records
+
+Each `run.trace()` decision entry includes:
+
+- `action`, `reason`, `model`
+- `run_id`, `mode`, `step`, `timestamp_ms`
+- `cost_total`, `latency_used_ms`, `energy_used`, `tool_calls_total`
+- `budget_state` (`max`, `remaining`)
+- `applied`, `decision_mode` (when available)
+
+The trace is scoped to the current `run()` context.
+
+## What the Harness Does Not Record
+
+By default, harness decision traces do not include:
+
+- raw prompts or user messages
+- model response text
+- tool argument payloads
+
+This keeps decision telemetry focused on policy/routing state instead of request content.
+
+## Callback Emission (Optional)
+
+If you provide a callback manager, each harness decision emits `CallbackEvent.CASCADE_DECISION`.
+
+```python
+from cascadeflow import init, run
+from cascadeflow.telemetry.callbacks import CallbackEvent, CallbackManager
+
+manager = CallbackManager()
+
+def on_decision(event):
+    print(event.data["action"], event.data["model"])
+
+manager.register(CallbackEvent.CASCADE_DECISION, on_decision)
+
+init(mode="observe", callback_manager=manager)
+
+with run(budget=1.0) as r:
+    ...
+```
+
+The emitted callback uses `query="[harness]"` and `workflow="harness"` to avoid passing user prompt content.
+
+## Per-Run Summary Logging
+
+When a scoped run exits (and recorded at least one step), the harness logs a summary on logger `cascadeflow.harness`:
+
+- run id, mode, steps, tool calls
+- cost/latency/energy totals
+- last action/model
+- remaining budget
+
+Use standard Python logging controls to direct this to your existing log sink.
diff --git a/docs/guides/langchain_integration.md b/docs/guides/langchain_integration.md
index eb385654..8eccba62 100644
--- a/docs/guides/langchain_integration.md
+++ b/docs/guides/langchain_integration.md
@@ -12,6 +12,7 @@ This guide shows how to use cascadeflow with LangChain for intelligent AI model
 6. [Use Cases](#use-cases)
 7. [Best Practices](#best-practices)
 8. [Troubleshooting](#troubleshooting)
+9. [Harness Integration (Python)](#harness-integration-python)
 
 ---
 
@@ -822,6 +823,132 @@ console.log(result.response_metadata?.cascade);
 // Not result.metadata (wrong)
 ```
 
+---
+
+## Harness Integration (Python)
+
+The cascadeflow harness adds multi-dimensional budget enforcement, energy tracking,
+tool call gating, and trace recording to LangChain applications via a callback handler.
+
+### Design Principles
+
+- **Callback-based** — Uses LangChain's native callback system to intercept every
+  LLM and tool call. Works with any chain, agent, or LangGraph graph.
+- **Opt-in** — Install `cascadeflow[langchain]` and pass the callback explicitly.
+  Never enabled by default.
+- **Fail-open** — Integration errors are logged but never break chain execution
+  (configurable).
+- **No model switching** — LangChain dispatches the LLM call before `on_llm_start`
+  returns, so the callback cannot redirect to a different model. `switch_model`
+  decisions are recorded with `applied=False` for observability.
+
+### Install
+
+```bash
+pip install "cascadeflow[langchain]"
+```
+
+Requires Python 3.10+.
+
+### Quick Start
+
+```python
+from langchain_openai import ChatOpenAI
+from cascadeflow import init, run
+from cascadeflow.integrations.langchain import get_harness_callback
+
+# 1. Initialize harness globally
+init(mode="observe", budget=1.0)
+
+model = ChatOpenAI(model="gpt-4o-mini")
+
+# 2. Use the harness-aware callback in a run scope
+with run(budget=0.5) as session:
+    with get_harness_callback() as cb:
+        response = model.invoke(
+            "Explain why model routing helps agent budgets.",
+            config={"callbacks": [cb]},
+        )
+
+    print(response.content)
+    print(f"Cost: ${session.cost:.6f}")
+    print(f"Steps: {session.step_count}")
+    print(f"Tool calls: {session.tool_calls}")
+    for event in session.trace():
+        print(event)
+```
+
+### What This Integration Adds
+
+- Budget gating in enforce mode (`on_llm_start` raises `HarnessStopError`)
+- Tool call gating in enforce mode (`on_tool_start` raises `HarnessStopError`)
+- Run metrics on `cascadeflow.run()` scope:
+  - `cost`, `budget_remaining`, `step_count`, `tool_calls`, `latency_used_ms`, `energy_used`
+- Full decision trace through `session.trace()`
+- LangGraph state extraction — automatically syncs `step_count`, `tool_calls`,
+  `budget_remaining`, `latency_used_ms`, `energy_used` from graph state payloads
+
+### Enforce-Mode Limitations
+
+| Decision | Enforced? | Notes |
+|----------|-----------|-------|
+| `stop` (budget/latency/energy) | Yes | Raises `HarnessStopError` from `on_llm_start` |
+| `deny_tool` (tool cap) | Yes | Raises `HarnessStopError` from `on_tool_start` |
+| `switch_model` | Observe-only | Recorded with `applied=False` — LangChain cannot redirect mid-call |
+| `deny_tool` (LLM-level) | Observe-only | Cannot strip tools from already-dispatched request |
+
+### Configuration
+
+```python
+from cascadeflow.integrations.langchain import (
+    HarnessAwareCascadeFlowCallbackHandler,
+    get_harness_callback,
+)
+
+# Context manager (recommended)
+with get_harness_callback(fail_open=True) as cb:
+    result = model.invoke("...", config={"callbacks": [cb]})
+
+# Direct instantiation
+cb = HarnessAwareCascadeFlowCallbackHandler(fail_open=True)
+result = model.invoke("...", config={"callbacks": [cb]})
+```
+
+### With LangGraph
+
+The callback automatically extracts harness-relevant state from LangGraph payloads
+(via `langgraph_state`, `graph_state`, or `state` keys in metadata/configurable).
+
+```python
+from langgraph.graph import StateGraph
+from cascadeflow import init, run
+from cascadeflow.integrations.langchain import get_harness_callback
+
+init(mode="observe", budget=1.0)
+
+# Build your graph as normal
+graph = builder.compile()
+
+with run(budget=0.5) as session:
+    with get_harness_callback() as cb:
+        result = graph.invoke(
+            {"messages": [("user", "What is model routing?")]},
+            config={"callbacks": [cb]},
+        )
+    print(session.summary())
+```
+
+### Troubleshooting
+
+| Symptom | Solution |
+|---------|----------|
+| `ImportError: cascadeflow.integrations.langchain` | `pip install "cascadeflow[langchain]"` |
+| Callback not tracking calls | Ensure `cb` is passed in `config={"callbacks": [cb]}` |
+| Budget not enforced | Check `init(mode="enforce", ...)` — observe mode never blocks |
+| Zero cost reported | Model name may not match pricing table; check `response.response_metadata` |
+
+---
+
 ## Next Steps
 
 1. **Examples**: Check the `examples/` directory for more patterns
diff --git a/docs/guides/openai_agents_integration.md b/docs/guides/openai_agents_integration.md
index 2db6b8b7..db8b1e34 100644
--- a/docs/guides/openai_agents_integration.md
+++ b/docs/guides/openai_agents_integration.md
@@ -15,6 +15,14 @@ Use cascadeflow as an explicit, opt-in `ModelProvider` integration for the OpenA
 pip install "cascadeflow[openai,openai-agents]"
 ```
 
+Recommended: Python 3.10+.
+
+Optional (more precise provider/model cost tracking in harness telemetry):
+
+```bash
+pip install litellm
+```
+
 ## Quickstart
 
 ```python
@@ -71,3 +79,4 @@ if __name__ == "__main__":
 - This is a Python integration for OpenAI Agents SDK.
 - The SDK remains optional and is only installed via the `openai-agents` extra.
 - Existing non-Agents users are unaffected.
+- Without `litellm`, cost tracking still works using cascadeflow's built-in pricing estimates.
diff --git a/docs/guides/python_harness_quickstart.md b/docs/guides/python_harness_quickstart.md
new file mode 100644
index 00000000..c757e48d
--- /dev/null
+++ b/docs/guides/python_harness_quickstart.md
@@ -0,0 +1,95 @@
+# Python Harness Quickstart
+
+This guide covers the in-process harness API:
+
+- `init(...)` for global defaults and SDK instrumentation
+- `run(...)` for per-request scoped budgets/limits and traceability
+- `@agent(...)` for attaching policy metadata to agent functions
+
+## Install
+
+```bash
+pip install "cascadeflow[openai]"
+```
+
+Optional integrations stay opt-in:
+
+```bash
+pip install "cascadeflow[openai,openai-agents]"
+pip install "cascadeflow[crewai]"
+pip install "cascadeflow[google-adk]"
+```
+
+Version notes:
+- `crewai` and `google-adk` integrations require Python 3.10+.
+- `openai-agents` is recommended on Python 3.10+.
+
+Optional for richer cost normalization across aliased provider model names:
+
+```bash
+pip install litellm
+```
+
+## 1) Initialize Harness
+
+```python
+from cascadeflow import init
+
+report = init(
+    mode="observe",      # off | observe | enforce
+    budget=1.0,          # default per-run budget cap
+    max_tool_calls=8,    # default per-run tool call cap
+)
+
+print(report.mode)
+print(report.instrumented)
+print(report.detected_but_not_instrumented)
+```
+
+`init(...)` is explicit and never auto-enables integrations.
+
+## 2) Track One Scoped Run
+
+```python
+from openai import OpenAI
+
+from cascadeflow import run
+
+client = OpenAI()
+
+with run(budget=0.25, max_tool_calls=4) as session:
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[{"role": "user", "content": "Summarize model cascading in one sentence."}],
+    )
+
+    print(response.choices[0].message.content)
+    print(session.summary())
+    print(session.trace())
+```
+
+## 3) Attach Agent Metadata
+
+`@agent(...)` attaches policy metadata to your function without changing how the
+function executes.
+
+```python
+from cascadeflow import agent
+
+@agent(
+    budget=0.2,
+    kpi_targets={"quality": 0.9},
+    kpi_weights={"cost": 0.5, "latency": 0.5},
+    compliance="strict",
+)
+def support_agent(task: str) -> str:
+    return f"Handled: {task}"
+
+print(support_agent.__cascadeflow_agent_policy__)
+```
+
+## Minimal Checklist
+
+1. Call `init(...)` once at process startup.
+2. Wrap each unit of work in `with run(...):`.
+3. Use `run.summary()` and `run.trace()` for auditability and tuning.
diff --git a/docs/strategy/agent-intelligence-v2-plan.md b/docs/strategy/agent-intelligence-v2-plan.md
index 267ddc69..295a713d 100644
--- a/docs/strategy/agent-intelligence-v2-plan.md
+++ b/docs/strategy/agent-intelligence-v2-plan.md
@@ -1,7 +1,7 @@
 # Agent Intelligence V2 Plan
 
-Last updated: February 25, 2026
-Status: Planning (no implementation in this document)
+Last updated: March 5, 2026
+Status: V2/V2.1 execution plan with implementation tracking (historical + active reference)
 Supersedes: agent-intelligence-v1-plan.md
 
 ## 1. Objective
@@ -197,9 +197,6 @@ Framework-specific packages provide deeper integration (state extraction, middle
 ### TypeScript Equivalent
 
 ```typescript
-// Target API — does not exist in @cascadeflow/core today.
-// TS parity is a V2.1 deliverable (see Section 16, Phase F).
-
 import { cascadeflow } from '@cascadeflow/core';
 
 // Tier 1: Auto-instrument
@@ -831,9 +828,9 @@ Estimated: 6-8 weeks after V2 Python launch.
 
 Estimated: 3-4 weeks (can parallel with Phase F).
 
-### 16.1 Parallel Branch Workboard (Tick-Off)
+### 16.1 Parallel Branch Workboard (Historical Tick-Off)
 
-Use this section as the single coordination board for parallel execution.
+Use this section as the historical coordination board for parallel execution.
 
 Branching model:
 - Keep `main` always releasable.
@@ -842,15 +839,17 @@ Branching model:
 - Merge to `main` only after integration branch CI + benchmark gates are green.
 
 Claim checklist (one owner per branch at a time):
-- [x] `feat/v2-core-harness-api` — Owner: `@codex` — PR: `TBD` — Status: `completed`
-- [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — PR: `TBD` — Status: `in-progress`
-- [x] `feat/v2-enforce-actions` — Owner: `@codex` — PR: `TBD` — Status: `completed (ready for PR)`
-- [ ] `feat/v2-openai-agents-integration` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
-- [ ] `feat/v2-crewai-integration` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
-- [ ] `feat/v2-langchain-harness-extension` — Owner: `@codex` — PR: `TBD` — Status: `in-progress`
-- [ ] `feat/v2-dx-docs-quickstarts` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
-- [x] `feat/v2-bench-repro-pipeline` — Owner: `@codex` — PR: `#163` — Status: `review`
-- [ ] `feat/v2-security-privacy-telemetry` — Owner: `@` — PR: `#` — Status: `claimed/in-progress/review/merged`
+- [x] `feat/v2-core-harness-api` — Owner: `@codex` — Status: `completed (merged to integration branch)`
+- [x] `feat/v2-openai-auto-instrumentation` — Owner: `@claude` — Status: `completed (merged to integration branch)`
+- [x] `feat/v2-enforce-actions` — Owner: `@codex` — Status: `completed (merged to integration branch)`
+- [x] `feat/v2-openai-agents-integration` — Owner: `@codex` — Status: `completed (merged to integration branch)` — code + 7 tests + docs + example
+- [x] `feat/v2-crewai-integration` — Owner: `@codex` — Status: `completed (merged to integration branch)` — code + 44 tests + docs + example
+- [x] `feat/v2-langchain-harness-extension` — Owner: `@codex` — Status: `completed (merged to integration branch)` — code + 79 tests + docs + example
+- [x] `feat/v2-dx-docs-quickstarts` — Owner: `@codex` — Status: `completed (merged to integration branch)` — quickstart + llms.txt
+- [x] `feat/v2-bench-repro-pipeline` — Owner: `@codex` — PR: `#163` — Status: `completed (merged to integration branch)`
+- [x] `feat/v2-security-privacy-telemetry` — Owner: `@codex` — PR: `#162` — Status: `completed (merged to integration branch)`
+- [x] `feat/v2-google-adk-integration` — Owner: `@codex` — Status: `completed (merged to integration branch)` — code + 63 tests + docs + example
+- [x] `feat/v2-n8n-harness` — Owner: `@codex` — PR: `#164` — Status: `completed (merged to integration branch)` — TS harness + 50 tests + UI
 
 Merge gates per feature branch:
 - [ ] Unit/integration tests green for touched scope
@@ -864,6 +863,21 @@ Integration-branch promotion gates:
 - [ ] Quickstart verification for existing app and framework paths
 - [ ] Go/No-Go checklist in Section 18 satisfied before merging to `main`
 
+### 16.2 V2.1 Parallel Execution Split
+
+To enable parallel work without merge collisions, split V2.1 into Python and TS tracks:
+
+- `feat/v2.1-anthropic-python-auto-instrumentation` (completed in this branch)
+  - Scope: `cascadeflow/harness/*`, Python harness tests, Python docs notes
+  - Deliverables: Anthropic Python auto-instrumentation, validation for `init()/run()` harness path
+- `feat/v2.1-ts-harness-api-parity` (completed and merged into this branch scope)
+  - Scope: `packages/core/*`, TS parity fixtures, TS docs notes
+  - Deliverables: `@cascadeflow/core` exports parity (`init()/run()`), TS fixture parity validation
+
+Parallel-safe rule:
+- Python track does not touch `packages/core/*`
+- TS track does not touch `cascadeflow/harness/*`
+
 ## 17. Future Phases (Post-V2, Not in Scope)
 
 For roadmap visibility. These inform V2 telemetry design but are not V2 deliverables.
@@ -903,29 +917,29 @@ For roadmap visibility. These inform V2 telemetry design but are not V2 delivera
 
 Go when all are true (V2 Python launch):
 
-- [ ] Harness layer is opt-in and backward compatible
-- [ ] `cascadeflow.init()` auto-instruments `openai` Python client
-- [ ] `observe` mode produces zero behavior change (benchmark-validated)
-- [ ] `enforce` mode actions work correctly (switch_model, deny_tool, stop)
-- [ ] Harness decision overhead <5ms p95
-- [ ] Python parity fixture tests pass
-- [ ] Core + integration CI green
-- [ ] Benchmark comparison acceptable vs latest baseline
-- [ ] OpenAI Agents SDK integration documented and validated
-- [ ] CrewAI integration documented and validated
-- [ ] LangChain integration extended and validated
-- [ ] Existing integrations (Vercel AI, n8n) verified compatible (no regressions)
-- [ ] DX quickstart works for existing app/agent users with 1-3 lines of code change
+- [x] Harness layer is opt-in and backward compatible
+- [x] `cascadeflow.init()` auto-instruments `openai` Python client
+- [x] `observe` mode produces zero behavior change (benchmark-validated)
+- [x] `enforce` mode actions work correctly (switch_model, deny_tool, stop)
+- [x] Harness decision overhead <5ms p95
+- [x] Python parity fixture tests pass
+- [x] Core + integration CI green
+- [x] Benchmark comparison acceptable vs latest baseline
+- [x] OpenAI Agents SDK integration documented and validated
+- [x] CrewAI integration documented and validated
+- [x] LangChain integration extended and validated
+- [x] Existing integrations (Vercel AI, n8n) verified compatible (no regressions)
+- [x] DX quickstart works for existing app/agent users with 1-3 lines of code change
 - [ ] External pilot median time-to-first-value <15 minutes
-- [ ] Public benchmark results ready for launch
-- [ ] Benchmark scripts + raw artifacts are reproducible by third parties
-- [ ] pyproject.toml extras (`openai-agents`, `crewai`, `langchain`) defined and installable
+- [x] Public benchmark results ready for launch
+- [x] Benchmark scripts + raw artifacts are reproducible by third parties
+- [x] pyproject.toml extras (`openai-agents`, `crewai`, `langchain`, `google-adk`) defined and installable
 
 V2.1 Go/No-Go (TS parity + anthropic):
-- [ ] TS parity fixtures pass
-- [ ] `@cascadeflow/core` exports `cascadeflow.init()` and `cascadeflow.run()`
-- [ ] `anthropic` Python client auto-instrumentation validated
-- [ ] `@anthropic-ai/sdk` TS client auto-instrumentation validated
+- [x] TS parity fixtures pass
+- [x] `@cascadeflow/core` exports `cascadeflow.init()` and `cascadeflow.run()`
+- [x] `anthropic` Python client auto-instrumentation validated
+- [x] `@anthropic-ai/sdk` TS client auto-instrumentation validated
 
 ## 19. Academic Validation
 
diff --git a/examples/integrations/README.md b/examples/integrations/README.md
index e7e7906a..556efe7a 100644
--- a/examples/integrations/README.md
+++ b/examples/integrations/README.md
@@ -6,6 +6,8 @@ This directory contains production-ready integration examples for cascadeflow wi
 
 - [LiteLLM Integration](#-litellm-integration) - Access 10+ providers with automatic cost tracking
 - [OpenAI Agents SDK Integration](#-openai-agents-sdk-integration) - Harness-aware ModelProvider for existing agent apps
+- [CrewAI Integration](#-crewai-integration) - Hook-based harness metrics and budget gating
+- [Google ADK Integration](#-google-adk-integration) - Plugin-based harness integration for ADK runners
 - [Paygentic Integration](#-paygentic-integration) - Usage event reporting and billing lifecycle helpers
 - [Local Providers](#-local-providers-setup) - Ollama and vLLM configuration examples
 - [OpenTelemetry & Grafana](#-opentelemetry--grafana) - Production observability and metrics
@@ -152,6 +154,9 @@ pip install "cascadeflow[openai,openai-agents]"
 python examples/integrations/openai_agents_harness.py
 ```
 
+Recommended: Python 3.10+.
+Optional: `pip install litellm` for more precise provider/model cost normalization.
+
 ### What It Shows
 
 - Harness-aware model switching with candidate models
@@ -160,6 +165,54 @@ python examples/integrations/openai_agents_harness.py
 
 ---
 
+## 👥 CrewAI Integration
+
+**File:** [`crewai_harness.py`](crewai_harness.py)
+
+Use cascadeflow as an explicit, opt-in CrewAI hook integration.
+
+### Quick Start
+
+```bash
+pip install "cascadeflow[crewai,openai]"
+python examples/integrations/crewai_harness.py
+```
+
+Requires Python 3.10+.
+Optional: `pip install litellm` for more precise provider/model cost normalization.
+
+### What It Shows
+
+- Explicit `enable(...)` hook registration (never on by default)
+- Enforce-mode budget gating before CrewAI LLM calls
+- Run metrics and decision trace via `cascadeflow.run(...)`
+
+---
+
+## 🧠 Google ADK Integration
+
+**File:** [`google_adk_harness.py`](google_adk_harness.py)
+
+Use cascadeflow as an explicit, opt-in plugin integration for Google ADK.
+
+### Quick Start
+
+```bash
+pip install "cascadeflow[google-adk]"
+python examples/integrations/google_adk_harness.py
+```
+
+Requires Python 3.10+.
+Optional: `pip install litellm` for more precise provider/model cost normalization.
+
+### What It Shows
+
+- Explicit plugin creation with `enable(...)` (integration-only behavior)
+- Runner-level plugin wiring via `Runner(..., plugins=[plugin])`
+- Budget gate + run-scoped metrics and trace
+
+---
+
 ## 💳 Paygentic Integration
 
 **File:** [`paygentic_usage.py`](paygentic_usage.py)
@@ -412,6 +465,9 @@ Cost Calculation Tests
 |------|---------|-------------------|
 | `litellm_providers.py` | Comprehensive LiteLLM demo with 8 examples | No (for cost info) |
 | `litellm_cost_tracking.py` | Cost tracking and provider validation | No (for cost info) |
+| `openai_agents_harness.py` | OpenAI Agents SDK harness integration (ModelProvider) | Yes |
+| `crewai_harness.py` | CrewAI hook-based harness integration (opt-in) | Yes |
+| `google_adk_harness.py` | Google ADK plugin harness integration (opt-in) | Yes |
 | `paygentic_usage.py` | Usage event reporting to Paygentic (opt-in, fail-open) | Yes |
 | `local_providers_setup.py` | Ollama and vLLM setup guide | No |
 | `opentelemetry_grafana.py` | Production observability example | No |
@@ -473,6 +529,18 @@ pip install cascadeflow[all]
 pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp-proto-http
 ```
 
+### "CrewAI hooks unavailable"
+```bash
+pip install "cascadeflow[crewai,openai]"
+# Requires crewai>=1.5 for llm_hooks
+```
+
+### "Google ADK not installed"
+```bash
+pip install "cascadeflow[google-adk]"
+# Google ADK requires Python 3.10+
+```
+
 ### "Metrics not appearing in Grafana"
 1. Check OpenTelemetry Collector logs: `docker-compose logs otel-collector`
 2. Verify metrics: `curl http://localhost:8889/metrics`
@@ -490,6 +558,9 @@ Always use provider prefixes for LiteLLM:
 
 - **Provider Guide:** [docs/guides/providers.md](../../docs/guides/providers.md)
 - **Cost Tracking:** [docs/guides/cost_tracking.md](../../docs/guides/cost_tracking.md)
+- **OpenAI Agents Guide:** [docs/guides/openai_agents_integration.md](../../docs/guides/openai_agents_integration.md)
+- **CrewAI Guide:** [docs/guides/crewai_integration.md](../../docs/guides/crewai_integration.md)
+- **Google ADK Guide:** [docs/guides/google_adk_integration.md](../../docs/guides/google_adk_integration.md)
 - **Paygentic Guide:** [docs/guides/paygentic_integration.md](../../docs/guides/paygentic_integration.md)
 - **Production Guide:** [docs/guides/production.md](../../docs/guides/production.md)
 
@@ -498,10 +569,13 @@ Always use provider prefixes for LiteLLM:
 ## 🚀 Next Steps
 
 1. **Try LiteLLM:** `python examples/integrations/litellm_providers.py`
-2. **Try Paygentic usage reporting:** `python examples/integrations/paygentic_usage.py`
-3. **Setup local providers:** `python examples/integrations/local_providers_setup.py`
-4. **Test your API keys:** `python examples/integrations/test_all_providers.py`
-5. **Add monitoring:** Follow OpenTelemetry section above
+2. **Try OpenAI Agents integration:** `python examples/integrations/openai_agents_harness.py`
+3. **Try CrewAI integration:** `python examples/integrations/crewai_harness.py`
+4. **Try Google ADK integration:** `python examples/integrations/google_adk_harness.py`
+5. **Try Paygentic usage reporting:** `python examples/integrations/paygentic_usage.py`
+6. **Setup local providers:** `python examples/integrations/local_providers_setup.py`
+7. **Test your API keys:** `python examples/integrations/test_all_providers.py`
+8. **Add monitoring:** Follow OpenTelemetry section above
 
 ---
 
diff --git a/examples/integrations/crewai_harness.py b/examples/integrations/crewai_harness.py
new file mode 100644
index 00000000..a9df72c6
--- /dev/null
+++ b/examples/integrations/crewai_harness.py
@@ -0,0 +1,73 @@
+"""
+CrewAI + cascadeflow harness integration example.
+
+Run:
+    pip install "cascadeflow[crewai,openai]"
+    export OPENAI_API_KEY="your-key"
+    python examples/integrations/crewai_harness.py
+"""
+
+from __future__ import annotations
+
+
+def main() -> None:
+    try:
+        from crewai import Agent, Crew, Process, Task
+    except ImportError as exc:
+        raise SystemExit(
+            "CrewAI is not installed. " 'Install with: pip install "cascadeflow[crewai,openai]"'
+        ) from exc
+
+    from cascadeflow import init, run
+    from cascadeflow.integrations.crewai import CrewAIHarnessConfig, enable
+
+    # 1) Initialize harness globally.
+    init(mode="observe", budget=1.0, max_tool_calls=6)
+
+    # 2) Explicitly enable CrewAI integration hooks (opt-in).
+    enabled = enable(
+        config=CrewAIHarnessConfig(
+            fail_open=True,
+            enable_budget_gate=True,
+        )
+    )
+    if not enabled:
+        raise SystemExit(
+            "CrewAI hooks are unavailable in this environment. " "Ensure crewai>=1.5 is installed."
+        )
+
+    agent = Agent(
+        role="Routing Analyst",
+        goal="Explain model routing impact on cost and latency in plain language.",
+        backstory="You are concise and practical.",
+        allow_delegation=False,
+        llm="openai/gpt-4o-mini",
+        verbose=False,
+    )
+
+    task = Task(
+        description="Explain why inside-the-loop routing helps agent workloads.",
+        expected_output="One short paragraph and three bullet points.",
+        agent=agent,
+    )
+
+    with run(budget=0.5, max_tool_calls=4) as session:
+        crew = Crew(agents=[agent], tasks=[task], process=Process.sequential, verbose=False)
+        result = crew.kickoff()
+
+        print("=== Result ===")
+        print(result)
+        print("\n=== Harness Metrics ===")
+        print(f"Cost: ${session.cost:.6f}")
+        print(f"Remaining budget: {session.budget_remaining}")
+        print(f"Steps: {session.step_count}")
+        print(f"Tool calls: {session.tool_calls}")
+        print(f"Latency: {session.latency_used_ms:.0f}ms")
+        print(f"Energy: {session.energy_used:.1f}")
+        print("\n=== Decision Trace ===")
+        for event in session.trace():
+            print(event)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/integrations/google_adk_harness.py b/examples/integrations/google_adk_harness.py
new file mode 100644
index 00000000..3f8c9743
--- /dev/null
+++ b/examples/integrations/google_adk_harness.py
@@ -0,0 +1,88 @@
+"""
+Google ADK + cascadeflow harness integration example.
+
+Run:
+    pip install "cascadeflow[google-adk]"
+    export GOOGLE_API_KEY="your-key"
+    python examples/integrations/google_adk_harness.py
+"""
+
+from __future__ import annotations
+
+import asyncio
+
+
+async def main() -> None:
+    try:
+        from google.adk.agents import Agent
+        from google.adk.runners import Runner
+        from google.adk.sessions import InMemorySessionService
+    except ImportError as exc:
+        raise SystemExit(
+            "Google ADK is not installed. " 'Install with: pip install "cascadeflow[google-adk]"'
+        ) from exc
+
+    from cascadeflow import init, run
+    from cascadeflow.integrations.google_adk import GoogleADKHarnessConfig, enable
+
+    # 1. Initialize harness globally
+    init(mode="observe", budget=1.0)
+
+    # 2. Create the cascadeflow ADK plugin
+    plugin = enable(
+        config=GoogleADKHarnessConfig(
+            fail_open=True,
+            enable_budget_gate=True,
+        )
+    )
+
+    # 3. Define an ADK agent
+    agent = Agent(
+        name="demo_agent",
+        model="gemini-2.5-flash",
+        instruction="You are a helpful assistant. Answer concisely.",
+    )
+
+    # 4. Create a Runner with the cascadeflow plugin
+    session_service = InMemorySessionService()
+    runner = Runner(
+        agent=agent,
+        app_name="cascadeflow_demo",
+        session_service=session_service,
+        plugins=[plugin],  # cascadeflow hooks into all LLM calls here
+    )
+
+    # 5. Run within a harness scope
+    with run(budget=0.5) as session:
+        user_session = await session_service.create_session(
+            app_name="cascadeflow_demo",
+            user_id="demo-user",
+        )
+
+        from google.genai.types import Content, Part
+
+        async for event in runner.run_async(
+            user_id="demo-user",
+            session_id=user_session.id,
+            new_message=Content(parts=[Part(text="What is model routing?")]),
+        ):
+            if event.content and event.content.parts:
+                for part in event.content.parts:
+                    if part.text:
+                        print(part.text, end="")
+        print()
+
+        print("\n=== Harness Metrics ===")
+        print(f"Cost: ${session.cost:.6f}")
+        print(f"Remaining budget: {session.budget_remaining}")
+        print(f"Steps: {session.step_count}")
+        print(f"Tool calls: {session.tool_calls}")
+        print(f"Energy: {session.energy_used:.1f}")
+        print(f"Latency: {session.latency_used_ms:.0f}ms")
+        print("\n=== Decision Trace ===")
+        for event in session.trace():
+            print(event)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/integrations/langchain_harness.py b/examples/integrations/langchain_harness.py
new file mode 100644
index 00000000..c0be501f
--- /dev/null
+++ b/examples/integrations/langchain_harness.py
@@ -0,0 +1,55 @@
+"""
+LangChain + cascadeflow harness integration example.
+
+Run:
+    pip install "cascadeflow[langchain]"
+    export OPENAI_API_KEY="your-key"
+    python examples/integrations/langchain_harness.py
+"""
+
+from __future__ import annotations
+
+import asyncio
+
+
+async def main() -> None:
+    try:
+        from langchain_openai import ChatOpenAI
+    except ImportError as exc:
+        raise SystemExit(
+            "langchain-openai is not installed. "
+            'Install with: pip install "cascadeflow[langchain]" langchain-openai'
+        ) from exc
+
+    from cascadeflow import init, run
+    from cascadeflow.integrations.langchain import get_harness_callback
+
+    # 1) Initialize harness globally.
+    init(mode="observe", budget=1.0, max_tool_calls=6)
+
+    model = ChatOpenAI(model="gpt-4o-mini")
+
+    # 2) Scoped run with harness-aware callback.
+    with run(budget=0.5, max_tool_calls=4) as session:
+        with get_harness_callback() as cb:
+            response = await model.ainvoke(
+                "Explain why inside-the-loop model routing helps agent budgets.",
+                config={"callbacks": [cb]},
+            )
+
+        print("=== Result ===")
+        print(response.content)
+        print("\n=== Harness Metrics ===")
+        print(f"Cost: ${session.cost:.6f}")
+        print(f"Remaining budget: {session.budget_remaining}")
+        print(f"Steps: {session.step_count}")
+        print(f"Tool calls: {session.tool_calls}")
+        print(f"Latency: {session.latency_used_ms:.0f}ms")
+        print(f"Energy: {session.energy_used:.1f}")
+        print("\n=== Decision Trace ===")
+        for event in session.trace():
+            print(event)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/llms.txt b/llms.txt
new file mode 100644
index 00000000..dbba72ca
--- /dev/null
+++ b/llms.txt
@@ -0,0 +1,214 @@
+# cascadeflow
+
+> Agent runtime intelligence layer for AI agent workflows.
+> In-process harness (not a proxy). Works inside agent loops with full state awareness.
+
+## Install
+
+pip install cascadeflow
+
+## Quickstart (3 lines)
+
+import cascadeflow
+cascadeflow.init(mode="observe")
+# All openai/anthropic SDK calls are now tracked. Switch to "enforce" for budget gating.
+
+## What cascadeflow is
+
+cascadeflow is an in-process intelligence layer that sits inside AI agent execution
+loops. Unlike external proxies that only see HTTP request boundaries, cascadeflow
+operates with full agent state awareness: step count, budget consumed, tool call
+history, error context, quality scores, domain, complexity, and user-defined
+business context.
+
+Eight things make cascadeflow different:
+
+1. Inside-the-loop control. Decisions happen per-step and per-tool-call inside
+   agent execution, not at the HTTP boundary. This enables budget gating mid-run,
+   model switching based on remaining budget, and stop actions when caps are hit.
+
+2. Multi-dimensional optimization. Six dimensions scored simultaneously: cost,
+   latency, quality, budget, compliance, and energy. Not just cost routing.
+
+3. Business logic injection. KPI weights and targets let teams encode business
+   priorities (e.g. 60% quality, 30% cost, 10% latency) into every model decision.
+
+4. Actionable decisions. Four actions: allow, switch_model, deny_tool, stop.
+   The harness does not just observe — it controls execution flow.
+
+5. Full transparency. Every decision produces a trace record with action, reason,
+   model, step, cost_total, budget_state, and applied fields. Audit-ready.
+
+6. Measurable value. Session summaries report cost, latency, energy, steps,
+   tool calls, and budget remaining. Before/after comparison is built in.
+
+7. Category leadership. Cross-framework policy layer with unified KPI semantics
+   across LangChain, OpenAI Agents SDK, CrewAI, Google ADK, n8n, Vercel AI SDK.
+
+8. Latency advantage. In-process instrumentation adds <1ms overhead per call.
+   External proxies add 10-50ms of network round-trip latency per LLM call.
+
+## When to use cascadeflow
+
+- You run AI agents (LangChain, LangGraph, CrewAI, OpenAI Agents SDK, Google ADK, or custom)
+- You want to reduce LLM costs without changing agent code
+- You need budget enforcement across multi-step agent runs
+- You need to inject business KPIs (quality, cost, latency, energy) into agent decisions
+- You need compliance-aware model gating (GDPR, HIPAA, PCI, strict)
+- You want full trace recording for auditability and tuning
+
+## When NOT to use cascadeflow
+
+- Single one-off LLM calls (overhead not justified)
+- You only use one model and don't want routing
+- You need a hosted proxy service (cascadeflow is a library, not a SaaS)
+
+## Proxy vs cascadeflow
+
+| Dimension          | External proxy             | cascadeflow harness          |
+|--------------------|----------------------------|------------------------------|
+| Scope              | HTTP request boundary      | Inside agent execution loop  |
+| Dimensions         | Cost only                  | Cost + quality + latency + budget + compliance + energy |
+| Latency overhead   | 10-50ms network RTT        | <1ms in-process              |
+| Business logic     | None                       | KPI weights and targets      |
+| Enforcement        | None (observe only)        | stop, deny_tool, switch_model |
+| Auditability       | Request logs               | Per-step decision traces     |
+
+## Key APIs
+
+- cascadeflow.init(mode) -- activate harness globally (off | observe | enforce)
+- cascadeflow.run(budget, max_tool_calls) -- scoped agent run with budget/limits
+- @cascadeflow.agent(budget, kpis) -- annotate agent functions with policy metadata
+- session.summary() -- structured run metrics (cost, latency, energy, steps, tool calls)
+- session.trace() -- full decision trace for auditability
+
+## HarnessConfig Reference
+
+@dataclass
+class HarnessConfig:
+    mode: HarnessMode          # "off" | "observe" | "enforce". Default: "off"
+    verbose: bool              # Print decisions to stderr. Default: False
+    budget: Optional[float]    # Max USD for the run. Default: None (unlimited)
+    max_tool_calls: Optional[int]    # Max tool/function calls. Default: None
+    max_latency_ms: Optional[float]  # Max wall-clock ms per call. Default: None
+    max_energy: Optional[float]      # Max energy units. Default: None
+    kpi_targets: Optional[dict]      # {"quality": 0.9, "cost": 0.5, ...}
+    kpi_weights: Optional[dict]      # {"quality": 0.6, "cost": 0.3, "latency": 0.1}
+    compliance: Optional[str]        # "gdpr" | "hipaa" | "pci" | "strict"
+
+## Harness Modes
+
+- off: no tracking, no enforcement
+- observe: track all metrics and decisions, never block execution (safe for production rollout)
+- enforce: track + enforce budget/tool/latency/energy caps (stop or deny_tool actions)
+
+## Harness Dimensions
+
+- Cost: estimated USD from model pricing table (18 models, fuzzy resolution)
+- Latency: wall-clock milliseconds per LLM call
+- Energy: deterministic compute-intensity proxy coefficient
+- Tool calls: count of tool/function calls executed
+- Quality: model quality priors for KPI-weighted scoring
+
+## Decision Actions
+
+- allow: proceed normally
+- switch_model: route to cheaper/better model (where runtime allows)
+- deny_tool: block tool execution when tool call cap reached
+- stop: halt agent loop when budget/latency/energy cap exceeded
+
+## Decision Trace Format
+
+Each decision produces a record with these fields:
+- action: "allow" | "switch_model" | "deny_tool" | "stop"
+- reason: human-readable explanation
+- model: model name used for the call
+- step: integer step number in the run
+- cost_total: cumulative cost in USD at this step
+- budget_state: "ok" | "warning" | "exceeded"
+- applied: true if the action was enforced (false in observe mode)
+
+## Compliance Model Allowlists
+
+- gdpr: gpt-4o, gpt-4o-mini, gpt-3.5-turbo
+- hipaa: gpt-4o, gpt-4o-mini
+- pci: gpt-4o-mini, gpt-3.5-turbo
+- strict: gpt-4o only
+
+## Integrations
+
+pip install cascadeflow[langchain]       # LangChain/LangGraph callback handler
+pip install cascadeflow[openai-agents]   # OpenAI Agents SDK ModelProvider
+pip install cascadeflow[crewai]          # CrewAI llm_hooks integration
+pip install cascadeflow[google-adk]      # Google ADK BasePlugin
+
+npm install @cascadeflow/core            # TypeScript core
+npm install @cascadeflow/langchain       # LangChain TypeScript
+npm install @cascadeflow/vercel-ai       # Vercel AI SDK middleware
+npm install @cascadeflow/n8n-nodes-cascadeflow  # n8n community node
+
+All integrations are opt-in. Install the extra and explicitly enable the integration.
+
+## Integration Code Snippets
+
+LangChain:
+  from cascadeflow.integrations.langchain import get_harness_callback
+  cb = get_harness_callback()
+  result = await model.ainvoke("query", config={"callbacks": [cb]})
+
+OpenAI Agents SDK:
+  from cascadeflow.integrations.openai_agents import CascadeFlowModelProvider
+  provider = CascadeFlowModelProvider(model_candidates=["gpt-4o-mini", "gpt-4o"])
+
+CrewAI:
+  from cascadeflow.integrations.crewai import enable
+  enable(budget_gate=True, fail_open=True)
+
+Google ADK:
+  from cascadeflow.integrations.google_adk import enable
+  plugin = enable(fail_open=True)
+  runner = Runner(agent=agent, plugins=[plugin])
+
+## Pricing Table (USD per 1M tokens: input / output)
+
+OpenAI:
+  gpt-4o:         $2.50 / $10.00
+  gpt-4o-mini:    $0.15 / $0.60
+  gpt-5:          $1.25 / $10.00
+  gpt-5-mini:     $0.20 / $0.80
+  gpt-4-turbo:    $10.00 / $30.00
+  gpt-4:          $30.00 / $60.00
+  gpt-3.5-turbo:  $0.50 / $1.50
+  o1:             $15.00 / $60.00
+  o1-mini:        $3.00 / $12.00
+  o3-mini:        $1.10 / $4.40
+
+Anthropic:
+  claude-sonnet-4:  $3.00 / $15.00
+  claude-haiku-3.5: $1.00 / $5.00
+  claude-opus-4.5:  $5.00 / $25.00
+
+Google:
+  gemini-2.5-flash: $0.15 / $0.60
+  gemini-2.5-pro:   $1.25 / $10.00
+  gemini-2.0-flash:  $0.10 / $0.40
+  gemini-1.5-flash: $0.075 / $0.30
+  gemini-1.5-pro:   $1.25 / $5.00
+
+## Energy Coefficients
+
+Model energy is computed as: energy_units = coeff * (input_tokens + output_tokens * 1.5)
+
+  gpt-4o: 1.0       gpt-4o-mini: 0.3     gpt-5: 1.2
+  gpt-5-mini: 0.35  gpt-4-turbo: 1.5     gpt-4: 1.5
+  gpt-3.5-turbo: 0.2  o1: 2.0            o1-mini: 0.8
+  o3-mini: 0.5      claude-sonnet-4: 1.0  claude-haiku-3.5: 0.3
+  claude-opus-4.5: 1.8  gemini-2.5-flash: 0.3  gemini-2.5-pro: 1.2
+  gemini-2.0-flash: 0.25  gemini-1.5-flash: 0.2  gemini-1.5-pro: 1.0
+
+## Links
+
+- Docs: https://docs.cascadeflow.dev
+- Source: https://github.com/lemony-ai/cascadeflow
+- PyPI: pip install cascadeflow
+- npm: npm install @cascadeflow/core
diff --git a/packages/core/README.md b/packages/core/README.md
index a0918d78..3188df91 100644
--- a/packages/core/README.md
+++ b/packages/core/README.md
@@ -33,6 +33,23 @@ pnpm add @cascadeflow/core
 yarn add @cascadeflow/core
 ```
 
+## Harness Quick Start (V2.1)
+
+```typescript
+import { cascadeflow } from '@cascadeflow/core';
+
+// 1) Turn on in-process harness decisions + SDK auto-instrumentation
+cascadeflow.init({ mode: 'enforce', budget: 0.5 });
+
+// 2) Scope one run (global defaults are inherited)
+const result = await cascadeflow.run({ maxToolCalls: 8 }, async (run) => {
+  // Any OpenAI / Anthropic SDK calls made here are evaluated by the harness.
+  return { runId: run.runId };
+});
+
+console.log(result);
+```
+
 ## Quick Start
 
 ### Recommended Setup (Claude Haiku + GPT-5)
diff --git a/packages/core/src/__tests__/harness.test.ts b/packages/core/src/__tests__/harness.test.ts
new file mode 100644
index 00000000..bad03376
--- /dev/null
+++ b/packages/core/src/__tests__/harness.test.ts
@@ -0,0 +1,232 @@
+import { afterEach, describe, expect, it } from 'vitest';
+
+import {
+  BudgetExceededError,
+  cascadeflow,
+  getCurrentRun,
+  getHarnessConfig,
+  init,
+  reset,
+  run,
+} from '../harness';
+import {
+  __resetInstrumentationLoadersForTest,
+  __resetInstrumentationStateForTest,
+  __setInstrumentationLoadersForTest,
+  isAnthropicPatched,
+  isOpenAIPatched,
+} from '../harness-instrument';
+
+class FakeOpenAICompletions {
+  constructor(private readonly calls: Array<Record<string, any>>) {}
+
+  create(request: Record<string, any>): Promise<Record<string, any>> {
+    this.calls.push({ ...request });
+    return Promise.resolve({
+      usage: {
+        prompt_tokens: 100,
+        completion_tokens: 25,
+      },
+      choices: [
+        {
+          message: {
+            tool_calls: [{ id: 'tool_1', type: 'function' }],
+          },
+        },
+      ],
+    });
+  }
+}
+
+class FakeAnthropicMessages {
+  constructor(private readonly calls: Array<Record<string, any>>) {}
+
+  create(request: Record<string, any>): Promise<Record<string, any>> {
+    this.calls.push({ ...request });
+    return Promise.resolve({
+      usage: {
+        input_tokens: 120,
+        output_tokens: 40,
+      },
+      content: [
+        { type: 'text', text: 'hello' },
+        { type: 'tool_use', id: 'tool_1', name: 'search', input: { q: 'x' } },
+      ],
+    });
+  }
+}
+
+afterEach(() => {
+  reset();
+  __resetInstrumentationStateForTest();
+  __resetInstrumentationLoadersForTest();
+});
+
+describe('harness API (TypeScript parity)', () => {
+  it('exposes cascadeflow init/run object API', async () => {
+    expect(typeof cascadeflow.init).toBe('function');
+    expect(typeof cascadeflow.run).toBe('function');
+
+    init({ mode: 'observe' });
+    const value = await cascadeflow.run(async (scope) => {
+      expect(scope.mode).toBe('observe');
+      expect(getCurrentRun()).toBe(scope);
+      return 42;
+    });
+
+    expect(value).toBe(42);
+    expect(getCurrentRun()).toBeNull();
+  });
+
+  it('honors code > env precedence and preserves nested scope isolation', async () => {
+    const previousMode = process.env.CASCADEFLOW_HARNESS_MODE;
+    process.env.CASCADEFLOW_HARNESS_MODE = 'observe';
+
+    init();
+    expect(getHarnessConfig().mode).toBe('observe');
+
+    init({ mode: 'enforce' });
+    expect(getHarnessConfig().mode).toBe('enforce');
+
+    await run({ budget: 1.0 }, async (outer) => {
+      outer.cost = 0.1;
+      expect(outer.budgetMax).toBe(1.0);
+      expect(getCurrentRun()).toBe(outer);
+
+      await run({ budget: 0.25 }, async (inner) => {
+        expect(getCurrentRun()).toBe(inner);
+        expect(inner.budgetMax).toBe(0.25);
+        inner.cost = 0.2;
+      });
+
+      expect(getCurrentRun()).toBe(outer);
+      expect(outer.budgetMax).toBe(1.0);
+      expect(outer.cost).toBe(0.1);
+    });
+
+    if (previousMode == null) {
+      delete process.env.CASCADEFLOW_HARNESS_MODE;
+    } else {
+      process.env.CASCADEFLOW_HARNESS_MODE = previousMode;
+    }
+  });
+
+  it('auto-instruments OpenAI and enforces switch_model decisions', async () => {
+    const openaiCalls: Array<Record<string, any>> = [];
+
+    __setInstrumentationLoadersForTest({
+      openai: () => ({
+        Completions: FakeOpenAICompletions,
+      }),
+      anthropic: () => null,
+    });
+
+    init({ mode: 'enforce' });
+    expect(isOpenAIPatched()).toBe(true);
+
+    await run({ kpiWeights: { cost: 1 } }, async (scope) => {
+      const client = new FakeOpenAICompletions(openaiCalls);
+      await client.create({
+        model: 'gpt-4o',
+        messages: [{ role: 'user', content: 'hi' }],
+      });
+
+      expect(scope.stepCount).toBe(1);
+      expect(scope.cost).toBeGreaterThan(0);
+      expect(scope.toolCalls).toBe(1);
+
+      const trace = scope.trace();
+      expect(trace).toHaveLength(1);
+      expect(trace[0]?.action).toBe('switch_model');
+      expect(trace[0]?.applied).toBe(true);
+      expect(trace[0]?.decisionMode).toBe('enforce');
+    });
+
+    expect(openaiCalls).toHaveLength(1);
+    expect(openaiCalls[0]?.model).not.toBe('gpt-4o');
+  });
+
+  it('observe mode logs non-allow decisions without mutating request', async () => {
+    const openaiCalls: Array<Record<string, any>> = [];
+
+    __setInstrumentationLoadersForTest({
+      openai: () => ({
+        Completions: FakeOpenAICompletions,
+      }),
+      anthropic: () => null,
+    });
+
+    init({ mode: 'observe' });
+
+    await run({ kpiWeights: { cost: 1 } }, async (scope) => {
+      const client = new FakeOpenAICompletions(openaiCalls);
+      await client.create({
+        model: 'gpt-4o',
+        messages: [{ role: 'user', content: 'hi' }],
+      });
+
+      const trace = scope.trace();
+      expect(trace).toHaveLength(1);
+      expect(trace[0]?.action).toBe('switch_model');
+      expect(trace[0]?.applied).toBe(false);
+      expect(trace[0]?.decisionMode).toBe('observe');
+    });
+
+    expect(openaiCalls).toHaveLength(1);
+    expect(openaiCalls[0]?.model).toBe('gpt-4o');
+  });
+
+  it('enforce mode stops calls when budget is exhausted', async () => {
+    const openaiCalls: Array<Record<string, any>> = [];
+
+    __setInstrumentationLoadersForTest({
+      openai: () => ({
+        Completions: FakeOpenAICompletions,
+      }),
+      anthropic: () => null,
+    });
+
+    init({ mode: 'enforce' });
+
+    await expect(
+      run({ budget: 0 }, async () => {
+        const client = new FakeOpenAICompletions(openaiCalls);
+        await client.create({
+          model: 'gpt-4o',
+          messages: [{ role: 'user', content: 'hi' }],
+        });
+      }),
+    ).rejects.toBeInstanceOf(BudgetExceededError);
+
+    expect(openaiCalls).toHaveLength(0);
+  });
+
+  it('auto-instruments Anthropic and tracks usage/tool calls', async () => {
+    const anthropicCalls: Array<Record<string, any>> = [];
+
+    __setInstrumentationLoadersForTest({
+      openai: () => null,
+      anthropic: () => ({
+        Messages: FakeAnthropicMessages,
+      }),
+    });
+
+    init({ mode: 'enforce' });
+    expect(isAnthropicPatched()).toBe(true);
+
+    await run(async (scope) => {
+      const client = new FakeAnthropicMessages(anthropicCalls);
+      await client.create({
+        model: 'claude-sonnet-4-5-20250929',
+        messages: [{ role: 'user', content: 'hello' }],
+      });
+
+      expect(scope.stepCount).toBe(1);
+      expect(scope.toolCalls).toBe(1);
+      expect(scope.cost).toBeGreaterThan(0);
+      expect(scope.trace()[0]?.action).toBe('allow');
+    });
+
+    expect(anthropicCalls).toHaveLength(1);
+  });
+});
diff --git a/packages/core/src/harness-instrument.ts b/packages/core/src/harness-instrument.ts
new file mode 100644
index 00000000..901af4ae
--- /dev/null
+++ b/packages/core/src/harness-instrument.ts
@@ -0,0 +1,746 @@
+type Action = 'allow' | 'switch_model' | 'deny_tool' | 'stop';
+
+type CreateFunction = (this: any, ...args: any[]) => any;
+
+type OpenAIModuleLike = {
+  Completions?: {
+    prototype?: {
+      create?: CreateFunction;
+    };
+  };
+};
+
+type AnthropicModuleLike = {
+  Messages?: {
+    prototype?: {
+      create?: CreateFunction;
+    };
+  };
+};
+
+type Pricing = { input: number; output: number };
+
+type PreCallDecision = {
+  action: Action;
+  reason: string;
+  targetModel: string;
+};
+
+type HarnessRuntime = {
+  getCurrentRun: () => HarnessRunContextLike | null;
+  getHarnessMode: () => HarnessModeLike;
+  createBudgetExceededError: (message: string, remaining?: number) => Error;
+  createHarnessStopError: (message: string, reason?: string) => Error;
+};
+
+type HarnessModeLike = 'off' | 'observe' | 'enforce';
+
+type HarnessRunContextLike = {
+  mode: HarnessModeLike;
+  cost: number;
+  stepCount: number;
+  toolCalls: number;
+  latencyUsedMs: number;
+  energyUsed: number;
+  budgetMax?: number;
+  budgetRemaining?: number;
+  toolCallsMax?: number;
+  latencyMaxMs?: number;
+  energyMax?: number;
+  compliance?: string;
+  kpiWeights?: Record<string, number>;
+  record: (
+    action: string,
+    reason: string,
+    model?: string,
+    options?: {
+      applied?: boolean;
+      decisionMode?: HarnessModeLike;
+    },
+  ) => void;
+};
+
+const MODEL_PRICING_PER_MILLION: Record<string, Pricing> = {
+  // OpenAI
+  'gpt-5': { input: 1.25, output: 10.0 },
+  'gpt-5-mini': { input: 0.25, output: 2.0 },
+  'gpt-5-nano': { input: 0.05, output: 0.4 },
+  'gpt-4o': { input: 2.5, output: 10.0 },
+  'gpt-4o-mini': { input: 0.15, output: 0.6 },
+  'o1': { input: 15.0, output: 60.0 },
+  'o1-mini': { input: 3.0, output: 12.0 },
+  'o3-mini': { input: 1.0, output: 5.0 },
+
+  // Anthropic
+  'claude-opus-4-5-20251101': { input: 15.0, output: 75.0 },
+  'claude-opus-4-20250514': { input: 15.0, output: 75.0 },
+  'claude-sonnet-4-5-20250929': { input: 3.0, output: 15.0 },
+  'claude-sonnet-4-20250514': { input: 3.0, output: 15.0 },
+  'claude-haiku-4-5-20251001': { input: 1.0, output: 5.0 },
+  'claude-3-5-haiku-20241022': { input: 1.0, output: 5.0 },
+};
+
+const ENERGY_COEFFICIENTS: Record<string, number> = {
+  'gpt-5': 1.15,
+  'gpt-5-mini': 0.72,
+  'gpt-5-nano': 0.45,
+  'gpt-4o': 1.0,
+  'gpt-4o-mini': 0.55,
+  'o1': 1.25,
+  'o1-mini': 0.85,
+  'o3-mini': 0.75,
+  'claude-opus-4-5-20251101': 1.2,
+  'claude-opus-4-20250514': 1.15,
+  'claude-sonnet-4-5-20250929': 0.95,
+  'claude-sonnet-4-20250514': 0.92,
+  'claude-haiku-4-5-20251001': 0.7,
+  'claude-3-5-haiku-20241022': 0.68,
+};
+
+const LATENCY_PRIORS: Record<string, number> = {
+  'gpt-5': 0.45,
+  'gpt-5-mini': 0.72,
+  'gpt-5-nano': 0.9,
+  'gpt-4o': 0.58,
+  'gpt-4o-mini': 0.82,
+  'o1': 0.35,
+  'o1-mini': 0.62,
+  'o3-mini': 0.7,
+  'claude-opus-4-5-20251101': 0.4,
+  'claude-opus-4-20250514': 0.44,
+  'claude-sonnet-4-5-20250929': 0.6,
+  'claude-sonnet-4-20250514': 0.63,
+  'claude-haiku-4-5-20251001': 0.85,
+  'claude-3-5-haiku-20241022': 0.86,
+};
+
+const QUALITY_PRIORS: Record<string, number> = {
+  'gpt-5': 0.95,
+  'gpt-5-mini': 0.86,
+  'gpt-5-nano': 0.74,
+  'gpt-4o': 0.9,
+  'gpt-4o-mini': 0.82,
+  'o1': 0.93,
+  'o1-mini': 0.84,
+  'o3-mini': 0.86,
+  'claude-opus-4-5-20251101': 0.94,
+  'claude-opus-4-20250514': 0.92,
+  'claude-sonnet-4-5-20250929': 0.9,
+  'claude-sonnet-4-20250514': 0.88,
+  'claude-haiku-4-5-20251001': 0.82,
+  'claude-3-5-haiku-20241022': 0.8,
+};
+
+const COMPLIANCE_ALLOWLISTS: Record<string, Set<string>> = {
+  strict: new Set(['gpt-4o', 'gpt-4o-mini', 'claude-sonnet-4-5-20250929', 'claude-haiku-4-5-20251001']),
+  regulated: new Set(['gpt-4o', 'claude-sonnet-4-5-20250929']),
+};
+
+const DEFAULT_ENERGY_COEFFICIENT = 0.9;
+const DEFAULT_OUTPUT_WEIGHT = 1.5;
+
+const PRICING_MODELS = Object.keys(MODEL_PRICING_PER_MILLION);
+
+let openAIPatched = false;
+let anthropicPatched = false;
+
+let originalOpenAICreate: CreateFunction | null = null;
+let originalAnthropicCreate: CreateFunction | null = null;
+let patchedOpenAIClass: { prototype?: { create?: CreateFunction } } | null = null;
+let patchedAnthropicClass: { prototype?: { create?: CreateFunction } } | null = null;
+
+const defaultOpenAILoader = (): OpenAIModuleLike | null => {
+  try {
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    return require('openai/resources/chat/completions') as OpenAIModuleLike;
+  } catch {
+    return null;
+  }
+};
+
+const defaultAnthropicLoader = (): AnthropicModuleLike | null => {
+  try {
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    return require('@anthropic-ai/sdk/resources/messages') as AnthropicModuleLike;
+  } catch {
+    return null;
+  }
+};
+
+let loadOpenAIModule = defaultOpenAILoader;
+let loadAnthropicModule = defaultAnthropicLoader;
+let harnessRuntimeBindings: HarnessRuntime | null = null;
+
+function getHarnessRuntime(): HarnessRuntime {
+  if (!harnessRuntimeBindings) {
+    throw new Error('Harness runtime bindings not configured');
+  }
+  return harnessRuntimeBindings;
+}
+
+export function setHarnessRuntimeBindingsForInstrumentation(bindings: HarnessRuntime): void {
+  harnessRuntimeBindings = bindings;
+}
+
+function nowMonotonicMs(): number {
+  // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
+  if (typeof globalThis !== 'undefined' && (globalThis as any).performance?.now) {
+    return (globalThis as any).performance.now() as number;
+  }
+
+  // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
+  if (typeof process !== 'undefined' && process.hrtime?.bigint) {
+    return Number(process.hrtime.bigint()) / 1_000_000;
+  }
+
+  return Date.now();
+}
+
+function normalizeModelName(model: string): string {
+  return model.trim().toLowerCase();
+}
+
+function estimateCost(model: string, promptTokens: number, completionTokens: number): number {
+  const price = MODEL_PRICING_PER_MILLION[normalizeModelName(model)];
+  if (!price) {
+    return 0;
+  }
+
+  return (promptTokens / 1_000_000) * price.input + (completionTokens / 1_000_000) * price.output;
+}
+
+function estimateEnergy(model: string, promptTokens: number, completionTokens: number): number {
+  const coefficient = ENERGY_COEFFICIENTS[normalizeModelName(model)] ?? DEFAULT_ENERGY_COEFFICIENT;
+  return coefficient * (promptTokens + completionTokens * DEFAULT_OUTPUT_WEIGHT) / 1000;
+}
+
+function modelTotalCost(model: string): number {
+  const price = MODEL_PRICING_PER_MILLION[normalizeModelName(model)];
+  if (!price) {
+    return Number.POSITIVE_INFINITY;
+  }
+  return price.input + price.output;
+}
+
+function selectCheaperModel(currentModel: string): string {
+  const currentCost = modelTotalCost(currentModel);
+  let bestModel = currentModel;
+  let bestCost = currentCost;
+
+  for (const candidate of PRICING_MODELS) {
+    const candidateCost = modelTotalCost(candidate);
+    if (candidateCost < bestCost) {
+      bestModel = candidate;
+      bestCost = candidateCost;
+    }
+  }
+
+  return bestModel;
+}
+
+function selectLowerEnergyModel(currentModel: string): string {
+  const currentCoeff = ENERGY_COEFFICIENTS[normalizeModelName(currentModel)] ?? DEFAULT_ENERGY_COEFFICIENT;
+  let bestModel = currentModel;
+  let bestCoeff = currentCoeff;
+
+  for (const candidate of PRICING_MODELS) {
+    const coeff = ENERGY_COEFFICIENTS[candidate] ?? DEFAULT_ENERGY_COEFFICIENT;
+    if (coeff < bestCoeff) {
+      bestModel = candidate;
+      bestCoeff = coeff;
+    }
+  }
+
+  return bestModel;
+}
+
+function selectFasterModel(currentModel: string): string {
+  const currentLatency = LATENCY_PRIORS[normalizeModelName(currentModel)] ?? 0.7;
+  let bestModel = currentModel;
+  let bestLatency = currentLatency;
+
+  for (const candidate of PRICING_MODELS) {
+    const score = LATENCY_PRIORS[candidate] ?? 0.7;
+    if (score > bestLatency) {
+      bestModel = candidate;
+      bestLatency = score;
+    }
+  }
+
+  return bestModel;
+}
+
+function normalizeWeights(weights: Record<string, number>): Record<string, number> {
+  const normalized: Record<string, number> = {};
+  let total = 0;
+
+  for (const [key, value] of Object.entries(weights)) {
+    if (!Number.isFinite(value) || value <= 0) {
+      continue;
+    }
+    normalized[key] = value;
+    total += value;
+  }
+
+  if (total <= 0) {
+    return {};
+  }
+
+  for (const key of Object.keys(normalized)) {
+    normalized[key] /= total;
+  }
+
+  return normalized;
+}
+
+function costUtility(model: string): number {
+  const costs = PRICING_MODELS.map(modelTotalCost).filter(Number.isFinite);
+  const min = Math.min(...costs);
+  const max = Math.max(...costs);
+  const current = modelTotalCost(model);
+
+  if (!Number.isFinite(current) || max === min) {
+    return 0.5;
+  }
+
+  return (max - current) / (max - min);
+}
+
+function energyUtility(model: string): number {
+  const coeffs = PRICING_MODELS.map((name) => ENERGY_COEFFICIENTS[name] ?? DEFAULT_ENERGY_COEFFICIENT);
+  const min = Math.min(...coeffs);
+  const max = Math.max(...coeffs);
+  const current = ENERGY_COEFFICIENTS[normalizeModelName(model)] ?? DEFAULT_ENERGY_COEFFICIENT;
+
+  if (max === min) {
+    return 0.5;
+  }
+
+  return (max - current) / (max - min);
+}
+
+function kpiScore(model: string, weights: Record<string, number>): number {
+  const normalized = normalizeWeights(weights);
+  if (Object.keys(normalized).length === 0) {
+    return 0;
+  }
+
+  const key = normalizeModelName(model);
+  const quality = QUALITY_PRIORS[key] ?? 0.7;
+  const latency = LATENCY_PRIORS[key] ?? 0.7;
+  const cost = costUtility(key);
+  const energy = energyUtility(key);
+
+  return (
+    (normalized.quality ?? 0) * quality
+    + (normalized.latency ?? 0) * latency
+    + (normalized.cost ?? 0) * cost
+    + (normalized.energy ?? 0) * energy
+  );
+}
+
+function selectKPIWeightedModel(currentModel: string, weights: Record<string, number>): string {
+  const normalized = normalizeWeights(weights);
+  if (Object.keys(normalized).length === 0) {
+    return currentModel;
+  }
+
+  let bestModel = currentModel;
+  let bestScore = kpiScore(currentModel, normalized);
+
+  for (const candidate of PRICING_MODELS) {
+    const score = kpiScore(candidate, normalized);
+    if (score > bestScore) {
+      bestModel = candidate;
+      bestScore = score;
+    }
+  }
+
+  return bestModel;
+}
+
+function extractOpenAIUsage(response: any): [number, number] {
+  const usage = response?.usage;
+  if (!usage || typeof usage !== 'object') {
+    return [0, 0];
+  }
+  const promptTokens = Number(usage.prompt_tokens ?? usage.input_tokens ?? 0);
+  const completionTokens = Number(usage.completion_tokens ?? usage.output_tokens ?? 0);
+  return [
+    Number.isFinite(promptTokens) ? promptTokens : 0,
+    Number.isFinite(completionTokens) ? completionTokens : 0,
+  ];
+}
+
+function extractAnthropicUsage(response: any): [number, number] {
+  const usage = response?.usage;
+  if (!usage || typeof usage !== 'object') {
+    return [0, 0];
+  }
+
+  const inputTokens = Number(usage.input_tokens ?? usage.prompt_tokens ?? 0);
+  const outputTokens = Number(usage.output_tokens ?? usage.completion_tokens ?? 0);
+  return [
+    Number.isFinite(inputTokens) ? inputTokens : 0,
+    Number.isFinite(outputTokens) ? outputTokens : 0,
+  ];
+}
+
+function countOpenAIToolCalls(response: any): number {
+  const toolCalls = response?.choices?.[0]?.message?.tool_calls;
+  if (!Array.isArray(toolCalls)) {
+    return 0;
+  }
+  return toolCalls.length;
+}
+
+function countAnthropicToolCalls(response: any): number {
+  const content = response?.content;
+  if (!Array.isArray(content)) {
+    return 0;
+  }
+  return content.filter((item: any) => item?.type === 'tool_use').length;
+}
+
+function evaluatePreCallDecision(ctx: HarnessRunContextLike, model: string, hasTools: boolean): PreCallDecision {
+  if (ctx.budgetMax != null && ctx.cost >= ctx.budgetMax) {
+    return { action: 'stop', reason: 'budget_exceeded', targetModel: model };
+  }
+
+  if (hasTools && ctx.toolCallsMax != null && ctx.toolCalls >= ctx.toolCallsMax) {
+    return { action: 'deny_tool', reason: 'max_tool_calls_reached', targetModel: model };
+  }
+
+  if (ctx.compliance) {
+    const profile = COMPLIANCE_ALLOWLISTS[ctx.compliance.trim().toLowerCase()];
+    if (profile) {
+      const normalized = normalizeModelName(model);
+      if (!profile.has(normalized)) {
+        const next = PRICING_MODELS.find((candidate) => profile.has(candidate));
+        if (next) {
+          return { action: 'switch_model', reason: 'compliance_model_policy', targetModel: next };
+        }
+        return {
+          action: hasTools ? 'deny_tool' : 'stop',
+          reason: hasTools ? 'compliance_no_approved_tool_path' : 'compliance_no_approved_model',
+          targetModel: model,
+        };
+      }
+      if (ctx.compliance.trim().toLowerCase() === 'strict' && hasTools) {
+        return { action: 'deny_tool', reason: 'compliance_tool_restriction', targetModel: model };
+      }
+    }
+  }
+
+  if (ctx.latencyMaxMs != null && ctx.latencyUsedMs >= ctx.latencyMaxMs) {
+    const faster = selectFasterModel(model);
+    if (normalizeModelName(faster) !== normalizeModelName(model)) {
+      return { action: 'switch_model', reason: 'latency_limit_exceeded', targetModel: faster };
+    }
+    return { action: 'stop', reason: 'latency_limit_exceeded', targetModel: model };
+  }
+
+  if (ctx.energyMax != null && ctx.energyUsed >= ctx.energyMax) {
+    const lower = selectLowerEnergyModel(model);
+    if (normalizeModelName(lower) !== normalizeModelName(model)) {
+      return { action: 'switch_model', reason: 'energy_limit_exceeded', targetModel: lower };
+    }
+    return { action: 'stop', reason: 'energy_limit_exceeded', targetModel: model };
+  }
+
+  if (
+    ctx.budgetMax != null
+    && ctx.budgetMax > 0
+    && ctx.budgetRemaining != null
+    && (ctx.budgetRemaining / ctx.budgetMax) < 0.2
+  ) {
+    const cheaper = selectCheaperModel(model);
+    if (normalizeModelName(cheaper) !== normalizeModelName(model)) {
+      return { action: 'switch_model', reason: 'budget_pressure', targetModel: cheaper };
+    }
+  }
+
+  if (ctx.kpiWeights && Object.keys(ctx.kpiWeights).length > 0) {
+    const candidate = selectKPIWeightedModel(model, ctx.kpiWeights);
+    if (normalizeModelName(candidate) !== normalizeModelName(model)) {
+      return { action: 'switch_model', reason: 'kpi_weight_optimization', targetModel: candidate };
+    }
+  }
+
+  return { action: 'allow', reason: ctx.mode, targetModel: model };
+}
+
+function raiseStopError(ctx: HarnessRunContextLike, reason: string): never {
+  const runtime = getHarnessRuntime();
+  if (reason === 'budget_exceeded') {
+    const remaining = Math.max(0, (ctx.budgetMax ?? 0) - ctx.cost);
+    throw runtime.createBudgetExceededError(
+      `Budget exhausted: spent $${ctx.cost.toFixed(4)} of $${(ctx.budgetMax ?? 0).toFixed(4)} max`,
+      remaining,
+    );
+  }
+
+  throw runtime.createHarnessStopError(`cascadeflow harness stop: ${reason}`, reason);
+}
+
+function updateContext(
+  ctx: HarnessRunContextLike,
+  mode: HarnessModeLike,
+  model: string,
+  promptTokens: number,
+  completionTokens: number,
+  toolCalls: number,
+  elapsedMs: number,
+  decision: PreCallDecision,
+  applied: boolean,
+): void {
+  const cost = estimateCost(model, promptTokens, completionTokens);
+  const energy = estimateEnergy(model, promptTokens, completionTokens);
+
+  ctx.cost += cost;
+  ctx.stepCount += 1;
+  ctx.toolCalls += toolCalls;
+  ctx.latencyUsedMs += elapsedMs;
+  ctx.energyUsed += energy;
+
+  if (ctx.budgetMax != null) {
+    ctx.budgetRemaining = ctx.budgetMax - ctx.cost;
+  }
+
+  ctx.record(decision.action, decision.reason, decision.targetModel, {
+    applied,
+    decisionMode: mode,
+  });
+}
+
+function isThenable(value: any): value is Promise<any> {
+  return Boolean(value) && typeof value.then === 'function';
+}
+
+function makePatchedCreate(provider: 'openai' | 'anthropic', original: CreateFunction): CreateFunction {
+  return function patchedCreate(this: any, ...args: any[]): any {
+    const runtime = getHarnessRuntime();
+    const activeRun = runtime.getCurrentRun();
+    const mode = activeRun?.mode ?? runtime.getHarnessMode();
+
+    if (mode === 'off') {
+      return original.apply(this, args);
+    }
+
+    const firstArg = args[0];
+    const request = firstArg && typeof firstArg === 'object' ? { ...firstArg } : {};
+    const model = typeof request.model === 'string' ? request.model : 'unknown';
+    const hasTools = Array.isArray(request.tools) && request.tools.length > 0;
+
+    const decision = activeRun ? evaluatePreCallDecision(activeRun, model, hasTools) : {
+      action: 'allow' as const,
+      reason: mode,
+      targetModel: model,
+    };
+
+    let applied = decision.action === 'allow';
+    let effectiveModel = model;
+
+    if (activeRun && mode === 'enforce') {
+      if (decision.action === 'stop') {
+        activeRun.record('stop', decision.reason, model, {
+          applied: true,
+          decisionMode: mode,
+        });
+        raiseStopError(activeRun, decision.reason);
+      }
+
+      if (decision.action === 'switch_model') {
+        if (normalizeModelName(decision.targetModel) !== normalizeModelName(model)) {
+          request.model = decision.targetModel;
+          effectiveModel = decision.targetModel;
+          applied = true;
+        } else {
+          applied = false;
+        }
+      }
+
+      if (decision.action === 'deny_tool') {
+        if (Array.isArray(request.tools) && request.tools.length > 0) {
+          request.tools = [];
+          applied = true;
+        } else {
+          applied = false;
+        }
+      }
+    } else if (decision.action !== 'allow') {
+      applied = false;
+    }
+
+    const interceptedArgs = firstArg && typeof firstArg === 'object'
+      ? [request, ...args.slice(1)]
+      : args;
+
+    const isStream = Boolean(request.stream);
+    const startedAt = nowMonotonicMs();
+    const result = original.apply(this, interceptedArgs);
+
+    if (!activeRun) {
+      return result;
+    }
+
+    const finalize = (response: any): any => {
+      const elapsedMs = Math.max(0, nowMonotonicMs() - startedAt);
+
+      let promptTokens = 0;
+      let completionTokens = 0;
+      let toolCallCount = 0;
+
+      if (!isStream) {
+        if (provider === 'openai') {
+          [promptTokens, completionTokens] = extractOpenAIUsage(response);
+          toolCallCount = countOpenAIToolCalls(response);
+        } else {
+          [promptTokens, completionTokens] = extractAnthropicUsage(response);
+          toolCallCount = countAnthropicToolCalls(response);
+        }
+      }
+
+      updateContext(
+        activeRun,
+        mode,
+        effectiveModel,
+        promptTokens,
+        completionTokens,
+        toolCallCount,
+        elapsedMs,
+        decision,
+        applied,
+      );
+
+      return response;
+    };
+
+    if (isThenable(result)) {
+      result
+        .then((response) => {
+          finalize(response);
+        })
+        .catch(() => {
+          // fail-open: harness instrumentation errors must not crash user flow.
+        });
+      return result;
+    }
+
+    return finalize(result);
+  };
+}
+
+export function detectOpenAIInstrumentationTarget(): boolean {
+  const module = loadOpenAIModule();
+  return Boolean(module?.Completions?.prototype?.create);
+}
+
+export function detectAnthropicInstrumentationTarget(): boolean {
+  const module = loadAnthropicModule();
+  return Boolean(module?.Messages?.prototype?.create);
+}
+
+export function patchOpenAI(): boolean {
+  if (openAIPatched) {
+    return true;
+  }
+
+  const module = loadOpenAIModule();
+  const cls = module?.Completions;
+  const prototype = cls?.prototype;
+  const create = prototype?.create;
+
+  if (!cls || !prototype || typeof create !== 'function') {
+    return false;
+  }
+
+  originalOpenAICreate = create;
+  patchedOpenAIClass = cls;
+  prototype.create = makePatchedCreate('openai', create);
+  openAIPatched = true;
+  return true;
+}
+
+export function patchAnthropic(): boolean {
+  if (anthropicPatched) {
+    return true;
+  }
+
+  const module = loadAnthropicModule();
+  const cls = module?.Messages;
+  const prototype = cls?.prototype;
+  const create = prototype?.create;
+
+  if (!cls || !prototype || typeof create !== 'function') {
+    return false;
+  }
+
+  originalAnthropicCreate = create;
+  patchedAnthropicClass = cls;
+  prototype.create = makePatchedCreate('anthropic', create);
+  anthropicPatched = true;
+  return true;
+}
+
+export function unpatchOpenAI(): void {
+  if (!openAIPatched) {
+    return;
+  }
+
+  if (patchedOpenAIClass?.prototype && originalOpenAICreate) {
+    patchedOpenAIClass.prototype.create = originalOpenAICreate;
+  }
+
+  openAIPatched = false;
+  originalOpenAICreate = null;
+  patchedOpenAIClass = null;
+}
+
+export function unpatchAnthropic(): void {
+  if (!anthropicPatched) {
+    return;
+  }
+
+  if (patchedAnthropicClass?.prototype && originalAnthropicCreate) {
+    patchedAnthropicClass.prototype.create = originalAnthropicCreate;
+  }
+
+  anthropicPatched = false;
+  originalAnthropicCreate = null;
+  patchedAnthropicClass = null;
+}
+
+export function isOpenAIPatched(): boolean {
+  return openAIPatched;
+}
+
+export function isAnthropicPatched(): boolean {
+  return anthropicPatched;
+}
+
+export function isPatched(): boolean {
+  return openAIPatched || anthropicPatched;
+}
+
+export function __setInstrumentationLoadersForTest(loaders: {
+  openai?: () => OpenAIModuleLike | null;
+  anthropic?: () => AnthropicModuleLike | null;
+}): void {
+  if (loaders.openai) {
+    loadOpenAIModule = loaders.openai;
+  }
+  if (loaders.anthropic) {
+    loadAnthropicModule = loaders.anthropic;
+  }
+}
+
+export function __resetInstrumentationLoadersForTest(): void {
+  loadOpenAIModule = defaultOpenAILoader;
+  loadAnthropicModule = defaultAnthropicLoader;
+}
+
+export function __resetInstrumentationStateForTest(): void {
+  unpatchOpenAI();
+  unpatchAnthropic();
+}
diff --git a/packages/core/src/harness.ts b/packages/core/src/harness.ts
new file mode 100644
index 00000000..3815360e
--- /dev/null
+++ b/packages/core/src/harness.ts
@@ -0,0 +1,754 @@
+import {
+  __resetInstrumentationStateForTest,
+  detectAnthropicInstrumentationTarget,
+  detectOpenAIInstrumentationTarget,
+  patchAnthropic,
+  patchOpenAI,
+  setHarnessRuntimeBindingsForInstrumentation,
+  unpatchAnthropic,
+  unpatchOpenAI,
+} from './harness-instrument';
+
+export type HarnessMode = 'off' | 'observe' | 'enforce';
+
+export type HarnessConfig = {
+  mode: HarnessMode;
+  verbose: boolean;
+  budget?: number;
+  maxToolCalls?: number;
+  maxLatencyMs?: number;
+  maxEnergy?: number;
+  kpiTargets?: Record<string, number>;
+  kpiWeights?: Record<string, number>;
+  compliance?: string;
+};
+
+export type HarnessInitOptions = Partial<HarnessConfig>;
+
+export type HarnessRunOptions = {
+  budget?: number;
+  maxToolCalls?: number;
+  maxLatencyMs?: number;
+  maxEnergy?: number;
+  kpiTargets?: Record<string, number>;
+  kpiWeights?: Record<string, number>;
+  compliance?: string;
+};
+
+export type HarnessInitReport = {
+  mode: HarnessMode;
+  instrumented: string[];
+  detectedButNotInstrumented: string[];
+  configSources: Record<string, 'code' | 'env' | 'file' | 'default'>;
+};
+
+export type HarnessRecordOptions = {
+  applied?: boolean;
+  decisionMode?: HarnessMode;
+};
+
+export type HarnessTraceEntry = {
+  action: string;
+  reason: string;
+  model?: string;
+  runId: string;
+  mode: HarnessMode;
+  step: number;
+  timestampMs: number;
+  toolCallsTotal: number;
+  costTotal: number;
+  latencyUsedMs: number;
+  energyUsed: number;
+  budgetState: {
+    max?: number;
+    remaining?: number;
+  };
+  applied?: boolean;
+  decisionMode?: HarnessMode;
+};
+
+export type HarnessRunSummary = {
+  runId: string;
+  mode: HarnessMode;
+  stepCount: number;
+  toolCalls: number;
+  cost: number;
+  savings: number;
+  latencyUsedMs: number;
+  energyUsed: number;
+  budgetMax?: number;
+  budgetRemaining?: number;
+  lastAction: string;
+  modelUsed?: string;
+  durationMs?: number;
+};
+
+export class HarnessStopError extends Error {
+  reason: string;
+
+  constructor(message: string, reason = 'stop') {
+    super(message);
+    this.name = 'HarnessStopError';
+    this.reason = reason;
+  }
+}
+
+export class BudgetExceededError extends HarnessStopError {
+  remaining: number;
+
+  constructor(message: string, remaining = 0) {
+    super(message, 'budget_exceeded');
+    this.name = 'BudgetExceededError';
+    this.remaining = remaining;
+  }
+}
+
+function randomRunId(): string {
+  return Math.random().toString(36).slice(2, 14);
+}
+
+function nowMonotonicMs(): number {
+  // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
+  if (typeof globalThis !== 'undefined' && (globalThis as any).performance?.now) {
+    return (globalThis as any).performance.now() as number;
+  }
+
+  // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
+  if (typeof process !== 'undefined' && process.hrtime?.bigint) {
+    return Number(process.hrtime.bigint()) / 1_000_000;
+  }
+
+  return Date.now();
+}
+
+const MAX_ACTION_LEN = 64;
+const MAX_REASON_LEN = 160;
+const MAX_MODEL_LEN = 128;
+
+function sanitizeTraceValue(value: unknown, maxLength: number): string | undefined {
+  if (value == null) {
+    return undefined;
+  }
+
+  const text = String(value).replace(/\r?\n/g, ' ').trim();
+  if (!text) {
+    return undefined;
+  }
+
+  if (text.length <= maxLength) {
+    return text;
+  }
+
+  return `${text.slice(0, Math.max(0, maxLength - 3))}...`;
+}
+
+export class HarnessRunContext {
+  runId: string;
+  startedAtMs: number;
+  endedAtMs?: number;
+  durationMs?: number;
+
+  mode: HarnessMode;
+  budgetMax?: number;
+  toolCallsMax?: number;
+  latencyMaxMs?: number;
+  energyMax?: number;
+  kpiTargets?: Record<string, number>;
+  kpiWeights?: Record<string, number>;
+  compliance?: string;
+
+  cost = 0;
+  savings = 0;
+  toolCalls = 0;
+  stepCount = 0;
+  latencyUsedMs = 0;
+  energyUsed = 0;
+  verbose = false;
+  budgetRemaining?: number;
+  modelUsed?: string;
+  lastAction = 'allow';
+  draftAccepted?: boolean;
+
+  private readonly _startedMonotonic: number;
+  private readonly _trace: HarnessTraceEntry[] = [];
+  private _finalized = false;
+
+  constructor(config: {
+    mode: HarnessMode;
+    budgetMax?: number;
+    toolCallsMax?: number;
+    latencyMaxMs?: number;
+    energyMax?: number;
+    kpiTargets?: Record<string, number>;
+    kpiWeights?: Record<string, number>;
+    compliance?: string;
+    verbose?: boolean;
+  }) {
+    this.runId = randomRunId();
+    this.startedAtMs = Date.now();
+    this._startedMonotonic = nowMonotonicMs();
+
+    this.mode = config.mode;
+    this.budgetMax = config.budgetMax;
+    this.toolCallsMax = config.toolCallsMax;
+    this.latencyMaxMs = config.latencyMaxMs;
+    this.energyMax = config.energyMax;
+    this.kpiTargets = config.kpiTargets;
+    this.kpiWeights = config.kpiWeights;
+    this.compliance = config.compliance;
+    this.verbose = Boolean(config.verbose);
+
+    if (config.budgetMax != null) {
+      this.budgetRemaining = config.budgetMax;
+    }
+  }
+
+  finish(): void {
+    if (this._finalized) {
+      return;
+    }
+
+    this._finalized = true;
+    this.endedAtMs = Date.now();
+    this.durationMs = Math.max(0, nowMonotonicMs() - this._startedMonotonic);
+
+    if (this.verbose && this.mode !== 'off' && this.stepCount > 0) {
+      // Keep logging cheap and controlled.
+      // eslint-disable-next-line no-console
+      console.info(
+        '[cascadeflow.harness] run summary',
+        {
+          runId: this.runId,
+          mode: this.mode,
+          steps: this.stepCount,
+          toolCalls: this.toolCalls,
+          cost: this.cost,
+          latencyMs: this.latencyUsedMs,
+          energy: this.energyUsed,
+          lastAction: this.lastAction,
+          model: this.modelUsed,
+          budgetRemaining: this.budgetRemaining,
+          durationMs: this.durationMs,
+        },
+      );
+    }
+  }
+
+  record(action: string, reason: string, model?: string, options: HarnessRecordOptions = {}): void {
+    let safeAction = sanitizeTraceValue(action, MAX_ACTION_LEN);
+    if (!safeAction) {
+      safeAction = 'allow';
+    }
+
+    const safeReason = sanitizeTraceValue(reason, MAX_REASON_LEN) ?? 'unspecified';
+    const safeModel = sanitizeTraceValue(model, MAX_MODEL_LEN);
+
+    this.lastAction = safeAction;
+    this.modelUsed = safeModel;
+
+    const entry: HarnessTraceEntry = {
+      action: safeAction,
+      reason: safeReason,
+      model: safeModel,
+      runId: this.runId,
+      mode: this.mode,
+      step: this.stepCount,
+      timestampMs: Date.now(),
+      toolCallsTotal: this.toolCalls,
+      costTotal: this.cost,
+      latencyUsedMs: this.latencyUsedMs,
+      energyUsed: this.energyUsed,
+      budgetState: {
+        max: this.budgetMax,
+        remaining: this.budgetRemaining,
+      },
+    };
+
+    if (options.applied != null) {
+      entry.applied = options.applied;
+    }
+
+    if (options.decisionMode != null) {
+      entry.decisionMode = options.decisionMode;
+    }
+
+    this._trace.push(entry);
+  }
+
+  trace(): HarnessTraceEntry[] {
+    return [...this._trace];
+  }
+
+  summary(): HarnessRunSummary {
+    return {
+      runId: this.runId,
+      mode: this.mode,
+      stepCount: this.stepCount,
+      toolCalls: this.toolCalls,
+      cost: this.cost,
+      savings: this.savings,
+      latencyUsedMs: this.latencyUsedMs,
+      energyUsed: this.energyUsed,
+      budgetMax: this.budgetMax,
+      budgetRemaining: this.budgetRemaining,
+      lastAction: this.lastAction,
+      modelUsed: this.modelUsed,
+      durationMs: this.durationMs,
+    };
+  }
+}
+
+type ConfigSource = 'code' | 'env' | 'file' | 'default';
+
+type ConfigWithSources = {
+  config: HarnessConfig;
+  sources: Record<string, ConfigSource>;
+};
+
+let _harnessConfig: HarnessConfig = {
+  mode: 'off',
+  verbose: false,
+};
+
+let _isInstrumented = false;
+let fallbackCurrentRun: HarnessRunContext | null = null;
+
+let asyncLocalStorageInstance: { run: (store: HarnessRunContext, callback: () => Promise<unknown>) => Promise<unknown>; getStore: () => HarnessRunContext | undefined } | null = null;
+
+function getAsyncLocalStorage(): typeof asyncLocalStorageInstance {
+  if (asyncLocalStorageInstance) {
+    return asyncLocalStorageInstance;
+  }
+
+  try {
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    const mod = require('node:async_hooks') as {
+      AsyncLocalStorage: new <T>() => { run: (store: T, callback: () => Promise<unknown>) => Promise<unknown>; getStore: () => T | undefined };
+    };
+
+    asyncLocalStorageInstance = new mod.AsyncLocalStorage<HarnessRunContext>();
+  } catch {
+    asyncLocalStorageInstance = null;
+  }
+
+  return asyncLocalStorageInstance;
+}
+
+function parseBoolean(raw: string): boolean {
+  const normalized = raw.trim().toLowerCase();
+  return normalized === '1' || normalized === 'true' || normalized === 'yes' || normalized === 'on';
+}
+
+function parseNumber(raw: string): number {
+  const value = Number(raw);
+  if (!Number.isFinite(value)) {
+    throw new Error(`Invalid numeric value: ${raw}`);
+  }
+  return value;
+}
+
+function parseJSONMap(raw: string): Record<string, number> {
+  const parsed = JSON.parse(raw);
+  if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
+    throw new Error('Expected object');
+  }
+
+  const result: Record<string, number> = {};
+  for (const [key, value] of Object.entries(parsed as Record<string, unknown>)) {
+    result[String(key)] = Number(value);
+  }
+  return result;
+}
+
+function normalizeMode(mode: unknown): HarnessMode {
+  if (mode === 'off' || mode === 'observe' || mode === 'enforce') {
+    return mode;
+  }
+
+  throw new Error('mode must be one of: off, observe, enforce');
+}
+
+function normalizeConfigRecord(raw: Record<string, unknown>): HarnessInitOptions {
+  const out: HarnessInitOptions = {};
+
+  const mode = raw.mode ?? raw.harness_mode;
+  if (typeof mode === 'string') {
+    out.mode = normalizeMode(mode);
+  }
+
+  const verbose = raw.verbose ?? raw.harness_verbose;
+  if (typeof verbose === 'boolean') {
+    out.verbose = verbose;
+  }
+
+  const budget = raw.budget ?? raw.max_budget;
+  if (typeof budget === 'number') {
+    out.budget = budget;
+  }
+
+  const maxToolCalls = raw.maxToolCalls ?? raw.max_tool_calls;
+  if (typeof maxToolCalls === 'number') {
+    out.maxToolCalls = maxToolCalls;
+  }
+
+  const maxLatencyMs = raw.maxLatencyMs ?? raw.max_latency_ms;
+  if (typeof maxLatencyMs === 'number') {
+    out.maxLatencyMs = maxLatencyMs;
+  }
+
+  const maxEnergy = raw.maxEnergy ?? raw.max_energy;
+  if (typeof maxEnergy === 'number') {
+    out.maxEnergy = maxEnergy;
+  }
+
+  const kpiTargets = raw.kpiTargets ?? raw.kpi_targets;
+  if (kpiTargets && typeof kpiTargets === 'object' && !Array.isArray(kpiTargets)) {
+    out.kpiTargets = kpiTargets as Record<string, number>;
+  }
+
+  const kpiWeights = raw.kpiWeights ?? raw.kpi_weights;
+  if (kpiWeights && typeof kpiWeights === 'object' && !Array.isArray(kpiWeights)) {
+    out.kpiWeights = kpiWeights as Record<string, number>;
+  }
+
+  const compliance = raw.compliance;
+  if (typeof compliance === 'string') {
+    out.compliance = compliance;
+  }
+
+  return out;
+}
+
+function readEnvConfig(): HarnessInitOptions {
+  // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
+  if (typeof process === 'undefined' || !process.env) {
+    return {};
+  }
+
+  const env = process.env;
+  const config: HarnessInitOptions = {};
+
+  const mode = env.CASCADEFLOW_HARNESS_MODE ?? env.CASCADEFLOW_MODE;
+  if (mode) {
+    config.mode = normalizeMode(mode);
+  }
+
+  if (env.CASCADEFLOW_HARNESS_VERBOSE != null) {
+    config.verbose = parseBoolean(env.CASCADEFLOW_HARNESS_VERBOSE);
+  }
+
+  const budget = env.CASCADEFLOW_HARNESS_BUDGET ?? env.CASCADEFLOW_BUDGET;
+  if (budget != null) {
+    config.budget = parseNumber(budget);
+  }
+
+  if (env.CASCADEFLOW_HARNESS_MAX_TOOL_CALLS != null) {
+    config.maxToolCalls = parseNumber(env.CASCADEFLOW_HARNESS_MAX_TOOL_CALLS);
+  }
+
+  if (env.CASCADEFLOW_HARNESS_MAX_LATENCY_MS != null) {
+    config.maxLatencyMs = parseNumber(env.CASCADEFLOW_HARNESS_MAX_LATENCY_MS);
+  }
+
+  if (env.CASCADEFLOW_HARNESS_MAX_ENERGY != null) {
+    config.maxEnergy = parseNumber(env.CASCADEFLOW_HARNESS_MAX_ENERGY);
+  }
+
+  if (env.CASCADEFLOW_HARNESS_KPI_TARGETS != null) {
+    config.kpiTargets = parseJSONMap(env.CASCADEFLOW_HARNESS_KPI_TARGETS);
+  }
+
+  if (env.CASCADEFLOW_HARNESS_KPI_WEIGHTS != null) {
+    config.kpiWeights = parseJSONMap(env.CASCADEFLOW_HARNESS_KPI_WEIGHTS);
+  }
+
+  if (env.CASCADEFLOW_HARNESS_COMPLIANCE != null) {
+    config.compliance = env.CASCADEFLOW_HARNESS_COMPLIANCE;
+  }
+
+  return config;
+}
+
+function readFileConfig(): HarnessInitOptions {
+  // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
+  if (typeof process === 'undefined' || !process.cwd) {
+    return {};
+  }
+
+  try {
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    const fs = require('node:fs') as typeof import('node:fs');
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    const path = require('node:path') as typeof import('node:path');
+
+    const configuredPath = process.env.CASCADEFLOW_CONFIG;
+    const candidates = configuredPath
+      ? [configuredPath]
+      : ['cascadeflow.json', 'cascadeflow.config.json'];
+
+    for (const candidate of candidates) {
+      const full = path.isAbsolute(candidate) ? candidate : path.join(process.cwd(), candidate);
+      if (!fs.existsSync(full)) {
+        continue;
+      }
+
+      const content = fs.readFileSync(full, 'utf8');
+      const parsed = JSON.parse(content) as Record<string, unknown>;
+      const harnessBlock = (
+        parsed.harness && typeof parsed.harness === 'object' && !Array.isArray(parsed.harness)
+      )
+        ? (parsed.harness as Record<string, unknown>)
+        : parsed;
+
+      return normalizeConfigRecord(harnessBlock);
+    }
+  } catch {
+    return {};
+  }
+
+  return {};
+}
+
+function resolveConfig(options: HarnessInitOptions): ConfigWithSources {
+  const env = readEnvConfig();
+  const file = readFileConfig();
+  const sources: Record<string, ConfigSource> = {};
+
+  const resolve = <T>(
+    key: keyof HarnessConfig,
+    explicit: T | undefined,
+    envValue: T | undefined,
+    fileValue: T | undefined,
+    defaultValue: T,
+  ): T => {
+    if (explicit !== undefined) {
+      sources[key] = 'code';
+      return explicit;
+    }
+    if (envValue !== undefined) {
+      sources[key] = 'env';
+      return envValue;
+    }
+    if (fileValue !== undefined) {
+      sources[key] = 'file';
+      return fileValue;
+    }
+    sources[key] = 'default';
+    return defaultValue;
+  };
+
+  const mode = resolve('mode', options.mode, env.mode, file.mode, 'off');
+  const verbose = resolve('verbose', options.verbose, env.verbose, file.verbose, false);
+  const budget = resolve('budget', options.budget, env.budget, file.budget, undefined);
+  const maxToolCalls = resolve(
+    'maxToolCalls',
+    options.maxToolCalls,
+    env.maxToolCalls,
+    file.maxToolCalls,
+    undefined,
+  );
+  const maxLatencyMs = resolve(
+    'maxLatencyMs',
+    options.maxLatencyMs,
+    env.maxLatencyMs,
+    file.maxLatencyMs,
+    undefined,
+  );
+  const maxEnergy = resolve('maxEnergy', options.maxEnergy, env.maxEnergy, file.maxEnergy, undefined);
+  const kpiTargets = resolve(
+    'kpiTargets',
+    options.kpiTargets,
+    env.kpiTargets,
+    file.kpiTargets,
+    undefined,
+  );
+  const kpiWeights = resolve(
+    'kpiWeights',
+    options.kpiWeights,
+    env.kpiWeights,
+    file.kpiWeights,
+    undefined,
+  );
+  const compliance = resolve(
+    'compliance',
+    options.compliance,
+    env.compliance,
+    file.compliance,
+    undefined,
+  );
+
+  return {
+    config: {
+      mode,
+      verbose,
+      budget,
+      maxToolCalls,
+      maxLatencyMs,
+      maxEnergy,
+      kpiTargets,
+      kpiWeights,
+      compliance,
+    },
+    sources,
+  };
+}
+
+export function getHarnessConfig(): HarnessConfig {
+  return { ..._harnessConfig };
+}
+
+export function getCurrentRun(): HarnessRunContext | null {
+  const als = getAsyncLocalStorage();
+  if (als) {
+    return als.getStore() ?? null;
+  }
+
+  return fallbackCurrentRun;
+}
+
+export function reset(): void {
+  unpatchOpenAI();
+  unpatchAnthropic();
+  __resetInstrumentationStateForTest();
+
+  _harnessConfig = { mode: 'off', verbose: false };
+  _isInstrumented = false;
+  fallbackCurrentRun = null;
+}
+
+export function init(options: HarnessInitOptions = {}): HarnessInitReport {
+  const { config, sources } = resolveConfig(options);
+  config.mode = normalizeMode(config.mode);
+
+  _harnessConfig = config;
+
+  const instrumented: string[] = [];
+  const detectedButNotInstrumented: string[] = [];
+
+  const openaiDetected = detectOpenAIInstrumentationTarget();
+  const anthropicDetected = detectAnthropicInstrumentationTarget();
+
+  if (config.mode !== 'off' && openaiDetected) {
+    if (patchOpenAI()) {
+      instrumented.push('openai');
+    } else {
+      detectedButNotInstrumented.push('openai');
+    }
+  }
+
+  if (config.mode !== 'off' && anthropicDetected) {
+    if (patchAnthropic()) {
+      instrumented.push('anthropic');
+    } else {
+      detectedButNotInstrumented.push('anthropic');
+    }
+  }
+
+  if (config.mode === 'off') {
+    unpatchOpenAI();
+    unpatchAnthropic();
+  }
+
+  _isInstrumented = true;
+
+  if (config.verbose) {
+    // eslint-disable-next-line no-console
+    console.info('[cascadeflow.harness] init', {
+      mode: config.mode,
+      instrumented,
+      detectedButNotInstrumented,
+    });
+  }
+
+  return {
+    mode: config.mode,
+    instrumented,
+    detectedButNotInstrumented,
+    configSources: sources,
+  };
+}
+
+type RunCallback<T> = (run: HarnessRunContext) => Promise<T> | T;
+
+async function executeScopedRun<T>(runContext: HarnessRunContext, fn: RunCallback<T>): Promise<T> {
+  try {
+    return await fn(runContext);
+  } finally {
+    runContext.finish();
+  }
+}
+
+export async function run<T>(callback: RunCallback<T>): Promise<T>;
+export async function run<T>(options: HarnessRunOptions, callback: RunCallback<T>): Promise<T>;
+export async function run<T>(
+  optionsOrCallback: HarnessRunOptions | RunCallback<T>,
+  callback?: RunCallback<T>,
+): Promise<T> {
+  const options = typeof optionsOrCallback === 'function' ? {} : optionsOrCallback;
+  const cb = (typeof optionsOrCallback === 'function' ? optionsOrCallback : callback) as RunCallback<T> | undefined;
+
+  if (!cb) {
+    throw new Error('run() requires a callback: run(options?, async (run) => { ... })');
+  }
+
+  const cfg = getHarnessConfig();
+  const runContext = new HarnessRunContext({
+    mode: cfg.mode,
+    budgetMax: options.budget ?? cfg.budget,
+    toolCallsMax: options.maxToolCalls ?? cfg.maxToolCalls,
+    latencyMaxMs: options.maxLatencyMs ?? cfg.maxLatencyMs,
+    energyMax: options.maxEnergy ?? cfg.maxEnergy,
+    kpiTargets: options.kpiTargets ?? cfg.kpiTargets,
+    kpiWeights: options.kpiWeights ?? cfg.kpiWeights,
+    compliance: options.compliance ?? cfg.compliance,
+    verbose: cfg.verbose,
+  });
+
+  const als = getAsyncLocalStorage();
+  if (als) {
+    return als.run(runContext, async () => executeScopedRun(runContext, cb)) as Promise<T>;
+  }
+
+  const previous = fallbackCurrentRun;
+  fallbackCurrentRun = runContext;
+  try {
+    return await executeScopedRun(runContext, cb);
+  } finally {
+    fallbackCurrentRun = previous;
+  }
+}
+
+export function agent(policy: HarnessRunOptions): <T extends (...args: any[]) => any>(fn: T) => T {
+  return <T extends (...args: any[]) => any>(fn: T): T => {
+    const wrapped = ((...args: any[]) => fn(...args)) as T;
+    (wrapped as any).__cascadeflow_agent_policy__ = {
+      budget: policy.budget,
+      kpiTargets: policy.kpiTargets,
+      kpiWeights: policy.kpiWeights,
+      compliance: policy.compliance,
+    };
+    return wrapped;
+  };
+}
+
+setHarnessRuntimeBindingsForInstrumentation({
+  getCurrentRun,
+  getHarnessMode: () => getHarnessConfig().mode,
+  createBudgetExceededError: (message: string, remaining?: number) =>
+    new BudgetExceededError(message, remaining),
+  createHarnessStopError: (message: string, reason?: string) =>
+    new HarnessStopError(message, reason),
+});
+
+export const cascadeflow = {
+  init,
+  run,
+  agent,
+  reset,
+  getHarnessConfig,
+  getCurrentRun,
+};
+
+export function isHarnessInstrumented(): boolean {
+  return _isInstrumented;
+}
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index 29819183..c919f67e 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -42,6 +42,31 @@ export {
   DEFAULT_CASCADE_CONFIG,
 } from './config';
 
+// Harness API (v2.1+)
+export type {
+  HarnessMode,
+  HarnessConfig,
+  HarnessInitOptions,
+  HarnessRunOptions,
+  HarnessInitReport,
+  HarnessRecordOptions,
+  HarnessTraceEntry,
+  HarnessRunSummary,
+} from './harness';
+export {
+  HarnessRunContext,
+  HarnessStopError,
+  BudgetExceededError,
+  init,
+  run,
+  agent as harnessAgent,
+  reset as resetHarness,
+  getHarnessConfig,
+  getCurrentRun,
+  isHarnessInstrumented,
+  cascadeflow,
+} from './harness';
+
 // Results
 export type { CascadeResult } from './result';
 export { resultToObject } from './result';
diff --git a/packages/integrations/n8n/nodes/CascadeFlowAgent/CascadeFlowAgent.node.ts b/packages/integrations/n8n/nodes/CascadeFlowAgent/CascadeFlowAgent.node.ts
index b3f52a60..925a9a96 100644
--- a/packages/integrations/n8n/nodes/CascadeFlowAgent/CascadeFlowAgent.node.ts
+++ b/packages/integrations/n8n/nodes/CascadeFlowAgent/CascadeFlowAgent.node.ts
@@ -21,6 +21,7 @@ import {
   type DomainType,
   getEnabledDomains,
 } from '../LmChatCascadeFlow/config';
+import { HarnessRunContext, type HarnessConfig, type HarnessMode, type KpiWeights } from '../harness';
 
 // Tool cascade validator - optional import
 let ToolCascadeValidator: any;
@@ -65,6 +66,7 @@ export class CascadeFlowAgentExecutor {
   private routingRules: Map<string, ToolRoutingMode>;
   private enableToolCascadeValidation: boolean;
   private toolCascadeValidator: any;
+  private harnessCtx: HarnessRunContext | null;
 
   constructor(
     private cascadeModel: CascadeChatModel,
@@ -72,7 +74,9 @@ export class CascadeFlowAgentExecutor {
     routingRules: ToolRoutingRule[],
     private maxIterations: number,
     enableToolCascadeValidation: boolean = false,
+    harnessCtx: HarnessRunContext | null = null,
   ) {
+    this.harnessCtx = harnessCtx;
     this.toolMap = new Map(
       tools.filter((tool) => tool?.name).map((tool) => [tool.name as string, tool])
     );
@@ -295,6 +299,18 @@ export class CascadeFlowAgentExecutor {
     let iterations = 0;
 
     while (iterations < this.maxIterations) {
+      // Harness enforce-mode pre-checks
+      if (this.harnessCtx?.config.mode === 'enforce') {
+        if (this.harnessCtx.isBudgetExhausted()) {
+          finalMessage = new AIMessage(`[Harness] Budget exhausted ($${this.harnessCtx.cost.toFixed(4)} of $${this.harnessCtx.config.budgetMax?.toFixed(4)} max). Agent stopped.`);
+          break;
+        }
+        if (this.harnessCtx.isToolCapReached()) {
+          finalMessage = new AIMessage(`[Harness] Tool call cap reached (${this.harnessCtx.toolCalls} of ${this.harnessCtx.config.toolCallsMax} max). Agent stopped.`);
+          break;
+        }
+      }
+
       const message = await this.cascadeModel.invoke(currentMessages, options);
       const toolCalls = this.extractToolCalls(message);
       trace.push(this.buildTraceEntry(message, toolCalls));
@@ -350,6 +366,12 @@ export class CascadeFlowAgentExecutor {
         );
       }
 
+      // Track tool calls in harness (CascadeChatModel records LLM token costs;
+      // agent executor tracks tool-call counts from the loop itself)
+      if (this.harnessCtx) {
+        this.harnessCtx.toolCalls += toolCalls.length;
+      }
+
       if (routing === 'verifier') {
         const verifierMessage = await this.cascadeModel.invokeVerifierDirect(currentMessages, options);
         trace.push(this.buildTraceEntry(verifierMessage));
@@ -377,6 +399,7 @@ export class CascadeFlowAgentExecutor {
       output: finalMessage.content.toString(),
       message: finalMessage,
       trace,
+      harness: this.harnessCtx?.summary() ?? null,
     };
   }
 
@@ -753,6 +776,99 @@ export class CascadeFlowAgent implements INodeType {
         default: '',
       },
       ...generateDomainProperties(),
+      // -----------------------------------------------------------------
+      // Harness: Multi-Dimensional Cascading
+      // -----------------------------------------------------------------
+      {
+        displayName: 'Harness',
+        name: 'harnessHeading',
+        type: 'notice',
+        default: '',
+      },
+      {
+        displayName: 'Harness Mode',
+        name: 'harnessMode',
+        type: 'options',
+        options: [
+          { name: 'Off', value: 'off', description: 'Harness disabled, zero overhead' },
+          { name: 'Observe', value: 'observe', description: 'Track all dimensions, record trace, no enforcement' },
+          { name: 'Enforce', value: 'enforce', description: 'Stop agent loop when limits are hit' },
+        ],
+        default: 'observe',
+        description: 'Harness mode: off (disabled), observe (telemetry only), or enforce (stop when limits hit)',
+      },
+      {
+        displayName: 'Budget (USD)',
+        name: 'harnessBudget',
+        type: 'number',
+        default: 0,
+        typeOptions: { minValue: 0, numberPrecision: 4 },
+        displayOptions: { hide: { harnessMode: ['off'] } },
+        description: 'Max budget in USD. 0 = unlimited.',
+      },
+      {
+        displayName: 'Max Tool Calls',
+        name: 'harnessMaxToolCalls',
+        type: 'number',
+        default: 0,
+        typeOptions: { minValue: 0 },
+        displayOptions: { hide: { harnessMode: ['off'] } },
+        description: 'Max tool call count. 0 = unlimited.',
+      },
+      {
+        displayName: 'Max Latency (Ms)',
+        name: 'harnessMaxLatencyMs',
+        type: 'number',
+        default: 0,
+        typeOptions: { minValue: 0 },
+        displayOptions: { hide: { harnessMode: ['off'] } },
+        description: 'Max cumulative latency in milliseconds. 0 = unlimited.',
+      },
+      {
+        displayName: 'Max Energy',
+        name: 'harnessMaxEnergy',
+        type: 'number',
+        default: 0,
+        typeOptions: { minValue: 0, numberPrecision: 2 },
+        displayOptions: { hide: { harnessMode: ['off'] } },
+        description: 'Max energy proxy units. 0 = unlimited.',
+      },
+      {
+        displayName: 'Compliance',
+        name: 'harnessCompliance',
+        type: 'options',
+        options: [
+          { name: 'GDPR', value: 'gdpr' },
+          { name: 'HIPAA', value: 'hipaa' },
+          { name: 'None', value: '' },
+          { name: 'PCI', value: 'pci' },
+          { name: 'Strict', value: 'strict' },
+        ],
+        default: '',
+        displayOptions: { hide: { harnessMode: ['off'] } },
+        description: 'Compliance policy to enforce model allowlists',
+      },
+      {
+        displayName: 'KPI Weights',
+        name: 'harnessKpiWeights',
+        type: 'fixedCollection',
+        typeOptions: { multipleValues: false },
+        displayOptions: { hide: { harnessMode: ['off'] } },
+        default: { weights: [{ quality: 0.4, cost: 0.3, latency: 0.2, energy: 0.1 }] },
+        options: [
+          {
+            name: 'weights',
+            displayName: 'Weights',
+            values: [
+              { displayName: 'Quality', name: 'quality', type: 'number', default: 0.4, typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 2 } },
+              { displayName: 'Cost', name: 'cost', type: 'number', default: 0.3, typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 2 } },
+              { displayName: 'Latency', name: 'latency', type: 'number', default: 0.2, typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 2 } },
+              { displayName: 'Energy', name: 'energy', type: 'number', default: 0.1, typeOptions: { minValue: 0, maxValue: 1, numberPrecision: 2 } },
+            ],
+          },
+        ],
+        description: 'KPI dimension weights for optimization scoring (normalized automatically)',
+      },
     ],
   };
 
@@ -782,6 +898,35 @@ export class CascadeFlowAgent implements INodeType {
     const toolRoutingRaw = this.getNodeParameter('toolRoutingRules', 0, { rule: [] }) as any;
     const toolRoutingRules = (toolRoutingRaw?.rule ?? []) as ToolRoutingRule[];
 
+    // Harness parameters
+    const harnessMode = this.getNodeParameter('harnessMode', 0, 'observe') as HarnessMode;
+    let harnessCtx: HarnessRunContext | null = null;
+    if (harnessMode !== 'off') {
+      const rawBudget = this.getNodeParameter('harnessBudget', 0, 0) as number;
+      const rawToolCalls = this.getNodeParameter('harnessMaxToolCalls', 0, 0) as number;
+      const rawLatency = this.getNodeParameter('harnessMaxLatencyMs', 0, 0) as number;
+      const rawEnergy = this.getNodeParameter('harnessMaxEnergy', 0, 0) as number;
+      const compliance = this.getNodeParameter('harnessCompliance', 0, '') as string;
+      const kpiRaw = this.getNodeParameter('harnessKpiWeights', 0, { weights: [{ quality: 0.4, cost: 0.3, latency: 0.2, energy: 0.1 }] }) as any;
+      const kpiEntry = kpiRaw?.weights?.[0] ?? { quality: 0.4, cost: 0.3, latency: 0.2, energy: 0.1 };
+
+      const config: HarnessConfig = {
+        mode: harnessMode,
+        budgetMax: rawBudget > 0 ? rawBudget : null,
+        toolCallsMax: rawToolCalls > 0 ? rawToolCalls : null,
+        latencyMaxMs: rawLatency > 0 ? rawLatency : null,
+        energyMax: rawEnergy > 0 ? rawEnergy : null,
+        compliance: compliance || null,
+        kpiWeights: {
+          quality: kpiEntry.quality ?? 0.4,
+          cost: kpiEntry.cost ?? 0.3,
+          latency: kpiEntry.latency ?? 0.2,
+          energy: kpiEntry.energy ?? 0.1,
+        },
+      };
+      harnessCtx = new HarnessRunContext(config);
+    }
+
     // Domain routing parameters
     const enableDomainRouting = this.getNodeParameter('enableDomainRouting', 0, false) as boolean;
 
@@ -887,12 +1032,18 @@ export class CascadeFlowAgent implements INodeType {
       domainVerifierGetters,
     );
 
+    // Wire harness context into cascade model for per-call recording
+    if (harnessCtx) {
+      cascadeModel.setHarnessContext(harnessCtx);
+    }
+
     const agentExecutor = new CascadeFlowAgentExecutor(
       cascadeModel,
       tools,
       toolRoutingRules,
       maxIterations,
       enableToolCascadeValidation,
+      harnessCtx,
     );
 
     // --- Process each input item ---
@@ -933,6 +1084,7 @@ export class CascadeFlowAgent implements INodeType {
           output: result.output,
           ...cascadeflowMeta,
           trace: result.trace,
+          harness: result.harness ?? null,
         },
         pairedItem: { item: itemIndex },
       });
diff --git a/packages/integrations/n8n/nodes/LmChatCascadeFlow/LmChatCascadeFlow.node.ts b/packages/integrations/n8n/nodes/LmChatCascadeFlow/LmChatCascadeFlow.node.ts
index 8c39ae41..ad2d603e 100644
--- a/packages/integrations/n8n/nodes/LmChatCascadeFlow/LmChatCascadeFlow.node.ts
+++ b/packages/integrations/n8n/nodes/LmChatCascadeFlow/LmChatCascadeFlow.node.ts
@@ -23,6 +23,8 @@ import {
   getEnabledDomains,
 } from './config';
 import { buildCascadeMetadata } from './cascade-metadata';
+import { estimateCost as harnessEstimateCost } from '../harness/pricing';
+import type { HarnessRunContext } from '../harness/harness';
 
 // Quality validation, cost tracking, and routing - optional import
 let QualityValidator: any;
@@ -110,6 +112,29 @@ export class CascadeChatModel extends BaseChatModel {
   private domainVerifiers: Map<DomainType, BaseChatModel | undefined> = new Map();
   private domainVerifierGetters: Map<DomainType, () => Promise<BaseChatModel | undefined>> = new Map();
 
+  // Harness context (set by agent node)
+  private harnessCtx: HarnessRunContext | null = null;
+
+  setHarnessContext(ctx: HarnessRunContext | null): void {
+    this.harnessCtx = ctx;
+  }
+
+  private recordHarnessCall(message: BaseMessage, model: BaseChatModel, elapsedMs: number): void {
+    if (!this.harnessCtx) return;
+    const responseMetadata = (message as any).response_metadata || {};
+    const tokenUsage = responseMetadata.tokenUsage || responseMetadata.usage || {};
+    const inputTokens = tokenUsage.promptTokens || tokenUsage.prompt_tokens || 0;
+    const outputTokens = tokenUsage.completionTokens || tokenUsage.completion_tokens || 0;
+    const modelName = (model as any).modelName || (model as any).model || 'unknown';
+    this.harnessCtx.recordCall({
+      model: modelName,
+      inputTokens,
+      outputTokens,
+      toolCallCount: 0,
+      elapsedMs,
+    });
+  }
+
   constructor(
     drafterModelGetter: () => Promise<BaseChatModel>,
     verifierModelGetter: () => Promise<BaseChatModel>,
@@ -257,6 +282,7 @@ export class CascadeChatModel extends BaseChatModel {
     const latency = Date.now() - start;
 
     const verifierCost = await this.calculateMessageCost(verifierMessage, verifierModel);
+    this.recordHarnessCall(verifierMessage, verifierModel, latency);
     const costBreakdown = {
       drafter: 0,
       verifier: verifierCost,
@@ -584,37 +610,8 @@ export class CascadeChatModel extends BaseChatModel {
       }
     }
 
-    // Fallback to rough estimates based on model name
-    const estimatesPerMillion: Record<string, { input: number; output: number }> = {
-      'gpt-4o-mini': { input: 0.15, output: 0.6 },
-      'gpt-4o': { input: 2.5, output: 10.0 },
-      'gpt-5-mini': { input: 0.20, output: 0.80 },
-      'gpt-4-turbo': { input: 10.0, output: 30.0 },
-      'gpt-4': { input: 30.0, output: 60.0 },
-      'gpt-3.5-turbo': { input: 0.5, output: 1.5 },
-      'claude-3-5-haiku': { input: 1.0, output: 5.0 },
-      'claude-haiku-4-5': { input: 1.0, output: 5.0 },
-      'claude-3-5-sonnet': { input: 3.0, output: 15.0 },
-      'claude-sonnet-4-5': { input: 3.0, output: 15.0 },
-      'claude-sonnet-4': { input: 3.0, output: 15.0 },
-      'claude-opus-4-5': { input: 5.0, output: 25.0 },
-      'claude-3-haiku': { input: 0.25, output: 1.25 },
-      default: { input: 1.0, output: 2.0 },
-    };
-
-    let estimate = estimatesPerMillion.default;
-    for (const [key, value] of Object.entries(estimatesPerMillion)) {
-      if (modelName.includes(key)) {
-        estimate = value;
-        break;
-      }
-    }
-
-    const cost =
-      (inputTokens / 1_000_000) * estimate.input +
-      (outputTokens / 1_000_000) * estimate.output;
-
-    return cost;
+    // Use shared harness pricing (fuzzy model resolution, 18 models)
+    return harnessEstimateCost(modelName, inputTokens, outputTokens);
   }
 
   /**
@@ -711,6 +708,7 @@ export class CascadeChatModel extends BaseChatModel {
         this.verifierCount++;
 
         const verifierCost = await this.calculateMessageCost(verifierMessage, verifierModel);
+        this.recordHarnessCall(verifierMessage, verifierModel, verifierLatency);
         const costBreakdown = {
           drafter: 0,
           verifier: verifierCost,
@@ -772,6 +770,7 @@ export class CascadeChatModel extends BaseChatModel {
       const drafterStartTime = Date.now();
       const drafterMessage = await modelToUse.invoke(messages, options);
       const drafterLatency = Date.now() - drafterStartTime;
+      this.recordHarnessCall(drafterMessage, modelToUse, drafterLatency);
 
       if (domainModel && detectedDomain) {
         this.domainCounts.set(detectedDomain, (this.domainCounts.get(detectedDomain) || 0) + 1);
@@ -798,6 +797,7 @@ export class CascadeChatModel extends BaseChatModel {
           const verifierStartTime = Date.now();
           const verifierMessage = await verifierModel.invoke(messages, options);
           const verifierLatency = Date.now() - verifierStartTime;
+          this.recordHarnessCall(verifierMessage, verifierModel, verifierLatency);
 
           this.verifierCount++;
 
@@ -1060,6 +1060,7 @@ export class CascadeChatModel extends BaseChatModel {
       const verifierInfo = this.getModelInfo(verifierModel);
       const verifierMessage = await verifierModel.invoke(messages, options);
       const verifierLatency = Date.now() - verifierStartTime;
+      this.recordHarnessCall(verifierMessage, verifierModel, verifierLatency);
 
       this.verifierCount++;
 
@@ -1136,7 +1137,9 @@ export class CascadeChatModel extends BaseChatModel {
 
       const verifierModel = await this.getVerifierModel();
       const verifierInfo = this.getModelInfo(verifierModel);
+      const fallbackStart = Date.now();
       const verifierMessage = await verifierModel.invoke(messages, options);
+      this.recordHarnessCall(verifierMessage, verifierModel, Date.now() - fallbackStart);
       this.verifierCount++;
       const verifierCost = await this.calculateMessageCost(verifierMessage, verifierModel);
 
diff --git a/packages/integrations/n8n/nodes/LmChatCascadeFlow/cascade-metadata.ts b/packages/integrations/n8n/nodes/LmChatCascadeFlow/cascade-metadata.ts
index d539d5b7..e93f7b23 100644
--- a/packages/integrations/n8n/nodes/LmChatCascadeFlow/cascade-metadata.ts
+++ b/packages/integrations/n8n/nodes/LmChatCascadeFlow/cascade-metadata.ts
@@ -1,4 +1,5 @@
 import type { DomainType } from './config';
+import type { HarnessSummary } from '../harness';
 
 export interface CostBreakdown {
   drafter: number;
@@ -12,12 +13,15 @@ export interface SavingsBreakdown {
   percent: number;
 }
 
+export interface HarnessSummaryOutput extends HarnessSummary {}
+
 export interface CascadeFlowMetadata {
   model_used: string;
   domain: DomainType | null;
   confidence?: number;
   costs: CostBreakdown;
   savings: SavingsBreakdown;
+  harness?: HarnessSummaryOutput | null;
 }
 
 export const calculateSavings = (
diff --git a/packages/integrations/n8n/nodes/harness/__tests__/harness.test.ts b/packages/integrations/n8n/nodes/harness/__tests__/harness.test.ts
new file mode 100644
index 00000000..5c003e42
--- /dev/null
+++ b/packages/integrations/n8n/nodes/harness/__tests__/harness.test.ts
@@ -0,0 +1,368 @@
+import { describe, expect, it } from 'vitest';
+
+import {
+  PRICING_USD_PER_M,
+  DEFAULT_PRICING_USD_PER_M,
+  ENERGY_COEFFICIENTS,
+  DEFAULT_ENERGY_COEFFICIENT,
+  ENERGY_OUTPUT_WEIGHT,
+  resolvePricingKey,
+  estimateCost,
+  estimateEnergy,
+  modelTotalPrice,
+} from '../pricing';
+
+import {
+  HarnessRunContext,
+  COMPLIANCE_MODEL_ALLOWLISTS,
+  QUALITY_PRIORS,
+  LATENCY_PRIORS,
+  normalizeWeights,
+  type HarnessConfig,
+} from '../harness';
+
+// ---------------------------------------------------------------------------
+// Pricing data fidelity
+// ---------------------------------------------------------------------------
+
+describe('pricing data', () => {
+  it('has 18 models in PRICING_USD_PER_M', () => {
+    expect(Object.keys(PRICING_USD_PER_M)).toHaveLength(18);
+  });
+
+  it('matches Python values for gpt-4o', () => {
+    expect(PRICING_USD_PER_M['gpt-4o']).toEqual([2.50, 10.00]);
+  });
+
+  it('matches Python values for gpt-4o-mini', () => {
+    expect(PRICING_USD_PER_M['gpt-4o-mini']).toEqual([0.15, 0.60]);
+  });
+
+  it('matches Python values for claude-sonnet-4', () => {
+    expect(PRICING_USD_PER_M['claude-sonnet-4']).toEqual([3.00, 15.00]);
+  });
+
+  it('matches Python values for gemini-2.5-flash', () => {
+    expect(PRICING_USD_PER_M['gemini-2.5-flash']).toEqual([0.15, 0.60]);
+  });
+
+  it('has correct default pricing', () => {
+    expect(DEFAULT_PRICING_USD_PER_M).toEqual([2.50, 10.00]);
+  });
+
+  it('has 18 models in ENERGY_COEFFICIENTS', () => {
+    expect(Object.keys(ENERGY_COEFFICIENTS)).toHaveLength(18);
+  });
+
+  it('has correct energy defaults', () => {
+    expect(DEFAULT_ENERGY_COEFFICIENT).toBe(1.0);
+    expect(ENERGY_OUTPUT_WEIGHT).toBe(1.5);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// estimateCost / estimateEnergy
+// ---------------------------------------------------------------------------
+
+describe('estimateCost', () => {
+  it('calculates gpt-4o cost correctly (1000 in, 500 out = $0.0075)', () => {
+    const cost = estimateCost('gpt-4o', 1000, 500);
+    expect(cost).toBeCloseTo(0.0075, 6);
+  });
+
+  it('calculates gpt-4o-mini cost correctly', () => {
+    const cost = estimateCost('gpt-4o-mini', 1_000_000, 1_000_000);
+    expect(cost).toBeCloseTo(0.15 + 0.60, 6);
+  });
+
+  it('uses default pricing for unknown models', () => {
+    const cost = estimateCost('unknown-model', 1_000_000, 1_000_000);
+    expect(cost).toBeCloseTo(2.50 + 10.00, 6);
+  });
+});
+
+describe('estimateEnergy', () => {
+  it('calculates gpt-4o energy correctly (100 in, 50 out)', () => {
+    // coeff=1.0, energy = 1.0 * (100 + 50 * 1.5) = 175.0
+    const energy = estimateEnergy('gpt-4o', 100, 50);
+    expect(energy).toBeCloseTo(175.0, 4);
+  });
+
+  it('uses default coefficient for unknown models', () => {
+    // coeff=1.0, energy = 1.0 * (100 + 50 * 1.5) = 175.0
+    const energy = estimateEnergy('unknown-model', 100, 50);
+    expect(energy).toBeCloseTo(175.0, 4);
+  });
+
+  it('uses correct coefficient for gpt-4o-mini', () => {
+    // coeff=0.3, energy = 0.3 * (100 + 50 * 1.5) = 52.5
+    const energy = estimateEnergy('gpt-4o-mini', 100, 50);
+    expect(energy).toBeCloseTo(52.5, 4);
+  });
+});
+
+describe('modelTotalPrice', () => {
+  it('returns input + output for gpt-4o', () => {
+    expect(modelTotalPrice('gpt-4o')).toBeCloseTo(12.50, 6);
+  });
+
+  it('returns default for unknown model', () => {
+    expect(modelTotalPrice('unknown')).toBeCloseTo(12.50, 6);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Fuzzy model resolution
+// ---------------------------------------------------------------------------
+
+describe('resolvePricingKey', () => {
+  it('exact match', () => {
+    expect(resolvePricingKey('gpt-4o')).toBe('gpt-4o');
+  });
+
+  it('strips version suffix (-20250120)', () => {
+    expect(resolvePricingKey('gpt-4o-20250120')).toBe('gpt-4o');
+  });
+
+  it('strips -preview suffix', () => {
+    expect(resolvePricingKey('gpt-4o-preview')).toBe('gpt-4o');
+  });
+
+  it('strips -latest suffix', () => {
+    expect(resolvePricingKey('gpt-4o-latest')).toBe('gpt-4o');
+  });
+
+  it('longest-prefix match (gemini-2.5-flash-8b → gemini-2.5-flash)', () => {
+    expect(resolvePricingKey('gemini-2.5-flash-8b')).toBe('gemini-2.5-flash');
+  });
+
+  it('returns null for completely unknown model', () => {
+    expect(resolvePricingKey('totally-unknown-model')).toBeNull();
+  });
+});
+
+// ---------------------------------------------------------------------------
+// HarnessRunContext — evaluatePreCall
+// ---------------------------------------------------------------------------
+
+function makeConfig(overrides: Partial<HarnessConfig> = {}): HarnessConfig {
+  return {
+    mode: 'enforce',
+    budgetMax: null,
+    toolCallsMax: null,
+    latencyMaxMs: null,
+    energyMax: null,
+    compliance: null,
+    kpiWeights: {},
+    ...overrides,
+  };
+}
+
+describe('evaluatePreCall', () => {
+  it('returns allow when no limits set', () => {
+    const ctx = new HarnessRunContext(makeConfig());
+    const decision = ctx.evaluatePreCall('gpt-4o', false);
+    expect(decision.action).toBe('allow');
+  });
+
+  it('returns stop when budget exhausted', () => {
+    const ctx = new HarnessRunContext(makeConfig({ budgetMax: 0.01 }));
+    ctx.cost = 0.01; // exhaust budget
+    const decision = ctx.evaluatePreCall('gpt-4o', false);
+    expect(decision.action).toBe('stop');
+    expect(decision.reason).toBe('budget_exceeded');
+  });
+
+  it('returns deny_tool when tool cap reached', () => {
+    const ctx = new HarnessRunContext(makeConfig({ toolCallsMax: 3 }));
+    ctx.toolCalls = 3;
+    const decision = ctx.evaluatePreCall('gpt-4o', true);
+    expect(decision.action).toBe('deny_tool');
+    expect(decision.reason).toBe('max_tool_calls_reached');
+  });
+
+  it('returns stop for compliance violation (non-compliant model)', () => {
+    const ctx = new HarnessRunContext(makeConfig({ compliance: 'gdpr' }));
+    const decision = ctx.evaluatePreCall('claude-sonnet-4', false);
+    expect(decision.action).toBe('stop');
+    expect(decision.reason).toBe('compliance_no_approved_model');
+  });
+
+  it('allows compliant model under GDPR', () => {
+    const ctx = new HarnessRunContext(makeConfig({ compliance: 'gdpr' }));
+    const decision = ctx.evaluatePreCall('gpt-4o', false);
+    expect(decision.action).toBe('allow');
+  });
+
+  it('returns stop when latency cap exceeded', () => {
+    const ctx = new HarnessRunContext(makeConfig({ latencyMaxMs: 1000 }));
+    ctx.latencyUsedMs = 1000;
+    const decision = ctx.evaluatePreCall('gpt-3.5-turbo', false);
+    // gpt-3.5-turbo is already the fastest → can't switch → stop
+    expect(decision.action).toBe('stop');
+    expect(decision.reason).toBe('latency_limit_exceeded');
+  });
+
+  it('returns stop when energy cap exceeded', () => {
+    const ctx = new HarnessRunContext(makeConfig({ energyMax: 100 }));
+    ctx.energyUsed = 100;
+    const decision = ctx.evaluatePreCall('gpt-3.5-turbo', false);
+    // gpt-3.5-turbo is already lowest energy → can't switch → stop
+    expect(decision.action).toBe('stop');
+    expect(decision.reason).toBe('energy_limit_exceeded');
+  });
+
+  it('returns switch_model observation for budget pressure', () => {
+    const ctx = new HarnessRunContext(makeConfig({ budgetMax: 1.0 }));
+    ctx.cost = 0.85; // 85% spent, < 20% remaining
+    ctx.budgetRemaining = 0.15;
+    const decision = ctx.evaluatePreCall('gpt-4o', false);
+    // Budget pressure suggests cheaper model
+    expect(decision.action).toBe('switch_model');
+    expect(decision.reason).toBe('budget_pressure');
+  });
+
+  it('returns switch_model observation for KPI optimization', () => {
+    const ctx = new HarnessRunContext(makeConfig({
+      kpiWeights: { quality: 0, cost: 1, latency: 0, energy: 0 },
+    }));
+    // gpt-4 is very expensive, KPI weights purely on cost → should suggest cheaper
+    const decision = ctx.evaluatePreCall('gpt-4', false);
+    expect(decision.action).toBe('switch_model');
+    expect(decision.reason).toBe('kpi_weight_optimization');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Budget tracking across multiple recordCall invocations
+// ---------------------------------------------------------------------------
+
+describe('recordCall and budget tracking', () => {
+  it('accumulates cost across calls', () => {
+    const ctx = new HarnessRunContext(makeConfig({ budgetMax: 0.10 }));
+    ctx.recordCall({ model: 'gpt-4o-mini', inputTokens: 100, outputTokens: 50, toolCallCount: 0, elapsedMs: 50 });
+    expect(ctx.cost).toBeGreaterThan(0);
+    expect(ctx.stepCount).toBe(1);
+    expect(ctx.budgetRemaining).toBeLessThan(0.10);
+
+    ctx.recordCall({ model: 'gpt-4o-mini', inputTokens: 200, outputTokens: 100, toolCallCount: 1, elapsedMs: 60 });
+    expect(ctx.stepCount).toBe(2);
+    expect(ctx.toolCalls).toBe(1);
+    expect(ctx.latencyUsedMs).toBe(110);
+  });
+
+  it('detects budget exhaustion', () => {
+    const ctx = new HarnessRunContext(makeConfig({ budgetMax: 0.0001 }));
+    ctx.recordCall({ model: 'gpt-4o', inputTokens: 10000, outputTokens: 5000, toolCallCount: 0, elapsedMs: 100 });
+    expect(ctx.isBudgetExhausted()).toBe(true);
+  });
+
+  it('detects tool cap reached', () => {
+    const ctx = new HarnessRunContext(makeConfig({ toolCallsMax: 2 }));
+    ctx.toolCalls = 2;
+    expect(ctx.isToolCapReached()).toBe(true);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Observe vs enforce mode behavior
+// ---------------------------------------------------------------------------
+
+describe('observe vs enforce mode', () => {
+  it('observe mode evaluatePreCall still returns decisions', () => {
+    const ctx = new HarnessRunContext(makeConfig({ mode: 'observe', budgetMax: 0.01 }));
+    ctx.cost = 0.01;
+    const decision = ctx.evaluatePreCall('gpt-4o', false);
+    // Decision is evaluated regardless of mode
+    expect(decision.action).toBe('stop');
+  });
+
+  it('off mode has no context created (by design)', () => {
+    // In the actual agent node, harnessCtx is null when mode=off
+    // This test validates that a context with mode=off still works
+    const ctx = new HarnessRunContext(makeConfig({ mode: 'off' }));
+    const decision = ctx.evaluatePreCall('gpt-4o', false);
+    expect(decision.action).toBe('allow');
+    expect(decision.reason).toBe('off');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Compliance allowlists
+// ---------------------------------------------------------------------------
+
+describe('compliance allowlists', () => {
+  it('GDPR allows gpt-4o, gpt-4o-mini, gpt-3.5-turbo', () => {
+    const allowlist = COMPLIANCE_MODEL_ALLOWLISTS['gdpr'];
+    expect(allowlist.has('gpt-4o')).toBe(true);
+    expect(allowlist.has('gpt-4o-mini')).toBe(true);
+    expect(allowlist.has('gpt-3.5-turbo')).toBe(true);
+    expect(allowlist.has('claude-sonnet-4')).toBe(false);
+  });
+
+  it('strict allows only gpt-4o', () => {
+    const allowlist = COMPLIANCE_MODEL_ALLOWLISTS['strict'];
+    expect(allowlist.size).toBe(1);
+    expect(allowlist.has('gpt-4o')).toBe(true);
+  });
+
+  it('strict mode denies tools even for compliant model', () => {
+    const ctx = new HarnessRunContext(makeConfig({ compliance: 'strict' }));
+    const decision = ctx.evaluatePreCall('gpt-4o', true);
+    expect(decision.action).toBe('deny_tool');
+    expect(decision.reason).toBe('compliance_tool_restriction');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// KPI weight normalization
+// ---------------------------------------------------------------------------
+
+describe('normalizeWeights', () => {
+  it('normalizes to sum=1', () => {
+    const result = normalizeWeights({ quality: 0.4, cost: 0.3, latency: 0.2, energy: 0.1 });
+    const sum = Object.values(result).reduce((a, b) => a + b, 0);
+    expect(sum).toBeCloseTo(1.0, 6);
+  });
+
+  it('filters out zero and negative values', () => {
+    const result = normalizeWeights({ quality: 1, cost: 0, latency: -1, energy: 1 });
+    expect(result.cost).toBeUndefined();
+    expect(result.latency).toBeUndefined();
+    expect(result.quality).toBeCloseTo(0.5, 6);
+    expect(result.energy).toBeCloseTo(0.5, 6);
+  });
+
+  it('returns empty for all-zero weights', () => {
+    const result = normalizeWeights({ quality: 0, cost: 0, latency: 0, energy: 0 });
+    expect(Object.keys(result)).toHaveLength(0);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// summary() structure
+// ---------------------------------------------------------------------------
+
+describe('summary()', () => {
+  it('returns correct structure', () => {
+    const ctx = new HarnessRunContext(makeConfig({ budgetMax: 1.0 }));
+    ctx.recordCall({ model: 'gpt-4o-mini', inputTokens: 100, outputTokens: 50, toolCallCount: 0, elapsedMs: 42 });
+
+    const s = ctx.summary();
+    expect(s.runId).toBeTruthy();
+    expect(s.mode).toBe('enforce');
+    expect(s.stepCount).toBe(1);
+    expect(s.toolCalls).toBe(0);
+    expect(s.cost).toBeGreaterThan(0);
+    expect(s.latencyUsedMs).toBe(42);
+    expect(s.energyUsed).toBeGreaterThan(0);
+    expect(s.budgetMax).toBe(1.0);
+    expect(s.budgetRemaining).toBeLessThan(1.0);
+    expect(s.lastAction).toBe('allow');
+    expect(s.durationMs).toBeGreaterThanOrEqual(0);
+    expect(Array.isArray(s.trace)).toBe(true);
+    expect(s.trace).toHaveLength(1);
+    expect(s.trace[0].action).toBe('allow');
+    expect(s.trace[0].budgetState.max).toBe(1.0);
+  });
+});
diff --git a/packages/integrations/n8n/nodes/harness/harness.ts b/packages/integrations/n8n/nodes/harness/harness.ts
new file mode 100644
index 00000000..ab3943d5
--- /dev/null
+++ b/packages/integrations/n8n/nodes/harness/harness.ts
@@ -0,0 +1,444 @@
+/**
+ * HarnessRunContext — multi-dimensional decision engine for n8n (TypeScript port).
+ *
+ * Ported from cascadeflow/harness/api.py (HarnessRunContext) and
+ * cascadeflow/harness/instrument.py (pre-call decision logic, compliance,
+ * quality/latency priors, KPI scoring).
+ *
+ * Key n8n constraint: models are graph connections (sub-nodes), not string
+ * parameters. The harness cannot switch models at runtime. Only `stop` and
+ * `deny_tool` actions have enforcement effects. `switch_model` decisions are
+ * recorded in the trace as observations.
+ */
+
+import {
+  ENERGY_COEFFICIENTS,
+  DEFAULT_ENERGY_COEFFICIENT,
+  estimateCost,
+  estimateEnergy,
+  modelTotalPrice,
+  PRICING_USD_PER_M,
+} from './pricing';
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+export type HarnessMode = 'off' | 'observe' | 'enforce';
+
+export interface KpiWeights {
+  quality?: number;
+  cost?: number;
+  latency?: number;
+  energy?: number;
+}
+
+export interface HarnessConfig {
+  mode: HarnessMode;
+  budgetMax: number | null;
+  toolCallsMax: number | null;
+  latencyMaxMs: number | null;
+  energyMax: number | null;
+  compliance: string | null;
+  kpiWeights: KpiWeights;
+}
+
+export interface PreCallDecision {
+  action: 'allow' | 'stop' | 'switch_model' | 'deny_tool';
+  reason: string;
+  targetModel: string;
+}
+
+export interface HarnessTraceEntry {
+  action: string;
+  reason: string;
+  model: string | null;
+  step: number;
+  timestampMs: number;
+  costTotal: number;
+  budgetState: { max: number | null; remaining: number | null };
+  applied: boolean;
+  decisionMode: string;
+}
+
+export interface HarnessSummary {
+  runId: string;
+  mode: HarnessMode;
+  stepCount: number;
+  toolCalls: number;
+  cost: number;
+  latencyUsedMs: number;
+  energyUsed: number;
+  budgetMax: number | null;
+  budgetRemaining: number | null;
+  lastAction: string;
+  durationMs: number;
+  trace: HarnessTraceEntry[];
+}
+
+export interface RecordCallParams {
+  model: string;
+  inputTokens: number;
+  outputTokens: number;
+  toolCallCount: number;
+  elapsedMs: number;
+  decision?: PreCallDecision;
+}
+
+// ---------------------------------------------------------------------------
+// Compliance allowlists (from instrument.py lines 107-112)
+// ---------------------------------------------------------------------------
+
+const COMPLIANCE_MODEL_ALLOWLISTS: Record<string, Set<string>> = {
+  gdpr: new Set(['gpt-4o', 'gpt-4o-mini', 'gpt-3.5-turbo']),
+  hipaa: new Set(['gpt-4o', 'gpt-4o-mini']),
+  pci: new Set(['gpt-4o-mini', 'gpt-3.5-turbo']),
+  strict: new Set(['gpt-4o']),
+};
+
+// ---------------------------------------------------------------------------
+// Quality & latency priors for KPI scoring (from instrument.py lines 74-95)
+// ---------------------------------------------------------------------------
+
+const QUALITY_PRIORS: Record<string, number> = {
+  'gpt-4o': 0.90,
+  'gpt-4o-mini': 0.75,
+  'gpt-5-mini': 0.86,
+  'gpt-4-turbo': 0.88,
+  'gpt-4': 0.87,
+  'gpt-3.5-turbo': 0.65,
+  'o1': 0.95,
+  'o1-mini': 0.82,
+  'o3-mini': 0.80,
+};
+
+const LATENCY_PRIORS: Record<string, number> = {
+  'gpt-4o': 0.72,
+  'gpt-4o-mini': 0.93,
+  'gpt-5-mini': 0.84,
+  'gpt-4-turbo': 0.66,
+  'gpt-4': 0.52,
+  'gpt-3.5-turbo': 1.00,
+  'o1': 0.40,
+  'o1-mini': 0.60,
+  'o3-mini': 0.78,
+};
+
+// Pre-computed model cost/energy bounds for utility functions.
+const MODEL_POOL = Object.keys(PRICING_USD_PER_M);
+const MODEL_TOTAL_COSTS = new Map(MODEL_POOL.map(m => [m, modelTotalPrice(m)]));
+const MIN_TOTAL_COST = Math.min(...MODEL_TOTAL_COSTS.values());
+const MAX_TOTAL_COST = Math.max(...MODEL_TOTAL_COSTS.values());
+
+const MODEL_ENERGY_COEFFS = new Map(
+  MODEL_POOL.map(m => [m, ENERGY_COEFFICIENTS[m] ?? DEFAULT_ENERGY_COEFFICIENT]),
+);
+const MIN_ENERGY_COEFF = Math.min(...MODEL_ENERGY_COEFFS.values());
+const MAX_ENERGY_COEFF = Math.max(...MODEL_ENERGY_COEFFS.values());
+
+// ---------------------------------------------------------------------------
+// KPI scoring helpers (from instrument.py lines 234-267)
+// ---------------------------------------------------------------------------
+
+function normalizeWeights(weights: KpiWeights): Record<string, number> {
+  const raw: Record<string, number> = {};
+  for (const [key, val] of Object.entries(weights)) {
+    if (['cost', 'quality', 'latency', 'energy'].includes(key) && typeof val === 'number' && val > 0) {
+      raw[key] = val;
+    }
+  }
+  const total = Object.values(raw).reduce((a, b) => a + b, 0);
+  if (total <= 0) return {};
+  const normalized: Record<string, number> = {};
+  for (const [key, val] of Object.entries(raw)) {
+    normalized[key] = val / total;
+  }
+  return normalized;
+}
+
+function costUtility(model: string): number {
+  const modelCost = MODEL_TOTAL_COSTS.get(model) ?? modelTotalPrice(model);
+  if (MAX_TOTAL_COST === MIN_TOTAL_COST) return 1.0;
+  return (MAX_TOTAL_COST - modelCost) / (MAX_TOTAL_COST - MIN_TOTAL_COST);
+}
+
+function energyUtility(model: string): number {
+  const coeff = ENERGY_COEFFICIENTS[model] ?? DEFAULT_ENERGY_COEFFICIENT;
+  if (MAX_ENERGY_COEFF === MIN_ENERGY_COEFF) return 1.0;
+  return (MAX_ENERGY_COEFF - coeff) / (MAX_ENERGY_COEFF - MIN_ENERGY_COEFF);
+}
+
+function kpiScoreWithNormalized(model: string, normalized: Record<string, number>): number {
+  if (Object.keys(normalized).length === 0) return 0.0;
+  const quality = QUALITY_PRIORS[model] ?? 0.7;
+  const latency = LATENCY_PRIORS[model] ?? 0.7;
+  const cost = costUtility(model);
+  const energy = energyUtility(model);
+  return (
+    (normalized.quality ?? 0) * quality +
+    (normalized.latency ?? 0) * latency +
+    (normalized.cost ?? 0) * cost +
+    (normalized.energy ?? 0) * energy
+  );
+}
+
+function selectKpiWeightedModel(currentModel: string, weights: KpiWeights): string {
+  const normalized = normalizeWeights(weights);
+  if (Object.keys(normalized).length === 0) return currentModel;
+  let bestModel = currentModel;
+  let bestScore = kpiScoreWithNormalized(currentModel, normalized);
+  for (const candidate of MODEL_POOL) {
+    const score = kpiScoreWithNormalized(candidate, normalized);
+    if (score > bestScore) {
+      bestModel = candidate;
+      bestScore = score;
+    }
+  }
+  return bestModel;
+}
+
+// Cheapest/fastest/lowest-energy helpers
+function selectCheaperModel(currentModel: string): string {
+  let cheapest = currentModel;
+  let cheapestCost = MODEL_TOTAL_COSTS.get(currentModel) ?? modelTotalPrice(currentModel);
+  for (const [m, c] of MODEL_TOTAL_COSTS) {
+    if (c < cheapestCost) {
+      cheapest = m;
+      cheapestCost = c;
+    }
+  }
+  return cheapest;
+}
+
+function selectFasterModel(currentModel: string): string {
+  const currentLatency = LATENCY_PRIORS[currentModel] ?? 0.7;
+  let best = currentModel;
+  let bestLatency = currentLatency;
+  for (const [m, lat] of Object.entries(LATENCY_PRIORS)) {
+    if (lat > bestLatency) {
+      best = m;
+      bestLatency = lat;
+    }
+  }
+  return best;
+}
+
+function selectLowerEnergyModel(currentModel: string): string {
+  const currentCoeff = ENERGY_COEFFICIENTS[currentModel] ?? DEFAULT_ENERGY_COEFFICIENT;
+  let best = currentModel;
+  let bestCoeff = currentCoeff;
+  for (const [m, c] of MODEL_ENERGY_COEFFS) {
+    if (c < bestCoeff) {
+      best = m;
+      bestCoeff = c;
+    }
+  }
+  return best;
+}
+
+// ---------------------------------------------------------------------------
+// HarnessRunContext
+// ---------------------------------------------------------------------------
+
+const MAX_TRACE_ENTRIES = 1000;
+
+/** Coerce NaN, Infinity, or negative values to null (unlimited). */
+function sanitizeNumericParam(value: number | null): number | null {
+  if (value === null || value === undefined) return null;
+  if (!Number.isFinite(value) || value < 0) return null;
+  return value;
+}
+
+let runIdCounter = 0;
+
+function generateRunId(): string {
+  runIdCounter += 1;
+  const ts = Date.now().toString(36);
+  const counter = runIdCounter.toString(36);
+  return `${ts}${counter}`.slice(-8);
+}
+
+export class HarnessRunContext {
+  readonly runId: string;
+  readonly config: HarnessConfig;
+
+  stepCount = 0;
+  toolCalls = 0;
+  cost = 0;
+  latencyUsedMs = 0;
+  energyUsed = 0;
+  budgetRemaining: number | null;
+  lastAction = 'allow';
+
+  private startedAt: number;
+  private trace: HarnessTraceEntry[] = [];
+
+  constructor(config: HarnessConfig) {
+    this.runId = generateRunId();
+    this.config = {
+      ...config,
+      budgetMax: sanitizeNumericParam(config.budgetMax),
+      toolCallsMax: sanitizeNumericParam(config.toolCallsMax),
+      latencyMaxMs: sanitizeNumericParam(config.latencyMaxMs),
+      energyMax: sanitizeNumericParam(config.energyMax),
+    };
+    this.budgetRemaining = this.config.budgetMax;
+    this.startedAt = Date.now();
+  }
+
+  // -----------------------------------------------------------------------
+  // Pre-call decision cascade (ported from instrument.py _evaluate_pre_call_decision)
+  // -----------------------------------------------------------------------
+
+  evaluatePreCall(model: string, hasTools: boolean): PreCallDecision {
+    const cfg = this.config;
+
+    // 1. Budget exhausted
+    if (cfg.budgetMax !== null && this.cost >= cfg.budgetMax) {
+      return { action: 'stop', reason: 'budget_exceeded', targetModel: model };
+    }
+
+    // 2. Tool call cap
+    if (hasTools && cfg.toolCallsMax !== null && this.toolCalls >= cfg.toolCallsMax) {
+      return { action: 'deny_tool', reason: 'max_tool_calls_reached', targetModel: model };
+    }
+
+    // 3. Compliance
+    if (cfg.compliance) {
+      const allowlist = COMPLIANCE_MODEL_ALLOWLISTS[cfg.compliance.trim().toLowerCase()];
+      if (allowlist) {
+        if (!allowlist.has(model)) {
+          // Can't switch models in n8n — stop if no compliant model possible
+          return { action: 'stop', reason: 'compliance_no_approved_model', targetModel: model };
+        }
+        if (cfg.compliance.trim().toLowerCase() === 'strict' && hasTools) {
+          return { action: 'deny_tool', reason: 'compliance_tool_restriction', targetModel: model };
+        }
+      }
+    }
+
+    // 4. Latency cap
+    if (cfg.latencyMaxMs !== null && this.latencyUsedMs >= cfg.latencyMaxMs) {
+      const faster = selectFasterModel(model);
+      if (faster !== model) {
+        return { action: 'switch_model', reason: 'latency_limit_exceeded', targetModel: faster };
+      }
+      return { action: 'stop', reason: 'latency_limit_exceeded', targetModel: model };
+    }
+
+    // 5. Energy cap
+    if (cfg.energyMax !== null && this.energyUsed >= cfg.energyMax) {
+      const lower = selectLowerEnergyModel(model);
+      if (lower !== model) {
+        return { action: 'switch_model', reason: 'energy_limit_exceeded', targetModel: lower };
+      }
+      return { action: 'stop', reason: 'energy_limit_exceeded', targetModel: model };
+    }
+
+    // 6. Budget pressure (<20% remaining) — observation only in n8n
+    if (
+      cfg.budgetMax !== null &&
+      cfg.budgetMax > 0 &&
+      this.budgetRemaining !== null &&
+      this.budgetRemaining / cfg.budgetMax < 0.2
+    ) {
+      const cheaper = selectCheaperModel(model);
+      if (cheaper !== model) {
+        return { action: 'switch_model', reason: 'budget_pressure', targetModel: cheaper };
+      }
+    }
+
+    // 7. KPI-weighted — observation only in n8n
+    const kw = cfg.kpiWeights;
+    if (kw && Object.values(kw).some(v => typeof v === 'number' && v > 0)) {
+      const weighted = selectKpiWeightedModel(model, kw);
+      if (weighted !== model) {
+        return { action: 'switch_model', reason: 'kpi_weight_optimization', targetModel: weighted };
+      }
+    }
+
+    // 8. Default: allow
+    return { action: 'allow', reason: cfg.mode, targetModel: model };
+  }
+
+  // -----------------------------------------------------------------------
+  // Record a completed call
+  // -----------------------------------------------------------------------
+
+  recordCall(params: RecordCallParams): void {
+    const { model, inputTokens, outputTokens, toolCallCount, elapsedMs, decision } = params;
+
+    const callCost = estimateCost(model, inputTokens, outputTokens);
+    const energy = estimateEnergy(model, inputTokens, outputTokens);
+
+    this.cost += callCost;
+    this.stepCount += 1;
+    this.latencyUsedMs += elapsedMs;
+    this.energyUsed += energy;
+    this.toolCalls += toolCallCount;
+
+    if (this.config.budgetMax !== null) {
+      this.budgetRemaining = this.config.budgetMax - this.cost;
+    }
+
+    const action = decision?.action ?? 'allow';
+    const reason = decision?.reason ?? this.config.mode;
+    const applied = action === 'allow' || (this.config.mode === 'enforce' && (action === 'stop' || action === 'deny_tool'));
+
+    this.lastAction = action;
+
+    this.trace.push({
+      action,
+      reason,
+      model,
+      step: this.stepCount,
+      timestampMs: Date.now(),
+      costTotal: this.cost,
+      budgetState: {
+        max: this.config.budgetMax,
+        remaining: this.budgetRemaining,
+      },
+      applied,
+      decisionMode: this.config.mode,
+    });
+    if (this.trace.length > MAX_TRACE_ENTRIES) {
+      this.trace = this.trace.slice(-MAX_TRACE_ENTRIES);
+    }
+  }
+
+  // -----------------------------------------------------------------------
+  // Quick checks for agent loop
+  // -----------------------------------------------------------------------
+
+  isBudgetExhausted(): boolean {
+    return this.config.budgetMax !== null && this.cost >= this.config.budgetMax;
+  }
+
+  isToolCapReached(): boolean {
+    return this.config.toolCallsMax !== null && this.toolCalls >= this.config.toolCallsMax;
+  }
+
+  // -----------------------------------------------------------------------
+  // Summary
+  // -----------------------------------------------------------------------
+
+  summary(): HarnessSummary {
+    return {
+      runId: this.runId,
+      mode: this.config.mode,
+      stepCount: this.stepCount,
+      toolCalls: this.toolCalls,
+      cost: this.cost,
+      latencyUsedMs: this.latencyUsedMs,
+      energyUsed: this.energyUsed,
+      budgetMax: this.config.budgetMax,
+      budgetRemaining: this.budgetRemaining,
+      lastAction: this.lastAction,
+      durationMs: Date.now() - this.startedAt,
+      trace: [...this.trace],
+    };
+  }
+}
+
+// Re-export for external test access
+export { COMPLIANCE_MODEL_ALLOWLISTS, QUALITY_PRIORS, LATENCY_PRIORS, normalizeWeights };
diff --git a/packages/integrations/n8n/nodes/harness/index.ts b/packages/integrations/n8n/nodes/harness/index.ts
new file mode 100644
index 00000000..663f93b3
--- /dev/null
+++ b/packages/integrations/n8n/nodes/harness/index.ts
@@ -0,0 +1,22 @@
+export {
+  PRICING_USD_PER_M,
+  DEFAULT_PRICING_USD_PER_M,
+  ENERGY_COEFFICIENTS,
+  DEFAULT_ENERGY_COEFFICIENT,
+  ENERGY_OUTPUT_WEIGHT,
+  resolvePricingKey,
+  estimateCost,
+  estimateEnergy,
+  modelTotalPrice,
+} from './pricing';
+
+export {
+  type HarnessMode,
+  type KpiWeights,
+  type HarnessConfig,
+  type PreCallDecision,
+  type HarnessTraceEntry,
+  type HarnessSummary,
+  type RecordCallParams,
+  HarnessRunContext,
+} from './harness';
diff --git a/packages/integrations/n8n/nodes/harness/pricing.ts b/packages/integrations/n8n/nodes/harness/pricing.ts
new file mode 100644
index 00000000..fd13f43a
--- /dev/null
+++ b/packages/integrations/n8n/nodes/harness/pricing.ts
@@ -0,0 +1,135 @@
+/**
+ * Shared harness pricing and energy profiles (TypeScript port).
+ *
+ * Ported from cascadeflow/harness/pricing.py — single source of truth for
+ * cost/energy estimation in the n8n integration.
+ */
+
+// USD per 1M tokens [input, output].
+export const PRICING_USD_PER_M: Record<string, [number, number]> = {
+  // OpenAI
+  'gpt-4o': [2.50, 10.00],
+  'gpt-4o-mini': [0.15, 0.60],
+  'gpt-5': [1.25, 10.00],
+  'gpt-5-mini': [0.20, 0.80],
+  'gpt-4-turbo': [10.00, 30.00],
+  'gpt-4': [30.00, 60.00],
+  'gpt-3.5-turbo': [0.50, 1.50],
+  'o1': [15.00, 60.00],
+  'o1-mini': [3.00, 12.00],
+  'o3-mini': [1.10, 4.40],
+  // Anthropic
+  'claude-sonnet-4': [3.00, 15.00],
+  'claude-haiku-3.5': [1.00, 5.00],
+  'claude-opus-4.5': [5.00, 25.00],
+  // Google Gemini
+  'gemini-2.5-flash': [0.15, 0.60],
+  'gemini-2.5-pro': [1.25, 10.00],
+  'gemini-2.0-flash': [0.10, 0.40],
+  'gemini-1.5-flash': [0.075, 0.30],
+  'gemini-1.5-pro': [1.25, 5.00],
+};
+
+export const DEFAULT_PRICING_USD_PER_M: [number, number] = [2.50, 10.00];
+
+// Deterministic proxy coefficients for energy tracking.
+export const ENERGY_COEFFICIENTS: Record<string, number> = {
+  // OpenAI
+  'gpt-4o': 1.0,
+  'gpt-4o-mini': 0.3,
+  'gpt-5': 1.2,
+  'gpt-5-mini': 0.35,
+  'gpt-4-turbo': 1.5,
+  'gpt-4': 1.5,
+  'gpt-3.5-turbo': 0.2,
+  'o1': 2.0,
+  'o1-mini': 0.8,
+  'o3-mini': 0.5,
+  // Anthropic
+  'claude-sonnet-4': 1.0,
+  'claude-haiku-3.5': 0.3,
+  'claude-opus-4.5': 1.8,
+  // Google Gemini
+  'gemini-2.5-flash': 0.3,
+  'gemini-2.5-pro': 1.2,
+  'gemini-2.0-flash': 0.25,
+  'gemini-1.5-flash': 0.2,
+  'gemini-1.5-pro': 1.0,
+};
+
+export const DEFAULT_ENERGY_COEFFICIENT = 1.0;
+export const ENERGY_OUTPUT_WEIGHT = 1.5;
+
+// ---------------------------------------------------------------------------
+// Fuzzy model-name resolution
+// ---------------------------------------------------------------------------
+
+// Strips version/preview/date suffixes.
+// Matches: -preview, -preview-05-20, -20250120, -latest, -exp-0827, -it
+const VERSION_SUFFIX_RE = /(-preview(?:-\d{2,4}-\d{2})?|-\d{8,}|-latest|-exp(?:-\d+)?|-it)$/;
+
+// Cache for resolved model → pricing key lookups.
+const pricingKeyCache = new Map<string, string | null>();
+
+export function resolvePricingKey(model: string): string | null {
+  const cached = pricingKeyCache.get(model);
+  if (cached !== undefined) return cached;
+
+  // Exact match
+  if (model in PRICING_USD_PER_M) {
+    pricingKeyCache.set(model, model);
+    return model;
+  }
+
+  // Strip version suffixes and retry
+  const stripped = model.replace(VERSION_SUFFIX_RE, '');
+  if (stripped !== model && stripped in PRICING_USD_PER_M) {
+    pricingKeyCache.set(model, stripped);
+    return stripped;
+  }
+
+  // Longest-prefix match (e.g. "gemini-2.5-flash-8b" → "gemini-2.5-flash")
+  let best: string | null = null;
+  let bestLen = 0;
+  for (const known of Object.keys(PRICING_USD_PER_M)) {
+    if (model.startsWith(known) && known.length > bestLen) {
+      best = known;
+      bestLen = known.length;
+    }
+  }
+  if (best !== null) {
+    pricingKeyCache.set(model, best);
+    return best;
+  }
+
+  pricingKeyCache.set(model, null);
+  return null;
+}
+
+// ---------------------------------------------------------------------------
+// Public estimation helpers
+// ---------------------------------------------------------------------------
+
+export function estimateCost(model: string, inputTokens: number, outputTokens: number): number {
+  const key = resolvePricingKey(model);
+  const [inPrice, outPrice] = key !== null
+    ? (PRICING_USD_PER_M[key] ?? DEFAULT_PRICING_USD_PER_M)
+    : DEFAULT_PRICING_USD_PER_M;
+  return (inputTokens / 1_000_000) * inPrice + (outputTokens / 1_000_000) * outPrice;
+}
+
+export function estimateEnergy(model: string, inputTokens: number, outputTokens: number): number {
+  const key = resolvePricingKey(model);
+  const coeff = key !== null
+    ? (ENERGY_COEFFICIENTS[key] ?? DEFAULT_ENERGY_COEFFICIENT)
+    : DEFAULT_ENERGY_COEFFICIENT;
+  return coeff * (inputTokens + outputTokens * ENERGY_OUTPUT_WEIGHT);
+}
+
+export function modelTotalPrice(model: string): number {
+  const key = resolvePricingKey(model);
+  const [inPrice, outPrice] = key !== null
+    ? (PRICING_USD_PER_M[key] ?? DEFAULT_PRICING_USD_PER_M)
+    : DEFAULT_PRICING_USD_PER_M;
+  return inPrice + outPrice;
+}
diff --git a/pyproject.toml b/pyproject.toml
index eaadb6b7..bc7c7072 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "cascadeflow"
 version = "1.0.0"
-description = "Smart AI model cascading for cost optimization - Save 40-85% on LLM costs with 2-6x faster responses. Available for Python and TypeScript/JavaScript."
+description = "Agent runtime intelligence layer — optimize cost, latency, quality, budget, compliance, and energy across AI agent workflows."
 readme = "README.md"
 requires-python = ">=3.9"
 license = "MIT"
@@ -32,9 +32,17 @@ keywords = [
     "javascript",
     "browser",
     "edge-functions",
+    "agent-intelligence",
+    "runtime-optimization",
+    "budget-enforcement",
+    "compliance",
+    "harness",
+    "agent-runtime",
+    "kpi",
+    "energy-tracking",
 ]
 classifiers = [
-    "Development Status :: 4 - Beta",
+    "Development Status :: 5 - Production/Stable",
     "Intended Audience :: Developers",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3.9",
@@ -93,7 +101,7 @@ semantic = [
 openclaw = ["fastembed>=0.7.0"]
 
 # CrewAI harness integration (opt-in)
-crewai = ["crewai>=1.5.0"]
+crewai = ["crewai>=1.5.0; python_version >= '3.10'"]
 
 # OpenAI Agents SDK integration (opt-in)
 openai-agents = [
@@ -101,6 +109,9 @@ openai-agents = [
     "openai-agents>=0.9.0; python_version >= '3.10'",
 ]
 
+# Google ADK integration (opt-in, requires Python 3.10+)
+google-adk = ["google-adk>=1.0.0; python_version >= '3.10'"]
+
 # Development tools (includes rich for terminal output)
 dev = [
     "pytest>=7.4.0",
@@ -138,7 +149,7 @@ all = [
 
 [project.urls]
 Homepage = "https://lemony.ai"
-Documentation = "https://github.com/lemony-ai/cascadeflow"
+Documentation = "https://docs.cascadeflow.dev"
 Repository = "https://github.com/lemony-ai/cascadeflow"
 "Bug Tracker" = "https://github.com/lemony-ai/cascadeflow/issues"
 Changelog = "https://github.com/lemony-ai/cascadeflow/releases"
diff --git a/tests/benchmarks/bfcl/agentic_benchmark.py b/tests/benchmarks/bfcl/agentic_benchmark.py
index 1386cb60..2b450e68 100644
--- a/tests/benchmarks/bfcl/agentic_benchmark.py
+++ b/tests/benchmarks/bfcl/agentic_benchmark.py
@@ -61,6 +61,7 @@ class AgenticResult:
     correct: bool
     draft_accepted: bool
     cost: float
+    baseline_cost: float
     latency_ms: float
     draft_accepted_turns: int = 0
     draft_acceptance_rate: float = 0.0
@@ -761,6 +762,23 @@ def _format_tools_desc(self, tools: list[dict[str, Any]]) -> str:
             lines.append(f"- {name}: {description} (params: {param_names})")
         return "\n".join(lines)
 
+    @staticmethod
+    def _extract_baseline_cost(result: Any) -> float:
+        """Extract baseline cost for a call from cascade metadata.
+
+        ``cost_saved`` is defined relative to a verifier-only baseline.
+        """
+        total_cost = float(getattr(result, "total_cost", 0.0) or 0.0)
+        metadata = getattr(result, "metadata", {}) or {}
+        raw_saved = metadata.get("cost_saved", 0.0) or 0.0
+        try:
+            cost_saved = float(raw_saved)
+        except (TypeError, ValueError):
+            cost_saved = 0.0
+
+        baseline_cost = total_cost + cost_saved
+        return baseline_cost if baseline_cost > 0 else total_cost
+
     def _extract_parameters(self, response: str) -> list[dict[str, Any]]:
         """Extract JSON parameter blocks from a tool response."""
         parameters = []
@@ -939,6 +957,7 @@ async def run_single_turn(self, task: dict) -> AgenticResult:
                 draft_accepted_turns=1 if draft_accepted else 0,
                 draft_acceptance_rate=1.0 if draft_accepted else 0.0,
                 cost=result.total_cost,
+                baseline_cost=self._extract_baseline_cost(result),
                 latency_ms=latency_ms,
                 turns_completed=1,
                 tools_called=tools_called,
@@ -952,6 +971,7 @@ async def run_single_turn(self, task: dict) -> AgenticResult:
                 correct=False,
                 draft_accepted=False,
                 cost=0.0,
+                baseline_cost=0.0,
                 latency_ms=latency_ms,
                 error=str(e),
             )
@@ -976,6 +996,7 @@ async def run_multi_turn(self, task: dict) -> AgenticResult:
 
         start_time = time.time()
         total_cost = 0.0
+        total_baseline_cost = 0.0
         all_tools_called = []
         turns_completed = 0
         state_maintained = True
@@ -1011,6 +1032,7 @@ async def run_multi_turn(self, task: dict) -> AgenticResult:
 
                 result = await agent.run(prompt, max_tokens=500)
                 total_cost += result.total_cost
+                total_baseline_cost += self._extract_baseline_cost(result)
 
                 tools_in_turn = self._extract_tool_calls(result.content)
                 params_in_turn = self._extract_parameters(result.content)
@@ -1057,6 +1079,7 @@ async def run_multi_turn(self, task: dict) -> AgenticResult:
                 draft_accepted_turns=draft_accepted_turns,
                 draft_acceptance_rate=draft_acceptance_rate,
                 cost=total_cost,
+                baseline_cost=total_baseline_cost if total_baseline_cost > 0 else total_cost,
                 latency_ms=latency_ms,
                 turns_completed=turns_completed,
                 tools_called=all_tools_called,
@@ -1072,6 +1095,7 @@ async def run_multi_turn(self, task: dict) -> AgenticResult:
                 draft_accepted_turns=draft_accepted_turns,
                 draft_acceptance_rate=0.0,
                 cost=total_cost,
+                baseline_cost=total_baseline_cost if total_baseline_cost > 0 else total_cost,
                 latency_ms=latency_ms,
                 turns_completed=turns_completed,
                 error=str(e),
@@ -1127,6 +1151,13 @@ def _calculate_metrics(self) -> dict:
         draft_accepted_turns = sum(r.draft_accepted_turns for r in self.results)
         dependency_handled = sum(1 for r in self.results if r.dependency_handled)
         total_cost = sum(r.cost for r in self.results)
+        total_baseline_cost = sum(
+            r.baseline_cost if r.baseline_cost > 0 else r.cost for r in self.results
+        )
+        total_savings = total_baseline_cost - total_cost
+        cost_reduction_pct = (
+            (total_savings / total_baseline_cost) * 100 if total_baseline_cost > 0 else 0.0
+        )
         total_turns = sum(r.turns_completed for r in self.results)
 
         # Group by task type
@@ -1172,6 +1203,9 @@ def _calculate_metrics(self) -> dict:
             "draft_acceptance_by_task": draft_accepted / total if total > 0 else 0,
             "dependency_handling": dependency_rate,
             "total_cost": total_cost,
+            "baseline_cost": total_baseline_cost,
+            "total_savings": total_savings,
+            "cost_reduction_pct": cost_reduction_pct,
             "by_type": by_type,
             # Natural vs Explicit comparison
             "natural_language": {
@@ -1198,6 +1232,8 @@ def _calculate_metrics(self) -> dict:
         print(f"  Draft Acceptance:    {draft_rate:.1%} (by turn)")
         print(f"  Dependency Handling: {dependency_rate:.1%}")
         print(f"  Total Cost:          ${total_cost:.4f}")
+        print(f"  Baseline Cost:       ${total_baseline_cost:.4f}")
+        print(f"  Cost Reduction:      {cost_reduction_pct:.1f}%")
 
         # Natural vs Explicit comparison (key insight)
         print("\n" + "-" * 70)
@@ -1287,6 +1323,7 @@ async def main():
                         "correct": r.correct,
                         "draft_accepted": r.draft_accepted,
                         "cost": r.cost,
+                        "baseline_cost": r.baseline_cost,
                         "latency_ms": r.latency_ms,
                         "turns_completed": r.turns_completed,
                         "tools_called": r.tools_called,
diff --git a/tests/benchmarks/run_all.py b/tests/benchmarks/run_all.py
index 739c0342..9c4a3f93 100644
--- a/tests/benchmarks/run_all.py
+++ b/tests/benchmarks/run_all.py
@@ -322,6 +322,10 @@ def generate_comparison_table(results: dict[str, Any]) -> str:
         )
         table += f"- **Dependency Handling:** {agentic_summary.get('dependency_handling', 0) * 100:.1f}%\n"
         table += f"- **Total Cost:** ${agentic_summary.get('total_cost', 0):.6f}\n"
+        if "baseline_cost" in agentic_summary:
+            table += f"- **Baseline Cost:** ${agentic_summary.get('baseline_cost', 0):.6f}\n"
+        if "cost_reduction_pct" in agentic_summary:
+            table += f"- **Cost Reduction:** {agentic_summary.get('cost_reduction_pct', 0):.1f}%\n"
 
         natural = agentic_summary.get("natural_language", {})
         explicit = agentic_summary.get("explicit_steps", {})
diff --git a/tests/test_google_adk_integration.py b/tests/test_google_adk_integration.py
new file mode 100644
index 00000000..688e39c4
--- /dev/null
+++ b/tests/test_google_adk_integration.py
@@ -0,0 +1,738 @@
+"""Tests for cascadeflow.integrations.google_adk harness integration.
+
+google-adk is not installed in test environments, so we use fake ADK types
+and test the integration logic directly against HarnessRunContext.
+"""
+
+from __future__ import annotations
+
+import time
+from unittest.mock import patch
+
+import pytest
+
+from cascadeflow.harness import init, reset, run
+
+# Import the module directly — it does not require google-adk at import time
+# (GOOGLE_ADK_AVAILABLE will be False, but all functions/classes are still defined).
+import cascadeflow.integrations.google_adk as adk_mod
+
+
+# ---------------------------------------------------------------------------
+# Fake ADK types
+# ---------------------------------------------------------------------------
+
+
+class FakeUsageMetadata:
+    """Stand-in for google.genai.types.GenerateContentResponseUsageMetadata."""
+
+    def __init__(
+        self,
+        prompt_token_count: int = 0,
+        candidates_token_count: int = 0,
+    ):
+        self.prompt_token_count = prompt_token_count
+        self.candidates_token_count = candidates_token_count
+
+
+class FakePart:
+    """Stand-in for google.genai.types.Part."""
+
+    def __init__(self, *, text: str | None = None, function_call: object | None = None):
+        self.text = text
+        self.function_call = function_call
+
+
+class FakeContent:
+    """Stand-in for google.genai.types.Content."""
+
+    def __init__(self, parts: list | None = None):
+        self.parts = parts or []
+
+
+class FakeLlmResponse:
+    """Stand-in for google.adk.models.LlmResponse."""
+
+    def __init__(
+        self,
+        *,
+        content: FakeContent | None = None,
+        usage_metadata: FakeUsageMetadata | None = None,
+    ):
+        self.content = content
+        self.usage_metadata = usage_metadata
+
+
+class FakeLlmRequest:
+    """Stand-in for google.adk.models.LlmRequest."""
+
+    def __init__(self, model: str = "gemini-2.5-flash"):
+        self.model = model
+
+
+class FakeCallbackContext:
+    """Stand-in for google.adk.agents.CallbackContext."""
+
+    def __init__(
+        self,
+        invocation_id: str = "inv-001",
+        agent_name: str = "test-agent",
+    ):
+        self.invocation_id = invocation_id
+        self.agent_name = agent_name
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(autouse=True)
+def _reset_adk_state():
+    """Reset harness and ADK module state before every test."""
+    reset()
+    adk_mod._config = adk_mod.GoogleADKHarnessConfig()
+    adk_mod._plugin_instance = None
+    adk_mod._enabled = False
+
+
+# ---------------------------------------------------------------------------
+# _normalize_model_name
+# ---------------------------------------------------------------------------
+
+
+class TestNormalizeModelName:
+    def test_plain_model(self):
+        assert adk_mod._normalize_model_name("gemini-2.5-flash") == "gemini-2.5-flash"
+
+    def test_strips_provider_prefix(self):
+        assert adk_mod._normalize_model_name("openai/gpt-4o") == "gpt-4o"
+
+    def test_strips_models_prefix(self):
+        assert adk_mod._normalize_model_name("models/gemini-2.5-flash") == "gemini-2.5-flash"
+
+    def test_strips_litellm_prefix(self):
+        assert adk_mod._normalize_model_name("vertex_ai/gemini-2.5-pro") == "gemini-2.5-pro"
+
+    def test_no_slash_passthrough(self):
+        assert adk_mod._normalize_model_name("gpt-4o-mini") == "gpt-4o-mini"
+
+
+# ---------------------------------------------------------------------------
+# _count_function_calls
+# ---------------------------------------------------------------------------
+
+
+class TestCountFunctionCalls:
+    def test_no_content(self):
+        assert adk_mod._count_function_calls(None) == 0
+
+    def test_no_parts(self):
+        content = FakeContent(parts=[])
+        assert adk_mod._count_function_calls(content) == 0
+
+    def test_text_only(self):
+        content = FakeContent(parts=[FakePart(text="hello")])
+        assert adk_mod._count_function_calls(content) == 0
+
+    def test_counts_function_calls(self):
+        content = FakeContent(
+            parts=[
+                FakePart(text="thinking..."),
+                FakePart(function_call={"name": "search", "args": {}}),
+                FakePart(function_call={"name": "calculate", "args": {}}),
+            ]
+        )
+        assert adk_mod._count_function_calls(content) == 2
+
+
+# ---------------------------------------------------------------------------
+# Cost / energy estimation (via shared pricing)
+# ---------------------------------------------------------------------------
+
+
+class TestEstimation:
+    def test_estimate_cost_known_model(self):
+        from cascadeflow.harness.pricing import estimate_cost
+
+        cost = estimate_cost("gemini-2.5-flash", 1_000_000, 1_000_000)
+        assert cost == pytest.approx(0.15 + 0.60)
+
+    def test_estimate_cost_unknown_model_uses_default(self):
+        from cascadeflow.harness.pricing import estimate_cost
+
+        cost = estimate_cost("unknown-model", 1_000_000, 0)
+        assert cost == pytest.approx(2.50)
+
+    def test_estimate_energy_known_model(self):
+        from cascadeflow.harness.pricing import estimate_energy
+
+        energy = estimate_energy("gemini-2.5-flash", 100, 100)
+        # coeff=0.3, output_weight=1.5
+        assert energy == pytest.approx(0.3 * (100 + 100 * 1.5))
+
+    def test_estimate_energy_unknown_model(self):
+        from cascadeflow.harness.pricing import estimate_energy
+
+        energy = estimate_energy("unknown-model", 100, 100)
+        # default coeff=1.0
+        assert energy == pytest.approx(1.0 * (100 + 100 * 1.5))
+
+
+# ---------------------------------------------------------------------------
+# before_model_callback
+# ---------------------------------------------------------------------------
+
+
+class TestBeforeModelCallback:
+    @pytest.fixture
+    def plugin(self):
+        return adk_mod.CascadeFlowADKPlugin()
+
+    async def test_no_run_context_returns_none(self, plugin):
+        ctx = FakeCallbackContext()
+        req = FakeLlmRequest()
+        result = await plugin.before_model_callback(ctx, req)
+        assert result is None
+
+    async def test_observe_mode_allows_over_budget(self, plugin):
+        init(mode="observe", budget=0.001)
+        with run(budget=0.001) as run_ctx:
+            run_ctx.cost = 0.002
+            result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest())
+            assert result is None  # observe never blocks
+
+    async def test_enforce_blocks_when_budget_exhausted(self, plugin):
+        init(mode="enforce", budget=0.001)
+        with run(budget=0.001) as run_ctx:
+            run_ctx.cost = 0.001
+            result = await plugin.before_model_callback(
+                FakeCallbackContext(), FakeLlmRequest("gemini-2.5-flash")
+            )
+            assert result is not None  # short-circuit response
+            assert run_ctx.last_action == "stop"
+            trace = run_ctx.trace()
+            assert trace[-1]["reason"] == "budget_exhausted"
+
+    async def test_enforce_blocked_call_does_not_leak_state(self, plugin):
+        init(mode="enforce", budget=0.001)
+        with run(budget=0.001) as run_ctx:
+            run_ctx.cost = 0.001
+            cb_ctx = FakeCallbackContext()
+            await plugin.before_model_callback(cb_ctx, FakeLlmRequest())
+            key = plugin._callback_key(cb_ctx)
+            assert key not in plugin._call_start_times
+            assert key not in plugin._call_models
+
+    async def test_enforce_allows_under_budget(self, plugin):
+        init(mode="enforce", budget=1.0)
+        with run(budget=1.0) as run_ctx:
+            run_ctx.cost = 0.5
+            result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest())
+            assert result is None
+
+    async def test_records_start_time_and_model(self, plugin):
+        init(mode="observe")
+        with run():
+            cb_ctx = FakeCallbackContext()
+            await plugin.before_model_callback(cb_ctx, FakeLlmRequest("gpt-4o"))
+            key = plugin._callback_key(cb_ctx)
+            assert key in plugin._call_start_times
+            assert plugin._call_models[key] == "gpt-4o"
+
+    async def test_normalizes_model_name(self, plugin):
+        init(mode="observe")
+        with run():
+            cb_ctx = FakeCallbackContext()
+            await plugin.before_model_callback(cb_ctx, FakeLlmRequest("openai/gpt-4o"))
+            key = plugin._callback_key(cb_ctx)
+            assert plugin._call_models[key] == "gpt-4o"
+
+    async def test_budget_gate_disabled_in_config(self):
+        plugin = adk_mod.CascadeFlowADKPlugin(
+            config=adk_mod.GoogleADKHarnessConfig(enable_budget_gate=False)
+        )
+        init(mode="enforce", budget=0.001)
+        with run(budget=0.001) as run_ctx:
+            run_ctx.cost = 0.002
+            result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest())
+            assert result is None  # gate disabled
+
+    async def test_fail_open_swallows_errors(self, plugin):
+        init(mode="enforce")
+        with run():
+            with patch(
+                "cascadeflow.integrations.google_adk.get_current_run",
+                side_effect=RuntimeError("boom"),
+            ):
+                result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest())
+                assert result is None
+
+
+# ---------------------------------------------------------------------------
+# after_model_callback
+# ---------------------------------------------------------------------------
+
+
+class TestAfterModelCallback:
+    @pytest.fixture
+    def plugin(self):
+        return adk_mod.CascadeFlowADKPlugin()
+
+    async def test_no_run_context_returns_none(self, plugin):
+        result = await plugin.after_model_callback(
+            FakeCallbackContext(),
+            FakeLlmResponse(),
+        )
+        assert result is None
+
+    async def test_updates_run_metrics_with_usage_metadata(self, plugin):
+        init(mode="observe")
+        with run(budget=1.0) as run_ctx:
+            cb_ctx = FakeCallbackContext()
+            key = plugin._callback_key(cb_ctx)
+            plugin._call_start_times[key] = time.monotonic() - 0.1
+            plugin._call_models[key] = "gemini-2.5-flash"
+
+            response = FakeLlmResponse(
+                usage_metadata=FakeUsageMetadata(
+                    prompt_token_count=100,
+                    candidates_token_count=50,
+                ),
+                content=FakeContent(parts=[FakePart(text="done")]),
+            )
+            await plugin.after_model_callback(cb_ctx, response)
+
+            assert run_ctx.step_count == 1
+            assert run_ctx.cost > 0
+            assert run_ctx.energy_used > 0
+            assert run_ctx.latency_used_ms > 0
+            assert run_ctx.model_used == "gemini-2.5-flash"
+            assert run_ctx.last_action == "allow"
+
+    async def test_fallback_token_estimation(self, plugin):
+        """When usage_metadata is missing, estimate from content text."""
+        init(mode="observe")
+        with run() as run_ctx:
+            cb_ctx = FakeCallbackContext()
+            key = plugin._callback_key(cb_ctx)
+            plugin._call_models[key] = "gemini-2.5-flash"
+
+            response = FakeLlmResponse(
+                content=FakeContent(parts=[FakePart(text="x" * 400)]),
+            )
+            await plugin.after_model_callback(cb_ctx, response)
+
+            assert run_ctx.cost > 0
+            assert run_ctx.step_count == 1
+
+    async def test_counts_tool_calls(self, plugin):
+        init(mode="observe")
+        with run() as run_ctx:
+            cb_ctx = FakeCallbackContext()
+            key = plugin._callback_key(cb_ctx)
+            plugin._call_models[key] = "gemini-2.5-flash"
+
+            response = FakeLlmResponse(
+                usage_metadata=FakeUsageMetadata(100, 50),
+                content=FakeContent(
+                    parts=[
+                        FakePart(function_call={"name": "search"}),
+                        FakePart(function_call={"name": "calc"}),
+                    ]
+                ),
+            )
+            await plugin.after_model_callback(cb_ctx, response)
+            assert run_ctx.tool_calls == 2
+
+    async def test_updates_budget_remaining(self, plugin):
+        init(mode="enforce", budget=1.0)
+        with run(budget=1.0) as run_ctx:
+            cb_ctx = FakeCallbackContext()
+            key = plugin._callback_key(cb_ctx)
+            plugin._call_models[key] = "gemini-2.5-flash"
+
+            response = FakeLlmResponse(
+                usage_metadata=FakeUsageMetadata(100, 50),
+            )
+            await plugin.after_model_callback(cb_ctx, response)
+            assert run_ctx.budget_remaining is not None
+            assert run_ctx.budget_remaining == pytest.approx(1.0 - run_ctx.cost)
+
+    async def test_trace_records_mode(self, plugin):
+        init(mode="enforce")
+        with run() as run_ctx:
+            cb_ctx = FakeCallbackContext()
+            key = plugin._callback_key(cb_ctx)
+            plugin._call_models[key] = "gpt-4o"
+
+            response = FakeLlmResponse(
+                usage_metadata=FakeUsageMetadata(10, 10),
+            )
+            await plugin.after_model_callback(cb_ctx, response)
+            trace = run_ctx.trace()
+            assert len(trace) == 1
+            assert trace[0]["reason"] == "enforce"
+            assert trace[0]["model"] == "gpt-4o"
+
+    async def test_no_start_time_records_zero_latency(self, plugin):
+        init(mode="observe")
+        with run() as run_ctx:
+            cb_ctx = FakeCallbackContext()
+            key = plugin._callback_key(cb_ctx)
+            plugin._call_models[key] = "gpt-4o"
+            # Don't set start time
+
+            response = FakeLlmResponse(
+                usage_metadata=FakeUsageMetadata(10, 10),
+            )
+            await plugin.after_model_callback(cb_ctx, response)
+            assert run_ctx.latency_used_ms == 0.0
+
+    async def test_fallback_key_tracks_across_distinct_context_objects(self, plugin):
+        """ADK runtimes may pass different callback_context objects per phase."""
+        init(mode="observe")
+        with run() as run_ctx:
+            before_ctx = FakeCallbackContext(invocation_id="inv-x", agent_name="agent-a")
+            after_ctx = FakeCallbackContext(invocation_id="inv-x", agent_name="agent-a")
+            await plugin.before_model_callback(before_ctx, FakeLlmRequest("gemini-2.5-flash"))
+
+            response = FakeLlmResponse(
+                usage_metadata=FakeUsageMetadata(100, 50),
+            )
+            await plugin.after_model_callback(after_ctx, response)
+            assert run_ctx.model_used == "gemini-2.5-flash"
+            assert run_ctx.latency_used_ms >= 0.0
+
+    async def test_fail_open_swallows_errors(self, plugin):
+        init(mode="observe")
+        with run():
+            with patch(
+                "cascadeflow.integrations.google_adk.get_current_run",
+                side_effect=RuntimeError("boom"),
+            ):
+                result = await plugin.after_model_callback(
+                    FakeCallbackContext(),
+                    FakeLlmResponse(),
+                )
+                assert result is None
+
+
+# ---------------------------------------------------------------------------
+# on_model_error_callback
+# ---------------------------------------------------------------------------
+
+
+class TestOnModelErrorCallback:
+    @pytest.fixture
+    def plugin(self):
+        return adk_mod.CascadeFlowADKPlugin()
+
+    async def test_records_error_in_trace(self, plugin):
+        init(mode="observe")
+        with run() as run_ctx:
+            cb_ctx = FakeCallbackContext()
+            key = plugin._callback_key(cb_ctx)
+            plugin._call_models[key] = "gemini-2.5-flash"
+            plugin._call_start_times[key] = time.monotonic()
+
+            await plugin.on_model_error_callback(cb_ctx, ValueError("bad input"))
+
+            trace = run_ctx.trace()
+            assert len(trace) == 1
+            assert trace[0]["action"] == "error"
+            assert "ValueError" in trace[0]["reason"]
+            assert trace[0]["model"] == "gemini-2.5-flash"
+
+    async def test_cleans_up_timing_state(self, plugin):
+        init(mode="observe")
+        with run():
+            cb_ctx = FakeCallbackContext()
+            key = plugin._callback_key(cb_ctx)
+            plugin._call_models[key] = "gemini-2.5-flash"
+            plugin._call_start_times[key] = time.monotonic()
+
+            await plugin.on_model_error_callback(cb_ctx, RuntimeError("oops"))
+
+            assert key not in plugin._call_models
+            assert key not in plugin._call_start_times
+
+    async def test_fail_open_swallows_errors(self, plugin):
+        init(mode="observe")
+        with run():
+            with patch(
+                "cascadeflow.integrations.google_adk.get_current_run",
+                side_effect=RuntimeError("boom"),
+            ):
+                result = await plugin.on_model_error_callback(
+                    FakeCallbackContext(),
+                    ValueError("test"),
+                )
+                assert result is None
+
+
+# ---------------------------------------------------------------------------
+# enable / disable lifecycle
+# ---------------------------------------------------------------------------
+
+
+class TestEnableDisable:
+    def test_enable_returns_plugin_instance(self):
+        plugin = adk_mod.enable()
+        assert isinstance(plugin, adk_mod.CascadeFlowADKPlugin)
+        assert plugin.name == "cascadeflow_harness"
+        assert adk_mod.is_enabled()
+
+    def test_enable_is_idempotent(self):
+        p1 = adk_mod.enable()
+        p2 = adk_mod.enable()
+        assert p1 is p2  # same instance
+
+    def test_enable_applies_config(self):
+        config = adk_mod.GoogleADKHarnessConfig(fail_open=False, enable_budget_gate=False)
+        plugin = adk_mod.enable(config=config)
+        assert plugin._config.fail_open is False
+        assert plugin._config.enable_budget_gate is False
+
+    def test_disable_deactivates_plugin(self):
+        plugin = adk_mod.enable()
+        assert plugin._active is True
+        adk_mod.disable()
+        assert not adk_mod.is_enabled()
+        assert plugin._active is False
+
+    def test_disable_when_not_enabled_is_safe(self):
+        adk_mod.disable()  # should not raise
+        assert not adk_mod.is_enabled()
+
+
+# ---------------------------------------------------------------------------
+# Public API helpers
+# ---------------------------------------------------------------------------
+
+
+class TestPublicAPI:
+    def test_is_available_reflects_module_flag(self):
+        assert adk_mod.is_available() == adk_mod.GOOGLE_ADK_AVAILABLE
+
+    def test_is_enabled_default_false(self):
+        assert adk_mod.is_enabled() is False
+
+    def test_get_config_returns_copy(self):
+        cfg = adk_mod.get_config()
+        assert isinstance(cfg, adk_mod.GoogleADKHarnessConfig)
+        assert cfg.fail_open is True
+        assert cfg.enable_budget_gate is True
+        # Modifying the copy doesn't affect module state
+        cfg.fail_open = False
+        assert adk_mod.get_config().fail_open is True
+
+
+# ---------------------------------------------------------------------------
+# GoogleADKHarnessConfig
+# ---------------------------------------------------------------------------
+
+
+class TestConfig:
+    def test_defaults(self):
+        cfg = adk_mod.GoogleADKHarnessConfig()
+        assert cfg.fail_open is True
+        assert cfg.enable_budget_gate is True
+
+    def test_custom_values(self):
+        cfg = adk_mod.GoogleADKHarnessConfig(fail_open=False, enable_budget_gate=False)
+        assert cfg.fail_open is False
+        assert cfg.enable_budget_gate is False
+
+
+# ---------------------------------------------------------------------------
+# Plugin deactivate
+# ---------------------------------------------------------------------------
+
+
+class TestDeactivate:
+    async def test_deactivated_plugin_skips_callbacks(self):
+        plugin = adk_mod.CascadeFlowADKPlugin()
+        plugin.deactivate()
+
+        init(mode="enforce", budget=0.001)
+        with run(budget=0.001) as run_ctx:
+            run_ctx.cost = 0.002
+            result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest())
+            assert result is None  # no-op, not blocked
+
+    async def test_deactivate_clears_state(self):
+        plugin = adk_mod.CascadeFlowADKPlugin()
+        plugin._call_start_times[12345] = 1.0
+        plugin._call_models[12345] = "test"
+        plugin.deactivate()
+        assert len(plugin._call_start_times) == 0
+        assert len(plugin._call_models) == 0
+
+
+# ---------------------------------------------------------------------------
+# _extract_tokens
+# ---------------------------------------------------------------------------
+
+
+class TestExtractTokens:
+    def test_from_usage_metadata(self):
+        response = FakeLlmResponse(
+            usage_metadata=FakeUsageMetadata(100, 200),
+        )
+        assert adk_mod.CascadeFlowADKPlugin._extract_tokens(response) == (100, 200)
+
+    def test_zero_usage_falls_back_to_content(self):
+        response = FakeLlmResponse(
+            usage_metadata=FakeUsageMetadata(0, 0),
+            content=FakeContent(parts=[FakePart(text="x" * 80)]),
+        )
+        inp, out = adk_mod.CascadeFlowADKPlugin._extract_tokens(response)
+        assert inp == 0
+        assert out == 20  # 80 / 4
+
+    def test_no_usage_no_content(self):
+        response = FakeLlmResponse()
+        assert adk_mod.CascadeFlowADKPlugin._extract_tokens(response) == (0, 0)
+
+    def test_content_with_no_text(self):
+        response = FakeLlmResponse(
+            content=FakeContent(parts=[FakePart(function_call={"name": "f"})]),
+        )
+        inp, out = adk_mod.CascadeFlowADKPlugin._extract_tokens(response)
+        assert inp == 0
+        assert out == 1  # max(0//4, 1)
+
+
+class TestCallbackKeyCollision:
+    """Verify _callback_key uses id() for per-object uniqueness."""
+
+    def test_distinct_keys_for_different_objects(self):
+        """Two distinct context objects always produce distinct keys."""
+        ctx_a = FakeCallbackContext(invocation_id="inv-1", agent_name="agent-a")
+        ctx_b = FakeCallbackContext(invocation_id="inv-1", agent_name="agent-a")
+        key_a = adk_mod.CascadeFlowADKPlugin._callback_key(ctx_a)
+        key_b = adk_mod.CascadeFlowADKPlugin._callback_key(ctx_b)
+        assert key_a != key_b, "Same IDs on different objects must produce distinct keys"
+
+    def test_key_stable_for_same_object(self):
+        """Same context object always produces the same key."""
+        ctx = FakeCallbackContext()
+        key1 = adk_mod.CascadeFlowADKPlugin._callback_key(ctx)
+        key2 = adk_mod.CascadeFlowADKPlugin._callback_key(ctx)
+        assert key1 == key2
+
+    def test_key_is_int(self):
+        """Key type is int (object id)."""
+        ctx = FakeCallbackContext()
+        assert isinstance(adk_mod.CascadeFlowADKPlugin._callback_key(ctx), int)
+
+    @pytest.mark.asyncio
+    async def test_concurrent_same_ids_track_independently(self):
+        """Two concurrent calls with same invocation_id+agent_name don't corrupt."""
+        init(mode="observe")
+        with run(budget=1.0) as harness_ctx:
+            plugin = adk_mod.CascadeFlowADKPlugin()
+            # Same IDs — previously would collide
+            ctx_a = FakeCallbackContext(invocation_id="inv-1", agent_name="agent")
+            ctx_b = FakeCallbackContext(invocation_id="inv-1", agent_name="agent")
+
+            req_a = FakeLlmRequest(model="gpt-4o")
+            req_b = FakeLlmRequest(model="gpt-4o-mini")
+
+            await plugin.before_model_callback(ctx_a, req_a)
+            await plugin.before_model_callback(ctx_b, req_b)
+
+            resp_b = FakeLlmResponse(usage_metadata=FakeUsageMetadata(50, 25))
+            resp_a = FakeLlmResponse(usage_metadata=FakeUsageMetadata(100, 50))
+            await plugin.after_model_callback(ctx_b, resp_b)
+            await plugin.after_model_callback(ctx_a, resp_a)
+
+            assert harness_ctx.step_count == 2
+            assert len(plugin._call_start_times) == 0
+            assert len(plugin._call_models) == 0
+
+
+# ---------------------------------------------------------------------------
+# Off-mode behavior
+# ---------------------------------------------------------------------------
+
+
+class TestOffMode:
+    """mode='off' must not track metrics or update run context."""
+
+    @pytest.mark.asyncio
+    async def test_off_mode_before_callback_returns_none(self):
+        init(mode="off")
+        plugin = adk_mod.CascadeFlowADKPlugin()
+        with run() as run_ctx:
+            result = await plugin.before_model_callback(FakeCallbackContext(), FakeLlmRequest())
+            assert result is None
+            assert len(plugin._call_start_times) == 0
+
+    @pytest.mark.asyncio
+    async def test_off_mode_after_callback_does_not_track(self):
+        init(mode="off")
+        plugin = adk_mod.CascadeFlowADKPlugin()
+        with run() as run_ctx:
+            await plugin.after_model_callback(
+                FakeCallbackContext(),
+                FakeLlmResponse(usage_metadata=FakeUsageMetadata(1000, 500)),
+            )
+            assert run_ctx.step_count == 0
+            assert run_ctx.cost == 0.0
+            assert run_ctx.energy_used == 0.0
+            assert len(run_ctx.trace()) == 0
+
+
+# ---------------------------------------------------------------------------
+# Versioned model name resolution
+# ---------------------------------------------------------------------------
+
+
+class TestVersionedModelPricing:
+    """Versioned model IDs must resolve to correct pricing, not default."""
+
+    def test_versioned_gemini_flash(self):
+        from cascadeflow.harness.pricing import estimate_cost
+
+        # Should resolve to gemini-2.5-flash pricing ($0.15/$0.60)
+        cost = estimate_cost("gemini-2.5-flash-preview-05-20", 1_000_000, 1_000_000)
+        assert cost == pytest.approx(0.75, abs=0.01)
+
+    def test_versioned_gemini_pro(self):
+        from cascadeflow.harness.pricing import estimate_cost
+
+        cost = estimate_cost("gemini-2.5-pro-preview-05-06", 1_000_000, 1_000_000)
+        assert cost == pytest.approx(11.25, abs=0.01)
+
+    def test_dated_model_suffix(self):
+        from cascadeflow.harness.pricing import estimate_cost
+
+        cost = estimate_cost("gemini-2.5-flash-20250120", 1_000_000, 1_000_000)
+        assert cost == pytest.approx(0.75, abs=0.01)
+
+    def test_latest_suffix(self):
+        from cascadeflow.harness.pricing import estimate_cost
+
+        cost = estimate_cost("gemini-2.5-flash-latest", 1_000_000, 1_000_000)
+        assert cost == pytest.approx(0.75, abs=0.01)
+
+    def test_unknown_model_still_uses_default(self):
+        from cascadeflow.harness.pricing import estimate_cost
+
+        cost = estimate_cost("totally-unknown-model", 1_000_000, 0)
+        assert cost == pytest.approx(2.50)
+
+    def test_exact_match_still_works(self):
+        from cascadeflow.harness.pricing import estimate_cost
+
+        cost = estimate_cost("gemini-2.5-flash", 1_000_000, 1_000_000)
+        assert cost == pytest.approx(0.75, abs=0.01)
+
+    def test_prefix_match_variant(self):
+        """A variant like gemini-2.5-flash-8b matches the base model."""
+        from cascadeflow.harness.pricing import estimate_cost
+
+        cost = estimate_cost("gemini-2.5-flash-8b", 1_000_000, 1_000_000)
+        assert cost == pytest.approx(0.75, abs=0.01)
diff --git a/tests/test_harness_api.py b/tests/test_harness_api.py
index 5669e845..f4e7f9cd 100644
--- a/tests/test_harness_api.py
+++ b/tests/test_harness_api.py
@@ -5,6 +5,7 @@
 import cascadeflow
 import cascadeflow.harness.api as harness_api
 from cascadeflow.harness import agent, get_current_run, get_harness_config, init, reset, run
+from cascadeflow.telemetry.callbacks import CallbackEvent, CallbackManager
 
 
 def setup_function() -> None:
@@ -154,6 +155,17 @@ def fn(x: int) -> int:
     assert policy["compliance"] == "gdpr"
 
 
+def test_agent_decorator_preserves_function_metadata():
+    @agent(budget=0.5)
+    def fn(x: int) -> int:
+        """sample doc"""
+        return x
+
+    assert fn.__name__ == "fn"
+    assert fn.__doc__ == "sample doc"
+    assert fn.__annotations__ == {"x": int, "return": int}
+
+
 @pytest.mark.asyncio
 async def test_agent_decorator_keeps_async_behavior_and_attaches_metadata():
     @agent(budget=0.4, kpi_weights={"cost": 1.0})
@@ -172,6 +184,8 @@ def test_top_level_exports_exist():
     assert callable(cascadeflow.run)
     assert callable(cascadeflow.harness_agent)
     assert hasattr(cascadeflow.agent, "PROVIDER_REGISTRY")
+    assert callable(cascadeflow.get_harness_callback_manager)
+    assert callable(cascadeflow.set_harness_callback_manager)
     report = cascadeflow.init(mode="off")
     assert report.mode == "off"
 
@@ -183,6 +197,8 @@ def test_run_record_and_trace_copy():
     trace_b = ctx.trace()
     assert trace_a == trace_b
     assert trace_a[0]["action"] == "switch_model"
+    assert "budget_state" in trace_a[0]
+    assert trace_a[0]["budget_state"]["max"] == 1.0
     trace_a.append({"action": "mutated"})
     assert len(ctx.trace()) == 1
 
@@ -205,6 +221,12 @@ def test_init_reads_from_env(monkeypatch):
     assert report.config_sources["budget"] == "env"
 
 
+def test_init_rejects_oversized_env_json(monkeypatch):
+    monkeypatch.setenv("CASCADEFLOW_HARNESS_KPI_TARGETS", "x" * 5000)
+    with pytest.raises(ValueError, match="JSON config exceeds"):
+        init()
+
+
 def test_init_reads_from_config_file(tmp_path, monkeypatch):
     config = tmp_path / "cascadeflow.json"
     config.write_text(
@@ -327,3 +349,188 @@ def test_init_reports_openai_instrumented_when_patch_succeeds(monkeypatch):
     monkeypatch.setattr(instrument, "patch_openai", lambda: True)
     report = init(mode="observe")
     assert report.instrumented == ["openai"]
+
+
+def test_init_reports_anthropic_instrumented_when_patch_succeeds(monkeypatch):
+    monkeypatch.setattr(
+        harness_api,
+        "find_spec",
+        lambda name: object() if name == "anthropic" else None,
+    )
+
+    import cascadeflow.harness.instrument as instrument
+
+    monkeypatch.setattr(instrument, "patch_anthropic", lambda: True)
+    report = init(mode="observe")
+    assert report.instrumented == ["anthropic"]
+
+
+def test_init_reports_anthropic_detected_not_instrumented_on_patch_failure(monkeypatch):
+    monkeypatch.setattr(
+        harness_api,
+        "find_spec",
+        lambda name: object() if name == "anthropic" else None,
+    )
+
+    import cascadeflow.harness.instrument as instrument
+
+    monkeypatch.setattr(instrument, "patch_anthropic", lambda: False)
+    report = init(mode="observe")
+    assert report.instrumented == []
+    assert report.detected_but_not_instrumented == ["anthropic"]
+
+
+def test_run_summary_populates_on_context_exit():
+    init(mode="observe")
+    with run(budget=1.5) as ctx:
+        ctx.step_count = 2
+        ctx.tool_calls = 1
+        ctx.cost = 0.42
+        ctx.latency_used_ms = 123.0
+        ctx.energy_used = 33.0
+        ctx.budget_remaining = 1.08
+        ctx.last_action = "allow"
+        ctx.model_used = "gpt-4o-mini"
+
+    summary = ctx.summary()
+    assert summary["run_id"] == ctx.run_id
+    assert summary["step_count"] == 2
+    assert summary["budget_remaining"] == pytest.approx(1.08)
+    assert summary["duration_ms"] is not None
+    assert summary["duration_ms"] >= 0.0
+    assert ctx.duration_ms is not None
+    assert ctx.duration_ms >= 0.0
+
+
+def test_run_context_logs_summary(caplog):
+    init(mode="observe")
+    with caplog.at_level("INFO", logger="cascadeflow.harness"):
+        with run(budget=1.0) as ctx:
+            ctx.step_count = 1
+            ctx.cost = 0.01
+            ctx.model_used = "gpt-4o-mini"
+
+    assert any("harness run summary" in rec.message for rec in caplog.records)
+
+
+def test_record_emits_cascade_decision_callback():
+    manager = CallbackManager()
+    received = []
+
+    def _on_decision(data):
+        received.append(data)
+
+    manager.register(CallbackEvent.CASCADE_DECISION, _on_decision)
+    report = init(mode="observe", callback_manager=manager)
+    assert report.config_sources["callback_manager"] == "code"
+
+    with run(budget=1.0) as ctx:
+        ctx.step_count = 1
+        ctx.record(action="switch_model", reason="budget_pressure", model="gpt-4o-mini")
+
+    assert len(received) == 1
+    event = received[0]
+    assert event.event == CallbackEvent.CASCADE_DECISION
+    assert event.query == "[harness]"
+    assert event.workflow == "harness"
+    assert event.data["action"] == "switch_model"
+    assert event.data["run_id"] == ctx.run_id
+
+
+def test_record_sanitizes_trace_values():
+    ctx = run()
+    ctx.record(
+        action="allow\nnewline",
+        reason="a" * 400,
+        model="model\r\nname",
+    )
+    entry = ctx.trace()[0]
+    assert "\n" not in entry["action"]
+    assert "\r" not in entry["model"]
+    assert len(entry["reason"]) <= 160
+
+
+def test_record_sanitizes_non_printable_values():
+    ctx = run()
+    ctx.record(action="allow\x00", reason="ok\x1f", model="gpt-4o-mini\x07")
+    entry = ctx.trace()[0]
+    assert "\x00" not in entry["action"]
+    assert "\x1f" not in entry["reason"]
+    assert "\x07" not in entry["model"]
+
+
+def test_record_without_callback_manager_is_noop():
+    init(mode="observe")
+    with run(budget=1.0) as ctx:
+        ctx.record(action="allow", reason="test", model="gpt-4o-mini")
+    assert len(ctx.trace()) == 1
+
+
+def test_record_empty_action_warns_and_defaults(caplog):
+    init(mode="observe")
+    with caplog.at_level("WARNING", logger="cascadeflow.harness"):
+        with run(budget=1.0) as ctx:
+            ctx.record(action="", reason="test", model="gpt-4o-mini")
+    entry = ctx.trace()[0]
+    assert entry["action"] == "allow"
+    assert any("empty action" in rec.message for rec in caplog.records)
+
+
+def test_init_rejects_negative_budget():
+    with pytest.raises(ValueError, match="non-negative"):
+        init(mode="observe", budget=-1.0)
+
+
+def test_init_rejects_negative_max_tool_calls():
+    with pytest.raises(ValueError, match="non-negative"):
+        init(mode="observe", max_tool_calls=-1)
+
+
+def test_init_rejects_negative_max_latency():
+    with pytest.raises(ValueError, match="non-negative"):
+        init(mode="observe", max_latency_ms=-100.0)
+
+
+def test_init_rejects_negative_max_energy():
+    with pytest.raises(ValueError, match="non-negative"):
+        init(mode="observe", max_energy=-0.5)
+
+
+def test_init_rejects_invalid_compliance():
+    with pytest.raises(ValueError, match="compliance"):
+        init(mode="observe", compliance="invalid_mode")
+
+
+def test_run_rejects_negative_budget():
+    init(mode="observe")
+    with pytest.raises(ValueError, match="non-negative"):
+        run(budget=-0.5)
+
+
+def test_run_rejects_invalid_compliance():
+    init(mode="observe")
+    with pytest.raises(ValueError, match="compliance"):
+        run(compliance="foobar")
+
+
+def test_init_accepts_zero_budget():
+    report = init(mode="observe", budget=0.0)
+    cfg = get_harness_config()
+    assert cfg.budget == 0.0
+
+
+def test_init_accepts_valid_compliance():
+    for value in ("gdpr", "hipaa", "pci", "strict"):
+        reset()
+        report = init(mode="observe", compliance=value)
+        cfg = get_harness_config()
+        assert cfg.compliance == value
+
+
+def test_trace_rotation_limits_entries():
+    init(mode="observe")
+    with run(budget=100.0) as ctx:
+        for i in range(1050):
+            ctx.record(action="allow", reason="test", model="gpt-4o-mini")
+    trace = ctx.trace()
+    assert len(trace) <= 1000
diff --git a/tests/test_harness_instrument.py b/tests/test_harness_instrument.py
index 75368522..a46cf8a6 100644
--- a/tests/test_harness_instrument.py
+++ b/tests/test_harness_instrument.py
@@ -1,7 +1,8 @@
-"""Tests for cascadeflow.harness.instrument — OpenAI auto-instrumentation."""
+"""Tests for cascadeflow.harness.instrument — OpenAI + Anthropic auto-instrumentation."""
 
 from __future__ import annotations
 
+from importlib.util import find_spec
 import time
 from typing import Optional
 from unittest.mock import AsyncMock, MagicMock
@@ -12,14 +13,24 @@
 
 from cascadeflow.harness import init, reset, run
 from cascadeflow.harness.instrument import (
+    _InstrumentedAnthropicAsyncStream,
+    _InstrumentedAnthropicStream,
     _InstrumentedAsyncStream,
     _InstrumentedStream,
+    _count_tool_calls_in_anthropic_response,
     _estimate_cost,
     _estimate_energy,
+    _extract_anthropic_usage,
+    _make_patched_anthropic_async_create,
+    _make_patched_anthropic_create,
     _make_patched_async_create,
     _make_patched_create,
+    is_anthropic_patched,
+    is_openai_patched,
     is_patched,
+    patch_anthropic,
     patch_openai,
+    unpatch_anthropic,
     unpatch_openai,
 )
 
@@ -87,19 +98,19 @@ def _mock_stream_chunk(
 
 class TestPatchLifecycle:
     def test_patch_and_unpatch(self) -> None:
-        assert not is_patched()
+        assert not is_openai_patched()
         result = patch_openai()
         assert result is True
-        assert is_patched()
+        assert is_openai_patched()
         unpatch_openai()
-        assert not is_patched()
+        assert not is_openai_patched()
 
     def test_idempotent_patching(self) -> None:
         patch_openai()
         patch_openai()
-        assert is_patched()
+        assert is_openai_patched()
         unpatch_openai()
-        assert not is_patched()
+        assert not is_openai_patched()
 
     def test_unpatch_without_prior_patch(self) -> None:
         unpatch_openai()  # should not raise
@@ -107,12 +118,12 @@ def test_unpatch_without_prior_patch(self) -> None:
     def test_init_observe_patches(self) -> None:
         report = init(mode="observe")
         assert "openai" in report.instrumented
-        assert is_patched()
+        assert is_openai_patched()
 
     def test_init_enforce_patches(self) -> None:
         report = init(mode="enforce")
         assert "openai" in report.instrumented
-        assert is_patched()
+        assert is_openai_patched()
 
     def test_init_off_does_not_patch(self) -> None:
         init(mode="off")
@@ -120,7 +131,7 @@ def test_init_off_does_not_patch(self) -> None:
 
     def test_reset_unpatches(self) -> None:
         init(mode="observe")
-        assert is_patched()
+        assert is_openai_patched()
         reset()
         assert not is_patched()
 
@@ -133,6 +144,27 @@ def test_class_method_actually_replaced(self) -> None:
         unpatch_openai()
         assert Completions.create is original
 
+    def test_patch_and_unpatch_anthropic(self) -> None:
+        if find_spec("anthropic") is None:
+            pytest.skip("anthropic package not available")
+        assert not is_anthropic_patched()
+        result = patch_anthropic()
+        assert result is True
+        assert is_anthropic_patched()
+        unpatch_anthropic()
+        assert not is_anthropic_patched()
+
+    def test_anthropic_class_method_actually_replaced(self) -> None:
+        if find_spec("anthropic") is None:
+            pytest.skip("anthropic package not available")
+        from anthropic.resources.messages import Messages
+
+        original = Messages.create
+        patch_anthropic()
+        assert Messages.create is not original
+        unpatch_anthropic()
+        assert Messages.create is original
+
 
 # ---------------------------------------------------------------------------
 # Sync wrapper
@@ -402,6 +434,31 @@ def test_stream_finalize_is_idempotent(self) -> None:
 
         assert ctx.step_count == 1  # Should not double-count
 
+    def test_stream_finalizes_on_iteration_error(self) -> None:
+        init(mode="observe")
+        chunk1 = _mock_stream_chunk("data", usage=_mock_usage(100, 50))
+
+        class _FailingStream:
+            def __init__(self) -> None:
+                self._done = False
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                if not self._done:
+                    self._done = True
+                    return chunk1
+                raise RuntimeError("stream failed")
+
+        with run(budget=1.0) as ctx:
+            wrapped = _InstrumentedStream(_FailingStream(), ctx, "gpt-4o-mini", time.monotonic())
+            with pytest.raises(RuntimeError, match="stream failed"):
+                list(wrapped)
+
+        assert ctx.step_count == 1
+        assert ctx.cost > 0
+
     def test_stream_wrapper_via_patched_create(self) -> None:
         """Verify that stream=True in the wrapper returns an _InstrumentedStream."""
         init(mode="observe")
@@ -464,6 +521,26 @@ async def _async_iter():
 
         assert ctx.step_count == 1
 
+    @pytest.mark.asyncio
+    async def test_async_stream_finalizes_on_iteration_error(self) -> None:
+        init(mode="observe")
+        chunk1 = _mock_stream_chunk("data", usage=_mock_usage(100, 50))
+
+        async def _failing_iter():
+            yield chunk1
+            raise RuntimeError("async stream failed")
+
+        async with run(budget=1.0) as ctx:
+            wrapped = _InstrumentedAsyncStream(
+                _failing_iter(), ctx, "gpt-4o-mini", time.monotonic()
+            )
+            with pytest.raises(RuntimeError, match="async stream failed"):
+                async for _ in wrapped:
+                    pass
+
+        assert ctx.step_count == 1
+        assert ctx.cost > 0
+
 
 # ---------------------------------------------------------------------------
 # Cost and energy estimation
@@ -941,3 +1018,487 @@ def test_non_stream_does_not_inject_stream_options(self) -> None:
 
         call_kwargs = original.call_args[1]
         assert "stream_options" not in call_kwargs
+
+
+# ===========================================================================
+# Anthropic instrumentation tests
+# ===========================================================================
+
+
+def _mock_anthropic_usage(
+    input_tokens: Optional[int] = 100,
+    output_tokens: Optional[int] = 50,
+) -> MagicMock:
+    u = MagicMock()
+    u.input_tokens = input_tokens
+    u.output_tokens = output_tokens
+    return u
+
+
+def _mock_anthropic_response(
+    input_tokens: int = 100,
+    output_tokens: int = 50,
+    content: Optional[list] = None,
+) -> MagicMock:
+    resp = MagicMock()
+    resp.usage = _mock_anthropic_usage(input_tokens, output_tokens)
+    resp.content = content or []
+    return resp
+
+
+def _mock_tool_use_block() -> MagicMock:
+    block = MagicMock()
+    block.type = "tool_use"
+    return block
+
+
+def _mock_text_block() -> MagicMock:
+    block = MagicMock()
+    block.type = "text"
+    return block
+
+
+def _mock_anthropic_message_start_event(
+    input_tokens: int = 100,
+    output_tokens: int = 0,
+) -> MagicMock:
+    event = MagicMock()
+    event.type = "message_start"
+    event.message = MagicMock()
+    event.message.usage = _mock_anthropic_usage(input_tokens, output_tokens)
+    return event
+
+
+def _mock_anthropic_message_delta_event(
+    output_tokens: int = 50,
+) -> MagicMock:
+    event = MagicMock()
+    event.type = "message_delta"
+    event.usage = _mock_anthropic_usage(None, output_tokens)
+    return event
+
+
+def _mock_anthropic_content_block_start_event(
+    block_type: str = "tool_use",
+) -> MagicMock:
+    event = MagicMock()
+    event.type = "content_block_start"
+    event.content_block = MagicMock()
+    event.content_block.type = block_type
+    return event
+
+
+def _mock_anthropic_message_stop_event() -> MagicMock:
+    event = MagicMock()
+    event.type = "message_stop"
+    event.usage = None
+    return event
+
+
+# ---------------------------------------------------------------------------
+# Anthropic usage extraction
+# ---------------------------------------------------------------------------
+
+
+class TestAnthropicUsageExtraction:
+    def test_extract_usage(self) -> None:
+        resp = _mock_anthropic_response(input_tokens=200, output_tokens=100)
+        inp, out = _extract_anthropic_usage(resp)
+        assert inp == 200
+        assert out == 100
+
+    def test_extract_usage_none(self) -> None:
+        resp = MagicMock()
+        resp.usage = None
+        inp, out = _extract_anthropic_usage(resp)
+        assert inp == 0
+        assert out == 0
+
+
+# ---------------------------------------------------------------------------
+# Anthropic tool call counting
+# ---------------------------------------------------------------------------
+
+
+class TestAnthropicToolCallCounting:
+    def test_counts_tool_use_blocks(self) -> None:
+        resp = _mock_anthropic_response(
+            content=[_mock_text_block(), _mock_tool_use_block(), _mock_tool_use_block()]
+        )
+        assert _count_tool_calls_in_anthropic_response(resp) == 2
+
+    def test_no_content(self) -> None:
+        resp = MagicMock()
+        resp.content = None
+        assert _count_tool_calls_in_anthropic_response(resp) == 0
+
+    def test_empty_content(self) -> None:
+        resp = _mock_anthropic_response(content=[])
+        assert _count_tool_calls_in_anthropic_response(resp) == 0
+
+    def test_text_only(self) -> None:
+        resp = _mock_anthropic_response(content=[_mock_text_block()])
+        assert _count_tool_calls_in_anthropic_response(resp) == 0
+
+
+# ---------------------------------------------------------------------------
+# Anthropic sync wrapper
+# ---------------------------------------------------------------------------
+
+
+class TestAnthropicSyncWrapper:
+    def test_observe_passes_through_response(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=1.0) as ctx:
+            result = wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert result is mock_resp
+        original.assert_called_once()
+
+    def test_observe_tracks_cost(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000)
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=100.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        # claude-sonnet-4: $3.00/1M in + $15.00/1M out = $18.00
+        assert ctx.cost == pytest.approx(18.0, abs=0.01)
+
+    def test_observe_tracks_step_count(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=1.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert ctx.step_count == 2
+
+    def test_observe_tracks_tool_calls(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response(
+            content=[_mock_tool_use_block(), _mock_tool_use_block()]
+        )
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=1.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert ctx.tool_calls == 2
+
+    def test_observe_tracks_energy(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response(input_tokens=1000, output_tokens=500)
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=1.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        # claude-sonnet-4 uses default coefficient=1.0, output_weight=1.5
+        # energy = 1.0 * (1000 + 500 * 1.5) = 1750.0
+        assert ctx.energy_used == pytest.approx(1750.0)
+
+    def test_observe_tracks_latency(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=1.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert ctx.latency_used_ms > 0
+
+    def test_budget_remaining_decreases(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000)
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=100.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert ctx.budget_remaining is not None
+        assert ctx.budget_remaining == pytest.approx(100.0 - 18.0, abs=0.01)
+
+    def test_trace_records_model_and_mode(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=1.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        trace = ctx.trace()
+        assert len(trace) == 1
+        assert trace[0]["action"] == "allow"
+        assert trace[0]["reason"] == "observe"
+        assert trace[0]["model"] == "claude-sonnet-4"
+
+    def test_off_mode_passthrough_no_tracking(self) -> None:
+        init(mode="off")
+        mock_resp = _mock_anthropic_response()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run() as ctx:
+            result = wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert result is mock_resp
+        assert ctx.cost == 0.0
+        assert ctx.step_count == 0
+
+    def test_no_run_scope_returns_response(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        result = wrapper(MagicMock(), model="claude-sonnet-4")
+        assert result is mock_resp
+
+    def test_stream_tracks_usage_and_tool_calls(self) -> None:
+        init(mode="observe")
+        mock_stream = iter(
+            [
+                _mock_anthropic_message_start_event(input_tokens=1_000_000),
+                _mock_anthropic_content_block_start_event("tool_use"),
+                _mock_anthropic_message_delta_event(output_tokens=1_000_000),
+                _mock_anthropic_message_stop_event(),
+            ]
+        )
+        original = MagicMock(return_value=mock_stream)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=1.0) as ctx:
+            result = wrapper(MagicMock(), model="claude-sonnet-4", stream=True)
+            assert isinstance(result, _InstrumentedAnthropicStream)
+            list(result)
+
+        assert ctx.cost == pytest.approx(18.0, abs=0.01)
+        assert ctx.step_count == 1
+        assert ctx.tool_calls == 1
+
+    def test_stream_finalizes_on_iteration_error(self) -> None:
+        init(mode="observe")
+
+        class _FailingAnthropicStream:
+            def __init__(self) -> None:
+                self._done = False
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                if not self._done:
+                    self._done = True
+                    return _mock_anthropic_message_start_event(input_tokens=1_000_000)
+                raise RuntimeError("anthropic stream failed")
+
+        original = MagicMock(return_value=_FailingAnthropicStream())
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=1.0) as ctx:
+            result = wrapper(MagicMock(), model="claude-sonnet-4", stream=True)
+            assert isinstance(result, _InstrumentedAnthropicStream)
+            with pytest.raises(RuntimeError, match="anthropic stream failed"):
+                list(result)
+
+        assert ctx.step_count == 1
+        assert ctx.cost > 0
+
+    def test_multiple_calls_accumulate(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000)
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=100.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert ctx.cost == pytest.approx(36.0, abs=0.01)
+        assert ctx.step_count == 2
+
+
+# ---------------------------------------------------------------------------
+# Anthropic async wrapper
+# ---------------------------------------------------------------------------
+
+
+class TestAnthropicAsyncWrapper:
+    async def test_observe_passes_through_response(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response()
+        original = AsyncMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_async_create(original)
+
+        async with run(budget=1.0) as ctx:
+            result = await wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert result is mock_resp
+
+    async def test_observe_tracks_cost(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000)
+        original = AsyncMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_async_create(original)
+
+        async with run(budget=100.0) as ctx:
+            await wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert ctx.cost == pytest.approx(18.0, abs=0.01)
+        assert ctx.step_count == 1
+
+    async def test_off_mode_passthrough(self) -> None:
+        init(mode="off")
+        mock_resp = _mock_anthropic_response()
+        original = AsyncMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_async_create(original)
+
+        async with run() as ctx:
+            result = await wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert result is mock_resp
+        assert ctx.cost == 0.0
+
+    async def test_stream_tracks_usage_and_tool_calls(self) -> None:
+        init(mode="observe")
+
+        async def _event_stream():
+            yield _mock_anthropic_message_start_event(input_tokens=1_000_000)
+            yield _mock_anthropic_content_block_start_event("tool_use")
+            yield _mock_anthropic_message_delta_event(output_tokens=1_000_000)
+            yield _mock_anthropic_message_stop_event()
+
+        original = AsyncMock(return_value=_event_stream())
+        wrapper = _make_patched_anthropic_async_create(original)
+
+        async with run(budget=1.0) as ctx:
+            result = await wrapper(MagicMock(), model="claude-sonnet-4", stream=True)
+            assert isinstance(result, _InstrumentedAnthropicAsyncStream)
+            async for _ in result:
+                pass
+
+        assert ctx.cost == pytest.approx(18.0, abs=0.01)
+        assert ctx.step_count == 1
+        assert ctx.tool_calls == 1
+
+    async def test_stream_finalizes_on_iteration_error(self) -> None:
+        init(mode="observe")
+
+        async def _failing_event_stream():
+            yield _mock_anthropic_message_start_event(input_tokens=1_000_000)
+            raise RuntimeError("anthropic async stream failed")
+
+        original = AsyncMock(return_value=_failing_event_stream())
+        wrapper = _make_patched_anthropic_async_create(original)
+
+        async with run(budget=1.0) as ctx:
+            result = await wrapper(MagicMock(), model="claude-sonnet-4", stream=True)
+            assert isinstance(result, _InstrumentedAnthropicAsyncStream)
+            with pytest.raises(RuntimeError, match="anthropic async stream failed"):
+                async for _ in result:
+                    pass
+
+        assert ctx.step_count == 1
+        assert ctx.cost > 0
+
+
+# ---------------------------------------------------------------------------
+# Anthropic enforce mode
+# ---------------------------------------------------------------------------
+
+
+class TestAnthropicEnforceMode:
+    def test_enforce_trace_records_enforce_reason(self) -> None:
+        init(mode="enforce")
+        mock_resp = _mock_anthropic_response()
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=100.0) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        trace = ctx.trace()
+        assert trace[0]["reason"] == "enforce"
+
+    def test_enforce_raises_on_budget_exhausted(self) -> None:
+        from cascadeflow.schema.exceptions import BudgetExceededError
+
+        init(mode="enforce")
+        mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000)
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=0.001) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+            with pytest.raises(BudgetExceededError):
+                wrapper(MagicMock(), model="claude-sonnet-4")
+
+    def test_observe_does_not_raise_on_budget_exhausted(self) -> None:
+        init(mode="observe")
+        mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000)
+        original = MagicMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_create(original)
+
+        with run(budget=0.001) as ctx:
+            wrapper(MagicMock(), model="claude-sonnet-4")
+            wrapper(MagicMock(), model="claude-sonnet-4")
+
+        assert ctx.cost > ctx.budget_max
+
+    async def test_async_enforce_raises_on_budget_exhausted(self) -> None:
+        from cascadeflow.schema.exceptions import BudgetExceededError
+
+        init(mode="enforce")
+        mock_resp = _mock_anthropic_response(input_tokens=1_000_000, output_tokens=1_000_000)
+        original = AsyncMock(return_value=mock_resp)
+        wrapper = _make_patched_anthropic_async_create(original)
+
+        async with run(budget=0.001) as ctx:
+            await wrapper(MagicMock(), model="claude-sonnet-4")
+            with pytest.raises(BudgetExceededError):
+                await wrapper(MagicMock(), model="claude-sonnet-4")
+
+
+# ---------------------------------------------------------------------------
+# Anthropic init() integration
+# ---------------------------------------------------------------------------
+
+
+class TestAnthropicInitIntegration:
+    def test_init_observe_patches_anthropic(self) -> None:
+        if find_spec("anthropic") is None:
+            pytest.skip("anthropic package not available")
+        report = init(mode="observe")
+        assert "anthropic" in report.instrumented
+        assert is_anthropic_patched()
+
+    def test_init_off_unpatches_anthropic(self) -> None:
+        if find_spec("anthropic") is None:
+            pytest.skip("anthropic package not available")
+        init(mode="observe")
+        assert is_anthropic_patched()
+        init(mode="off")
+        assert not is_anthropic_patched()
+
+    def test_reset_unpatches_anthropic(self) -> None:
+        if find_spec("anthropic") is None:
+            pytest.skip("anthropic package not available")
+        init(mode="observe")
+        assert is_anthropic_patched()
+        reset()
+        assert not is_anthropic_patched()