From ee04f9f1a17280ab9dbdeb58008a8ee47704f42b Mon Sep 17 00:00:00 2001 From: Manoj Prabhakar Paidiparthy Date: Sat, 20 Jun 2026 19:29:58 -0700 Subject: [PATCH 1/2] chore(release): bump engine to 0.8.0 Captures the chat-completions HTTP face (#65) and model-routing matrix integration (#64). New `amplifier-agent serve chat-completions` exposes amplifier-agent as an OpenAI-compatible HTTP service for embedding in third-party tools (opencode, custom UIs). New `amplifier-agent auth` subcommand persists provider credentials to ~/.amplifier-agent/credentials.json so users can configure once and have every invocation pick them up. Wire protocol unchanged at 0.3.0; no wrapper bump required. TypeScript wrapper stays at 0.7.0, Python wrapper stays at 0.3.0. See CHANGELOG.md for full details. Generated with [Amplifier](https://github.com/microsoft/amplifier) Co-Authored-By: Amplifier <240397093+microsoft-amplifier@users.noreply.github.com> --- CHANGELOG.md | 43 +++++++++++++++++++++++++++++++++++++++++++ README.md | 2 +- pyproject.toml | 2 +- uv.lock | 2 +- 4 files changed, 46 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b17a9b3..41ad1c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,49 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.8.0] — 2026-06-20 + +Adds an OpenAI-compatible chat-completions HTTP face for embedding amplifier-agent in third-party tools (opencode and similar), a persistent `auth` subcommand for provider credentials, and integrates the model-routing matrix for per-provider model selection. Existing JSON-RPC wire protocol unchanged — no wrapper bump required. + +### Added + +- **OpenAI-compatible chat-completions HTTP face** (`amplifier-agent serve chat-completions`). Exposes `/v1/models` and `/v1/chat/completions` over HTTP with bearer-token auth (`Authorization: Bearer ...`). Streams responses, returns OpenAI-shape envelopes, and supports multi-provider routing: the model field on each request is resolved through the served-models registry to the upstream provider, so a single server can serve Anthropic, OpenAI, Azure, and Ollama models from one endpoint. Enables direct integration with opencode (via the separate [`amplifier-app-opencode`](https://github.com/microsoft/amplifier-app-opencode) wrapper) and any other OpenAI-compatible client. + +- **`amplifier-agent auth` subcommand** for persistent provider credentials. Stores at `~/.amplifier-agent/credentials.json` (mode `0600`) via the `set / list / remove / status / clear` actions. Resolution chain is **env-first**: shell env vars (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, …) always win over the file, so existing shell-rc workflows are unchanged. The file lets users configure credentials once and have every subsequent invocation pick them up automatically — including the HTTP server, the `models list` command, and wrappers like `amplifier-opencode`. UX matches `claude login` / `gh auth login` / `aws configure` without the OAuth ceremony. + +- **Host-tool delegation** over the chat-completions wire face. Tools declared by the host (in `host_config.json` under `host_tools`) are surfaced to the model with stub schemas; when the model invokes one, amplifier-agent emits a signal tool_call back to the client (carrying the same `chunk_id`), the client executes the tool host-side, and the result is returned for the model to continue. Lets the host own filesystem, shell, browser, or any custom tool without amplifier-agent having to bundle it. + +- **Model routing matrix integration** (#64). The routing matrix can declare per-role provider/model preferences; amplifier-agent resolves the right provider per turn based on the matrix. Used by the new HTTP face for cross-provider model dispatch. + +- **`X-Client-Session-Id` request header** for workspace correlation. Wrappers pass their own session ID; the server uses it as the workspace name when writing transcript logs, so client-side and server-side session bookkeeping stay aligned. + +### Changed + +- **Lifespan provider initialization** now iterates `KNOWN_PROVIDERS` and registers every provider whose module is installed AND whose credentials are present. Previously the chat-completions face hardcoded `inject_provider("anthropic")` at lifespan; injection is now per-request based on the model the client picks. Boot log surfaces a line per skipped provider (`"Skipping provider 'openai' -- module not installed"`, `"Skipping provider 'ollama' -- no credentials in env"`) so it's clear what amplifier-agent thinks it can serve. + +- **`/v1/models` response** surfaces a `_provider` tag per model so OpenAI-compatible clients can see which provider serves each entry. Standard clients ignore the non-standard field per the OpenAI spec; aware clients can use it for routing decisions or display. + +- **Usage-counter telemetry** in chat-completions responses now correctly reflects the provider that actually served the turn (was previously misattributed when routing across providers). + +- **Bundle preparation pipeline** refactored to support the new HTTP face cleanly — same cache key semantics, no migration required. + +### Internal + +- `_resolve_env_credential` in `provider_sources.py` extended to chain env → `credentials.json` → empty. Lazy-imports the file reader from `admin/auth.py` to avoid a module-load cycle. +- New `admin/auth.py` (~330 lines) implements the `auth` subcommand surface: atomic JSON write (mode 0600), parent dir mode 0700, versioned envelope `{version: 1, providers: {...}}`, schema-tolerant load that round-trips unknown providers/fields. +- `routes/chat_completions.py` looks up the requested model in `app.state.served_models_registry` and passes the resolved `provider_id` + `upstream_model` through to `_session_runner.run_chat_turn` for per-request `inject_provider` under the existing `_create_session_lock` (save-restore pattern). +- `routes/models.py` surfaces `_provider` in the `/v1/models` response. + +### Wire protocol + +- Existing JSON-RPC wire face unchanged at `0.3.0`. **No wrapper bump required.** TypeScript wrapper stays at `0.7.0`, Python wrapper stays at `0.3.0`. +- New OpenAI-compatible chat-completions face is a separate wire — independent versioning is not currently surfaced; the schema is the OpenAI chat-completions subset documented in `README.md`. + +### Migration + +- No breaking changes for users on the JSON-RPC wire (existing `run`, `serve`, `models list`, etc. unchanged). +- New users / new integrations: prefer the chat-completions face for OpenAI-compatible clients; prefer `amplifier-agent auth set` over shell-rc exports for the "set once, works everywhere" UX. Both env vars and the file still work side-by-side. + ## [0.7.0] — 2026-06-17 Built-in bundle replaced with vendored behavioral-anchor. Agent set, tool roster, and bundle name all change. Wire protocol unchanged — no wrapper bump required. diff --git a/README.md b/README.md index 70db3ff..57c26d6 100644 --- a/README.md +++ b/README.md @@ -349,7 +349,7 @@ The TypeScript and Python wrapper SDKs ([`wrappers/typescript/`](wrappers/typesc ## Status -Current versions: engine `0.6.0`, TypeScript wrapper `amplifier-agent-ts@0.7.0`, Python wrapper `amplifier-agent-py@0.3.0` (see [`wrappers/python-py/`](wrappers/python-py/)). Wire protocol: `0.3.0`. +Current versions: engine `0.8.0`, TypeScript wrapper `amplifier-agent-ts@0.7.0`, Python wrapper `amplifier-agent-py@0.3.0` (see [`wrappers/python-py/`](wrappers/python-py/)). Wire protocol: `0.3.0`. **Shipped:** diff --git a/pyproject.toml b/pyproject.toml index 55a11a8..335085b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = 'amplifier-agent' -version = '0.7.0' +version = '0.8.0' requires-python = '>=3.12' license = 'MIT' dependencies = [ diff --git a/uv.lock b/uv.lock index a023cb1..233d99c 100644 --- a/uv.lock +++ b/uv.lock @@ -10,7 +10,7 @@ members = [ [[package]] name = "amplifier-agent" -version = "0.7.0" +version = "0.8.0" source = { editable = "." } dependencies = [ { name = "amplifier-foundation" }, From bccb5f381f08d798382bfd7b409e450cadcf62c2 Mon Sep 17 00:00:00 2001 From: Manoj Prabhakar Paidiparthy Date: Sun, 21 Jun 2026 02:50:44 -0700 Subject: [PATCH 2/2] feat(http): per-turn cost_usd in /v1/chat/completions usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provider modules already compute cost_usd per turn and emit it on the JSON-RPC NDJSON wire via hook_streaming (amplifier_agent_lib/bundle/hook_streaming.py). The chat-completions HTTP face was throwing this away — extracting only token counts from kernel usage events. Now it lifts cost_usd through too, accumulating across sub-turns (a single user turn can drive multiple LLM calls for tool-call rounds) and emitting the total in the OpenAI usage envelope as the non-standard `cost_usd` field. Real $$ from the provider's own pricing, surfaced on the SSE response. Implementation -------------- - `_event_translator.extract_usage`: widened return type from `dict[str, int]` to `dict[str, Any]`; reads `event['cost']` (set by hook_streaming from kernel `cost_usd`) and stamps it on the result as `cost_usd: str(...)`. - `_wire._build_usage_block / stop_chunk / tool_calls_stop_chunk`: new `cost_usd: str | None = None` parameter; surfaced on the usage block when set. - `routes/chat_completions`: accumulates `usage_cost: Decimal | None` across all usage events in the turn (preserves precision), serializes to str on emission, passes through to the terminal chunk helpers. cost_usd is a string (Decimal precision) and is omitted entirely when no provider emitted it (older provider modules, third-party endpoints without cost telemetry, etc.) — standard OpenAI clients ignore the non-standard field. Verified end-to-end against the running server: a one-word reply with claude-haiku-4-5 produces `cost_usd: '0.0118625'` in the terminal chunk's usage block. This is a wire translation — it leverages cost telemetry that already flows on the NDJSON wire. No new pricing catalog or provider-side plumbing required. 🤖 Generated with [Amplifier](https://github.com/microsoft/amplifier) Co-Authored-By: Amplifier <240397093+microsoft-amplifier@users.noreply.github.com> --- src/amplifier_agent_http/_event_translator.py | 19 +++++++++++++--- src/amplifier_agent_http/_wire.py | 22 +++++++++++++++++++ .../routes/chat_completions.py | 22 +++++++++++++++++++ 3 files changed, 60 insertions(+), 3 deletions(-) diff --git a/src/amplifier_agent_http/_event_translator.py b/src/amplifier_agent_http/_event_translator.py index f4d91e7..9a21db5 100644 --- a/src/amplifier_agent_http/_event_translator.py +++ b/src/amplifier_agent_http/_event_translator.py @@ -134,8 +134,8 @@ def translate_event( return None -def extract_usage(event: DisplayEvent) -> dict[str, int] | None: - """If the event is a usage event, extract token counts in OpenAI shape. +def extract_usage(event: DisplayEvent) -> dict[str, Any] | None: + """If the event is a usage event, extract token counts + cost in OpenAI shape. Returns ``None`` for non-usage events so the caller can use a simple accumulator: ``if (u := extract_usage(ev)): usage_block = u``. @@ -184,7 +184,7 @@ def _to_int(value: Any) -> int: output = _to_int(event.get("outputTokens")) prompt_total = new_input + cache_read + cache_write - return { + result: dict[str, Any] = { "prompt_tokens": prompt_total, "completion_tokens": output, "total_tokens": prompt_total + output, @@ -193,3 +193,16 @@ def _to_int(value: Any) -> int: # ``cacheReadTokens`` onto it. "cached_tokens": cache_read, } + + # Provider modules stamp the per-turn USD cost onto the kernel usage + # event as ``cost`` (a Decimal-as-string from hook_streaming.py). + # Forward it so the chat-completions route can accumulate across + # sub-turns and surface a ``cost_usd`` field in the OpenAI usage + # envelope -- a non-standard extension (OpenAI's spec is silent on + # cost). Standard clients ignore unknown fields; cost-aware clients + # (like opencode) can render the real per-turn dollar value rather + # than computing it from per-million catalog rates. + cost_raw = event.get("cost") + if cost_raw is not None: + result["cost_usd"] = str(cost_raw) + return result diff --git a/src/amplifier_agent_http/_wire.py b/src/amplifier_agent_http/_wire.py index be98608..3734b28 100644 --- a/src/amplifier_agent_http/_wire.py +++ b/src/amplifier_agent_http/_wire.py @@ -136,6 +136,7 @@ def _build_usage_block( prompt_tokens: int, completion_tokens: int, cached_tokens: int = 0, + cost_usd: str | None = None, ) -> dict[str, Any]: """Assemble the OpenAI usage block, with prompt_tokens_details when relevant. @@ -149,6 +150,14 @@ def _build_usage_block( Always include the details object when there's usage at all -- clients that don't understand it ignore it; clients that do get accurate cache visibility. + + ``cost_usd`` is a non-standard amplifier-agent extension: the actual + dollar cost the provider module computed for this turn, surfaced as a + string to preserve Decimal precision on the wire. Standard OpenAI + clients ignore unknown usage fields; cost-aware clients can render + the real per-turn dollar value rather than computing it themselves + from per-million catalog rates. Omitted when ``None`` (e.g. providers + that don't emit ``cost_usd`` in their llm:response events). """ usage: dict[str, Any] = { "prompt_tokens": prompt_tokens, @@ -157,6 +166,8 @@ def _build_usage_block( } if prompt_tokens or completion_tokens: usage["prompt_tokens_details"] = {"cached_tokens": cached_tokens} + if cost_usd is not None: + usage["cost_usd"] = cost_usd return usage @@ -167,6 +178,7 @@ def stop_chunk( prompt_tokens: int = 0, completion_tokens: int = 0, cached_tokens: int = 0, + cost_usd: str | None = None, include_usage: bool = True, ) -> dict[str, Any]: """Final chunk -- empty delta, finish_reason: stop, optional usage block. @@ -179,6 +191,10 @@ def stop_chunk( per the OpenAI usage-block extension. Pass the Anthropic ``cache_read_input_tokens`` count here so cost tracking on the consumer side reflects the actual cache hit rate. + + ``cost_usd`` is the actual dollar cost provider modules computed for + this turn -- surfaced as a string (Decimal precision) on + ``usage.cost_usd`` (non-standard extension; standard clients ignore). """ chunk = _base_chunk(chunk_id, model) chunk["choices"] = [{"index": 0, "delta": {}, "finish_reason": "stop"}] @@ -187,6 +203,7 @@ def stop_chunk( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, cached_tokens=cached_tokens, + cost_usd=cost_usd, ) return chunk @@ -243,6 +260,7 @@ def tool_calls_stop_chunk( prompt_tokens: int = 0, completion_tokens: int = 0, cached_tokens: int = 0, + cost_usd: str | None = None, include_usage: bool = True, ) -> dict[str, Any]: """Terminal chunk for a turn that ends with host-delegated tool calls. @@ -255,6 +273,9 @@ def tool_calls_stop_chunk( ``cached_tokens`` is surfaced under ``usage.prompt_tokens_details.cached_tokens`` in the same shape ``stop_chunk`` uses -- pass through the Anthropic ``cache_read_input_tokens`` count for accurate consumer-side cost tracking. + + ``cost_usd`` -- non-standard amplifier-agent extension -- carries the + actual dollar cost the provider module computed for this turn. """ chunk = _base_chunk(chunk_id, model) chunk["choices"] = [{"index": 0, "delta": {}, "finish_reason": "tool_calls"}] @@ -263,6 +284,7 @@ def tool_calls_stop_chunk( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, cached_tokens=cached_tokens, + cost_usd=cost_usd, ) return chunk diff --git a/src/amplifier_agent_http/routes/chat_completions.py b/src/amplifier_agent_http/routes/chat_completions.py index 10caae3..e746131 100644 --- a/src/amplifier_agent_http/routes/chat_completions.py +++ b/src/amplifier_agent_http/routes/chat_completions.py @@ -25,6 +25,7 @@ import json import logging from collections.abc import AsyncGenerator +from decimal import Decimal, InvalidOperation from typing import Any from fastapi import APIRouter, Depends, HTTPException, Request, status @@ -327,6 +328,14 @@ async def _stream_chat_completion( usage_prompt: int = 0 usage_completion: int = 0 usage_cached: int = 0 + # Accumulated dollar cost across all kernel usage events in this turn. + # Provider modules stamp ``cost_usd`` (Decimal-as-string) on each + # ``llm:response`` event; hook_streaming forwards it on the wire and + # ``extract_usage()`` lifts it for us. We sum across sub-calls so the + # terminal chunk's ``usage.cost_usd`` reflects the FULL turn cost, not + # just the final sub-call. Kept as Decimal during accumulation to + # preserve monetary precision, serialized to str at emission. + usage_cost: Decimal | None = None # Track unknown event types so we log each once per request (cheap). seen_unknown: set[str] = set() @@ -386,6 +395,16 @@ async def _signal_done() -> None: usage_prompt += u.get("prompt_tokens", 0) usage_completion += u.get("completion_tokens", 0) usage_cached += u.get("cached_tokens", 0) + cost_str = u.get("cost_usd") + if cost_str is not None: + try: + usage_cost = (usage_cost or Decimal("0")) + Decimal(str(cost_str)) + except (InvalidOperation, ValueError): + # Provider emitted a non-numeric cost — skip rather + # than break the turn. Real providers always emit + # well-formed Decimals; this guards against bad + # third-party providers in case anyone adds one. + pass continue # Translate other event types into a chunk dict, or skip. chunk = translate_event(event, chunk_id, model_id, seen_unknown) @@ -442,6 +461,7 @@ async def _signal_done() -> None: # - "tool_calls" when a HostToolYield escaped: the client reads this and # runs the tool host-side, then re-POSTs. # - "stop" for the normal end-of-turn path (with or without text). + cost_str_final: str | None = str(usage_cost) if usage_cost is not None else None if finish_reason_tool_calls: yield sse_data( tool_calls_stop_chunk( @@ -450,6 +470,7 @@ async def _signal_done() -> None: prompt_tokens=usage_prompt, completion_tokens=usage_completion, cached_tokens=usage_cached, + cost_usd=cost_str_final, include_usage=True, ) ) @@ -461,6 +482,7 @@ async def _signal_done() -> None: prompt_tokens=usage_prompt, completion_tokens=usage_completion, cached_tokens=usage_cached, + cost_usd=cost_str_final, include_usage=True, ) )