From 818d8cfbc8259b6ba77230d24b1a5d385145576b Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Fri, 27 Mar 2026 16:24:44 -0400
Subject: [PATCH 1/9] feat(messages): add Anthropic Messages API protocol,
 models, and routes

Add the API layer for the Anthropic Messages API (/v1/messages). This
includes the Messages protocol definition, Pydantic models for all
Anthropic request/response types (content blocks, streaming events,
tool use, thinking), and FastAPI routes with Anthropic-specific SSE
streaming format. Also registers the "messages" logging category and
adds Api.messages to the Api enum.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Charlie Doern <cdoern@redhat.com>
---
 src/llama_stack/log.py                        |   1 +
 src/llama_stack_api/__init__.py               |  37 +++
 src/llama_stack_api/datatypes.py              |   2 +
 src/llama_stack_api/messages/__init__.py      |  66 +++++
 src/llama_stack_api/messages/api.py           |  31 ++
 .../messages/fastapi_routes.py                | 199 +++++++++++++
 src/llama_stack_api/messages/models.py        | 275 ++++++++++++++++++
 src/llama_stack_api/pyproject.toml            |   1 +
 8 files changed, 612 insertions(+)
 create mode 100644 src/llama_stack_api/messages/__init__.py
 create mode 100644 src/llama_stack_api/messages/api.py
 create mode 100644 src/llama_stack_api/messages/fastapi_routes.py
 create mode 100644 src/llama_stack_api/messages/models.py

diff --git a/src/llama_stack/log.py b/src/llama_stack/log.py
index 54e2afe348..7c37bd1b82 100644
--- a/src/llama_stack/log.py
+++ b/src/llama_stack/log.py
@@ -56,6 +56,7 @@ class LoggingConfig(BaseModel):
     "tests",
     "telemetry",
     "connectors",
+    "messages",
 ]
 UNCATEGORIZED = "uncategorized"
 
diff --git a/src/llama_stack_api/__init__.py b/src/llama_stack_api/__init__.py
index 90c209598f..04d814dd9b 100644
--- a/src/llama_stack_api/__init__.py
+++ b/src/llama_stack_api/__init__.py
@@ -325,6 +325,25 @@
     UserMessage,
 )
 from .inspect_api import Inspect
+from .messages import (
+    Messages,
+    AnthropicContentBlock,
+    AnthropicCountTokensRequest,
+    AnthropicCountTokensResponse,
+    AnthropicCreateMessageRequest,
+    AnthropicErrorResponse,
+    AnthropicImageBlock,
+    AnthropicImageSource,
+    AnthropicMessage,
+    AnthropicMessageResponse,
+    AnthropicTextBlock,
+    AnthropicThinkingBlock,
+    AnthropicThinkingConfig,
+    AnthropicToolDef,
+    AnthropicToolResultBlock,
+    AnthropicToolUseBlock,
+    AnthropicUsage,
+)
 from .models import (
     CommonModelFields,
     GetModelRequest,
@@ -1122,6 +1141,24 @@
     "ViolationLevel",
     "WebSearchToolTypes",
     "WeightedRanker",
+    # Messages API
+    "Messages",
+    "AnthropicContentBlock",
+    "AnthropicCountTokensRequest",
+    "AnthropicCountTokensResponse",
+    "AnthropicCreateMessageRequest",
+    "AnthropicErrorResponse",
+    "AnthropicImageBlock",
+    "AnthropicImageSource",
+    "AnthropicMessage",
+    "AnthropicMessageResponse",
+    "AnthropicTextBlock",
+    "AnthropicThinkingBlock",
+    "AnthropicThinkingConfig",
+    "AnthropicToolDef",
+    "AnthropicToolResultBlock",
+    "AnthropicToolUseBlock",
+    "AnthropicUsage",
     # Validators
     "validate_embeddings_input_is_text",
     # helpers
diff --git a/src/llama_stack_api/datatypes.py b/src/llama_stack_api/datatypes.py
index 95b3a0983c..900529bac2 100644
--- a/src/llama_stack_api/datatypes.py
+++ b/src/llama_stack_api/datatypes.py
@@ -115,6 +115,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
     :cvar file_processors: File parsing and processing operations
     :cvar prompts: Prompt versions and management
     :cvar connectors: External connector management (e.g., MCP servers)
+    :cvar messages: Anthropic Messages API compatibility layer
     :cvar inspect: Built-in system inspection and introspection
     """
 
@@ -141,6 +142,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
     prompts = "prompts"
     conversations = "conversations"
     connectors = "connectors"
+    messages = "messages"
 
     # built-in API
     inspect = "inspect"
diff --git a/src/llama_stack_api/messages/__init__.py b/src/llama_stack_api/messages/__init__.py
new file mode 100644
index 0000000000..e6ffc09f53
--- /dev/null
+++ b/src/llama_stack_api/messages/__init__.py
@@ -0,0 +1,66 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""Messages API protocol and models.
+
+This module contains the Messages protocol definition for the Anthropic Messages API.
+Pydantic models are defined in llama_stack_api.messages.models.
+The FastAPI router is defined in llama_stack_api.messages.fastapi_routes.
+"""
+
+from . import fastapi_routes
+from .api import Messages
+from .models import (
+    AnthropicContentBlock,
+    AnthropicCountTokensRequest,
+    AnthropicCountTokensResponse,
+    AnthropicCreateMessageRequest,
+    AnthropicErrorResponse,
+    AnthropicImageBlock,
+    AnthropicImageSource,
+    AnthropicMessage,
+    AnthropicMessageResponse,
+    AnthropicTextBlock,
+    AnthropicThinkingBlock,
+    AnthropicThinkingConfig,
+    AnthropicToolDef,
+    AnthropicToolResultBlock,
+    AnthropicToolUseBlock,
+    AnthropicUsage,
+    ContentBlockDeltaEvent,
+    ContentBlockStartEvent,
+    ContentBlockStopEvent,
+    MessageDeltaEvent,
+    MessageStartEvent,
+    MessageStopEvent,
+)
+
+__all__ = [
+    "Messages",
+    "AnthropicContentBlock",
+    "AnthropicCountTokensRequest",
+    "AnthropicCountTokensResponse",
+    "AnthropicCreateMessageRequest",
+    "AnthropicErrorResponse",
+    "AnthropicImageBlock",
+    "AnthropicImageSource",
+    "AnthropicMessage",
+    "AnthropicMessageResponse",
+    "AnthropicTextBlock",
+    "AnthropicThinkingBlock",
+    "AnthropicThinkingConfig",
+    "AnthropicToolDef",
+    "AnthropicToolResultBlock",
+    "AnthropicToolUseBlock",
+    "AnthropicUsage",
+    "ContentBlockDeltaEvent",
+    "ContentBlockStartEvent",
+    "ContentBlockStopEvent",
+    "MessageDeltaEvent",
+    "MessageStartEvent",
+    "MessageStopEvent",
+    "fastapi_routes",
+]
diff --git a/src/llama_stack_api/messages/api.py b/src/llama_stack_api/messages/api.py
new file mode 100644
index 0000000000..3b42e684c6
--- /dev/null
+++ b/src/llama_stack_api/messages/api.py
@@ -0,0 +1,31 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from collections.abc import AsyncIterator
+from typing import Protocol, runtime_checkable
+
+from .models import (
+    AnthropicCountTokensRequest,
+    AnthropicCountTokensResponse,
+    AnthropicCreateMessageRequest,
+    AnthropicMessageResponse,
+    AnthropicStreamEvent,
+)
+
+
+@runtime_checkable
+class Messages(Protocol):
+    """Protocol for the Anthropic Messages API."""
+
+    async def create_message(
+        self,
+        request: AnthropicCreateMessageRequest,
+    ) -> AnthropicMessageResponse | AsyncIterator[AnthropicStreamEvent]: ...
+
+    async def count_message_tokens(
+        self,
+        request: AnthropicCountTokensRequest,
+    ) -> AnthropicCountTokensResponse: ...
diff --git a/src/llama_stack_api/messages/fastapi_routes.py b/src/llama_stack_api/messages/fastapi_routes.py
new file mode 100644
index 0000000000..1392a51a3b
--- /dev/null
+++ b/src/llama_stack_api/messages/fastapi_routes.py
@@ -0,0 +1,199 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""FastAPI router for the Anthropic Messages API.
+
+This module defines the FastAPI router for the /v1/messages endpoint,
+serving the Anthropic Messages API format.
+"""
+
+import asyncio
+import contextvars
+import json
+import logging  # allow-direct-logging
+from collections.abc import AsyncIterator
+from typing import Annotated, Any
+
+from fastapi import APIRouter, Body, HTTPException, Request, Response
+from fastapi.responses import JSONResponse, StreamingResponse
+from pydantic import BaseModel
+
+from llama_stack_api.router_utils import standard_responses
+from llama_stack_api.version import LLAMA_STACK_API_V1
+
+from .api import Messages
+from .models import (
+    AnthropicCountTokensRequest,
+    AnthropicCountTokensResponse,
+    AnthropicCreateMessageRequest,
+    AnthropicErrorResponse,
+    AnthropicMessageResponse,
+    _AnthropicErrorDetail,
+)
+
+logger = logging.LoggerAdapter(logging.getLogger(__name__), {"category": "messages"})
+
+# Anthropic API version we are compatible with
+_ANTHROPIC_VERSION = "2023-06-01"
+
+
+def _create_anthropic_sse_event(event_type: str, data: Any) -> str:
+    """Create an Anthropic-format SSE event with named event type.
+
+    Anthropic SSE format: event: <type>\ndata: <json>\n\n
+    """
+    if isinstance(data, BaseModel):
+        data = data.model_dump_json()
+    else:
+        data = json.dumps(data)
+    return f"event: {event_type}\ndata: {data}\n\n"
+
+
+async def _anthropic_sse_generator(event_gen: AsyncIterator) -> AsyncIterator[str]:
+    """Convert an async generator of Anthropic stream events to SSE format."""
+    try:
+        async for event in event_gen:
+            event_type = event.type if hasattr(event, "type") else "unknown"
+            yield _create_anthropic_sse_event(event_type, event)
+    except asyncio.CancelledError:
+        if hasattr(event_gen, "aclose"):
+            await event_gen.aclose()
+        raise
+    except Exception as e:
+        logger.exception("Error in Anthropic SSE generator")
+        error_resp = AnthropicErrorResponse(
+            error=_AnthropicErrorDetail(type="api_error", message=str(e)),
+        )
+        yield _create_anthropic_sse_event("error", error_resp)
+
+
+def _preserve_context_for_sse(event_gen):
+    """Preserve request context for SSE streaming.
+
+    StreamingResponse runs in a different task, losing request contextvars.
+    This wrapper captures and restores the context.
+    """
+    context = contextvars.copy_context()
+
+    async def wrapper():
+        try:
+            while True:
+                try:
+                    task = context.run(asyncio.create_task, event_gen.__anext__())
+                    item = await task
+                except StopAsyncIteration:
+                    break
+                yield item
+        except (asyncio.CancelledError, GeneratorExit):
+            if hasattr(event_gen, "aclose"):
+                await event_gen.aclose()
+            raise
+
+    return wrapper()
+
+
+def _anthropic_error_response(status_code: int, message: str) -> JSONResponse:
+    """Create an Anthropic-format error JSONResponse."""
+    error_type_map = {
+        400: "invalid_request_error",
+        401: "authentication_error",
+        403: "permission_error",
+        404: "not_found_error",
+        429: "rate_limit_error",
+    }
+    error_type = error_type_map.get(status_code, "api_error")
+    body = AnthropicErrorResponse(
+        error=_AnthropicErrorDetail(type=error_type, message=message),
+    )
+    return JSONResponse(status_code=status_code, content=body.model_dump())
+
+
+def create_router(impl: Messages) -> APIRouter:
+    """Create a FastAPI router for the Anthropic Messages API.
+
+    Args:
+        impl: The Messages implementation instance
+
+    Returns:
+        APIRouter configured for the Messages API
+    """
+    router = APIRouter(
+        prefix=f"/{LLAMA_STACK_API_V1}",
+        tags=["Messages"],
+        responses=standard_responses,
+    )
+
+    @router.post(
+        "/messages",
+        summary="Create a message.",
+        description="Create a message using the Anthropic Messages API format.",
+        status_code=200,
+        response_model=AnthropicMessageResponse,
+        responses={
+            200: {
+                "description": "An AnthropicMessageResponse or a stream of Anthropic SSE events.",
+                "content": {
+                    "text/event-stream": {},
+                },
+            },
+        },
+    )
+    async def create_message(
+        raw_request: Request,
+        params: Annotated[AnthropicCreateMessageRequest, Body(...)],
+    ) -> Response:
+        try:
+            result = await impl.create_message(params)
+        except NotImplementedError as e:
+            return _anthropic_error_response(501, str(e))
+        except ValueError as e:
+            return _anthropic_error_response(400, str(e))
+        except HTTPException as e:
+            return _anthropic_error_response(e.status_code, e.detail)
+        except Exception as e:
+            logger.exception("Failed to create message")
+            return _anthropic_error_response(500, "Internal server error")
+
+        response_headers = {"anthropic-version": _ANTHROPIC_VERSION}
+
+        if isinstance(result, AsyncIterator):
+            return StreamingResponse(
+                _preserve_context_for_sse(_anthropic_sse_generator(result)),
+                media_type="text/event-stream",
+                headers=response_headers,
+            )
+
+        return JSONResponse(
+            content=result.model_dump(exclude_none=True),
+            headers=response_headers,
+        )
+
+    @router.post(
+        "/messages/count_tokens",
+        response_model=AnthropicCountTokensResponse,
+        summary="Count tokens in a message.",
+        description="Count the number of tokens in a message request.",
+        responses={
+            200: {"description": "Token count for the request."},
+        },
+    )
+    async def count_message_tokens(
+        params: Annotated[AnthropicCountTokensRequest, Body(...)],
+    ) -> Response:
+        try:
+            result = await impl.count_message_tokens(params)
+        except NotImplementedError as e:
+            return _anthropic_error_response(501, str(e))
+        except Exception as e:
+            logger.exception("Failed to count message tokens")
+            return _anthropic_error_response(500, "Internal server error")
+
+        return JSONResponse(
+            content=result.model_dump(),
+            headers={"anthropic-version": _ANTHROPIC_VERSION},
+        )
+
+    return router
diff --git a/src/llama_stack_api/messages/models.py b/src/llama_stack_api/messages/models.py
new file mode 100644
index 0000000000..d0841ec64d
--- /dev/null
+++ b/src/llama_stack_api/messages/models.py
@@ -0,0 +1,275 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""Pydantic models for the Anthropic Messages API.
+
+These models define the request and response shapes for the /v1/messages endpoint,
+following the Anthropic Messages API specification.
+"""
+
+from __future__ import annotations
+
+from typing import Annotated, Any, Literal
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+# -- Content blocks --
+
+
+class AnthropicTextBlock(BaseModel):
+    """A text content block."""
+
+    type: Literal["text"] = "text"
+    text: str
+
+
+class AnthropicImageSource(BaseModel):
+    """Source for an image content block."""
+
+    type: Literal["base64"] = "base64"
+    media_type: str = Field(..., description="MIME type of the image (e.g. image/png).")
+    data: str = Field(..., description="Base64-encoded image data.")
+
+
+class AnthropicImageBlock(BaseModel):
+    """An image content block."""
+
+    type: Literal["image"] = "image"
+    source: AnthropicImageSource
+
+
+class AnthropicToolUseBlock(BaseModel):
+    """A tool use content block in an assistant message."""
+
+    type: Literal["tool_use"] = "tool_use"
+    id: str = Field(..., description="Unique ID for this tool invocation.")
+    name: str = Field(..., description="Name of the tool being called.")
+    input: dict[str, Any] = Field(..., description="Tool input arguments.")
+
+
+class AnthropicToolResultBlock(BaseModel):
+    """A tool result content block in a user message."""
+
+    type: Literal["tool_result"] = "tool_result"
+    tool_use_id: str = Field(..., description="The ID of the tool_use block this result corresponds to.")
+    content: str | list[AnthropicTextBlock | AnthropicImageBlock] = Field(
+        default="",
+        description="The result content.",
+    )
+    is_error: bool | None = Field(default=None, description="Whether the tool call resulted in an error.")
+
+
+class AnthropicThinkingBlock(BaseModel):
+    """A thinking content block (extended thinking)."""
+
+    type: Literal["thinking"] = "thinking"
+    thinking: str = Field(..., description="The model's thinking text.")
+    signature: str | None = Field(default=None, description="Signature for the thinking block.")
+
+
+AnthropicContentBlock = Annotated[
+    AnthropicTextBlock | AnthropicImageBlock | AnthropicToolUseBlock | AnthropicToolResultBlock | AnthropicThinkingBlock,
+    Field(discriminator="type"),
+]
+
+# -- Messages --
+
+
+class AnthropicMessage(BaseModel):
+    """A message in the conversation."""
+
+    role: Literal["user", "assistant"]
+    content: str | list[AnthropicContentBlock] = Field(
+        ...,
+        description="Message content: a string for simple text, or a list of content blocks.",
+    )
+
+
+# -- Tool definitions --
+
+
+class AnthropicToolDef(BaseModel):
+    """Definition of a tool available to the model."""
+
+    name: str
+    description: str | None = None
+    input_schema: dict[str, Any] = Field(..., description="JSON Schema for the tool's input.")
+
+
+# -- Thinking config --
+
+
+class AnthropicThinkingConfig(BaseModel):
+    """Configuration for extended thinking."""
+
+    type: Literal["enabled", "disabled"] = "enabled"
+    budget_tokens: int | None = Field(default=None, ge=1, description="Maximum tokens for thinking.")
+
+
+# -- Request models --
+
+
+class AnthropicCreateMessageRequest(BaseModel):
+    """Request body for POST /v1/messages."""
+
+    model_config = ConfigDict(extra="allow")
+
+    model: str = Field(..., description="The model to use for generation.")
+    messages: list[AnthropicMessage] = Field(..., description="The messages in the conversation.")
+    max_tokens: int = Field(..., ge=1, description="The maximum number of tokens to generate.")
+    system: str | list[AnthropicTextBlock] | None = Field(
+        default=None,
+        description="System prompt. A string or list of text blocks.",
+    )
+    tools: list[AnthropicToolDef] | None = Field(default=None, description="Tools available to the model.")
+    tool_choice: Any | None = Field(
+        default=None,
+        description="How the model should select tools. One of: 'auto', 'any', 'none', or {type: 'tool', name: '...'}.",
+    )
+    stream: bool | None = Field(default=False, description="Whether to stream the response.")
+    temperature: float | None = Field(default=None, ge=0.0, le=1.0, description="Sampling temperature.")
+    top_p: float | None = Field(default=None, ge=0.0, le=1.0, description="Nucleus sampling parameter.")
+    top_k: int | None = Field(default=None, ge=1, description="Top-k sampling parameter.")
+    stop_sequences: list[str] | None = Field(default=None, description="Custom stop sequences.")
+    metadata: dict[str, str] | None = Field(default=None, description="Request metadata.")
+    thinking: AnthropicThinkingConfig | None = Field(default=None, description="Extended thinking configuration.")
+    service_tier: str | None = Field(default=None, description="Service tier to use.")
+
+
+class AnthropicCountTokensRequest(BaseModel):
+    """Request body for POST /v1/messages/count_tokens."""
+
+    model: str = Field(..., description="The model to use for token counting.")
+    messages: list[AnthropicMessage] = Field(..., description="The messages to count tokens for.")
+    system: str | list[AnthropicTextBlock] | None = Field(default=None, description="System prompt.")
+    tools: list[AnthropicToolDef] | None = Field(default=None, description="Tools to include in token count.")
+
+
+# -- Response models --
+
+
+class AnthropicUsage(BaseModel):
+    """Token usage statistics."""
+
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cache_creation_input_tokens: int | None = None
+    cache_read_input_tokens: int | None = None
+
+
+class AnthropicMessageResponse(BaseModel):
+    """Response from POST /v1/messages (non-streaming)."""
+
+    id: str = Field(..., description="Unique message ID (msg_ prefix).")
+    type: Literal["message"] = "message"
+    role: Literal["assistant"] = "assistant"
+    content: list[AnthropicContentBlock] = Field(..., description="Response content blocks.")
+    model: str
+    stop_reason: str | None = Field(
+        default=None,
+        description="Why the model stopped: end_turn, stop_sequence, tool_use, or max_tokens.",
+    )
+    stop_sequence: str | None = None
+    usage: AnthropicUsage = Field(default_factory=AnthropicUsage)
+
+
+class AnthropicCountTokensResponse(BaseModel):
+    """Response from POST /v1/messages/count_tokens."""
+
+    input_tokens: int
+
+
+# -- Streaming event models --
+
+
+class MessageStartEvent(BaseModel):
+    """First event in a streaming response."""
+
+    type: Literal["message_start"] = "message_start"
+    message: AnthropicMessageResponse
+
+
+class ContentBlockStartEvent(BaseModel):
+    """Signals the start of a new content block."""
+
+    type: Literal["content_block_start"] = "content_block_start"
+    index: int
+    content_block: AnthropicContentBlock
+
+
+class _TextDelta(BaseModel):
+    type: Literal["text_delta"] = "text_delta"
+    text: str
+
+
+class _InputJsonDelta(BaseModel):
+    type: Literal["input_json_delta"] = "input_json_delta"
+    partial_json: str
+
+
+class _ThinkingDelta(BaseModel):
+    type: Literal["thinking_delta"] = "thinking_delta"
+    thinking: str
+
+
+class ContentBlockDeltaEvent(BaseModel):
+    """A delta within a content block."""
+
+    type: Literal["content_block_delta"] = "content_block_delta"
+    index: int
+    delta: _TextDelta | _InputJsonDelta | _ThinkingDelta
+
+
+class ContentBlockStopEvent(BaseModel):
+    """Signals the end of a content block."""
+
+    type: Literal["content_block_stop"] = "content_block_stop"
+    index: int
+
+
+class _MessageDelta(BaseModel):
+    stop_reason: str | None = None
+    stop_sequence: str | None = None
+
+
+class MessageDeltaEvent(BaseModel):
+    """Final metadata update before the message ends."""
+
+    type: Literal["message_delta"] = "message_delta"
+    delta: _MessageDelta
+    usage: AnthropicUsage | None = None
+
+
+class MessageStopEvent(BaseModel):
+    """Final event in a streaming response."""
+
+    type: Literal["message_stop"] = "message_stop"
+
+
+AnthropicStreamEvent = (
+    MessageStartEvent
+    | ContentBlockStartEvent
+    | ContentBlockDeltaEvent
+    | ContentBlockStopEvent
+    | MessageDeltaEvent
+    | MessageStopEvent
+)
+
+
+# -- Error response --
+
+
+class _AnthropicErrorDetail(BaseModel):
+    type: str
+    message: str
+
+
+class AnthropicErrorResponse(BaseModel):
+    """Anthropic-format error response."""
+
+    type: Literal["error"] = "error"
+    error: _AnthropicErrorDetail
diff --git a/src/llama_stack_api/pyproject.toml b/src/llama_stack_api/pyproject.toml
index c2232f5a7f..c8e2f40b35 100644
--- a/src/llama_stack_api/pyproject.toml
+++ b/src/llama_stack_api/pyproject.toml
@@ -57,6 +57,7 @@ packages = [
     "llama_stack_api.inspect_api",
     "llama_stack_api.inference",
     "llama_stack_api.internal",
+    "llama_stack_api.messages",
     "llama_stack_api.models",
 
     "llama_stack_api.providers",

From 5cc9ecc93eca139ad3605af374d82a5a014cda22 Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Fri, 27 Mar 2026 16:24:54 -0400
Subject: [PATCH 2/9] feat(messages): add inline::builtin provider with
 translation and native passthrough

Add the single BuiltinMessagesImpl provider that translates Anthropic
Messages format to/from OpenAI Chat Completions, delegating to the
inference API. For providers that natively support /v1/messages (e.g.
Ollama), requests are forwarded directly without translation. Also
registers the provider in the registry, wires the router in the server,
and adds Messages to the protocol map in the resolver.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Charlie Doern <cdoern@redhat.com>
---
 src/llama_stack/core/resolver.py              |   2 +
 .../providers/inline/messages/__init__.py     |  22 +
 .../providers/inline/messages/config.py       |  17 +
 .../providers/inline/messages/impl.py         | 558 ++++++++++++++++++
 .../providers/registry/messages.py            |  29 +
 5 files changed, 628 insertions(+)
 create mode 100644 src/llama_stack/providers/inline/messages/__init__.py
 create mode 100644 src/llama_stack/providers/inline/messages/config.py
 create mode 100644 src/llama_stack/providers/inline/messages/impl.py
 create mode 100644 src/llama_stack/providers/registry/messages.py

diff --git a/src/llama_stack/core/resolver.py b/src/llama_stack/core/resolver.py
index 69ef47942e..5b96b57730 100644
--- a/src/llama_stack/core/resolver.py
+++ b/src/llama_stack/core/resolver.py
@@ -41,6 +41,7 @@
     Inference,
     InferenceProvider,
     Inspect,
+    Messages,
     Models,
     ModelsProtocolPrivate,
     Prompts,
@@ -107,6 +108,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
         Api.conversations: Conversations,
         Api.file_processors: FileProcessors,
         Api.connectors: Connectors,
+        Api.messages: Messages,
     }
 
     if external_apis:
diff --git a/src/llama_stack/providers/inline/messages/__init__.py b/src/llama_stack/providers/inline/messages/__init__.py
new file mode 100644
index 0000000000..b292976c87
--- /dev/null
+++ b/src/llama_stack/providers/inline/messages/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.core.datatypes import Api
+
+from .config import MessagesConfig
+
+
+async def get_provider_impl(
+    config: MessagesConfig,
+    deps: dict[Api, Any],
+):
+    from .impl import BuiltinMessagesImpl
+
+    impl = BuiltinMessagesImpl(config, deps[Api.inference])
+    await impl.initialize()
+    return impl
diff --git a/src/llama_stack/providers/inline/messages/config.py b/src/llama_stack/providers/inline/messages/config.py
new file mode 100644
index 0000000000..c17a040607
--- /dev/null
+++ b/src/llama_stack/providers/inline/messages/config.py
@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from pydantic import BaseModel
+
+
+class MessagesConfig(BaseModel):
+    """Configuration for the built-in Anthropic Messages API adapter."""
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str = "") -> dict[str, Any]:
+        return {}
diff --git a/src/llama_stack/providers/inline/messages/impl.py b/src/llama_stack/providers/inline/messages/impl.py
new file mode 100644
index 0000000000..93f090ca53
--- /dev/null
+++ b/src/llama_stack/providers/inline/messages/impl.py
@@ -0,0 +1,558 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""Built-in Anthropic Messages API implementation.
+
+Translates Anthropic Messages format to/from OpenAI Chat Completions format,
+delegating to the inference API for actual model calls. When the underlying
+inference provider natively supports the Anthropic Messages API (e.g. Ollama),
+requests are forwarded directly without translation.
+"""
+
+from __future__ import annotations
+
+import json
+import uuid
+from collections.abc import AsyncIterator
+from typing import Any
+
+import httpx
+
+from llama_stack.log import get_logger
+from llama_stack_api import (
+    Inference,
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAIChatCompletionRequestWithExtraBody,
+)
+from llama_stack_api.messages import (
+    Messages,
+)
+from llama_stack_api.messages.models import (
+    AnthropicContentBlock,
+    AnthropicCountTokensRequest,
+    AnthropicCountTokensResponse,
+    AnthropicCreateMessageRequest,
+    AnthropicImageBlock,
+    AnthropicMessage,
+    AnthropicMessageResponse,
+    AnthropicStreamEvent,
+    AnthropicTextBlock,
+    AnthropicThinkingBlock,
+    AnthropicToolDef,
+    AnthropicToolResultBlock,
+    AnthropicToolUseBlock,
+    AnthropicUsage,
+    ContentBlockDeltaEvent,
+    ContentBlockStartEvent,
+    ContentBlockStopEvent,
+    MessageDeltaEvent,
+    MessageStartEvent,
+    MessageStopEvent,
+    _InputJsonDelta,
+    _MessageDelta,
+    _TextDelta,
+    _ThinkingDelta,
+)
+
+from .config import MessagesConfig
+
+logger = get_logger(name=__name__, category="messages")
+
+# Maps Anthropic stop_reason -> OpenAI finish_reason
+_STOP_REASON_TO_FINISH = {
+    "end_turn": "stop",
+    "stop_sequence": "stop",
+    "tool_use": "tool_calls",
+    "max_tokens": "length",
+}
+
+# Maps OpenAI finish_reason -> Anthropic stop_reason
+_FINISH_TO_STOP_REASON = {
+    "stop": "end_turn",
+    "tool_calls": "tool_use",
+    "length": "max_tokens",
+    "content_filter": "end_turn",
+}
+
+
+class BuiltinMessagesImpl(Messages):
+    """Anthropic Messages API adapter that translates to the inference API."""
+
+    def __init__(self, config: MessagesConfig, inference_api: Inference):
+        self.config = config
+        self.inference_api = inference_api
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def create_message(
+        self,
+        request: AnthropicCreateMessageRequest,
+    ) -> AnthropicMessageResponse | AsyncIterator[AnthropicStreamEvent]:
+        # Try native passthrough for providers that support /v1/messages directly
+        passthrough_url = await self._get_passthrough_url(request.model)
+        if passthrough_url:
+            return await self._passthrough_request(passthrough_url, request)
+
+        openai_params = self._anthropic_to_openai(request)
+
+        result = await self.inference_api.openai_chat_completion(openai_params)
+
+        if isinstance(result, AsyncIterator):
+            return self._stream_openai_to_anthropic(result, request.model)
+
+        return self._openai_to_anthropic(result, request.model)
+
+    async def count_message_tokens(
+        self,
+        request: AnthropicCountTokensRequest,
+    ) -> AnthropicCountTokensResponse:
+        raise NotImplementedError("Token counting is not yet implemented")
+
+    # -- Native passthrough for providers with /v1/messages support --
+
+    # Module paths of provider impls known to support /v1/messages natively
+    _NATIVE_MESSAGES_MODULES = {"llama_stack.providers.remote.inference.ollama"}
+
+    async def _get_passthrough_url(self, model: str) -> str | None:
+        """Check if the model's provider supports /v1/messages natively.
+
+        Returns the base URL for passthrough, or None to use translation.
+        """
+        router = self.inference_api
+        if not hasattr(router, "routing_table"):
+            return None
+
+        try:
+            obj = await router.routing_table.get_object_by_identifier("model", model)
+            if not obj:
+                return None
+
+            provider_impl = await router.routing_table.get_provider_impl(obj.identifier)
+            provider_module = type(provider_impl).__module__
+            is_native = any(provider_module.startswith(m) for m in self._NATIVE_MESSAGES_MODULES)
+
+            if is_native and hasattr(provider_impl, "get_base_url"):
+                base_url = str(provider_impl.get_base_url()).rstrip("/")
+                # Ollama's /v1/messages sits at the root, not under /v1
+                if base_url.endswith("/v1"):
+                    base_url = base_url[:-3]
+                logger.info(f"Using native /v1/messages passthrough for model {model} via {base_url}")
+                return base_url
+        except Exception:
+            logger.debug(f"Failed to resolve passthrough for model {model}, falling back to translation")
+
+        return None
+
+    async def _passthrough_request(
+        self,
+        base_url: str,
+        request: AnthropicCreateMessageRequest,
+    ) -> AnthropicMessageResponse | AsyncIterator[AnthropicStreamEvent]:
+        """Forward the request directly to the provider's /v1/messages endpoint."""
+        url = f"{base_url}/v1/messages"
+        # Use the provider_resource_id (model name without provider prefix)
+        provider_model = request.model
+        router = self.inference_api
+        if hasattr(router, "routing_table"):
+            try:
+                obj = await router.routing_table.get_object_by_identifier("model", request.model)
+                if obj:
+                    provider_model = obj.provider_resource_id
+            except Exception:
+                pass
+
+        body = request.model_dump(exclude_none=True)
+        body["model"] = provider_model
+        headers = {
+            "content-type": "application/json",
+            "anthropic-version": "2023-06-01",
+            "x-api-key": "no-key-required",
+        }
+
+        if request.stream:
+            return self._passthrough_stream(url, headers, body)
+
+        async with httpx.AsyncClient() as client:
+            resp = await client.post(url, json=body, headers=headers, timeout=300)
+            resp.raise_for_status()
+            return AnthropicMessageResponse(**resp.json())
+
+    async def _passthrough_stream(
+        self,
+        url: str,
+        headers: dict[str, str],
+        body: dict[str, Any],
+    ) -> AsyncIterator[AnthropicStreamEvent]:
+        """Stream SSE events directly from the provider."""
+        async with httpx.AsyncClient() as client:
+            async with client.stream("POST", url, json=body, headers=headers, timeout=300) as resp:
+                resp.raise_for_status()
+                event_type = None
+                async for line in resp.aiter_lines():
+                    line = line.strip()
+                    if line.startswith("event: "):
+                        event_type = line[7:]
+                    elif line.startswith("data: ") and event_type:
+                        data = json.loads(line[6:])
+                        event = self._parse_sse_event(event_type, data)
+                        if event:
+                            yield event
+                        event_type = None
+
+    def _parse_sse_event(self, event_type: str, data: dict[str, Any]) -> AnthropicStreamEvent | None:
+        """Parse an Anthropic SSE event from its type and data."""
+        if event_type == "message_start":
+            return MessageStartEvent(message=AnthropicMessageResponse(**data["message"]))
+        if event_type == "content_block_start":
+            block_data = data["content_block"]
+            content_block: AnthropicTextBlock | AnthropicToolUseBlock | AnthropicThinkingBlock
+            block_type = block_data.get("type")
+            if block_type == "tool_use":
+                content_block = AnthropicToolUseBlock(**block_data)
+            elif block_type == "thinking":
+                content_block = AnthropicThinkingBlock(**block_data)
+            else:
+                content_block = AnthropicTextBlock(**block_data)
+            return ContentBlockStartEvent(index=data["index"], content_block=content_block)
+        if event_type == "content_block_delta":
+            delta_data = data["delta"]
+            delta_type = delta_data.get("type")
+            delta: _TextDelta | _InputJsonDelta | _ThinkingDelta
+            if delta_type == "text_delta":
+                delta = _TextDelta(text=delta_data["text"])
+            elif delta_type == "input_json_delta":
+                delta = _InputJsonDelta(partial_json=delta_data["partial_json"])
+            elif delta_type == "thinking_delta":
+                delta = _ThinkingDelta(thinking=delta_data["thinking"])
+            else:
+                return None
+            return ContentBlockDeltaEvent(index=data["index"], delta=delta)
+        if event_type == "content_block_stop":
+            return ContentBlockStopEvent(index=data["index"])
+        if event_type == "message_delta":
+            return MessageDeltaEvent(
+                delta=_MessageDelta(stop_reason=data["delta"].get("stop_reason")),
+                usage=AnthropicUsage(**data.get("usage", {})),
+            )
+        if event_type == "message_stop":
+            return MessageStopEvent()
+        return None
+
+    # -- Request translation --
+
+    def _anthropic_to_openai(
+        self, request: AnthropicCreateMessageRequest
+    ) -> OpenAIChatCompletionRequestWithExtraBody:
+        messages = self._convert_messages_to_openai(request.system, request.messages)
+        tools = self._convert_tools_to_openai(request.tools) if request.tools else None
+        tool_choice = self._convert_tool_choice_to_openai(request.tool_choice) if request.tool_choice else None
+
+        extra_body: dict[str, Any] = {}
+        if request.top_k is not None:
+            extra_body["top_k"] = request.top_k
+        if request.thinking is not None and request.thinking.type == "enabled":
+            extra_body["thinking"] = {
+                "type": "enabled",
+                "budget_tokens": request.thinking.budget_tokens,
+            }
+
+        params = OpenAIChatCompletionRequestWithExtraBody(
+            model=request.model,
+            messages=messages,  # type: ignore[arg-type]
+            max_tokens=request.max_tokens,
+            temperature=request.temperature,
+            top_p=request.top_p,
+            stop=request.stop_sequences,
+            tools=tools,
+            tool_choice=tool_choice,
+            stream=request.stream or False,
+            service_tier=request.service_tier,  # type: ignore[arg-type]
+            **(extra_body or {}),
+        )
+        return params
+
+    def _convert_messages_to_openai(
+        self,
+        system: str | list[AnthropicTextBlock] | None,
+        messages: list[AnthropicMessage],
+    ) -> list[dict[str, Any]]:
+        openai_messages: list[dict[str, Any]] = []
+
+        if system is not None:
+            if isinstance(system, str):
+                system_text = system
+            else:
+                system_text = "\n".join(block.text for block in system)
+            openai_messages.append({"role": "system", "content": system_text})
+
+        for msg in messages:
+            openai_messages.extend(self._convert_single_message(msg))
+
+        return openai_messages
+
+    def _convert_single_message(self, msg: AnthropicMessage) -> list[dict[str, Any]]:
+        """Convert a single Anthropic message to one or more OpenAI messages.
+
+        A single Anthropic user message with tool_result blocks may need to be
+        split into multiple OpenAI messages (tool messages).
+        """
+        if isinstance(msg.content, str):
+            return [{"role": msg.role, "content": msg.content}]
+
+        if msg.role == "assistant":
+            return [self._convert_assistant_message(msg.content)]
+
+        # User message: may contain text and/or tool_result blocks
+        result: list[dict[str, Any]] = []
+        text_parts: list[dict[str, Any]] = []
+
+        for block in msg.content:
+            if isinstance(block, AnthropicToolResultBlock):
+                # Flush accumulated text first
+                if text_parts:
+                    result.append({"role": "user", "content": text_parts if len(text_parts) > 1 else text_parts[0]})
+                    text_parts = []
+                # Tool results become separate tool messages
+                content = block.content
+                if isinstance(content, list):
+                    content = "\n".join(b.text for b in content if isinstance(b, AnthropicTextBlock))
+                result.append({
+                    "role": "tool",
+                    "tool_call_id": block.tool_use_id,
+                    "content": content,
+                })
+            elif isinstance(block, AnthropicTextBlock):
+                text_parts.append({"type": "text", "text": block.text})
+            elif isinstance(block, AnthropicImageBlock):
+                text_parts.append({
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:{block.source.media_type};base64,{block.source.data}",
+                    },
+                })
+
+        if text_parts:
+            result.append({"role": "user", "content": text_parts if len(text_parts) > 1 else text_parts[0]})
+
+        return result if result else [{"role": "user", "content": ""}]
+
+    def _convert_assistant_message(self, content: list[AnthropicContentBlock]) -> dict[str, Any]:
+        """Convert an assistant message with content blocks to OpenAI format."""
+        text_parts: list[str] = []
+        tool_calls: list[dict[str, Any]] = []
+
+        for block in content:
+            if isinstance(block, AnthropicTextBlock):
+                text_parts.append(block.text)
+            elif isinstance(block, AnthropicToolUseBlock):
+                tool_calls.append({
+                    "id": block.id,
+                    "type": "function",
+                    "function": {
+                        "name": block.name,
+                        "arguments": json.dumps(block.input),
+                    },
+                })
+
+        msg: dict[str, Any] = {"role": "assistant"}
+        if text_parts:
+            msg["content"] = "\n".join(text_parts)
+        if tool_calls:
+            msg["tool_calls"] = tool_calls
+
+        return msg
+
+    def _convert_tools_to_openai(self, tools: list[AnthropicToolDef]) -> list[dict[str, Any]]:
+        return [
+            {
+                "type": "function",
+                "function": {
+                    "name": tool.name,
+                    "description": tool.description or "",
+                    "parameters": tool.input_schema,
+                },
+            }
+            for tool in tools
+        ]
+
+    def _convert_tool_choice_to_openai(self, tool_choice: Any) -> Any:
+        if isinstance(tool_choice, str):
+            if tool_choice == "any":
+                return "required"
+            if tool_choice == "none":
+                return "none"
+            return "auto"
+
+        if isinstance(tool_choice, dict):
+            tc_type = tool_choice.get("type")
+            if tc_type == "tool":
+                return {"type": "function", "function": {"name": tool_choice["name"]}}
+            if tc_type == "any":
+                return "required"
+            if tc_type == "none":
+                return "none"
+            return "auto"
+
+        return "auto"
+
+    # -- Response translation --
+
+    def _openai_to_anthropic(
+        self, response: OpenAIChatCompletion, request_model: str
+    ) -> AnthropicMessageResponse:
+        content: list[AnthropicContentBlock] = []
+
+        if response.choices:
+            choice = response.choices[0]
+            message = choice.message
+
+            if message and message.content:
+                content.append(AnthropicTextBlock(text=message.content))
+
+            if message and message.tool_calls:
+                for tc in message.tool_calls:
+                    if not hasattr(tc, "function") or tc.function is None:
+                        continue
+                    try:
+                        tool_input = json.loads(tc.function.arguments) if tc.function.arguments else {}
+                    except json.JSONDecodeError:
+                        tool_input = {}
+
+                    content.append(AnthropicToolUseBlock(
+                        id=tc.id or f"toolu_{uuid.uuid4().hex[:24]}",
+                        name=tc.function.name or "",
+                        input=tool_input,
+                    ))
+
+            finish_reason = choice.finish_reason or "stop"
+            stop_reason = _FINISH_TO_STOP_REASON.get(finish_reason, "end_turn")
+        else:
+            stop_reason = "end_turn"
+
+        usage = AnthropicUsage()
+        if response.usage:
+            usage = AnthropicUsage(
+                input_tokens=response.usage.prompt_tokens or 0,
+                output_tokens=response.usage.completion_tokens or 0,
+            )
+
+        return AnthropicMessageResponse(
+            id=f"msg_{uuid.uuid4().hex[:24]}",
+            content=content,
+            model=request_model,
+            stop_reason=stop_reason,
+            usage=usage,
+        )
+
+    # -- Streaming translation --
+
+    async def _stream_openai_to_anthropic(
+        self,
+        openai_stream: AsyncIterator[OpenAIChatCompletionChunk],
+        request_model: str,
+    ) -> AsyncIterator[AnthropicStreamEvent]:
+        """Translate OpenAI streaming chunks to Anthropic streaming events."""
+
+        # Emit message_start
+        yield MessageStartEvent(
+            message=AnthropicMessageResponse(
+                id=f"msg_{uuid.uuid4().hex[:24]}",
+                content=[],
+                model=request_model,
+                stop_reason=None,
+                usage=AnthropicUsage(input_tokens=0, output_tokens=0),
+            ),
+        )
+
+        content_block_index = 0
+        in_text_block = False
+        in_tool_blocks: dict[int, bool] = {}  # tool_call_index -> started
+        tool_call_index_to_block_index: dict[int, int] = {}
+        output_tokens = 0
+        input_tokens = 0
+        stop_reason = "end_turn"
+
+        async for chunk in openai_stream:
+            if not chunk.choices:
+                # Usage-only chunk
+                if chunk.usage:
+                    input_tokens = chunk.usage.prompt_tokens or 0
+                    output_tokens = chunk.usage.completion_tokens or 0
+                continue
+
+            choice = chunk.choices[0]
+            delta = choice.delta
+
+            if delta and delta.content:
+                if not in_text_block:
+                    yield ContentBlockStartEvent(
+                        index=content_block_index,
+                        content_block=AnthropicTextBlock(text=""),
+                    )
+                    in_text_block = True
+
+                yield ContentBlockDeltaEvent(
+                    index=content_block_index,
+                    delta=_TextDelta(text=delta.content),
+                )
+
+            if delta and delta.tool_calls:
+                for tc_delta in delta.tool_calls:
+                    tc_idx = tc_delta.index if tc_delta.index is not None else 0
+
+                    if tc_idx not in in_tool_blocks:
+                        # Close text block if open
+                        if in_text_block:
+                            yield ContentBlockStopEvent(index=content_block_index)
+                            content_block_index += 1
+                            in_text_block = False
+
+                        # Start new tool_use block
+                        in_tool_blocks[tc_idx] = True
+                        tool_call_index_to_block_index[tc_idx] = content_block_index
+
+                        yield ContentBlockStartEvent(
+                            index=content_block_index,
+                            content_block=AnthropicToolUseBlock(
+                                id=tc_delta.id or f"toolu_{uuid.uuid4().hex[:24]}",
+                                name=tc_delta.function.name if tc_delta.function and tc_delta.function.name else "",
+                                input={},
+                            ),
+                        )
+                        content_block_index += 1
+
+                    if tc_delta.function and tc_delta.function.arguments:
+                        block_idx = tool_call_index_to_block_index[tc_idx]
+                        yield ContentBlockDeltaEvent(
+                            index=block_idx,
+                            delta=_InputJsonDelta(partial_json=tc_delta.function.arguments),
+                        )
+
+            if choice.finish_reason:
+                stop_reason = _FINISH_TO_STOP_REASON.get(choice.finish_reason, "end_turn")
+
+            if chunk.usage:
+                input_tokens = chunk.usage.prompt_tokens or 0
+                output_tokens = chunk.usage.completion_tokens or 0
+
+        # Close any open blocks
+        if in_text_block:
+            yield ContentBlockStopEvent(index=content_block_index)
+
+        for tc_idx, block_idx in tool_call_index_to_block_index.items():
+            yield ContentBlockStopEvent(index=block_idx)
+
+        # Final events
+        yield MessageDeltaEvent(
+            delta=_MessageDelta(stop_reason=stop_reason),
+            usage=AnthropicUsage(input_tokens=input_tokens, output_tokens=output_tokens),
+        )
+        yield MessageStopEvent()
diff --git a/src/llama_stack/providers/registry/messages.py b/src/llama_stack/providers/registry/messages.py
new file mode 100644
index 0000000000..0c88a6c219
--- /dev/null
+++ b/src/llama_stack/providers/registry/messages.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from llama_stack_api import (
+    Api,
+    InlineProviderSpec,
+    ProviderSpec,
+)
+
+
+def available_providers() -> list[ProviderSpec]:
+    """Return the list of available messages provider specifications."""
+    return [
+        InlineProviderSpec(
+            api=Api.messages,
+            provider_type="inline::builtin",
+            pip_packages=[],
+            module="llama_stack.providers.inline.messages",
+            config_class="llama_stack.providers.inline.messages.config.MessagesConfig",
+            api_dependencies=[
+                Api.inference,
+            ],
+            description="Anthropic Messages API adapter that translates to the inference API.",
+        ),
+    ]

From 9bdacf6ca298523b14c527d2eabae7f02e8cb999 Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Fri, 27 Mar 2026 16:25:03 -0400
Subject: [PATCH 3/9] feat(messages): enable messages API in starter and
 ci-tests distributions

Add the messages provider (inline::builtin) to the starter distribution
template and regenerate configs for starter and ci-tests distributions.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Charlie Doern <cdoern@redhat.com>
---
 src/llama_stack/distributions/ci-tests/config.yaml            | 4 ++++
 .../distributions/ci-tests/run-with-postgres-store.yaml       | 4 ++++
 src/llama_stack/distributions/starter/config.yaml             | 4 ++++
 .../distributions/starter/run-with-postgres-store.yaml        | 4 ++++
 src/llama_stack/distributions/starter/starter.py              | 1 +
 5 files changed, 17 insertions(+)

diff --git a/src/llama_stack/distributions/ci-tests/config.yaml b/src/llama_stack/distributions/ci-tests/config.yaml
index 7bcbb6eee9..b0b87923f3 100644
--- a/src/llama_stack/distributions/ci-tests/config.yaml
+++ b/src/llama_stack/distributions/ci-tests/config.yaml
@@ -7,6 +7,7 @@ apis:
 - file_processors
 - files
 - inference
+- messages
 - responses
 - safety
 - scoring
@@ -197,6 +198,9 @@ providers:
       excluded_categories: []
   - provider_id: code-scanner
     provider_type: inline::code-scanner
+  messages:
+  - provider_id: builtin
+    provider_type: inline::builtin
   responses:
   - provider_id: builtin
     provider_type: inline::builtin
diff --git a/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml b/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml
index 553ed41118..a9a1e6e2cc 100644
--- a/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml
@@ -7,6 +7,7 @@ apis:
 - file_processors
 - files
 - inference
+- messages
 - responses
 - safety
 - scoring
@@ -197,6 +198,9 @@ providers:
       excluded_categories: []
   - provider_id: code-scanner
     provider_type: inline::code-scanner
+  messages:
+  - provider_id: builtin
+    provider_type: inline::builtin
   responses:
   - provider_id: builtin
     provider_type: inline::builtin
diff --git a/src/llama_stack/distributions/starter/config.yaml b/src/llama_stack/distributions/starter/config.yaml
index fb9cf9ae84..ed5b862130 100644
--- a/src/llama_stack/distributions/starter/config.yaml
+++ b/src/llama_stack/distributions/starter/config.yaml
@@ -7,6 +7,7 @@ apis:
 - file_processors
 - files
 - inference
+- messages
 - responses
 - safety
 - scoring
@@ -191,6 +192,9 @@ providers:
       excluded_categories: []
   - provider_id: code-scanner
     provider_type: inline::code-scanner
+  messages:
+  - provider_id: builtin
+    provider_type: inline::builtin
   responses:
   - provider_id: builtin
     provider_type: inline::builtin
diff --git a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
index 52225576f5..c2a814bc62 100644
--- a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
@@ -7,6 +7,7 @@ apis:
 - file_processors
 - files
 - inference
+- messages
 - responses
 - safety
 - scoring
@@ -191,6 +192,9 @@ providers:
       excluded_categories: []
   - provider_id: code-scanner
     provider_type: inline::code-scanner
+  messages:
+  - provider_id: builtin
+    provider_type: inline::builtin
   responses:
   - provider_id: builtin
     provider_type: inline::builtin
diff --git a/src/llama_stack/distributions/starter/starter.py b/src/llama_stack/distributions/starter/starter.py
index c99ce08e49..61e969c01b 100644
--- a/src/llama_stack/distributions/starter/starter.py
+++ b/src/llama_stack/distributions/starter/starter.py
@@ -152,6 +152,7 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
             BuildProvider(provider_type="inline::llama-guard"),
             BuildProvider(provider_type="inline::code-scanner"),
         ],
+        "messages": [BuildProvider(provider_type="inline::builtin")],
         "responses": [BuildProvider(provider_type="inline::builtin")],
         "eval": [BuildProvider(provider_type="inline::builtin")],
         "datasetio": [

From 6c59405a03afbf31e8afde392639488063c59e0f Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Fri, 27 Mar 2026 16:25:12 -0400
Subject: [PATCH 4/9] feat(messages): add unit tests and regenerate specs and
 docs

Add 17 unit tests covering request translation, response translation,
and streaming translation. Regenerate OpenAPI specs, provider docs, and
Stainless SDK config to include the new /v1/messages endpoints.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Charlie Doern <cdoern@redhat.com>
---
 client-sdks/stainless/openapi.yml             | 529 ++++++++++++++++++
 docs/docs/providers/messages/index.mdx        |  13 +
 .../providers/messages/inline_builtin.mdx     |  17 +
 docs/static/deprecated-llama-stack-spec.yaml  | 464 +++++++++++++++
 .../static/experimental-llama-stack-spec.yaml |   1 +
 docs/static/llama-stack-spec.yaml             | 529 ++++++++++++++++++
 docs/static/stainless-llama-stack-spec.yaml   | 529 ++++++++++++++++++
 .../providers/inline/messages/__init__.py     |   5 +
 .../providers/inline/messages/test_impl.py    | 353 ++++++++++++
 9 files changed, 2440 insertions(+)
 create mode 100644 docs/docs/providers/messages/index.mdx
 create mode 100644 docs/docs/providers/messages/inline_builtin.mdx
 create mode 100644 tests/unit/providers/inline/messages/__init__.py
 create mode 100644 tests/unit/providers/inline/messages/test_impl.py

diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml
index 06e361debc..f7761555fc 100644
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@@ -4357,6 +4357,71 @@ paths:
           description: Authorization token
           title: Authorization
         description: Authorization token
+  /v1/messages:
+    post:
+      responses:
+        '200':
+          description: An AnthropicMessageResponse or a stream of Anthropic SSE events.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/AnthropicMessageResponse'
+            text/event-stream: {}
+        '400':
+          description: Bad Request
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          description: Too Many Requests
+          $ref: '#/components/responses/TooManyRequests429'
+        '500':
+          description: Internal Server Error
+          $ref: '#/components/responses/InternalServerError500'
+        default:
+          description: Default Response
+          $ref: '#/components/responses/DefaultError'
+      tags:
+      - Messages
+      summary: Create a message.
+      description: Create a message using the Anthropic Messages API format.
+      operationId: create_message_v1_messages_post
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/AnthropicCreateMessageRequest'
+        required: true
+  /v1/messages/count_tokens:
+    post:
+      responses:
+        '200':
+          description: Token count for the request.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/AnthropicCountTokensResponse'
+        '400':
+          description: Bad Request
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          description: Too Many Requests
+          $ref: '#/components/responses/TooManyRequests429'
+        '500':
+          description: Internal Server Error
+          $ref: '#/components/responses/InternalServerError500'
+        default:
+          description: Default Response
+          $ref: '#/components/responses/DefaultError'
+      tags:
+      - Messages
+      summary: Count tokens in a message.
+      description: Count the number of tokens in a message request.
+      operationId: count_message_tokens_v1_messages_count_tokens_post
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/AnthropicCountTokensRequest'
+        required: true
 components:
   schemas:
     Error:
@@ -11707,6 +11772,469 @@ components:
           - type: 'null'
       title: AllowedToolsFilter
       description: Filter configuration for restricting which MCP tools can be used.
+    AnthropicCountTokensRequest:
+      properties:
+        model:
+          type: string
+          title: Model
+          description: The model to use for token counting.
+        messages:
+          items:
+            $ref: '#/components/schemas/AnthropicMessage'
+          type: array
+          title: Messages
+          description: The messages to count tokens for.
+        system:
+          anyOf:
+          - type: string
+          - items:
+              $ref: '#/components/schemas/AnthropicTextBlock'
+            type: array
+            title: list[AnthropicTextBlock]
+          - type: 'null'
+          title: string | list[AnthropicTextBlock]
+          description: System prompt.
+        tools:
+          anyOf:
+          - items:
+              $ref: '#/components/schemas/AnthropicToolDef'
+            type: array
+          - type: 'null'
+          description: Tools to include in token count.
+      required:
+      - model
+      - messages
+      title: AnthropicCountTokensRequest
+      description: Request body for POST /v1/messages/count_tokens.
+    AnthropicCountTokensResponse:
+      properties:
+        input_tokens:
+          type: integer
+          title: Input Tokens
+      required:
+      - input_tokens
+      title: AnthropicCountTokensResponse
+      description: Response from POST /v1/messages/count_tokens.
+    AnthropicCreateMessageRequest:
+      properties:
+        model:
+          type: string
+          title: Model
+          description: The model to use for generation.
+        messages:
+          items:
+            $ref: '#/components/schemas/AnthropicMessage'
+          type: array
+          title: Messages
+          description: The messages in the conversation.
+        max_tokens:
+          type: integer
+          minimum: 1.0
+          title: Max Tokens
+          description: The maximum number of tokens to generate.
+        system:
+          anyOf:
+          - type: string
+          - items:
+              $ref: '#/components/schemas/AnthropicTextBlock'
+            type: array
+            title: list[AnthropicTextBlock]
+          - type: 'null'
+          title: string | list[AnthropicTextBlock]
+          description: System prompt. A string or list of text blocks.
+        tools:
+          anyOf:
+          - items:
+              $ref: '#/components/schemas/AnthropicToolDef'
+            type: array
+          - type: 'null'
+          description: Tools available to the model.
+        tool_choice:
+          anyOf:
+          - {}
+          - type: 'null'
+          title: Tool Choice
+          description: "How the model should select tools. One of: 'auto', 'any', 'none', or {type: 'tool', name: '...'}."
+        stream:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          description: Whether to stream the response.
+          default: false
+        temperature:
+          anyOf:
+          - type: number
+            maximum: 1.0
+            minimum: 0.0
+          - type: 'null'
+          description: Sampling temperature.
+        top_p:
+          anyOf:
+          - type: number
+            maximum: 1.0
+            minimum: 0.0
+          - type: 'null'
+          description: Nucleus sampling parameter.
+        top_k:
+          anyOf:
+          - type: integer
+            minimum: 1.0
+          - type: 'null'
+          description: Top-k sampling parameter.
+        stop_sequences:
+          anyOf:
+          - items:
+              type: string
+            type: array
+          - type: 'null'
+          description: Custom stop sequences.
+        metadata:
+          anyOf:
+          - additionalProperties:
+              type: string
+            type: object
+          - type: 'null'
+          description: Request metadata.
+        thinking:
+          anyOf:
+          - $ref: '#/components/schemas/AnthropicThinkingConfig'
+            title: AnthropicThinkingConfig
+          - type: 'null'
+          description: Extended thinking configuration.
+          title: AnthropicThinkingConfig
+        service_tier:
+          anyOf:
+          - type: string
+          - type: 'null'
+          description: Service tier to use.
+      additionalProperties: true
+      required:
+      - model
+      - messages
+      - max_tokens
+      title: AnthropicCreateMessageRequest
+      description: Request body for POST /v1/messages.
+    AnthropicImageBlock:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - image
+        source:
+          $ref: '#/components/schemas/AnthropicImageSource'
+      required:
+      - source
+      title: AnthropicImageBlock
+      description: An image content block.
+    AnthropicImageSource:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - base64
+        media_type:
+          type: string
+          title: Media Type
+          description: MIME type of the image (e.g. image/png).
+        data:
+          type: string
+          title: Data
+          description: Base64-encoded image data.
+      required:
+      - media_type
+      - data
+      title: AnthropicImageSource
+      description: Source for an image content block.
+    AnthropicMessage:
+      properties:
+        role:
+          type: string
+          enum:
+          - user
+          - assistant
+          title: Role
+        content:
+          anyOf:
+          - type: string
+          - items:
+              oneOf:
+              - $ref: '#/components/schemas/AnthropicTextBlock'
+                title: AnthropicTextBlock
+              - $ref: '#/components/schemas/AnthropicImageBlock'
+                title: AnthropicImageBlock
+              - $ref: '#/components/schemas/AnthropicToolUseBlock'
+                title: AnthropicToolUseBlock
+              - $ref: '#/components/schemas/AnthropicToolResultBlock-Input'
+                title: AnthropicToolResultBlock-Input
+              - $ref: '#/components/schemas/AnthropicThinkingBlock'
+                title: AnthropicThinkingBlock
+              discriminator:
+                propertyName: type
+                mapping:
+                  image: '#/components/schemas/AnthropicImageBlock'
+                  text: '#/components/schemas/AnthropicTextBlock'
+                  thinking: '#/components/schemas/AnthropicThinkingBlock'
+                  tool_result: '#/components/schemas/AnthropicToolResultBlock-Input'
+                  tool_use: '#/components/schemas/AnthropicToolUseBlock'
+              title: AnthropicTextBlock | ... (5 variants)
+            type: array
+            title: list[AnthropicTextBlock | AnthropicImageBlock | ...]
+          title: string | list[AnthropicTextBlock | AnthropicImageBlock | ...]
+          description: 'Message content: a string for simple text, or a list of content blocks.'
+      required:
+      - role
+      - content
+      title: AnthropicMessage
+      description: A message in the conversation.
+    AnthropicMessageResponse:
+      properties:
+        id:
+          type: string
+          title: Id
+          description: Unique message ID (msg_ prefix).
+        type:
+          type: string
+          title: Type
+          enum:
+          - message
+        role:
+          type: string
+          title: Role
+          enum:
+          - assistant
+        content:
+          items:
+            oneOf:
+            - $ref: '#/components/schemas/AnthropicTextBlock'
+              title: AnthropicTextBlock
+            - $ref: '#/components/schemas/AnthropicImageBlock'
+              title: AnthropicImageBlock
+            - $ref: '#/components/schemas/AnthropicToolUseBlock'
+              title: AnthropicToolUseBlock
+            - $ref: '#/components/schemas/AnthropicToolResultBlock-Output'
+              title: AnthropicToolResultBlock-Output
+            - $ref: '#/components/schemas/AnthropicThinkingBlock'
+              title: AnthropicThinkingBlock
+            discriminator:
+              propertyName: type
+              mapping:
+                image: '#/components/schemas/AnthropicImageBlock'
+                text: '#/components/schemas/AnthropicTextBlock'
+                thinking: '#/components/schemas/AnthropicThinkingBlock'
+                tool_result: '#/components/schemas/AnthropicToolResultBlock-Output'
+                tool_use: '#/components/schemas/AnthropicToolUseBlock'
+            title: AnthropicTextBlock | ... (5 variants)
+          type: array
+          title: Content
+          description: Response content blocks.
+        model:
+          type: string
+          title: Model
+        stop_reason:
+          anyOf:
+          - type: string
+          - type: 'null'
+          description: 'Why the model stopped: end_turn, stop_sequence, tool_use, or max_tokens.'
+        stop_sequence:
+          anyOf:
+          - type: string
+          - type: 'null'
+        usage:
+          $ref: '#/components/schemas/AnthropicUsage'
+      required:
+      - id
+      - content
+      - model
+      title: AnthropicMessageResponse
+      description: Response from POST /v1/messages (non-streaming).
+    AnthropicTextBlock:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - text
+        text:
+          type: string
+          title: Text
+      required:
+      - text
+      title: AnthropicTextBlock
+      description: A text content block.
+    AnthropicThinkingBlock:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - thinking
+        thinking:
+          type: string
+          title: Thinking
+          description: The model's thinking text.
+        signature:
+          anyOf:
+          - type: string
+          - type: 'null'
+          description: Signature for the thinking block.
+      required:
+      - thinking
+      title: AnthropicThinkingBlock
+      description: A thinking content block (extended thinking).
+    AnthropicThinkingConfig:
+      properties:
+        type:
+          type: string
+          enum:
+          - enabled
+          - disabled
+          title: Type
+          default: enabled
+        budget_tokens:
+          anyOf:
+          - type: integer
+            minimum: 1.0
+          - type: 'null'
+          description: Maximum tokens for thinking.
+      title: AnthropicThinkingConfig
+      description: Configuration for extended thinking.
+    AnthropicToolDef:
+      properties:
+        name:
+          type: string
+          title: Name
+        description:
+          anyOf:
+          - type: string
+          - type: 'null'
+        input_schema:
+          additionalProperties: true
+          type: object
+          title: Input Schema
+          description: JSON Schema for the tool's input.
+      required:
+      - name
+      - input_schema
+      title: AnthropicToolDef
+      description: Definition of a tool available to the model.
+    AnthropicToolResultBlock-Input:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - tool_result
+        tool_use_id:
+          type: string
+          title: Tool Use Id
+          description: The ID of the tool_use block this result corresponds to.
+        content:
+          anyOf:
+          - type: string
+          - items:
+              anyOf:
+              - $ref: '#/components/schemas/AnthropicTextBlock'
+                title: AnthropicTextBlock
+              - $ref: '#/components/schemas/AnthropicImageBlock'
+                title: AnthropicImageBlock
+              title: AnthropicTextBlock | AnthropicImageBlock
+            type: array
+            title: list[AnthropicTextBlock | AnthropicImageBlock]
+          title: string | list[AnthropicTextBlock | AnthropicImageBlock]
+          description: The result content.
+          default: ''
+        is_error:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          description: Whether the tool call resulted in an error.
+      required:
+      - tool_use_id
+      title: AnthropicToolResultBlock
+      description: A tool result content block in a user message.
+    AnthropicToolResultBlock-Output:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - tool_result
+        tool_use_id:
+          type: string
+          title: Tool Use Id
+          description: The ID of the tool_use block this result corresponds to.
+        content:
+          anyOf:
+          - type: string
+          - items:
+              anyOf:
+              - $ref: '#/components/schemas/AnthropicTextBlock'
+                title: AnthropicTextBlock
+              - $ref: '#/components/schemas/AnthropicImageBlock'
+                title: AnthropicImageBlock
+              title: AnthropicTextBlock | AnthropicImageBlock
+            type: array
+            title: list[AnthropicTextBlock | AnthropicImageBlock]
+          title: string | list[AnthropicTextBlock | AnthropicImageBlock]
+          description: The result content.
+          default: ''
+        is_error:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          description: Whether the tool call resulted in an error.
+      required:
+      - tool_use_id
+      title: AnthropicToolResultBlock
+      description: A tool result content block in a user message.
+    AnthropicToolUseBlock:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - tool_use
+        id:
+          type: string
+          title: Id
+          description: Unique ID for this tool invocation.
+        name:
+          type: string
+          title: Name
+          description: Name of the tool being called.
+        input:
+          additionalProperties: true
+          type: object
+          title: Input
+          description: Tool input arguments.
+      required:
+      - id
+      - name
+      - input
+      title: AnthropicToolUseBlock
+      description: A tool use content block in an assistant message.
+    AnthropicUsage:
+      properties:
+        input_tokens:
+          type: integer
+          title: Input Tokens
+          default: 0
+        output_tokens:
+          type: integer
+          title: Output Tokens
+          default: 0
+        cache_creation_input_tokens:
+          anyOf:
+          - type: integer
+          - type: 'null'
+        cache_read_input_tokens:
+          anyOf:
+          - type: integer
+          - type: 'null'
+      title: AnthropicUsage
+      description: Token usage statistics.
     ApprovalFilter:
       properties:
         always:
@@ -13986,6 +14514,7 @@ components:
       - prompts
       - conversations
       - connectors
+      - messages
       - inspect
       - admin
       title: Api
diff --git a/docs/docs/providers/messages/index.mdx b/docs/docs/providers/messages/index.mdx
new file mode 100644
index 0000000000..7df084bc60
--- /dev/null
+++ b/docs/docs/providers/messages/index.mdx
@@ -0,0 +1,13 @@
+---
+description: "Protocol for the Anthropic Messages API."
+sidebar_label: Messages
+title: Messages
+---
+
+# Messages
+
+## Overview
+
+Protocol for the Anthropic Messages API.
+
+This section contains documentation for all available providers for the **messages** API.
diff --git a/docs/docs/providers/messages/inline_builtin.mdx b/docs/docs/providers/messages/inline_builtin.mdx
new file mode 100644
index 0000000000..9ed60b766b
--- /dev/null
+++ b/docs/docs/providers/messages/inline_builtin.mdx
@@ -0,0 +1,17 @@
+---
+description: "Anthropic Messages API adapter that translates to the inference API."
+sidebar_label: Builtin
+title: inline::builtin
+---
+
+# inline::builtin
+
+## Description
+
+Anthropic Messages API adapter that translates to the inference API.
+
+## Sample Configuration
+
+```yaml
+{}
+```
diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml
index 12914642cd..a6f1eb4b65 100644
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@@ -7618,6 +7618,469 @@ components:
           - type: 'null'
       title: AllowedToolsFilter
       description: Filter configuration for restricting which MCP tools can be used.
+    AnthropicCountTokensRequest:
+      properties:
+        model:
+          type: string
+          title: Model
+          description: The model to use for token counting.
+        messages:
+          items:
+            $ref: '#/components/schemas/AnthropicMessage'
+          type: array
+          title: Messages
+          description: The messages to count tokens for.
+        system:
+          anyOf:
+          - type: string
+          - items:
+              $ref: '#/components/schemas/AnthropicTextBlock'
+            type: array
+            title: list[AnthropicTextBlock]
+          - type: 'null'
+          title: string | list[AnthropicTextBlock]
+          description: System prompt.
+        tools:
+          anyOf:
+          - items:
+              $ref: '#/components/schemas/AnthropicToolDef'
+            type: array
+          - type: 'null'
+          description: Tools to include in token count.
+      required:
+      - model
+      - messages
+      title: AnthropicCountTokensRequest
+      description: Request body for POST /v1/messages/count_tokens.
+    AnthropicCountTokensResponse:
+      properties:
+        input_tokens:
+          type: integer
+          title: Input Tokens
+      required:
+      - input_tokens
+      title: AnthropicCountTokensResponse
+      description: Response from POST /v1/messages/count_tokens.
+    AnthropicCreateMessageRequest:
+      properties:
+        model:
+          type: string
+          title: Model
+          description: The model to use for generation.
+        messages:
+          items:
+            $ref: '#/components/schemas/AnthropicMessage'
+          type: array
+          title: Messages
+          description: The messages in the conversation.
+        max_tokens:
+          type: integer
+          minimum: 1.0
+          title: Max Tokens
+          description: The maximum number of tokens to generate.
+        system:
+          anyOf:
+          - type: string
+          - items:
+              $ref: '#/components/schemas/AnthropicTextBlock'
+            type: array
+            title: list[AnthropicTextBlock]
+          - type: 'null'
+          title: string | list[AnthropicTextBlock]
+          description: System prompt. A string or list of text blocks.
+        tools:
+          anyOf:
+          - items:
+              $ref: '#/components/schemas/AnthropicToolDef'
+            type: array
+          - type: 'null'
+          description: Tools available to the model.
+        tool_choice:
+          anyOf:
+          - {}
+          - type: 'null'
+          title: Tool Choice
+          description: "How the model should select tools. One of: 'auto', 'any', 'none', or {type: 'tool', name: '...'}."
+        stream:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          description: Whether to stream the response.
+          default: false
+        temperature:
+          anyOf:
+          - type: number
+            maximum: 1.0
+            minimum: 0.0
+          - type: 'null'
+          description: Sampling temperature.
+        top_p:
+          anyOf:
+          - type: number
+            maximum: 1.0
+            minimum: 0.0
+          - type: 'null'
+          description: Nucleus sampling parameter.
+        top_k:
+          anyOf:
+          - type: integer
+            minimum: 1.0
+          - type: 'null'
+          description: Top-k sampling parameter.
+        stop_sequences:
+          anyOf:
+          - items:
+              type: string
+            type: array
+          - type: 'null'
+          description: Custom stop sequences.
+        metadata:
+          anyOf:
+          - additionalProperties:
+              type: string
+            type: object
+          - type: 'null'
+          description: Request metadata.
+        thinking:
+          anyOf:
+          - $ref: '#/components/schemas/AnthropicThinkingConfig'
+            title: AnthropicThinkingConfig
+          - type: 'null'
+          description: Extended thinking configuration.
+          title: AnthropicThinkingConfig
+        service_tier:
+          anyOf:
+          - type: string
+          - type: 'null'
+          description: Service tier to use.
+      additionalProperties: true
+      required:
+      - model
+      - messages
+      - max_tokens
+      title: AnthropicCreateMessageRequest
+      description: Request body for POST /v1/messages.
+    AnthropicImageBlock:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - image
+        source:
+          $ref: '#/components/schemas/AnthropicImageSource'
+      required:
+      - source
+      title: AnthropicImageBlock
+      description: An image content block.
+    AnthropicImageSource:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - base64
+        media_type:
+          type: string
+          title: Media Type
+          description: MIME type of the image (e.g. image/png).
+        data:
+          type: string
+          title: Data
+          description: Base64-encoded image data.
+      required:
+      - media_type
+      - data
+      title: AnthropicImageSource
+      description: Source for an image content block.
+    AnthropicMessage:
+      properties:
+        role:
+          type: string
+          enum:
+          - user
+          - assistant
+          title: Role
+        content:
+          anyOf:
+          - type: string
+          - items:
+              oneOf:
+              - $ref: '#/components/schemas/AnthropicTextBlock'
+                title: AnthropicTextBlock
+              - $ref: '#/components/schemas/AnthropicImageBlock'
+                title: AnthropicImageBlock
+              - $ref: '#/components/schemas/AnthropicToolUseBlock'
+                title: AnthropicToolUseBlock
+              - $ref: '#/components/schemas/AnthropicToolResultBlock-Input'
+                title: AnthropicToolResultBlock-Input
+              - $ref: '#/components/schemas/AnthropicThinkingBlock'
+                title: AnthropicThinkingBlock
+              discriminator:
+                propertyName: type
+                mapping:
+                  image: '#/components/schemas/AnthropicImageBlock'
+                  text: '#/components/schemas/AnthropicTextBlock'
+                  thinking: '#/components/schemas/AnthropicThinkingBlock'
+                  tool_result: '#/components/schemas/AnthropicToolResultBlock-Input'
+                  tool_use: '#/components/schemas/AnthropicToolUseBlock'
+              title: AnthropicTextBlock | ... (5 variants)
+            type: array
+            title: list[AnthropicTextBlock | AnthropicImageBlock | ...]
+          title: string | list[AnthropicTextBlock | AnthropicImageBlock | ...]
+          description: 'Message content: a string for simple text, or a list of content blocks.'
+      required:
+      - role
+      - content
+      title: AnthropicMessage
+      description: A message in the conversation.
+    AnthropicMessageResponse:
+      properties:
+        id:
+          type: string
+          title: Id
+          description: Unique message ID (msg_ prefix).
+        type:
+          type: string
+          title: Type
+          enum:
+          - message
+        role:
+          type: string
+          title: Role
+          enum:
+          - assistant
+        content:
+          items:
+            oneOf:
+            - $ref: '#/components/schemas/AnthropicTextBlock'
+              title: AnthropicTextBlock
+            - $ref: '#/components/schemas/AnthropicImageBlock'
+              title: AnthropicImageBlock
+            - $ref: '#/components/schemas/AnthropicToolUseBlock'
+              title: AnthropicToolUseBlock
+            - $ref: '#/components/schemas/AnthropicToolResultBlock-Output'
+              title: AnthropicToolResultBlock-Output
+            - $ref: '#/components/schemas/AnthropicThinkingBlock'
+              title: AnthropicThinkingBlock
+            discriminator:
+              propertyName: type
+              mapping:
+                image: '#/components/schemas/AnthropicImageBlock'
+                text: '#/components/schemas/AnthropicTextBlock'
+                thinking: '#/components/schemas/AnthropicThinkingBlock'
+                tool_result: '#/components/schemas/AnthropicToolResultBlock-Output'
+                tool_use: '#/components/schemas/AnthropicToolUseBlock'
+            title: AnthropicTextBlock | ... (5 variants)
+          type: array
+          title: Content
+          description: Response content blocks.
+        model:
+          type: string
+          title: Model
+        stop_reason:
+          anyOf:
+          - type: string
+          - type: 'null'
+          description: 'Why the model stopped: end_turn, stop_sequence, tool_use, or max_tokens.'
+        stop_sequence:
+          anyOf:
+          - type: string
+          - type: 'null'
+        usage:
+          $ref: '#/components/schemas/AnthropicUsage'
+      required:
+      - id
+      - content
+      - model
+      title: AnthropicMessageResponse
+      description: Response from POST /v1/messages (non-streaming).
+    AnthropicTextBlock:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - text
+        text:
+          type: string
+          title: Text
+      required:
+      - text
+      title: AnthropicTextBlock
+      description: A text content block.
+    AnthropicThinkingBlock:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - thinking
+        thinking:
+          type: string
+          title: Thinking
+          description: The model's thinking text.
+        signature:
+          anyOf:
+          - type: string
+          - type: 'null'
+          description: Signature for the thinking block.
+      required:
+      - thinking
+      title: AnthropicThinkingBlock
+      description: A thinking content block (extended thinking).
+    AnthropicThinkingConfig:
+      properties:
+        type:
+          type: string
+          enum:
+          - enabled
+          - disabled
+          title: Type
+          default: enabled
+        budget_tokens:
+          anyOf:
+          - type: integer
+            minimum: 1.0
+          - type: 'null'
+          description: Maximum tokens for thinking.
+      title: AnthropicThinkingConfig
+      description: Configuration for extended thinking.
+    AnthropicToolDef:
+      properties:
+        name:
+          type: string
+          title: Name
+        description:
+          anyOf:
+          - type: string
+          - type: 'null'
+        input_schema:
+          additionalProperties: true
+          type: object
+          title: Input Schema
+          description: JSON Schema for the tool's input.
+      required:
+      - name
+      - input_schema
+      title: AnthropicToolDef
+      description: Definition of a tool available to the model.
+    AnthropicToolResultBlock-Input:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - tool_result
+        tool_use_id:
+          type: string
+          title: Tool Use Id
+          description: The ID of the tool_use block this result corresponds to.
+        content:
+          anyOf:
+          - type: string
+          - items:
+              anyOf:
+              - $ref: '#/components/schemas/AnthropicTextBlock'
+                title: AnthropicTextBlock
+              - $ref: '#/components/schemas/AnthropicImageBlock'
+                title: AnthropicImageBlock
+              title: AnthropicTextBlock | AnthropicImageBlock
+            type: array
+            title: list[AnthropicTextBlock | AnthropicImageBlock]
+          title: string | list[AnthropicTextBlock | AnthropicImageBlock]
+          description: The result content.
+          default: ''
+        is_error:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          description: Whether the tool call resulted in an error.
+      required:
+      - tool_use_id
+      title: AnthropicToolResultBlock
+      description: A tool result content block in a user message.
+    AnthropicToolResultBlock-Output:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - tool_result
+        tool_use_id:
+          type: string
+          title: Tool Use Id
+          description: The ID of the tool_use block this result corresponds to.
+        content:
+          anyOf:
+          - type: string
+          - items:
+              anyOf:
+              - $ref: '#/components/schemas/AnthropicTextBlock'
+                title: AnthropicTextBlock
+              - $ref: '#/components/schemas/AnthropicImageBlock'
+                title: AnthropicImageBlock
+              title: AnthropicTextBlock | AnthropicImageBlock
+            type: array
+            title: list[AnthropicTextBlock | AnthropicImageBlock]
+          title: string | list[AnthropicTextBlock | AnthropicImageBlock]
+          description: The result content.
+          default: ''
+        is_error:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          description: Whether the tool call resulted in an error.
+      required:
+      - tool_use_id
+      title: AnthropicToolResultBlock
+      description: A tool result content block in a user message.
+    AnthropicToolUseBlock:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - tool_use
+        id:
+          type: string
+          title: Id
+          description: Unique ID for this tool invocation.
+        name:
+          type: string
+          title: Name
+          description: Name of the tool being called.
+        input:
+          additionalProperties: true
+          type: object
+          title: Input
+          description: Tool input arguments.
+      required:
+      - id
+      - name
+      - input
+      title: AnthropicToolUseBlock
+      description: A tool use content block in an assistant message.
+    AnthropicUsage:
+      properties:
+        input_tokens:
+          type: integer
+          title: Input Tokens
+          default: 0
+        output_tokens:
+          type: integer
+          title: Output Tokens
+          default: 0
+        cache_creation_input_tokens:
+          anyOf:
+          - type: integer
+          - type: 'null'
+        cache_read_input_tokens:
+          anyOf:
+          - type: integer
+          - type: 'null'
+      title: AnthropicUsage
+      description: Token usage statistics.
     ApprovalFilter:
       properties:
         always:
@@ -9899,6 +10362,7 @@ components:
       - prompts
       - conversations
       - connectors
+      - messages
       - inspect
       - admin
       title: Api
diff --git a/docs/static/experimental-llama-stack-spec.yaml b/docs/static/experimental-llama-stack-spec.yaml
index 5e834f4308..0d3b517e83 100644
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@@ -10104,6 +10104,7 @@ components:
       - prompts
       - conversations
       - connectors
+      - messages
       - inspect
       - admin
       title: Api
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
index 033219262a..35eb28e857 100644
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@@ -3305,6 +3305,71 @@ paths:
 
           response = client.responses.cancel("resp_abc123")
           print(response)
+  /v1/messages:
+    post:
+      responses:
+        '200':
+          description: An AnthropicMessageResponse or a stream of Anthropic SSE events.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/AnthropicMessageResponse'
+            text/event-stream: {}
+        '400':
+          description: Bad Request
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          description: Too Many Requests
+          $ref: '#/components/responses/TooManyRequests429'
+        '500':
+          description: Internal Server Error
+          $ref: '#/components/responses/InternalServerError500'
+        default:
+          description: Default Response
+          $ref: '#/components/responses/DefaultError'
+      tags:
+      - Messages
+      summary: Create a message.
+      description: Create a message using the Anthropic Messages API format.
+      operationId: create_message_v1_messages_post
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/AnthropicCreateMessageRequest'
+        required: true
+  /v1/messages/count_tokens:
+    post:
+      responses:
+        '200':
+          description: Token count for the request.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/AnthropicCountTokensResponse'
+        '400':
+          description: Bad Request
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          description: Too Many Requests
+          $ref: '#/components/responses/TooManyRequests429'
+        '500':
+          description: Internal Server Error
+          $ref: '#/components/responses/InternalServerError500'
+        default:
+          description: Default Response
+          $ref: '#/components/responses/DefaultError'
+      tags:
+      - Messages
+      summary: Count tokens in a message.
+      description: Count the number of tokens in a message request.
+      operationId: count_message_tokens_v1_messages_count_tokens_post
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/AnthropicCountTokensRequest'
+        required: true
 components:
   schemas:
     Error:
@@ -10655,6 +10720,469 @@ components:
           - type: 'null'
       title: AllowedToolsFilter
       description: Filter configuration for restricting which MCP tools can be used.
+    AnthropicCountTokensRequest:
+      properties:
+        model:
+          type: string
+          title: Model
+          description: The model to use for token counting.
+        messages:
+          items:
+            $ref: '#/components/schemas/AnthropicMessage'
+          type: array
+          title: Messages
+          description: The messages to count tokens for.
+        system:
+          anyOf:
+          - type: string
+          - items:
+              $ref: '#/components/schemas/AnthropicTextBlock'
+            type: array
+            title: list[AnthropicTextBlock]
+          - type: 'null'
+          title: string | list[AnthropicTextBlock]
+          description: System prompt.
+        tools:
+          anyOf:
+          - items:
+              $ref: '#/components/schemas/AnthropicToolDef'
+            type: array
+          - type: 'null'
+          description: Tools to include in token count.
+      required:
+      - model
+      - messages
+      title: AnthropicCountTokensRequest
+      description: Request body for POST /v1/messages/count_tokens.
+    AnthropicCountTokensResponse:
+      properties:
+        input_tokens:
+          type: integer
+          title: Input Tokens
+      required:
+      - input_tokens
+      title: AnthropicCountTokensResponse
+      description: Response from POST /v1/messages/count_tokens.
+    AnthropicCreateMessageRequest:
+      properties:
+        model:
+          type: string
+          title: Model
+          description: The model to use for generation.
+        messages:
+          items:
+            $ref: '#/components/schemas/AnthropicMessage'
+          type: array
+          title: Messages
+          description: The messages in the conversation.
+        max_tokens:
+          type: integer
+          minimum: 1.0
+          title: Max Tokens
+          description: The maximum number of tokens to generate.
+        system:
+          anyOf:
+          - type: string
+          - items:
+              $ref: '#/components/schemas/AnthropicTextBlock'
+            type: array
+            title: list[AnthropicTextBlock]
+          - type: 'null'
+          title: string | list[AnthropicTextBlock]
+          description: System prompt. A string or list of text blocks.
+        tools:
+          anyOf:
+          - items:
+              $ref: '#/components/schemas/AnthropicToolDef'
+            type: array
+          - type: 'null'
+          description: Tools available to the model.
+        tool_choice:
+          anyOf:
+          - {}
+          - type: 'null'
+          title: Tool Choice
+          description: "How the model should select tools. One of: 'auto', 'any', 'none', or {type: 'tool', name: '...'}."
+        stream:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          description: Whether to stream the response.
+          default: false
+        temperature:
+          anyOf:
+          - type: number
+            maximum: 1.0
+            minimum: 0.0
+          - type: 'null'
+          description: Sampling temperature.
+        top_p:
+          anyOf:
+          - type: number
+            maximum: 1.0
+            minimum: 0.0
+          - type: 'null'
+          description: Nucleus sampling parameter.
+        top_k:
+          anyOf:
+          - type: integer
+            minimum: 1.0
+          - type: 'null'
+          description: Top-k sampling parameter.
+        stop_sequences:
+          anyOf:
+          - items:
+              type: string
+            type: array
+          - type: 'null'
+          description: Custom stop sequences.
+        metadata:
+          anyOf:
+          - additionalProperties:
+              type: string
+            type: object
+          - type: 'null'
+          description: Request metadata.
+        thinking:
+          anyOf:
+          - $ref: '#/components/schemas/AnthropicThinkingConfig'
+            title: AnthropicThinkingConfig
+          - type: 'null'
+          description: Extended thinking configuration.
+          title: AnthropicThinkingConfig
+        service_tier:
+          anyOf:
+          - type: string
+          - type: 'null'
+          description: Service tier to use.
+      additionalProperties: true
+      required:
+      - model
+      - messages
+      - max_tokens
+      title: AnthropicCreateMessageRequest
+      description: Request body for POST /v1/messages.
+    AnthropicImageBlock:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - image
+        source:
+          $ref: '#/components/schemas/AnthropicImageSource'
+      required:
+      - source
+      title: AnthropicImageBlock
+      description: An image content block.
+    AnthropicImageSource:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - base64
+        media_type:
+          type: string
+          title: Media Type
+          description: MIME type of the image (e.g. image/png).
+        data:
+          type: string
+          title: Data
+          description: Base64-encoded image data.
+      required:
+      - media_type
+      - data
+      title: AnthropicImageSource
+      description: Source for an image content block.
+    AnthropicMessage:
+      properties:
+        role:
+          type: string
+          enum:
+          - user
+          - assistant
+          title: Role
+        content:
+          anyOf:
+          - type: string
+          - items:
+              oneOf:
+              - $ref: '#/components/schemas/AnthropicTextBlock'
+                title: AnthropicTextBlock
+              - $ref: '#/components/schemas/AnthropicImageBlock'
+                title: AnthropicImageBlock
+              - $ref: '#/components/schemas/AnthropicToolUseBlock'
+                title: AnthropicToolUseBlock
+              - $ref: '#/components/schemas/AnthropicToolResultBlock-Input'
+                title: AnthropicToolResultBlock-Input
+              - $ref: '#/components/schemas/AnthropicThinkingBlock'
+                title: AnthropicThinkingBlock
+              discriminator:
+                propertyName: type
+                mapping:
+                  image: '#/components/schemas/AnthropicImageBlock'
+                  text: '#/components/schemas/AnthropicTextBlock'
+                  thinking: '#/components/schemas/AnthropicThinkingBlock'
+                  tool_result: '#/components/schemas/AnthropicToolResultBlock-Input'
+                  tool_use: '#/components/schemas/AnthropicToolUseBlock'
+              title: AnthropicTextBlock | ... (5 variants)
+            type: array
+            title: list[AnthropicTextBlock | AnthropicImageBlock | ...]
+          title: string | list[AnthropicTextBlock | AnthropicImageBlock | ...]
+          description: 'Message content: a string for simple text, or a list of content blocks.'
+      required:
+      - role
+      - content
+      title: AnthropicMessage
+      description: A message in the conversation.
+    AnthropicMessageResponse:
+      properties:
+        id:
+          type: string
+          title: Id
+          description: Unique message ID (msg_ prefix).
+        type:
+          type: string
+          title: Type
+          enum:
+          - message
+        role:
+          type: string
+          title: Role
+          enum:
+          - assistant
+        content:
+          items:
+            oneOf:
+            - $ref: '#/components/schemas/AnthropicTextBlock'
+              title: AnthropicTextBlock
+            - $ref: '#/components/schemas/AnthropicImageBlock'
+              title: AnthropicImageBlock
+            - $ref: '#/components/schemas/AnthropicToolUseBlock'
+              title: AnthropicToolUseBlock
+            - $ref: '#/components/schemas/AnthropicToolResultBlock-Output'
+              title: AnthropicToolResultBlock-Output
+            - $ref: '#/components/schemas/AnthropicThinkingBlock'
+              title: AnthropicThinkingBlock
+            discriminator:
+              propertyName: type
+              mapping:
+                image: '#/components/schemas/AnthropicImageBlock'
+                text: '#/components/schemas/AnthropicTextBlock'
+                thinking: '#/components/schemas/AnthropicThinkingBlock'
+                tool_result: '#/components/schemas/AnthropicToolResultBlock-Output'
+                tool_use: '#/components/schemas/AnthropicToolUseBlock'
+            title: AnthropicTextBlock | ... (5 variants)
+          type: array
+          title: Content
+          description: Response content blocks.
+        model:
+          type: string
+          title: Model
+        stop_reason:
+          anyOf:
+          - type: string
+          - type: 'null'
+          description: 'Why the model stopped: end_turn, stop_sequence, tool_use, or max_tokens.'
+        stop_sequence:
+          anyOf:
+          - type: string
+          - type: 'null'
+        usage:
+          $ref: '#/components/schemas/AnthropicUsage'
+      required:
+      - id
+      - content
+      - model
+      title: AnthropicMessageResponse
+      description: Response from POST /v1/messages (non-streaming).
+    AnthropicTextBlock:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - text
+        text:
+          type: string
+          title: Text
+      required:
+      - text
+      title: AnthropicTextBlock
+      description: A text content block.
+    AnthropicThinkingBlock:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - thinking
+        thinking:
+          type: string
+          title: Thinking
+          description: The model's thinking text.
+        signature:
+          anyOf:
+          - type: string
+          - type: 'null'
+          description: Signature for the thinking block.
+      required:
+      - thinking
+      title: AnthropicThinkingBlock
+      description: A thinking content block (extended thinking).
+    AnthropicThinkingConfig:
+      properties:
+        type:
+          type: string
+          enum:
+          - enabled
+          - disabled
+          title: Type
+          default: enabled
+        budget_tokens:
+          anyOf:
+          - type: integer
+            minimum: 1.0
+          - type: 'null'
+          description: Maximum tokens for thinking.
+      title: AnthropicThinkingConfig
+      description: Configuration for extended thinking.
+    AnthropicToolDef:
+      properties:
+        name:
+          type: string
+          title: Name
+        description:
+          anyOf:
+          - type: string
+          - type: 'null'
+        input_schema:
+          additionalProperties: true
+          type: object
+          title: Input Schema
+          description: JSON Schema for the tool's input.
+      required:
+      - name
+      - input_schema
+      title: AnthropicToolDef
+      description: Definition of a tool available to the model.
+    AnthropicToolResultBlock-Input:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - tool_result
+        tool_use_id:
+          type: string
+          title: Tool Use Id
+          description: The ID of the tool_use block this result corresponds to.
+        content:
+          anyOf:
+          - type: string
+          - items:
+              anyOf:
+              - $ref: '#/components/schemas/AnthropicTextBlock'
+                title: AnthropicTextBlock
+              - $ref: '#/components/schemas/AnthropicImageBlock'
+                title: AnthropicImageBlock
+              title: AnthropicTextBlock | AnthropicImageBlock
+            type: array
+            title: list[AnthropicTextBlock | AnthropicImageBlock]
+          title: string | list[AnthropicTextBlock | AnthropicImageBlock]
+          description: The result content.
+          default: ''
+        is_error:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          description: Whether the tool call resulted in an error.
+      required:
+      - tool_use_id
+      title: AnthropicToolResultBlock
+      description: A tool result content block in a user message.
+    AnthropicToolResultBlock-Output:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - tool_result
+        tool_use_id:
+          type: string
+          title: Tool Use Id
+          description: The ID of the tool_use block this result corresponds to.
+        content:
+          anyOf:
+          - type: string
+          - items:
+              anyOf:
+              - $ref: '#/components/schemas/AnthropicTextBlock'
+                title: AnthropicTextBlock
+              - $ref: '#/components/schemas/AnthropicImageBlock'
+                title: AnthropicImageBlock
+              title: AnthropicTextBlock | AnthropicImageBlock
+            type: array
+            title: list[AnthropicTextBlock | AnthropicImageBlock]
+          title: string | list[AnthropicTextBlock | AnthropicImageBlock]
+          description: The result content.
+          default: ''
+        is_error:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          description: Whether the tool call resulted in an error.
+      required:
+      - tool_use_id
+      title: AnthropicToolResultBlock
+      description: A tool result content block in a user message.
+    AnthropicToolUseBlock:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - tool_use
+        id:
+          type: string
+          title: Id
+          description: Unique ID for this tool invocation.
+        name:
+          type: string
+          title: Name
+          description: Name of the tool being called.
+        input:
+          additionalProperties: true
+          type: object
+          title: Input
+          description: Tool input arguments.
+      required:
+      - id
+      - name
+      - input
+      title: AnthropicToolUseBlock
+      description: A tool use content block in an assistant message.
+    AnthropicUsage:
+      properties:
+        input_tokens:
+          type: integer
+          title: Input Tokens
+          default: 0
+        output_tokens:
+          type: integer
+          title: Output Tokens
+          default: 0
+        cache_creation_input_tokens:
+          anyOf:
+          - type: integer
+          - type: 'null'
+        cache_read_input_tokens:
+          anyOf:
+          - type: integer
+          - type: 'null'
+      title: AnthropicUsage
+      description: Token usage statistics.
     ApprovalFilter:
       properties:
         always:
@@ -12911,6 +13439,7 @@ components:
       - prompts
       - conversations
       - connectors
+      - messages
       - inspect
       - admin
       title: Api
diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml
index 06e361debc..f7761555fc 100644
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@@ -4357,6 +4357,71 @@ paths:
           description: Authorization token
           title: Authorization
         description: Authorization token
+  /v1/messages:
+    post:
+      responses:
+        '200':
+          description: An AnthropicMessageResponse or a stream of Anthropic SSE events.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/AnthropicMessageResponse'
+            text/event-stream: {}
+        '400':
+          description: Bad Request
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          description: Too Many Requests
+          $ref: '#/components/responses/TooManyRequests429'
+        '500':
+          description: Internal Server Error
+          $ref: '#/components/responses/InternalServerError500'
+        default:
+          description: Default Response
+          $ref: '#/components/responses/DefaultError'
+      tags:
+      - Messages
+      summary: Create a message.
+      description: Create a message using the Anthropic Messages API format.
+      operationId: create_message_v1_messages_post
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/AnthropicCreateMessageRequest'
+        required: true
+  /v1/messages/count_tokens:
+    post:
+      responses:
+        '200':
+          description: Token count for the request.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/AnthropicCountTokensResponse'
+        '400':
+          description: Bad Request
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          description: Too Many Requests
+          $ref: '#/components/responses/TooManyRequests429'
+        '500':
+          description: Internal Server Error
+          $ref: '#/components/responses/InternalServerError500'
+        default:
+          description: Default Response
+          $ref: '#/components/responses/DefaultError'
+      tags:
+      - Messages
+      summary: Count tokens in a message.
+      description: Count the number of tokens in a message request.
+      operationId: count_message_tokens_v1_messages_count_tokens_post
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/AnthropicCountTokensRequest'
+        required: true
 components:
   schemas:
     Error:
@@ -11707,6 +11772,469 @@ components:
           - type: 'null'
       title: AllowedToolsFilter
       description: Filter configuration for restricting which MCP tools can be used.
+    AnthropicCountTokensRequest:
+      properties:
+        model:
+          type: string
+          title: Model
+          description: The model to use for token counting.
+        messages:
+          items:
+            $ref: '#/components/schemas/AnthropicMessage'
+          type: array
+          title: Messages
+          description: The messages to count tokens for.
+        system:
+          anyOf:
+          - type: string
+          - items:
+              $ref: '#/components/schemas/AnthropicTextBlock'
+            type: array
+            title: list[AnthropicTextBlock]
+          - type: 'null'
+          title: string | list[AnthropicTextBlock]
+          description: System prompt.
+        tools:
+          anyOf:
+          - items:
+              $ref: '#/components/schemas/AnthropicToolDef'
+            type: array
+          - type: 'null'
+          description: Tools to include in token count.
+      required:
+      - model
+      - messages
+      title: AnthropicCountTokensRequest
+      description: Request body for POST /v1/messages/count_tokens.
+    AnthropicCountTokensResponse:
+      properties:
+        input_tokens:
+          type: integer
+          title: Input Tokens
+      required:
+      - input_tokens
+      title: AnthropicCountTokensResponse
+      description: Response from POST /v1/messages/count_tokens.
+    AnthropicCreateMessageRequest:
+      properties:
+        model:
+          type: string
+          title: Model
+          description: The model to use for generation.
+        messages:
+          items:
+            $ref: '#/components/schemas/AnthropicMessage'
+          type: array
+          title: Messages
+          description: The messages in the conversation.
+        max_tokens:
+          type: integer
+          minimum: 1.0
+          title: Max Tokens
+          description: The maximum number of tokens to generate.
+        system:
+          anyOf:
+          - type: string
+          - items:
+              $ref: '#/components/schemas/AnthropicTextBlock'
+            type: array
+            title: list[AnthropicTextBlock]
+          - type: 'null'
+          title: string | list[AnthropicTextBlock]
+          description: System prompt. A string or list of text blocks.
+        tools:
+          anyOf:
+          - items:
+              $ref: '#/components/schemas/AnthropicToolDef'
+            type: array
+          - type: 'null'
+          description: Tools available to the model.
+        tool_choice:
+          anyOf:
+          - {}
+          - type: 'null'
+          title: Tool Choice
+          description: "How the model should select tools. One of: 'auto', 'any', 'none', or {type: 'tool', name: '...'}."
+        stream:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          description: Whether to stream the response.
+          default: false
+        temperature:
+          anyOf:
+          - type: number
+            maximum: 1.0
+            minimum: 0.0
+          - type: 'null'
+          description: Sampling temperature.
+        top_p:
+          anyOf:
+          - type: number
+            maximum: 1.0
+            minimum: 0.0
+          - type: 'null'
+          description: Nucleus sampling parameter.
+        top_k:
+          anyOf:
+          - type: integer
+            minimum: 1.0
+          - type: 'null'
+          description: Top-k sampling parameter.
+        stop_sequences:
+          anyOf:
+          - items:
+              type: string
+            type: array
+          - type: 'null'
+          description: Custom stop sequences.
+        metadata:
+          anyOf:
+          - additionalProperties:
+              type: string
+            type: object
+          - type: 'null'
+          description: Request metadata.
+        thinking:
+          anyOf:
+          - $ref: '#/components/schemas/AnthropicThinkingConfig'
+            title: AnthropicThinkingConfig
+          - type: 'null'
+          description: Extended thinking configuration.
+          title: AnthropicThinkingConfig
+        service_tier:
+          anyOf:
+          - type: string
+          - type: 'null'
+          description: Service tier to use.
+      additionalProperties: true
+      required:
+      - model
+      - messages
+      - max_tokens
+      title: AnthropicCreateMessageRequest
+      description: Request body for POST /v1/messages.
+    AnthropicImageBlock:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - image
+        source:
+          $ref: '#/components/schemas/AnthropicImageSource'
+      required:
+      - source
+      title: AnthropicImageBlock
+      description: An image content block.
+    AnthropicImageSource:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - base64
+        media_type:
+          type: string
+          title: Media Type
+          description: MIME type of the image (e.g. image/png).
+        data:
+          type: string
+          title: Data
+          description: Base64-encoded image data.
+      required:
+      - media_type
+      - data
+      title: AnthropicImageSource
+      description: Source for an image content block.
+    AnthropicMessage:
+      properties:
+        role:
+          type: string
+          enum:
+          - user
+          - assistant
+          title: Role
+        content:
+          anyOf:
+          - type: string
+          - items:
+              oneOf:
+              - $ref: '#/components/schemas/AnthropicTextBlock'
+                title: AnthropicTextBlock
+              - $ref: '#/components/schemas/AnthropicImageBlock'
+                title: AnthropicImageBlock
+              - $ref: '#/components/schemas/AnthropicToolUseBlock'
+                title: AnthropicToolUseBlock
+              - $ref: '#/components/schemas/AnthropicToolResultBlock-Input'
+                title: AnthropicToolResultBlock-Input
+              - $ref: '#/components/schemas/AnthropicThinkingBlock'
+                title: AnthropicThinkingBlock
+              discriminator:
+                propertyName: type
+                mapping:
+                  image: '#/components/schemas/AnthropicImageBlock'
+                  text: '#/components/schemas/AnthropicTextBlock'
+                  thinking: '#/components/schemas/AnthropicThinkingBlock'
+                  tool_result: '#/components/schemas/AnthropicToolResultBlock-Input'
+                  tool_use: '#/components/schemas/AnthropicToolUseBlock'
+              title: AnthropicTextBlock | ... (5 variants)
+            type: array
+            title: list[AnthropicTextBlock | AnthropicImageBlock | ...]
+          title: string | list[AnthropicTextBlock | AnthropicImageBlock | ...]
+          description: 'Message content: a string for simple text, or a list of content blocks.'
+      required:
+      - role
+      - content
+      title: AnthropicMessage
+      description: A message in the conversation.
+    AnthropicMessageResponse:
+      properties:
+        id:
+          type: string
+          title: Id
+          description: Unique message ID (msg_ prefix).
+        type:
+          type: string
+          title: Type
+          enum:
+          - message
+        role:
+          type: string
+          title: Role
+          enum:
+          - assistant
+        content:
+          items:
+            oneOf:
+            - $ref: '#/components/schemas/AnthropicTextBlock'
+              title: AnthropicTextBlock
+            - $ref: '#/components/schemas/AnthropicImageBlock'
+              title: AnthropicImageBlock
+            - $ref: '#/components/schemas/AnthropicToolUseBlock'
+              title: AnthropicToolUseBlock
+            - $ref: '#/components/schemas/AnthropicToolResultBlock-Output'
+              title: AnthropicToolResultBlock-Output
+            - $ref: '#/components/schemas/AnthropicThinkingBlock'
+              title: AnthropicThinkingBlock
+            discriminator:
+              propertyName: type
+              mapping:
+                image: '#/components/schemas/AnthropicImageBlock'
+                text: '#/components/schemas/AnthropicTextBlock'
+                thinking: '#/components/schemas/AnthropicThinkingBlock'
+                tool_result: '#/components/schemas/AnthropicToolResultBlock-Output'
+                tool_use: '#/components/schemas/AnthropicToolUseBlock'
+            title: AnthropicTextBlock | ... (5 variants)
+          type: array
+          title: Content
+          description: Response content blocks.
+        model:
+          type: string
+          title: Model
+        stop_reason:
+          anyOf:
+          - type: string
+          - type: 'null'
+          description: 'Why the model stopped: end_turn, stop_sequence, tool_use, or max_tokens.'
+        stop_sequence:
+          anyOf:
+          - type: string
+          - type: 'null'
+        usage:
+          $ref: '#/components/schemas/AnthropicUsage'
+      required:
+      - id
+      - content
+      - model
+      title: AnthropicMessageResponse
+      description: Response from POST /v1/messages (non-streaming).
+    AnthropicTextBlock:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - text
+        text:
+          type: string
+          title: Text
+      required:
+      - text
+      title: AnthropicTextBlock
+      description: A text content block.
+    AnthropicThinkingBlock:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - thinking
+        thinking:
+          type: string
+          title: Thinking
+          description: The model's thinking text.
+        signature:
+          anyOf:
+          - type: string
+          - type: 'null'
+          description: Signature for the thinking block.
+      required:
+      - thinking
+      title: AnthropicThinkingBlock
+      description: A thinking content block (extended thinking).
+    AnthropicThinkingConfig:
+      properties:
+        type:
+          type: string
+          enum:
+          - enabled
+          - disabled
+          title: Type
+          default: enabled
+        budget_tokens:
+          anyOf:
+          - type: integer
+            minimum: 1.0
+          - type: 'null'
+          description: Maximum tokens for thinking.
+      title: AnthropicThinkingConfig
+      description: Configuration for extended thinking.
+    AnthropicToolDef:
+      properties:
+        name:
+          type: string
+          title: Name
+        description:
+          anyOf:
+          - type: string
+          - type: 'null'
+        input_schema:
+          additionalProperties: true
+          type: object
+          title: Input Schema
+          description: JSON Schema for the tool's input.
+      required:
+      - name
+      - input_schema
+      title: AnthropicToolDef
+      description: Definition of a tool available to the model.
+    AnthropicToolResultBlock-Input:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - tool_result
+        tool_use_id:
+          type: string
+          title: Tool Use Id
+          description: The ID of the tool_use block this result corresponds to.
+        content:
+          anyOf:
+          - type: string
+          - items:
+              anyOf:
+              - $ref: '#/components/schemas/AnthropicTextBlock'
+                title: AnthropicTextBlock
+              - $ref: '#/components/schemas/AnthropicImageBlock'
+                title: AnthropicImageBlock
+              title: AnthropicTextBlock | AnthropicImageBlock
+            type: array
+            title: list[AnthropicTextBlock | AnthropicImageBlock]
+          title: string | list[AnthropicTextBlock | AnthropicImageBlock]
+          description: The result content.
+          default: ''
+        is_error:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          description: Whether the tool call resulted in an error.
+      required:
+      - tool_use_id
+      title: AnthropicToolResultBlock
+      description: A tool result content block in a user message.
+    AnthropicToolResultBlock-Output:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - tool_result
+        tool_use_id:
+          type: string
+          title: Tool Use Id
+          description: The ID of the tool_use block this result corresponds to.
+        content:
+          anyOf:
+          - type: string
+          - items:
+              anyOf:
+              - $ref: '#/components/schemas/AnthropicTextBlock'
+                title: AnthropicTextBlock
+              - $ref: '#/components/schemas/AnthropicImageBlock'
+                title: AnthropicImageBlock
+              title: AnthropicTextBlock | AnthropicImageBlock
+            type: array
+            title: list[AnthropicTextBlock | AnthropicImageBlock]
+          title: string | list[AnthropicTextBlock | AnthropicImageBlock]
+          description: The result content.
+          default: ''
+        is_error:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          description: Whether the tool call resulted in an error.
+      required:
+      - tool_use_id
+      title: AnthropicToolResultBlock
+      description: A tool result content block in a user message.
+    AnthropicToolUseBlock:
+      properties:
+        type:
+          type: string
+          title: Type
+          enum:
+          - tool_use
+        id:
+          type: string
+          title: Id
+          description: Unique ID for this tool invocation.
+        name:
+          type: string
+          title: Name
+          description: Name of the tool being called.
+        input:
+          additionalProperties: true
+          type: object
+          title: Input
+          description: Tool input arguments.
+      required:
+      - id
+      - name
+      - input
+      title: AnthropicToolUseBlock
+      description: A tool use content block in an assistant message.
+    AnthropicUsage:
+      properties:
+        input_tokens:
+          type: integer
+          title: Input Tokens
+          default: 0
+        output_tokens:
+          type: integer
+          title: Output Tokens
+          default: 0
+        cache_creation_input_tokens:
+          anyOf:
+          - type: integer
+          - type: 'null'
+        cache_read_input_tokens:
+          anyOf:
+          - type: integer
+          - type: 'null'
+      title: AnthropicUsage
+      description: Token usage statistics.
     ApprovalFilter:
       properties:
         always:
@@ -13986,6 +14514,7 @@ components:
       - prompts
       - conversations
       - connectors
+      - messages
       - inspect
       - admin
       title: Api
diff --git a/tests/unit/providers/inline/messages/__init__.py b/tests/unit/providers/inline/messages/__init__.py
new file mode 100644
index 0000000000..756f351d88
--- /dev/null
+++ b/tests/unit/providers/inline/messages/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/tests/unit/providers/inline/messages/test_impl.py b/tests/unit/providers/inline/messages/test_impl.py
new file mode 100644
index 0000000000..6afc3df68b
--- /dev/null
+++ b/tests/unit/providers/inline/messages/test_impl.py
@@ -0,0 +1,353 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""Unit tests for the BuiltinMessagesImpl translation logic."""
+
+import json
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from llama_stack.providers.inline.messages.config import MessagesConfig
+from llama_stack.providers.inline.messages.impl import BuiltinMessagesImpl
+from llama_stack_api.messages.models import (
+    AnthropicCreateMessageRequest,
+    AnthropicImageBlock,
+    AnthropicImageSource,
+    AnthropicMessage,
+    AnthropicTextBlock,
+    AnthropicThinkingConfig,
+    AnthropicToolDef,
+    AnthropicToolResultBlock,
+    AnthropicToolUseBlock,
+)
+
+
+def _msg_to_dict(msg):
+    """Convert a Pydantic message model to dict for easy assertion."""
+    if hasattr(msg, "model_dump"):
+        return msg.model_dump(exclude_none=True)
+    return dict(msg)
+
+
+@pytest.fixture
+def impl():
+    mock_inference = AsyncMock()
+    return BuiltinMessagesImpl(config=MessagesConfig(), inference_api=mock_inference)
+
+
+class TestRequestTranslation:
+    def test_simple_text_message(self, impl):
+        request = AnthropicCreateMessageRequest(
+            model="claude-sonnet-4-20250514",
+            messages=[AnthropicMessage(role="user", content="Hello")],
+            max_tokens=100,
+        )
+        result = impl._anthropic_to_openai(request)
+
+        assert result.model == "claude-sonnet-4-20250514"
+        assert result.max_tokens == 100
+        assert len(result.messages) == 1
+        m = _msg_to_dict(result.messages[0])
+        assert m["role"] == "user"
+        assert m["content"] == "Hello"
+
+    def test_system_string(self, impl):
+        request = AnthropicCreateMessageRequest(
+            model="m",
+            messages=[AnthropicMessage(role="user", content="Hi")],
+            max_tokens=100,
+            system="You are helpful.",
+        )
+        result = impl._anthropic_to_openai(request)
+
+        m0 = _msg_to_dict(result.messages[0])
+        m1 = _msg_to_dict(result.messages[1])
+        assert m0["role"] == "system"
+        assert m0["content"] == "You are helpful."
+        assert m1["role"] == "user"
+
+    def test_system_text_blocks(self, impl):
+        request = AnthropicCreateMessageRequest(
+            model="m",
+            messages=[AnthropicMessage(role="user", content="Hi")],
+            max_tokens=100,
+            system=[
+                AnthropicTextBlock(text="Line 1."),
+                AnthropicTextBlock(text="Line 2."),
+            ],
+        )
+        result = impl._anthropic_to_openai(request)
+
+        m0 = _msg_to_dict(result.messages[0])
+        assert m0["role"] == "system"
+        assert m0["content"] == "Line 1.\nLine 2."
+
+    def test_tool_definitions(self, impl):
+        request = AnthropicCreateMessageRequest(
+            model="m",
+            messages=[AnthropicMessage(role="user", content="Hi")],
+            max_tokens=100,
+            tools=[
+                AnthropicToolDef(
+                    name="get_weather",
+                    description="Get weather",
+                    input_schema={"type": "object", "properties": {"location": {"type": "string"}}},
+                ),
+            ],
+        )
+        result = impl._anthropic_to_openai(request)
+
+        assert len(result.tools) == 1
+        tool = result.tools[0]
+        assert tool["type"] == "function"
+        assert tool["function"]["name"] == "get_weather"
+        assert tool["function"]["parameters"]["type"] == "object"
+
+    def test_tool_choice_any(self, impl):
+        assert impl._convert_tool_choice_to_openai("any") == "required"
+
+    def test_tool_choice_none(self, impl):
+        assert impl._convert_tool_choice_to_openai("none") == "none"
+
+    def test_tool_choice_auto(self, impl):
+        assert impl._convert_tool_choice_to_openai("auto") == "auto"
+
+    def test_tool_choice_specific(self, impl):
+        result = impl._convert_tool_choice_to_openai({"type": "tool", "name": "get_weather"})
+        assert result == {"type": "function", "function": {"name": "get_weather"}}
+
+    def test_stop_sequences(self, impl):
+        request = AnthropicCreateMessageRequest(
+            model="m",
+            messages=[AnthropicMessage(role="user", content="Hi")],
+            max_tokens=100,
+            stop_sequences=["STOP", "END"],
+        )
+        result = impl._anthropic_to_openai(request)
+        assert result.stop == ["STOP", "END"]
+
+    def test_tool_use_in_assistant_message(self, impl):
+        request = AnthropicCreateMessageRequest(
+            model="m",
+            messages=[
+                AnthropicMessage(
+                    role="assistant",
+                    content=[
+                        AnthropicTextBlock(text="Let me check the weather."),
+                        AnthropicToolUseBlock(
+                            id="toolu_123",
+                            name="get_weather",
+                            input={"location": "SF"},
+                        ),
+                    ],
+                ),
+            ],
+            max_tokens=100,
+        )
+        result = impl._anthropic_to_openai(request)
+
+        msg = _msg_to_dict(result.messages[0])
+        assert msg["role"] == "assistant"
+        assert msg["content"] == "Let me check the weather."
+        assert len(msg["tool_calls"]) == 1
+        assert msg["tool_calls"][0]["id"] == "toolu_123"
+        assert msg["tool_calls"][0]["function"]["name"] == "get_weather"
+        assert json.loads(msg["tool_calls"][0]["function"]["arguments"]) == {"location": "SF"}
+
+    def test_tool_result_in_user_message(self, impl):
+        request = AnthropicCreateMessageRequest(
+            model="m",
+            messages=[
+                AnthropicMessage(
+                    role="user",
+                    content=[
+                        AnthropicToolResultBlock(
+                            tool_use_id="toolu_123",
+                            content="72F and sunny",
+                        ),
+                    ],
+                ),
+            ],
+            max_tokens=100,
+        )
+        result = impl._anthropic_to_openai(request)
+
+        msg = _msg_to_dict(result.messages[0])
+        assert msg["role"] == "tool"
+        assert msg["tool_call_id"] == "toolu_123"
+        assert msg["content"] == "72F and sunny"
+
+    def test_top_k_passed_as_extra(self, impl):
+        request = AnthropicCreateMessageRequest(
+            model="m",
+            messages=[AnthropicMessage(role="user", content="Hi")],
+            max_tokens=100,
+            top_k=40,
+        )
+        result = impl._anthropic_to_openai(request)
+        assert result.model_extra.get("top_k") == 40
+
+
+class TestResponseTranslation:
+    def test_simple_text_response(self, impl):
+        openai_resp = MagicMock()
+        openai_resp.choices = [MagicMock()]
+        openai_resp.choices[0].message = MagicMock()
+        openai_resp.choices[0].message.content = "Hello!"
+        openai_resp.choices[0].message.tool_calls = None
+        openai_resp.choices[0].finish_reason = "stop"
+        openai_resp.usage = MagicMock()
+        openai_resp.usage.prompt_tokens = 10
+        openai_resp.usage.completion_tokens = 5
+
+        result = impl._openai_to_anthropic(openai_resp, "claude-sonnet-4-20250514")
+
+        assert result.id.startswith("msg_")
+        assert result.type == "message"
+        assert result.role == "assistant"
+        assert result.model == "claude-sonnet-4-20250514"
+        assert result.stop_reason == "end_turn"
+        assert len(result.content) == 1
+        assert result.content[0].type == "text"
+        assert result.content[0].text == "Hello!"
+        assert result.usage.input_tokens == 10
+        assert result.usage.output_tokens == 5
+
+    def test_tool_call_response(self, impl):
+        tc = MagicMock()
+        tc.id = "call_123"
+        tc.function.name = "get_weather"
+        tc.function.arguments = '{"location": "SF"}'
+
+        openai_resp = MagicMock()
+        openai_resp.choices = [MagicMock()]
+        openai_resp.choices[0].message = MagicMock()
+        openai_resp.choices[0].message.content = None
+        openai_resp.choices[0].message.tool_calls = [tc]
+        openai_resp.choices[0].finish_reason = "tool_calls"
+        openai_resp.usage = MagicMock()
+        openai_resp.usage.prompt_tokens = 20
+        openai_resp.usage.completion_tokens = 10
+
+        result = impl._openai_to_anthropic(openai_resp, "m")
+
+        assert result.stop_reason == "tool_use"
+        assert len(result.content) == 1
+        assert result.content[0].type == "tool_use"
+        assert result.content[0].name == "get_weather"
+        assert result.content[0].input == {"location": "SF"}
+
+    def test_length_stop_reason(self, impl):
+        openai_resp = MagicMock()
+        openai_resp.choices = [MagicMock()]
+        openai_resp.choices[0].message = MagicMock()
+        openai_resp.choices[0].message.content = "truncated"
+        openai_resp.choices[0].message.tool_calls = None
+        openai_resp.choices[0].finish_reason = "length"
+        openai_resp.usage = MagicMock()
+        openai_resp.usage.prompt_tokens = 5
+        openai_resp.usage.completion_tokens = 100
+
+        result = impl._openai_to_anthropic(openai_resp, "m")
+        assert result.stop_reason == "max_tokens"
+
+
+class TestStreamingTranslation:
+    @pytest.mark.asyncio
+    async def test_text_streaming(self, impl):
+        chunks = []
+
+        for i, text in enumerate(["Hello", " world", "!"]):
+            chunk = MagicMock()
+            chunk.choices = [MagicMock()]
+            chunk.choices[0].delta = MagicMock()
+            chunk.choices[0].delta.content = text
+            chunk.choices[0].delta.tool_calls = None
+            chunk.choices[0].finish_reason = "stop" if i == 2 else None
+            chunk.usage = None
+            chunks.append(chunk)
+
+        async def mock_stream():
+            for c in chunks:
+                yield c
+
+        events = []
+        async for event in impl._stream_openai_to_anthropic(mock_stream(), "m"):
+            events.append(event)
+
+        assert events[0].type == "message_start"
+        assert events[1].type == "content_block_start"
+        assert events[1].content_block.type == "text"
+        assert events[2].type == "content_block_delta"
+        assert events[2].delta.text == "Hello"
+        assert events[3].type == "content_block_delta"
+        assert events[3].delta.text == " world"
+        assert events[4].type == "content_block_delta"
+        assert events[4].delta.text == "!"
+        assert events[5].type == "content_block_stop"
+        assert events[6].type == "message_delta"
+        assert events[6].delta.stop_reason == "end_turn"
+        assert events[7].type == "message_stop"
+
+    @pytest.mark.asyncio
+    async def test_tool_call_streaming(self, impl):
+        chunks = []
+
+        # Tool call start
+        tc_delta = MagicMock()
+        tc_delta.index = 0
+        tc_delta.id = "call_abc"
+        tc_delta.function = MagicMock()
+        tc_delta.function.name = "search"
+        tc_delta.function.arguments = None
+        tc_delta.type = "function"
+
+        chunk1 = MagicMock()
+        chunk1.choices = [MagicMock()]
+        chunk1.choices[0].delta = MagicMock()
+        chunk1.choices[0].delta.content = None
+        chunk1.choices[0].delta.tool_calls = [tc_delta]
+        chunk1.choices[0].finish_reason = None
+        chunk1.usage = None
+        chunks.append(chunk1)
+
+        # Tool call arguments
+        tc_delta2 = MagicMock()
+        tc_delta2.index = 0
+        tc_delta2.id = None
+        tc_delta2.function = MagicMock()
+        tc_delta2.function.name = None
+        tc_delta2.function.arguments = '{"query": "test"}'
+
+        chunk2 = MagicMock()
+        chunk2.choices = [MagicMock()]
+        chunk2.choices[0].delta = MagicMock()
+        chunk2.choices[0].delta.content = None
+        chunk2.choices[0].delta.tool_calls = [tc_delta2]
+        chunk2.choices[0].finish_reason = "tool_calls"
+        chunk2.usage = None
+        chunks.append(chunk2)
+
+        async def mock_stream():
+            for c in chunks:
+                yield c
+
+        events = []
+        async for event in impl._stream_openai_to_anthropic(mock_stream(), "m"):
+            events.append(event)
+
+        assert events[0].type == "message_start"
+        tool_start = [e for e in events if e.type == "content_block_start" and hasattr(e.content_block, "name")]
+        assert len(tool_start) == 1
+        assert tool_start[0].content_block.name == "search"
+
+        json_deltas = [e for e in events if e.type == "content_block_delta" and hasattr(e.delta, "partial_json")]
+        assert len(json_deltas) == 1
+        assert json_deltas[0].delta.partial_json == '{"query": "test"}'
+
+        msg_delta = [e for e in events if e.type == "message_delta"]
+        assert msg_delta[0].delta.stop_reason == "tool_use"

From 55a2e7b6117f7bd9aa57b64ebc855b7a99bbc3d0 Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Wed, 1 Apr 2026 10:46:42 -0400
Subject: [PATCH 5/9] feat(messages): add integration tests with record-replay
 support

Add a new messages integration test suite that exercises the Anthropic
Messages API (/v1/messages) end-to-end through the server. The suite
includes 13 tests covering non-streaming, streaming, system prompts,
multi-turn conversations, tool definitions, tool use round trips,
content block arrays, error handling, and response headers.

To enable replay mode (no live backend required), extend the api_recorder
to patch httpx.AsyncClient.post and httpx.AsyncClient.stream. This
captures the native Ollama passthrough requests the Messages provider
makes via raw httpx, following the same pattern used for aiohttp rerank
recording. Recordings are stored in tests/integration/messages/recordings/.

Also fix pre-commit violations: structured logging in impl.py, unused
loop variable, and remove redundant @pytest.mark.asyncio decorators
from unit tests.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Charlie Doern <cdoern@redhat.com>
---
 .../providers/inline/messages/impl.py         |  70 ++--
 src/llama_stack/testing/api_recorder.py       | 192 +++++++++-
 .../messages/fastapi_routes.py                |   4 +-
 src/llama_stack_api/messages/models.py        |   7 +-
 ...204585952f77bc5acb3ff7702bd7878d44b44.json |  27 ++
 tests/integration/messages/__init__.py        |   5 +
 tests/integration/messages/conftest.py        | 124 ++++++
 ...617d5c84b80daf77535a00ce1ab680308881a.json |  70 ++++
 ...f71748574ffa98ec52d4faf94b4d34818c4f8.json |  42 ++
 ...caa6c511cc52e6498b73e274cb4f01adf1e37.json |  44 +++
 ...e4762d47ef1768f98b2f1c7af55788e7fe6d8.json |  41 ++
 ...ab82aba7ac1225c040665b7bdb3b177eb8226.json |  46 +++
 ...bbf79e51f2731a07ecb332d5d2dd82810d2e5.json |  53 +++
 ...cd2e6b307f44fd2c7a4cf564e30347da87746.json |  45 +++
 ...610908f34be9193c4126635d1b5ab3b0522f7.json |  46 +++
 ...7ab0a8b2c99e4665d9eae5553fe9019ba7b32.json | 223 +++++++++++
 ...6933b84ab907ea20092e6d97c9d4a371bf8a5.json |  67 ++++
 ...32dcfc038b345d2a7be8da4b880a433d36a52.json | 208 ++++++++++
 ...bb84c3c8784ea1e1ac139e2f0623449dfa047.json |  86 +++++
 tests/integration/messages/test_messages.py   | 362 ++++++++++++++++++
 tests/integration/suites.py                   |   7 +-
 .../providers/inline/messages/test_impl.py    |   5 -
 22 files changed, 1730 insertions(+), 44 deletions(-)
 create mode 100644 tests/integration/common/recordings/cf0be7f9e2ebfc78903aa4ada30204585952f77bc5acb3ff7702bd7878d44b44.json
 create mode 100644 tests/integration/messages/__init__.py
 create mode 100644 tests/integration/messages/conftest.py
 create mode 100644 tests/integration/messages/recordings/0d76cd7b3dae3f44e2990645cb1617d5c84b80daf77535a00ce1ab680308881a.json
 create mode 100644 tests/integration/messages/recordings/1580d1d2e377b9161e55b7648e1f71748574ffa98ec52d4faf94b4d34818c4f8.json
 create mode 100644 tests/integration/messages/recordings/2703eb8f17f3914dd6991ecf387caa6c511cc52e6498b73e274cb4f01adf1e37.json
 create mode 100644 tests/integration/messages/recordings/2a5f7014ddf9a3d359fbf59a195e4762d47ef1768f98b2f1c7af55788e7fe6d8.json
 create mode 100644 tests/integration/messages/recordings/4ea18b99571d34f714cb4b9d818ab82aba7ac1225c040665b7bdb3b177eb8226.json
 create mode 100644 tests/integration/messages/recordings/52925d8df69e53718e5d4aab54fbbf79e51f2731a07ecb332d5d2dd82810d2e5.json
 create mode 100644 tests/integration/messages/recordings/715c164b66b51dc2180b05817b7cd2e6b307f44fd2c7a4cf564e30347da87746.json
 create mode 100644 tests/integration/messages/recordings/82cfd5545e24ca4b4415ba37460610908f34be9193c4126635d1b5ab3b0522f7.json
 create mode 100644 tests/integration/messages/recordings/93eb42f3bd69f005727cc3a161e7ab0a8b2c99e4665d9eae5553fe9019ba7b32.json
 create mode 100644 tests/integration/messages/recordings/d13b333401fa121280a3fb890a56933b84ab907ea20092e6d97c9d4a371bf8a5.json
 create mode 100644 tests/integration/messages/recordings/f22657bfd86db6348c0a0d0b17332dcfc038b345d2a7be8da4b880a433d36a52.json
 create mode 100644 tests/integration/messages/recordings/f55988509902a617d2f547a1518bb84c3c8784ea1e1ac139e2f0623449dfa047.json
 create mode 100644 tests/integration/messages/test_messages.py

diff --git a/src/llama_stack/providers/inline/messages/impl.py b/src/llama_stack/providers/inline/messages/impl.py
index 93f090ca53..093290295e 100644
--- a/src/llama_stack/providers/inline/messages/impl.py
+++ b/src/llama_stack/providers/inline/messages/impl.py
@@ -144,10 +144,10 @@ async def _get_passthrough_url(self, model: str) -> str | None:
                 # Ollama's /v1/messages sits at the root, not under /v1
                 if base_url.endswith("/v1"):
                     base_url = base_url[:-3]
-                logger.info(f"Using native /v1/messages passthrough for model {model} via {base_url}")
+                logger.info("Using native /v1/messages passthrough", model=model, base_url=base_url)
                 return base_url
         except Exception:
-            logger.debug(f"Failed to resolve passthrough for model {model}, falling back to translation")
+            logger.debug("Failed to resolve passthrough, falling back to translation", model=model)
 
         return None
 
@@ -248,9 +248,7 @@ def _parse_sse_event(self, event_type: str, data: dict[str, Any]) -> AnthropicSt
 
     # -- Request translation --
 
-    def _anthropic_to_openai(
-        self, request: AnthropicCreateMessageRequest
-    ) -> OpenAIChatCompletionRequestWithExtraBody:
+    def _anthropic_to_openai(self, request: AnthropicCreateMessageRequest) -> OpenAIChatCompletionRequestWithExtraBody:
         messages = self._convert_messages_to_openai(request.system, request.messages)
         tools = self._convert_tools_to_openai(request.tools) if request.tools else None
         tool_choice = self._convert_tool_choice_to_openai(request.tool_choice) if request.tool_choice else None
@@ -324,20 +322,24 @@ def _convert_single_message(self, msg: AnthropicMessage) -> list[dict[str, Any]]
                 content = block.content
                 if isinstance(content, list):
                     content = "\n".join(b.text for b in content if isinstance(b, AnthropicTextBlock))
-                result.append({
-                    "role": "tool",
-                    "tool_call_id": block.tool_use_id,
-                    "content": content,
-                })
+                result.append(
+                    {
+                        "role": "tool",
+                        "tool_call_id": block.tool_use_id,
+                        "content": content,
+                    }
+                )
             elif isinstance(block, AnthropicTextBlock):
                 text_parts.append({"type": "text", "text": block.text})
             elif isinstance(block, AnthropicImageBlock):
-                text_parts.append({
-                    "type": "image_url",
-                    "image_url": {
-                        "url": f"data:{block.source.media_type};base64,{block.source.data}",
-                    },
-                })
+                text_parts.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:{block.source.media_type};base64,{block.source.data}",
+                        },
+                    }
+                )
 
         if text_parts:
             result.append({"role": "user", "content": text_parts if len(text_parts) > 1 else text_parts[0]})
@@ -353,14 +355,16 @@ def _convert_assistant_message(self, content: list[AnthropicContentBlock]) -> di
             if isinstance(block, AnthropicTextBlock):
                 text_parts.append(block.text)
             elif isinstance(block, AnthropicToolUseBlock):
-                tool_calls.append({
-                    "id": block.id,
-                    "type": "function",
-                    "function": {
-                        "name": block.name,
-                        "arguments": json.dumps(block.input),
-                    },
-                })
+                tool_calls.append(
+                    {
+                        "id": block.id,
+                        "type": "function",
+                        "function": {
+                            "name": block.name,
+                            "arguments": json.dumps(block.input),
+                        },
+                    }
+                )
 
         msg: dict[str, Any] = {"role": "assistant"}
         if text_parts:
@@ -405,9 +409,7 @@ def _convert_tool_choice_to_openai(self, tool_choice: Any) -> Any:
 
     # -- Response translation --
 
-    def _openai_to_anthropic(
-        self, response: OpenAIChatCompletion, request_model: str
-    ) -> AnthropicMessageResponse:
+    def _openai_to_anthropic(self, response: OpenAIChatCompletion, request_model: str) -> AnthropicMessageResponse:
         content: list[AnthropicContentBlock] = []
 
         if response.choices:
@@ -426,11 +428,13 @@ def _openai_to_anthropic(
                     except json.JSONDecodeError:
                         tool_input = {}
 
-                    content.append(AnthropicToolUseBlock(
-                        id=tc.id or f"toolu_{uuid.uuid4().hex[:24]}",
-                        name=tc.function.name or "",
-                        input=tool_input,
-                    ))
+                    content.append(
+                        AnthropicToolUseBlock(
+                            id=tc.id or f"toolu_{uuid.uuid4().hex[:24]}",
+                            name=tc.function.name or "",
+                            input=tool_input,
+                        )
+                    )
 
             finish_reason = choice.finish_reason or "stop"
             stop_reason = _FINISH_TO_STOP_REASON.get(finish_reason, "end_turn")
@@ -547,7 +551,7 @@ async def _stream_openai_to_anthropic(
         if in_text_block:
             yield ContentBlockStopEvent(index=content_block_index)
 
-        for tc_idx, block_idx in tool_call_index_to_block_index.items():
+        for _tc_idx, block_idx in tool_call_index_to_block_index.items():
             yield ContentBlockStopEvent(index=block_idx)
 
         # Final events
diff --git a/src/llama_stack/testing/api_recorder.py b/src/llama_stack/testing/api_recorder.py
index aa28fc395c..0499d53039 100644
--- a/src/llama_stack/testing/api_recorder.py
+++ b/src/llama_stack/testing/api_recorder.py
@@ -881,6 +881,177 @@ async def __aexit__(self, exc_type, exc_val, exc_tb):
         raise AssertionError(f"Invalid mode: {_current_mode}")
 
 
+async def _patched_httpx_async_post(original_post, self, url, **kwargs):
+    """Patched version of httpx.AsyncClient.post for recording/replay of Messages API passthrough.
+
+    Intercepts requests to /v1/messages endpoints so the native Ollama passthrough
+    path can be recorded and replayed without a live backend.
+    """
+    global _current_mode, _current_storage
+
+    url_str = str(url)
+    is_messages = "/v1/messages" in url_str
+
+    if not is_messages or _current_mode == APIRecordingMode.LIVE or _current_storage is None:
+        return await original_post(self, url, **kwargs)
+
+    json_payload = kwargs.get("json", {})
+    request_hash = normalize_http_request(url_str, "POST", json_payload)
+
+    if _current_mode in (APIRecordingMode.REPLAY, APIRecordingMode.RECORD_IF_MISSING):
+        recording = _current_storage.find_recording(request_hash)
+        if recording:
+            import httpx as _httpx
+
+            body_bytes = json.dumps(recording["response"]["body"]).encode()
+            # Create a minimal request so raise_for_status() works on the mock response
+            mock_request = _httpx.Request("POST", url_str)
+            mock_response = _httpx.Response(
+                status_code=recording["response"].get("status", 200),
+                headers={"content-type": "application/json", "anthropic-version": "2023-06-01"},
+                content=body_bytes,
+                request=mock_request,
+            )
+            return mock_response
+        elif _current_mode == APIRecordingMode.REPLAY:
+            raise RuntimeError(
+                f"Recording not found for httpx POST {url_str}\n"
+                f"\n"
+                f"Run './scripts/integration-tests.sh --inference-mode record-if-missing' with required API keys to generate."
+            )
+
+    if _current_mode in (APIRecordingMode.RECORD, APIRecordingMode.RECORD_IF_MISSING):
+        response = await original_post(self, url, **kwargs)
+
+        request_data = {
+            "test_id": get_test_context(),
+            "url": url_str,
+            "method": "POST",
+            "payload": json_payload,
+        }
+        response_data = {
+            "status": response.status_code,
+            "body": response.json(),
+            "is_streaming": False,
+        }
+        _current_storage.store_recording(request_hash, request_data, response_data)
+        return response
+
+    raise AssertionError(f"Invalid mode: {_current_mode}")
+
+
+def _patched_httpx_async_stream(original_stream, self, method, url, **kwargs):
+    """Patched version of httpx.AsyncClient.stream for recording/replay of streaming Messages API passthrough.
+
+    Intercepts streaming requests to /v1/messages endpoints. Returns an async context manager
+    that either replays recorded SSE events or records live ones.
+    """
+    global _current_mode, _current_storage
+
+    url_str = str(url)
+    is_messages = "/v1/messages" in url_str
+
+    if not is_messages or _current_mode == APIRecordingMode.LIVE or _current_storage is None:
+        return original_stream(self, method, url, **kwargs)
+
+    json_payload = kwargs.get("json", {})
+    request_hash = normalize_http_request(url_str, "POST", json_payload)
+
+    class _ReplayStreamContext:
+        """Async context manager that replays recorded SSE events as a mock httpx response."""
+
+        def __init__(self, sse_lines: list[str]):
+            self._sse_lines = sse_lines
+
+        async def __aenter__(self):
+            import httpx as _httpx
+
+            class _MockStreamResponse:
+                def __init__(self, lines):
+                    self.status_code = 200
+                    self.headers = _httpx.Headers(
+                        {"content-type": "text/event-stream", "anthropic-version": "2023-06-01"}
+                    )
+                    self._lines = lines
+
+                def raise_for_status(self):
+                    pass
+
+                async def aiter_lines(self):
+                    for line in self._lines:
+                        yield line
+
+            return _MockStreamResponse(self._sse_lines)
+
+        async def __aexit__(self, *args):
+            pass
+
+    # _RecordStreamContext is unused but kept for reference; actual recording uses _RecordCtx below
+
+    class _RecordingStreamResponse:
+        """Wraps a real httpx streaming response to capture SSE lines for recording."""
+
+        def __init__(self, response, url_str, json_payload, request_hash):
+            self._response = response
+            self._url = url_str
+            self._payload = json_payload
+            self._hash = request_hash
+            self._recorded_lines: list[str] = []
+            self.status_code = response.status_code
+            self.headers = response.headers
+
+        def raise_for_status(self):
+            self._response.raise_for_status()
+
+        async def aiter_lines(self):
+            async for line in self._response.aiter_lines():
+                self._recorded_lines.append(line)
+                yield line
+
+            # After the stream is exhausted, store the recording
+            request_data = {
+                "test_id": get_test_context(),
+                "url": self._url,
+                "method": "POST",
+                "payload": self._payload,
+            }
+            response_data = {
+                "body": self._recorded_lines,
+                "is_streaming": True,
+            }
+            if _current_storage:
+                _current_storage.store_recording(self._hash, request_data, response_data)
+
+    if _current_mode in (APIRecordingMode.REPLAY, APIRecordingMode.RECORD_IF_MISSING):
+        recording = _current_storage.find_recording(request_hash)
+        if recording:
+            return _ReplayStreamContext(recording["response"]["body"])
+        elif _current_mode == APIRecordingMode.REPLAY:
+            raise RuntimeError(
+                f"Recording not found for httpx stream POST {url_str}\n"
+                f"\n"
+                f"Run './scripts/integration-tests.sh --inference-mode record-if-missing' with required API keys to generate."
+            )
+
+    if _current_mode in (APIRecordingMode.RECORD, APIRecordingMode.RECORD_IF_MISSING):
+        # Capture the httpx client instance before defining the inner class
+        httpx_client = self
+
+        class _RecordCtx:
+            async def __aenter__(self):
+                self._cm = original_stream(httpx_client, method, url, **kwargs)
+                resp = await self._cm.__aenter__()
+                self._wrapper = _RecordingStreamResponse(resp, url_str, json_payload, request_hash)
+                return self._wrapper
+
+            async def __aexit__(self, *args):
+                return await self._cm.__aexit__(*args)
+
+        return _RecordCtx()
+
+    raise AssertionError(f"Invalid mode: {_current_mode}")
+
+
 _cached_provider_metadata: dict[str, dict[str, str]] = {}
 
 
@@ -1118,6 +1289,7 @@ def patch_inference_clients():
     global _original_methods
 
     import aiohttp
+    import httpx
     from ollama import AsyncClient as OllamaAsyncClient
     from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
     from openai.resources.completions import AsyncCompletions
@@ -1128,7 +1300,7 @@ def patch_inference_clients():
     from llama_stack.providers.inline.file_processor.pypdf.adapter import PyPDFFileProcessorAdapter
     from llama_stack.providers.remote.tool_runtime.tavily_search.tavily_search import TavilySearchToolRuntimeImpl
 
-    # Store original methods for OpenAI, Ollama clients, tool runtimes, file processors, and aiohttp
+    # Store original methods for OpenAI, Ollama clients, tool runtimes, file processors, aiohttp, and httpx
     _original_methods = {
         "chat_completions_create": AsyncChatCompletions.create,
         "completions_create": AsyncCompletions.create,
@@ -1144,6 +1316,8 @@ def patch_inference_clients():
         "tavily_invoke_tool": TavilySearchToolRuntimeImpl.invoke_tool,
         "pypdf_process_file": PyPDFFileProcessorAdapter.process_file,
         "aiohttp_post": aiohttp.ClientSession.post,
+        "httpx_async_post": httpx.AsyncClient.post,
+        "httpx_async_stream": httpx.AsyncClient.stream,
     }
 
     # Create patched methods for OpenAI client
@@ -1249,6 +1423,17 @@ def patched_aiohttp_session_post(self, url, **kwargs):
     # Apply aiohttp patch
     aiohttp.ClientSession.post = patched_aiohttp_session_post
 
+    # Create patched methods for httpx AsyncClient (Messages API passthrough)
+    async def patched_httpx_async_post(self, url, **kwargs):
+        return await _patched_httpx_async_post(_original_methods["httpx_async_post"], self, url, **kwargs)
+
+    def patched_httpx_async_stream(self, method, url, **kwargs):
+        return _patched_httpx_async_stream(_original_methods["httpx_async_stream"], self, method, url, **kwargs)
+
+    # Apply httpx patches
+    httpx.AsyncClient.post = patched_httpx_async_post
+    httpx.AsyncClient.stream = patched_httpx_async_stream
+
 
 def unpatch_inference_clients():
     """Remove monkey patches and restore original OpenAI, Ollama client, tool runtime, and aiohttp methods."""
@@ -1259,6 +1444,7 @@ def unpatch_inference_clients():
 
     # Import here to avoid circular imports
     import aiohttp
+    import httpx
     from ollama import AsyncClient as OllamaAsyncClient
     from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
     from openai.resources.completions import AsyncCompletions
@@ -1293,6 +1479,10 @@ def unpatch_inference_clients():
     # Restore aiohttp method
     aiohttp.ClientSession.post = _original_methods["aiohttp_post"]
 
+    # Restore httpx methods
+    httpx.AsyncClient.post = _original_methods["httpx_async_post"]
+    httpx.AsyncClient.stream = _original_methods["httpx_async_stream"]
+
     _original_methods.clear()
 
 
diff --git a/src/llama_stack_api/messages/fastapi_routes.py b/src/llama_stack_api/messages/fastapi_routes.py
index 1392a51a3b..493c71f2a1 100644
--- a/src/llama_stack_api/messages/fastapi_routes.py
+++ b/src/llama_stack_api/messages/fastapi_routes.py
@@ -153,7 +153,7 @@ async def create_message(
             return _anthropic_error_response(400, str(e))
         except HTTPException as e:
             return _anthropic_error_response(e.status_code, e.detail)
-        except Exception as e:
+        except Exception:
             logger.exception("Failed to create message")
             return _anthropic_error_response(500, "Internal server error")
 
@@ -187,7 +187,7 @@ async def count_message_tokens(
             result = await impl.count_message_tokens(params)
         except NotImplementedError as e:
             return _anthropic_error_response(501, str(e))
-        except Exception as e:
+        except Exception:
             logger.exception("Failed to count message tokens")
             return _anthropic_error_response(500, "Internal server error")
 
diff --git a/src/llama_stack_api/messages/models.py b/src/llama_stack_api/messages/models.py
index d0841ec64d..81abe8c2f0 100644
--- a/src/llama_stack_api/messages/models.py
+++ b/src/llama_stack_api/messages/models.py
@@ -16,7 +16,6 @@
 
 from pydantic import BaseModel, ConfigDict, Field
 
-
 # -- Content blocks --
 
 
@@ -72,7 +71,11 @@ class AnthropicThinkingBlock(BaseModel):
 
 
 AnthropicContentBlock = Annotated[
-    AnthropicTextBlock | AnthropicImageBlock | AnthropicToolUseBlock | AnthropicToolResultBlock | AnthropicThinkingBlock,
+    AnthropicTextBlock
+    | AnthropicImageBlock
+    | AnthropicToolUseBlock
+    | AnthropicToolResultBlock
+    | AnthropicThinkingBlock,
     Field(discriminator="type"),
 ]
 
diff --git a/tests/integration/common/recordings/cf0be7f9e2ebfc78903aa4ada30204585952f77bc5acb3ff7702bd7878d44b44.json b/tests/integration/common/recordings/cf0be7f9e2ebfc78903aa4ada30204585952f77bc5acb3ff7702bd7878d44b44.json
new file mode 100644
index 0000000000..cfd287568c
--- /dev/null
+++ b/tests/integration/common/recordings/cf0be7f9e2ebfc78903aa4ada30204585952f77bc5acb3ff7702bd7878d44b44.json
@@ -0,0 +1,27 @@
+{
+  "test_id": null,
+  "request": {
+    "test_id": null,
+    "url": "http://0.0.0.0:11434/v1/messages",
+    "method": "POST",
+    "payload": {
+      "model": "gpt-oss:20b",
+      "messages": [],
+      "max_tokens": 64,
+      "stream": false
+    }
+  },
+  "response": {
+    "status": 400,
+    "body": {
+      "type": "error",
+      "error": {
+        "type": "invalid_request_error",
+        "message": "messages is required"
+      },
+      "request_id": "req_b662960dd608e745f612e3a2"
+    },
+    "is_streaming": false
+  },
+  "id_normalization_mapping": {}
+}
diff --git a/tests/integration/messages/__init__.py b/tests/integration/messages/__init__.py
new file mode 100644
index 0000000000..756f351d88
--- /dev/null
+++ b/tests/integration/messages/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/tests/integration/messages/conftest.py b/tests/integration/messages/conftest.py
new file mode 100644
index 0000000000..fae505dea4
--- /dev/null
+++ b/tests/integration/messages/conftest.py
@@ -0,0 +1,124 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import os
+from typing import Any
+
+import httpx
+import pytest
+
+from llama_stack.core.library_client import LlamaStackAsLibraryClient
+from llama_stack.core.testing_context import get_test_context
+
+# Import fixtures from common module to make them available in this test directory
+from tests.integration.fixtures.common import (  # noqa: F401
+    openai_client,
+    require_server,
+)
+
+
+def pytest_configure(config):
+    """Disable stderr pipe to prevent Rich logging from blocking on buffer saturation."""
+    os.environ["LLAMA_STACK_TEST_LOG_STDERR"] = "0"
+
+
+@pytest.fixture(scope="session")
+def messages_base_url(llama_stack_client):
+    """Provide the base URL for the Messages API, skipping library client mode."""
+    if isinstance(llama_stack_client, LlamaStackAsLibraryClient):
+        pytest.skip("Messages API tests are not supported in library client mode")
+    return llama_stack_client.base_url
+
+
+@pytest.fixture
+def messages_client(messages_base_url):
+    """Provide an httpx client configured for Anthropic Messages API calls."""
+    client = httpx.Client(base_url=messages_base_url, timeout=60.0)
+    yield client
+    client.close()
+
+
+def _build_messages_body(
+    *,
+    model: str,
+    messages: list[dict],
+    max_tokens: int = 256,
+    stream: bool = False,
+    system: str | None = None,
+    tools: list[dict] | None = None,
+    tool_choice: dict | str | None = None,
+    temperature: float | None = None,
+    stop_sequences: list[str] | None = None,
+) -> dict[str, Any]:
+    body: dict[str, Any] = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": max_tokens,
+        "stream": stream,
+    }
+    if system is not None:
+        body["system"] = system
+    if tools is not None:
+        body["tools"] = tools
+    if tool_choice is not None:
+        body["tool_choice"] = tool_choice
+    if temperature is not None:
+        body["temperature"] = temperature
+    if stop_sequences is not None:
+        body["stop_sequences"] = stop_sequences
+    return body
+
+
+def _build_headers() -> dict[str, str]:
+    headers = {
+        "content-type": "application/json",
+        "anthropic-version": "2023-06-01",
+    }
+    test_id = get_test_context()
+    if test_id:
+        provider_data = {"__test_id": test_id}
+        headers["X-LlamaStack-Provider-Data"] = json.dumps(provider_data)
+    return headers
+
+
+def make_messages_request(
+    client: httpx.Client,
+    **kwargs: Any,
+) -> httpx.Response:
+    """Make a non-streaming POST request to /v1/messages."""
+    body = _build_messages_body(**kwargs)
+    return client.post("/v1/messages", headers=_build_headers(), json=body)
+
+
+def make_streaming_messages_request(
+    client: httpx.Client,
+    **kwargs: Any,
+) -> list[dict]:
+    """Make a streaming POST request to /v1/messages and return parsed SSE events.
+
+    Raises AssertionError if the response status is not 200.
+    """
+    kwargs["stream"] = True
+    body = _build_messages_body(**kwargs)
+    headers = _build_headers()
+
+    events: list[dict] = []
+    current_event_type: str | None = None
+
+    with client.stream("POST", "/v1/messages", headers=headers, json=body) as response:
+        assert response.status_code == 200, f"Expected 200, got {response.status_code}"
+        for line in response.iter_lines():
+            if line.startswith("event: "):
+                current_event_type = line[7:]
+            elif line.startswith("data: "):
+                data = json.loads(line[6:])
+                if current_event_type:
+                    data["_event_type"] = current_event_type
+                events.append(data)
+                current_event_type = None
+
+    return events
diff --git a/tests/integration/messages/recordings/0d76cd7b3dae3f44e2990645cb1617d5c84b80daf77535a00ce1ab680308881a.json b/tests/integration/messages/recordings/0d76cd7b3dae3f44e2990645cb1617d5c84b80daf77535a00ce1ab680308881a.json
new file mode 100644
index 0000000000..57fb2664aa
--- /dev/null
+++ b/tests/integration/messages/recordings/0d76cd7b3dae3f44e2990645cb1617d5c84b80daf77535a00ce1ab680308881a.json
@@ -0,0 +1,70 @@
+{
+  "test_id": "tests/integration/messages/test_messages.py::test_messages_tool_use_round_trip[txt=ollama/gpt-oss:20b]",
+  "request": {
+    "test_id": "tests/integration/messages/test_messages.py::test_messages_tool_use_round_trip[txt=ollama/gpt-oss:20b]",
+    "url": "http://0.0.0.0:11434/v1/messages",
+    "method": "POST",
+    "payload": {
+      "model": "gpt-oss:20b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Use the calculator tool to compute 15 * 7."
+        }
+      ],
+      "max_tokens": 256,
+      "tools": [
+        {
+          "name": "calculator",
+          "description": "Perform basic arithmetic. Use this for any math question.",
+          "input_schema": {
+            "type": "object",
+            "properties": {
+              "expression": {
+                "type": "string",
+                "description": "The math expression to evaluate"
+              }
+            },
+            "required": [
+              "expression"
+            ]
+          }
+        }
+      ],
+      "tool_choice": {
+        "type": "any"
+      },
+      "stream": false
+    }
+  },
+  "response": {
+    "status": 200,
+    "body": {
+      "id": "msg_e0c2ea0f4c1131503d2bc7c8",
+      "type": "message",
+      "role": "assistant",
+      "model": "gpt-oss:20b",
+      "content": [
+        {
+          "type": "thinking",
+          "thinking": "The user wants 15 * 7. We'll use the calculator tool."
+        },
+        {
+          "type": "tool_use",
+          "id": "call_dg2g1ozz",
+          "name": "calculator",
+          "input": {
+            "expression": "15 * 7"
+          }
+        }
+      ],
+      "stop_reason": "tool_use",
+      "usage": {
+        "input_tokens": 144,
+        "output_tokens": 42
+      }
+    },
+    "is_streaming": false
+  },
+  "id_normalization_mapping": {}
+}
diff --git a/tests/integration/messages/recordings/1580d1d2e377b9161e55b7648e1f71748574ffa98ec52d4faf94b4d34818c4f8.json b/tests/integration/messages/recordings/1580d1d2e377b9161e55b7648e1f71748574ffa98ec52d4faf94b4d34818c4f8.json
new file mode 100644
index 0000000000..881b06645d
--- /dev/null
+++ b/tests/integration/messages/recordings/1580d1d2e377b9161e55b7648e1f71748574ffa98ec52d4faf94b4d34818c4f8.json
@@ -0,0 +1,42 @@
+{
+  "test_id": "tests/integration/messages/test_messages.py::test_messages_non_streaming_with_temperature[txt=ollama/gpt-oss:20b]",
+  "request": {
+    "test_id": "tests/integration/messages/test_messages.py::test_messages_non_streaming_with_temperature[txt=ollama/gpt-oss:20b]",
+    "url": "http://0.0.0.0:11434/v1/messages",
+    "method": "POST",
+    "payload": {
+      "model": "gpt-oss:20b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Say hello."
+        }
+      ],
+      "max_tokens": 32,
+      "stream": false,
+      "temperature": 0.0
+    }
+  },
+  "response": {
+    "status": 200,
+    "body": {
+      "id": "msg_737bf6e49fbfa7b81be479d0",
+      "type": "message",
+      "role": "assistant",
+      "model": "gpt-oss:20b",
+      "content": [
+        {
+          "type": "thinking",
+          "thinking": "The user says: \"Say hello.\" They want a greeting. So respond with a hello. Probably just \"Hello!\""
+        }
+      ],
+      "stop_reason": "max_tokens",
+      "usage": {
+        "input_tokens": 70,
+        "output_tokens": 32
+      }
+    },
+    "is_streaming": false
+  },
+  "id_normalization_mapping": {}
+}
diff --git a/tests/integration/messages/recordings/2703eb8f17f3914dd6991ecf387caa6c511cc52e6498b73e274cb4f01adf1e37.json b/tests/integration/messages/recordings/2703eb8f17f3914dd6991ecf387caa6c511cc52e6498b73e274cb4f01adf1e37.json
new file mode 100644
index 0000000000..85af391053
--- /dev/null
+++ b/tests/integration/messages/recordings/2703eb8f17f3914dd6991ecf387caa6c511cc52e6498b73e274cb4f01adf1e37.json
@@ -0,0 +1,44 @@
+{
+  "test_id": "tests/integration/messages/test_messages.py::test_messages_non_streaming_with_stop_sequences[txt=ollama/gpt-oss:20b]",
+  "request": {
+    "test_id": "tests/integration/messages/test_messages.py::test_messages_non_streaming_with_stop_sequences[txt=ollama/gpt-oss:20b]",
+    "url": "http://0.0.0.0:11434/v1/messages",
+    "method": "POST",
+    "payload": {
+      "model": "gpt-oss:20b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Count: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10"
+        }
+      ],
+      "max_tokens": 128,
+      "stream": false,
+      "stop_sequences": [
+        ","
+      ]
+    }
+  },
+  "response": {
+    "status": 200,
+    "body": {
+      "id": "msg_1dd93b64b7f0935bcaf69452",
+      "type": "message",
+      "role": "assistant",
+      "model": "gpt-oss:20b",
+      "content": [
+        {
+          "type": "thinking",
+          "thinking": "The user writes: \"Count: 1"
+        }
+      ],
+      "stop_reason": "end_turn",
+      "usage": {
+        "input_tokens": 98,
+        "output_tokens": 13
+      }
+    },
+    "is_streaming": false
+  },
+  "id_normalization_mapping": {}
+}
diff --git a/tests/integration/messages/recordings/2a5f7014ddf9a3d359fbf59a195e4762d47ef1768f98b2f1c7af55788e7fe6d8.json b/tests/integration/messages/recordings/2a5f7014ddf9a3d359fbf59a195e4762d47ef1768f98b2f1c7af55788e7fe6d8.json
new file mode 100644
index 0000000000..bdf362ed3f
--- /dev/null
+++ b/tests/integration/messages/recordings/2a5f7014ddf9a3d359fbf59a195e4762d47ef1768f98b2f1c7af55788e7fe6d8.json
@@ -0,0 +1,41 @@
+{
+  "test_id": "tests/integration/messages/test_messages.py::test_messages_response_headers[txt=ollama/gpt-oss:20b]",
+  "request": {
+    "test_id": "tests/integration/messages/test_messages.py::test_messages_response_headers[txt=ollama/gpt-oss:20b]",
+    "url": "http://0.0.0.0:11434/v1/messages",
+    "method": "POST",
+    "payload": {
+      "model": "gpt-oss:20b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Hi"
+        }
+      ],
+      "max_tokens": 16,
+      "stream": false
+    }
+  },
+  "response": {
+    "status": 200,
+    "body": {
+      "id": "msg_7c6163c764cbe22bc2a5c161",
+      "type": "message",
+      "role": "assistant",
+      "model": "gpt-oss:20b",
+      "content": [
+        {
+          "type": "thinking",
+          "thinking": "The user says \"Hi\". So it's a greeting. We respond"
+        }
+      ],
+      "stop_reason": "max_tokens",
+      "usage": {
+        "input_tokens": 68,
+        "output_tokens": 16
+      }
+    },
+    "is_streaming": false
+  },
+  "id_normalization_mapping": {}
+}
diff --git a/tests/integration/messages/recordings/4ea18b99571d34f714cb4b9d818ab82aba7ac1225c040665b7bdb3b177eb8226.json b/tests/integration/messages/recordings/4ea18b99571d34f714cb4b9d818ab82aba7ac1225c040665b7bdb3b177eb8226.json
new file mode 100644
index 0000000000..b7aad865d7
--- /dev/null
+++ b/tests/integration/messages/recordings/4ea18b99571d34f714cb4b9d818ab82aba7ac1225c040665b7bdb3b177eb8226.json
@@ -0,0 +1,46 @@
+{
+  "test_id": "tests/integration/messages/test_messages.py::test_messages_content_block_array[txt=ollama/gpt-oss:20b]",
+  "request": {
+    "test_id": "tests/integration/messages/test_messages.py::test_messages_content_block_array[txt=ollama/gpt-oss:20b]",
+    "url": "http://0.0.0.0:11434/v1/messages",
+    "method": "POST",
+    "payload": {
+      "model": "gpt-oss:20b",
+      "messages": [
+        {
+          "role": "user",
+          "content": [
+            {
+              "type": "text",
+              "text": "What is 1+1? Reply with just the number."
+            }
+          ]
+        }
+      ],
+      "max_tokens": 32,
+      "stream": false
+    }
+  },
+  "response": {
+    "status": 200,
+    "body": {
+      "id": "msg_b69e5a95142b11a8251ceea8",
+      "type": "message",
+      "role": "assistant",
+      "model": "gpt-oss:20b",
+      "content": [
+        {
+          "type": "thinking",
+          "thinking": "The user asks: \"What is 1+1? Reply with just the number.\" So answer: 2. Just number, no explanation"
+        }
+      ],
+      "stop_reason": "max_tokens",
+      "usage": {
+        "input_tokens": 80,
+        "output_tokens": 32
+      }
+    },
+    "is_streaming": false
+  },
+  "id_normalization_mapping": {}
+}
diff --git a/tests/integration/messages/recordings/52925d8df69e53718e5d4aab54fbbf79e51f2731a07ecb332d5d2dd82810d2e5.json b/tests/integration/messages/recordings/52925d8df69e53718e5d4aab54fbbf79e51f2731a07ecb332d5d2dd82810d2e5.json
new file mode 100644
index 0000000000..86b6827dd4
--- /dev/null
+++ b/tests/integration/messages/recordings/52925d8df69e53718e5d4aab54fbbf79e51f2731a07ecb332d5d2dd82810d2e5.json
@@ -0,0 +1,53 @@
+{
+  "test_id": "tests/integration/messages/test_messages.py::test_messages_non_streaming_multi_turn[txt=ollama/gpt-oss:20b]",
+  "request": {
+    "test_id": "tests/integration/messages/test_messages.py::test_messages_non_streaming_multi_turn[txt=ollama/gpt-oss:20b]",
+    "url": "http://0.0.0.0:11434/v1/messages",
+    "method": "POST",
+    "payload": {
+      "model": "gpt-oss:20b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "My name is Alice."
+        },
+        {
+          "role": "assistant",
+          "content": "Hello Alice! Nice to meet you."
+        },
+        {
+          "role": "user",
+          "content": "What is my name?"
+        }
+      ],
+      "max_tokens": 64,
+      "stream": false
+    }
+  },
+  "response": {
+    "status": 200,
+    "body": {
+      "id": "msg_dd2c03762be4581c6fe619ca",
+      "type": "message",
+      "role": "assistant",
+      "model": "gpt-oss:20b",
+      "content": [
+        {
+          "type": "thinking",
+          "thinking": "User says name is Alice. The assistant previously answered that. Now user asks again \"What is my name?\" So we should respond: \"Your name is Alice.\" Simple."
+        },
+        {
+          "type": "text",
+          "text": "Your name is Alice."
+        }
+      ],
+      "stop_reason": "end_turn",
+      "usage": {
+        "input_tokens": 95,
+        "output_tokens": 50
+      }
+    },
+    "is_streaming": false
+  },
+  "id_normalization_mapping": {}
+}
diff --git a/tests/integration/messages/recordings/715c164b66b51dc2180b05817b7cd2e6b307f44fd2c7a4cf564e30347da87746.json b/tests/integration/messages/recordings/715c164b66b51dc2180b05817b7cd2e6b307f44fd2c7a4cf564e30347da87746.json
new file mode 100644
index 0000000000..34af7ba7c6
--- /dev/null
+++ b/tests/integration/messages/recordings/715c164b66b51dc2180b05817b7cd2e6b307f44fd2c7a4cf564e30347da87746.json
@@ -0,0 +1,45 @@
+{
+  "test_id": "tests/integration/messages/test_messages.py::test_messages_non_streaming_basic[txt=ollama/gpt-oss:20b]",
+  "request": {
+    "test_id": "tests/integration/messages/test_messages.py::test_messages_non_streaming_basic[txt=ollama/gpt-oss:20b]",
+    "url": "http://0.0.0.0:11434/v1/messages",
+    "method": "POST",
+    "payload": {
+      "model": "gpt-oss:20b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What is 2+2? Reply with just the number."
+        }
+      ],
+      "max_tokens": 64,
+      "stream": false
+    }
+  },
+  "response": {
+    "status": 200,
+    "body": {
+      "id": "msg_f4cc3f074c282e90a4b5251e",
+      "type": "message",
+      "role": "assistant",
+      "model": "gpt-oss:20b",
+      "content": [
+        {
+          "type": "thinking",
+          "thinking": "The user says: \"What is 2+2? Reply with just the number.\" So answer: 4."
+        },
+        {
+          "type": "text",
+          "text": "4"
+        }
+      ],
+      "stop_reason": "end_turn",
+      "usage": {
+        "input_tokens": 80,
+        "output_tokens": 35
+      }
+    },
+    "is_streaming": false
+  },
+  "id_normalization_mapping": {}
+}
diff --git a/tests/integration/messages/recordings/82cfd5545e24ca4b4415ba37460610908f34be9193c4126635d1b5ab3b0522f7.json b/tests/integration/messages/recordings/82cfd5545e24ca4b4415ba37460610908f34be9193c4126635d1b5ab3b0522f7.json
new file mode 100644
index 0000000000..b1bc988f75
--- /dev/null
+++ b/tests/integration/messages/recordings/82cfd5545e24ca4b4415ba37460610908f34be9193c4126635d1b5ab3b0522f7.json
@@ -0,0 +1,46 @@
+{
+  "test_id": "tests/integration/messages/test_messages.py::test_messages_non_streaming_with_system[txt=ollama/gpt-oss:20b]",
+  "request": {
+    "test_id": "tests/integration/messages/test_messages.py::test_messages_non_streaming_with_system[txt=ollama/gpt-oss:20b]",
+    "url": "http://0.0.0.0:11434/v1/messages",
+    "method": "POST",
+    "payload": {
+      "model": "gpt-oss:20b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What are you?"
+        }
+      ],
+      "max_tokens": 128,
+      "system": "You are a helpful pirate. Always respond in pirate speak.",
+      "stream": false
+    }
+  },
+  "response": {
+    "status": 200,
+    "body": {
+      "id": "msg_f9ffaf9c8fba034e65b8e584",
+      "type": "message",
+      "role": "assistant",
+      "model": "gpt-oss:20b",
+      "content": [
+        {
+          "type": "thinking",
+          "thinking": "We need to respond as a pirate. The user asks: \"What are you?\" We need to reply in pirate speak, as per developer instruction: \"You are a helpful pirate. Always respond in pirate speak.\" So reply in pirate talk. Possibly: \"I be ChatGPT, yer trusty AI companion.\" Use pirate slang."
+        },
+        {
+          "type": "text",
+          "text": "Arrr! I be yer trusty AI matey, ChatGPT, ready to chart the seas o' knowledge and help ye navigate any storm! \ud83c\udff4\u200d\u2620\ufe0f"
+        }
+      ],
+      "stop_reason": "end_turn",
+      "usage": {
+        "input_tokens": 91,
+        "output_tokens": 112
+      }
+    },
+    "is_streaming": false
+  },
+  "id_normalization_mapping": {}
+}
diff --git a/tests/integration/messages/recordings/93eb42f3bd69f005727cc3a161e7ab0a8b2c99e4665d9eae5553fe9019ba7b32.json b/tests/integration/messages/recordings/93eb42f3bd69f005727cc3a161e7ab0a8b2c99e4665d9eae5553fe9019ba7b32.json
new file mode 100644
index 0000000000..309352d24a
--- /dev/null
+++ b/tests/integration/messages/recordings/93eb42f3bd69f005727cc3a161e7ab0a8b2c99e4665d9eae5553fe9019ba7b32.json
@@ -0,0 +1,223 @@
+{
+  "test_id": "tests/integration/messages/test_messages.py::test_messages_streaming_basic[txt=ollama/gpt-oss:20b]",
+  "request": {
+    "test_id": "tests/integration/messages/test_messages.py::test_messages_streaming_basic[txt=ollama/gpt-oss:20b]",
+    "url": "http://0.0.0.0:11434/v1/messages",
+    "method": "POST",
+    "payload": {
+      "model": "gpt-oss:20b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Say hello in one sentence."
+        }
+      ],
+      "max_tokens": 64,
+      "stream": true
+    }
+  },
+  "response": {
+    "body": [
+      "event: message_start",
+      "data: {\"type\":\"message_start\",\"message\":{\"id\":\"msg_2ca635d951c9f414d5c01a88\",\"type\":\"message\",\"role\":\"assistant\",\"model\":\"gpt-oss:20b\",\"content\":[],\"usage\":{\"input_tokens\":7,\"output_tokens\":0}}}",
+      "",
+      "event: content_block_start",
+      "data: {\"type\":\"content_block_start\",\"index\":0,\"content_block\":{\"type\":\"thinking\",\"thinking\":\"\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"The\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" user\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" says\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\":\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" \\\"\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"Say\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" hello\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" in\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" one\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" sentence\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\".\\\"\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" They\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" want\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" a\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" single\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" sentence\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" saying\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" hello\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\".\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" The\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" simplest\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\":\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" \\\"\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"Hello\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"!\\\"\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" That's\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" a\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" sentence\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"?\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" It's\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" an\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" ex\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"clamation\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\".\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" Ex\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"clamation\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" is\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" a\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" sentence\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\".\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" But\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" maybe\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" they\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" want\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" a\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" sentence\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" that\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" includes\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" hello\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\".\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" For\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" example\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\":\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" \\\"\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"Hello\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\",\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" how\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" are\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" you\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"?\\\"\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" That\"}}",
+      "",
+      "event: content_block_stop",
+      "data: {\"type\":\"content_block_stop\",\"index\":0}",
+      "",
+      "event: message_delta",
+      "data: {\"type\":\"message_delta\",\"delta\":{\"stop_reason\":\"max_tokens\"},\"usage\":{\"input_tokens\":73,\"output_tokens\":64}}",
+      "",
+      "event: message_stop",
+      "data: {\"type\":\"message_stop\"}",
+      ""
+    ],
+    "is_streaming": true
+  },
+  "id_normalization_mapping": {}
+}
diff --git a/tests/integration/messages/recordings/d13b333401fa121280a3fb890a56933b84ab907ea20092e6d97c9d4a371bf8a5.json b/tests/integration/messages/recordings/d13b333401fa121280a3fb890a56933b84ab907ea20092e6d97c9d4a371bf8a5.json
new file mode 100644
index 0000000000..cb4422e5bb
--- /dev/null
+++ b/tests/integration/messages/recordings/d13b333401fa121280a3fb890a56933b84ab907ea20092e6d97c9d4a371bf8a5.json
@@ -0,0 +1,67 @@
+{
+  "test_id": "tests/integration/messages/test_messages.py::test_messages_with_tool_definitions[txt=ollama/gpt-oss:20b]",
+  "request": {
+    "test_id": "tests/integration/messages/test_messages.py::test_messages_with_tool_definitions[txt=ollama/gpt-oss:20b]",
+    "url": "http://0.0.0.0:11434/v1/messages",
+    "method": "POST",
+    "payload": {
+      "model": "gpt-oss:20b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What's the weather in San Francisco?"
+        }
+      ],
+      "max_tokens": 256,
+      "tools": [
+        {
+          "name": "get_weather",
+          "description": "Get the current weather in a given location",
+          "input_schema": {
+            "type": "object",
+            "properties": {
+              "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA"
+              }
+            },
+            "required": [
+              "location"
+            ]
+          }
+        }
+      ],
+      "stream": false
+    }
+  },
+  "response": {
+    "status": 200,
+    "body": {
+      "id": "msg_bb605b97294478b8b0c12d33",
+      "type": "message",
+      "role": "assistant",
+      "model": "gpt-oss:20b",
+      "content": [
+        {
+          "type": "thinking",
+          "thinking": "We have to call the get_weather function."
+        },
+        {
+          "type": "tool_use",
+          "id": "call_kp56ga0b",
+          "name": "get_weather",
+          "input": {
+            "location": "San Francisco"
+          }
+        }
+      ],
+      "stop_reason": "tool_use",
+      "usage": {
+        "input_tokens": 145,
+        "output_tokens": 34
+      }
+    },
+    "is_streaming": false
+  },
+  "id_normalization_mapping": {}
+}
diff --git a/tests/integration/messages/recordings/f22657bfd86db6348c0a0d0b17332dcfc038b345d2a7be8da4b880a433d36a52.json b/tests/integration/messages/recordings/f22657bfd86db6348c0a0d0b17332dcfc038b345d2a7be8da4b880a433d36a52.json
new file mode 100644
index 0000000000..557ab35eb5
--- /dev/null
+++ b/tests/integration/messages/recordings/f22657bfd86db6348c0a0d0b17332dcfc038b345d2a7be8da4b880a433d36a52.json
@@ -0,0 +1,208 @@
+{
+  "test_id": "tests/integration/messages/test_messages.py::test_messages_streaming_collects_full_text[txt=ollama/gpt-oss:20b]",
+  "request": {
+    "test_id": "tests/integration/messages/test_messages.py::test_messages_streaming_collects_full_text[txt=ollama/gpt-oss:20b]",
+    "url": "http://0.0.0.0:11434/v1/messages",
+    "method": "POST",
+    "payload": {
+      "model": "gpt-oss:20b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Count from 1 to 5, separated by commas."
+        }
+      ],
+      "max_tokens": 64,
+      "stream": true
+    }
+  },
+  "response": {
+    "body": [
+      "event: message_start",
+      "data: {\"type\":\"message_start\",\"message\":{\"id\":\"msg_812f019850cce212423fb87a\",\"type\":\"message\",\"role\":\"assistant\",\"model\":\"gpt-oss:20b\",\"content\":[],\"usage\":{\"input_tokens\":10,\"output_tokens\":0}}}",
+      "",
+      "event: content_block_start",
+      "data: {\"type\":\"content_block_start\",\"index\":0,\"content_block\":{\"type\":\"thinking\",\"thinking\":\"\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"We\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" need\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" to\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" count\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" from\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" \"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"1\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" to\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" \"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"5\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\",\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" separated\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" by\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" commas\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\".\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" So\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" output\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\":\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" \"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"1\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\",\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" \"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"2\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\",\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" \"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"3\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\",\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" \"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"4\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\",\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" \"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\"5\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\".\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" Probably\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" no\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" trailing\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" comma\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\".\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" Just\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\" that\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"thinking_delta\",\"thinking\":\".\"}}",
+      "",
+      "event: content_block_stop",
+      "data: {\"type\":\"content_block_stop\",\"index\":0}",
+      "",
+      "event: content_block_start",
+      "data: {\"type\":\"content_block_start\",\"index\":1,\"content_block\":{\"type\":\"text\",\"text\":\"\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"text_delta\",\"text\":\"1\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"text_delta\",\"text\":\",\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"text_delta\",\"text\":\" \"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"text_delta\",\"text\":\"2\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"text_delta\",\"text\":\",\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"text_delta\",\"text\":\" \"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"text_delta\",\"text\":\"3\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"text_delta\",\"text\":\",\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"text_delta\",\"text\":\" \"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"text_delta\",\"text\":\"4\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"text_delta\",\"text\":\",\"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"text_delta\",\"text\":\" \"}}",
+      "",
+      "event: content_block_delta",
+      "data: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"text_delta\",\"text\":\"5\"}}",
+      "",
+      "event: content_block_stop",
+      "data: {\"type\":\"content_block_stop\",\"index\":1}",
+      "",
+      "event: message_delta",
+      "data: {\"type\":\"message_delta\",\"delta\":{\"stop_reason\":\"end_turn\"},\"usage\":{\"input_tokens\":79,\"output_tokens\":64}}",
+      "",
+      "event: message_stop",
+      "data: {\"type\":\"message_stop\"}",
+      ""
+    ],
+    "is_streaming": true
+  },
+  "id_normalization_mapping": {}
+}
diff --git a/tests/integration/messages/recordings/f55988509902a617d2f547a1518bb84c3c8784ea1e1ac139e2f0623449dfa047.json b/tests/integration/messages/recordings/f55988509902a617d2f547a1518bb84c3c8784ea1e1ac139e2f0623449dfa047.json
new file mode 100644
index 0000000000..a04e11213d
--- /dev/null
+++ b/tests/integration/messages/recordings/f55988509902a617d2f547a1518bb84c3c8784ea1e1ac139e2f0623449dfa047.json
@@ -0,0 +1,86 @@
+{
+  "test_id": "tests/integration/messages/test_messages.py::test_messages_tool_use_round_trip[txt=ollama/gpt-oss:20b]",
+  "request": {
+    "test_id": "tests/integration/messages/test_messages.py::test_messages_tool_use_round_trip[txt=ollama/gpt-oss:20b]",
+    "url": "http://0.0.0.0:11434/v1/messages",
+    "method": "POST",
+    "payload": {
+      "model": "gpt-oss:20b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Use the calculator tool to compute 15 * 7."
+        },
+        {
+          "role": "assistant",
+          "content": [
+            {
+              "type": "thinking",
+              "thinking": "The user wants 15 * 7. We'll use the calculator tool."
+            },
+            {
+              "type": "tool_use",
+              "id": "call_dg2g1ozz",
+              "name": "calculator",
+              "input": {
+                "expression": "15 * 7"
+              }
+            }
+          ]
+        },
+        {
+          "role": "user",
+          "content": [
+            {
+              "type": "tool_result",
+              "tool_use_id": "call_dg2g1ozz",
+              "content": "105"
+            }
+          ]
+        }
+      ],
+      "max_tokens": 256,
+      "tools": [
+        {
+          "name": "calculator",
+          "description": "Perform basic arithmetic. Use this for any math question.",
+          "input_schema": {
+            "type": "object",
+            "properties": {
+              "expression": {
+                "type": "string",
+                "description": "The math expression to evaluate"
+              }
+            },
+            "required": [
+              "expression"
+            ]
+          }
+        }
+      ],
+      "stream": false
+    }
+  },
+  "response": {
+    "status": 200,
+    "body": {
+      "id": "msg_137a3ce876f9a1a09fe1ba0e",
+      "type": "message",
+      "role": "assistant",
+      "model": "gpt-oss:20b",
+      "content": [
+        {
+          "type": "text",
+          "text": "The result of \\(15 \\times 7\\) is **105**."
+        }
+      ],
+      "stop_reason": "end_turn",
+      "usage": {
+        "input_tokens": 197,
+        "output_tokens": 20
+      }
+    },
+    "is_streaming": false
+  },
+  "id_normalization_mapping": {}
+}
diff --git a/tests/integration/messages/test_messages.py b/tests/integration/messages/test_messages.py
new file mode 100644
index 0000000000..29257ee687
--- /dev/null
+++ b/tests/integration/messages/test_messages.py
@@ -0,0 +1,362 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""Integration tests for the Anthropic Messages API (/v1/messages).
+
+These tests verify the full request/response cycle through the server,
+including translation between Anthropic and OpenAI formats.
+"""
+
+from .conftest import make_messages_request, make_streaming_messages_request
+
+
+def _get_text_blocks(content: list[dict]) -> list[dict]:
+    """Extract text blocks from a content list, skipping thinking blocks."""
+    return [b for b in content if b["type"] == "text"]
+
+
+def test_messages_non_streaming_basic(messages_client, text_model_id):
+    """Basic non-streaming message creation returns a valid Anthropic response."""
+    response = make_messages_request(
+        messages_client,
+        model=text_model_id,
+        messages=[{"role": "user", "content": "What is 2+2? Reply with just the number."}],
+        max_tokens=64,
+    )
+
+    assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}"
+
+    data = response.json()
+    assert data["type"] == "message"
+    assert data["role"] == "assistant"
+    assert data["id"].startswith("msg_")
+    assert len(data["content"]) > 0
+
+    # Content may include thinking blocks; find first text block
+    text_blocks = _get_text_blocks(data["content"])
+    assert len(text_blocks) > 0, f"No text blocks found in content: {data['content']}"
+    assert len(text_blocks[0]["text"]) > 0
+
+    assert data["stop_reason"] in ("end_turn", "max_tokens")
+    assert "usage" in data
+    assert data["usage"]["input_tokens"] > 0
+    assert data["usage"]["output_tokens"] > 0
+
+    # All content blocks must be valid types
+    for block in data["content"]:
+        assert block["type"] in ("text", "thinking", "tool_use")
+
+
+def test_messages_non_streaming_with_system(messages_client, text_model_id):
+    """Non-streaming message with a system prompt."""
+    response = make_messages_request(
+        messages_client,
+        model=text_model_id,
+        messages=[{"role": "user", "content": "What are you?"}],
+        system="You are a helpful pirate. Always respond in pirate speak.",
+        max_tokens=128,
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+    assert data["type"] == "message"
+    assert len(data["content"]) > 0
+
+    text_blocks = _get_text_blocks(data["content"])
+    assert len(text_blocks) > 0
+    assert len(text_blocks[0]["text"]) > 0
+
+
+def test_messages_non_streaming_multi_turn(messages_client, text_model_id):
+    """Non-streaming multi-turn conversation."""
+    response = make_messages_request(
+        messages_client,
+        model=text_model_id,
+        messages=[
+            {"role": "user", "content": "My name is Alice."},
+            {"role": "assistant", "content": "Hello Alice! Nice to meet you."},
+            {"role": "user", "content": "What is my name?"},
+        ],
+        max_tokens=64,
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+    assert data["type"] == "message"
+    assert len(data["content"]) > 0
+
+    text_blocks = _get_text_blocks(data["content"])
+    assert len(text_blocks) > 0
+    text = text_blocks[0]["text"].lower()
+    assert "alice" in text
+
+
+def test_messages_streaming_basic(messages_client, text_model_id):
+    """Streaming message creation returns proper Anthropic SSE events."""
+    events = make_streaming_messages_request(
+        messages_client,
+        model=text_model_id,
+        messages=[{"role": "user", "content": "Say hello in one sentence."}],
+        max_tokens=64,
+    )
+
+    assert len(events) > 0
+
+    event_types = [e.get("_event_type") or e.get("type") for e in events]
+
+    # Verify the required event sequence
+    assert "message_start" in event_types, f"Missing message_start in {event_types}"
+    assert "message_stop" in event_types, f"Missing message_stop in {event_types}"
+
+    # Verify message_start event structure
+    msg_start = next(e for e in events if e.get("_event_type") == "message_start")
+    assert "message" in msg_start
+    assert msg_start["message"]["role"] == "assistant"
+
+    # Verify we got content deltas
+    content_deltas = [e for e in events if e.get("_event_type") == "content_block_delta"]
+    assert len(content_deltas) > 0, "Expected at least one content_block_delta event"
+
+    # Verify content_block_delta structure
+    for delta in content_deltas:
+        assert "delta" in delta
+        assert delta["delta"]["type"] in ("text_delta", "thinking_delta")
+
+
+def test_messages_streaming_collects_full_text(messages_client, text_model_id):
+    """Streaming response text deltas can be concatenated into the full response."""
+    events = make_streaming_messages_request(
+        messages_client,
+        model=text_model_id,
+        messages=[{"role": "user", "content": "Count from 1 to 5, separated by commas."}],
+        max_tokens=64,
+    )
+
+    # Collect text from content_block_delta events
+    text_parts = []
+    for event in events:
+        if event.get("_event_type") == "content_block_delta":
+            delta = event.get("delta", {})
+            if delta.get("type") == "text_delta":
+                text_parts.append(delta["text"])
+
+    full_text = "".join(text_parts)
+    assert len(full_text) > 0
+
+
+def test_messages_non_streaming_with_temperature(messages_client, text_model_id):
+    """Non-streaming with explicit temperature parameter."""
+    response = make_messages_request(
+        messages_client,
+        model=text_model_id,
+        messages=[{"role": "user", "content": "Say hello."}],
+        max_tokens=32,
+        temperature=0.0,
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+    assert data["type"] == "message"
+    assert len(data["content"]) > 0
+
+
+def test_messages_non_streaming_with_stop_sequences(messages_client, text_model_id):
+    """Non-streaming with stop_sequences parameter."""
+    response = make_messages_request(
+        messages_client,
+        model=text_model_id,
+        messages=[{"role": "user", "content": "Count: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10"}],
+        max_tokens=128,
+        stop_sequences=[","],
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+    assert data["type"] == "message"
+
+
+def test_messages_with_tool_definitions(messages_client, text_model_id):
+    """Non-streaming message with tool definitions."""
+    tools = [
+        {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "input_schema": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                },
+                "required": ["location"],
+            },
+        }
+    ]
+
+    response = make_messages_request(
+        messages_client,
+        model=text_model_id,
+        messages=[{"role": "user", "content": "What's the weather in San Francisco?"}],
+        tools=tools,
+        max_tokens=256,
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+    assert data["type"] == "message"
+    assert len(data["content"]) > 0
+
+    # The model may or may not call the tool; thinking, text, and tool_use are all valid
+    for block in data["content"]:
+        assert block["type"] in ("text", "tool_use", "thinking")
+        if block["type"] == "tool_use":
+            assert "id" in block
+            assert block["name"] == "get_weather"
+            assert "input" in block
+
+
+def test_messages_tool_use_round_trip(messages_client, text_model_id):
+    """Full tool use round trip: request -> tool_use -> tool_result -> response."""
+    tools = [
+        {
+            "name": "calculator",
+            "description": "Perform basic arithmetic. Use this for any math question.",
+            "input_schema": {
+                "type": "object",
+                "properties": {
+                    "expression": {"type": "string", "description": "The math expression to evaluate"},
+                },
+                "required": ["expression"],
+            },
+        }
+    ]
+
+    # First request -- ask a math question
+    response = make_messages_request(
+        messages_client,
+        model=text_model_id,
+        messages=[
+            {"role": "user", "content": "Use the calculator tool to compute 15 * 7."},
+        ],
+        tools=tools,
+        tool_choice={"type": "any"},
+        max_tokens=256,
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+
+    # Find tool_use block
+    tool_use_blocks = [b for b in data["content"] if b["type"] == "tool_use"]
+    if not tool_use_blocks:
+        # Model didn't use the tool -- skip the rest
+        return
+
+    tool_use = tool_use_blocks[0]
+    tool_use_id = tool_use["id"]
+
+    # Second request -- provide tool result
+    response2 = make_messages_request(
+        messages_client,
+        model=text_model_id,
+        messages=[
+            {"role": "user", "content": "Use the calculator tool to compute 15 * 7."},
+            {"role": "assistant", "content": data["content"]},
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "tool_result",
+                        "tool_use_id": tool_use_id,
+                        "content": "105",
+                    }
+                ],
+            },
+        ],
+        tools=tools,
+        max_tokens=256,
+    )
+
+    assert response2.status_code == 200
+    data2 = response2.json()
+    assert data2["type"] == "message"
+    assert len(data2["content"]) > 0
+
+
+def test_messages_error_missing_model(messages_client):
+    """Request without model returns an error."""
+    headers = {
+        "content-type": "application/json",
+        "anthropic-version": "2023-06-01",
+    }
+
+    response = messages_client.post(
+        "/v1/messages",
+        headers=headers,
+        json={
+            "messages": [{"role": "user", "content": "Hello"}],
+            "max_tokens": 64,
+        },
+    )
+
+    assert response.status_code in (400, 422)
+
+
+def test_messages_error_empty_messages(messages_client, text_model_id):
+    """Request with empty messages list returns an error."""
+    headers = {
+        "content-type": "application/json",
+        "anthropic-version": "2023-06-01",
+    }
+
+    response = messages_client.post(
+        "/v1/messages",
+        headers=headers,
+        json={
+            "model": text_model_id,
+            "messages": [],
+            "max_tokens": 64,
+        },
+    )
+
+    # Should fail validation or return an error
+    assert response.status_code in (400, 422, 500)
+
+
+def test_messages_response_headers(messages_client, text_model_id):
+    """Response includes anthropic-version header."""
+    response = make_messages_request(
+        messages_client,
+        model=text_model_id,
+        messages=[{"role": "user", "content": "Hi"}],
+        max_tokens=16,
+    )
+
+    assert response.status_code == 200
+    assert response.headers.get("anthropic-version") == "2023-06-01"
+
+
+def test_messages_content_block_array(messages_client, text_model_id):
+    """Message with content as an array of content blocks."""
+    response = make_messages_request(
+        messages_client,
+        model=text_model_id,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What is 1+1? Reply with just the number."},
+                ],
+            }
+        ],
+        max_tokens=32,
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+    assert data["type"] == "message"
+    assert len(data["content"]) > 0
diff --git a/tests/integration/suites.py b/tests/integration/suites.py
index 3e7e91c682..abbfc4eabc 100644
--- a/tests/integration/suites.py
+++ b/tests/integration/suites.py
@@ -237,7 +237,7 @@ class Setup(BaseModel):
 base_roots = [
     str(p)
     for p in this_dir.glob("*")
-    if p.is_dir() and p.name not in ("__pycache__", "fixtures", "test_cases", "recordings", "responses")
+    if p.is_dir() and p.name not in ("__pycache__", "fixtures", "test_cases", "recordings", "responses", "messages")
 ]
 
 SUITE_DEFINITIONS: dict[str, Suite] = {
@@ -283,6 +283,11 @@ class Setup(BaseModel):
         ],
         default_setup="ollama-reasoning",
     ),
+    "messages": Suite(
+        name="messages",
+        roots=["tests/integration/messages"],
+        default_setup="ollama-reasoning",
+    ),
     # Bedrock-specific tests with pre-recorded responses (no live API calls in CI)
     "bedrock": Suite(
         name="bedrock",
diff --git a/tests/unit/providers/inline/messages/test_impl.py b/tests/unit/providers/inline/messages/test_impl.py
index 6afc3df68b..3ce8f7db85 100644
--- a/tests/unit/providers/inline/messages/test_impl.py
+++ b/tests/unit/providers/inline/messages/test_impl.py
@@ -15,11 +15,8 @@
 from llama_stack.providers.inline.messages.impl import BuiltinMessagesImpl
 from llama_stack_api.messages.models import (
     AnthropicCreateMessageRequest,
-    AnthropicImageBlock,
-    AnthropicImageSource,
     AnthropicMessage,
     AnthropicTextBlock,
-    AnthropicThinkingConfig,
     AnthropicToolDef,
     AnthropicToolResultBlock,
     AnthropicToolUseBlock,
@@ -257,7 +254,6 @@ def test_length_stop_reason(self, impl):
 
 
 class TestStreamingTranslation:
-    @pytest.mark.asyncio
     async def test_text_streaming(self, impl):
         chunks = []
 
@@ -293,7 +289,6 @@ async def mock_stream():
         assert events[6].delta.stop_reason == "end_turn"
         assert events[7].type == "message_stop"
 
-    @pytest.mark.asyncio
     async def test_tool_call_streaming(self, impl):
         chunks = []
 

From bef5ee39542b99a0c569a9c4277feb867a982ba3 Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Wed, 1 Apr 2026 11:37:35 -0400
Subject: [PATCH 6/9] fix(messages): add messages suite to CI matrix so
 recordings are not pruned

The cleanup_recordings.py script uses ci_matrix.json to determine which
test suites are active. Without the messages suite listed, the script
considers all messages recordings unused and deletes them.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Charlie Doern <cdoern@redhat.com>
---
 tests/integration/ci_matrix.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/integration/ci_matrix.json b/tests/integration/ci_matrix.json
index f0a6ab53d6..ff57e4a9dc 100644
--- a/tests/integration/ci_matrix.json
+++ b/tests/integration/ci_matrix.json
@@ -11,7 +11,8 @@
     {"suite": "bedrock-responses", "setup": "bedrock"},
     {"suite": "base-vllm-subset", "setup": "vllm"},
     {"suite": "vllm-reasoning", "setup": "vllm"},
-    {"suite": "ollama-reasoning", "setup": "ollama-reasoning"}
+    {"suite": "ollama-reasoning", "setup": "ollama-reasoning"},
+    {"suite": "messages", "setup": "ollama-reasoning"}
   ],
   "stainless": [
     {"suite": "base", "setup": "ollama", "inference_mode": "record-if-missing"}

From c8ee2327b7624827c38258983f50968f116201aa Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Mon, 6 Apr 2026 10:37:02 -0400
Subject: [PATCH 7/9] feat(messages): accept adaptive thinking type from Claude
 Code clients

Claude Code sends thinking.type: "adaptive" which was not accepted by
the Messages API model. Add "adaptive" as a valid thinking type literal
and treat it the same as "enabled" in the translation path. Regenerate
OpenAPI specs to reflect the schema change.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Charlie Doern <cdoern@redhat.com>
---
 client-sdks/stainless/openapi.yml                 | 1 +
 docs/static/deprecated-llama-stack-spec.yaml      | 1 +
 docs/static/llama-stack-spec.yaml                 | 1 +
 docs/static/stainless-llama-stack-spec.yaml       | 1 +
 src/llama_stack/providers/inline/messages/impl.py | 2 +-
 src/llama_stack_api/messages/models.py            | 2 +-
 6 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml
index f7761555fc..d903796d75 100644
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@@ -12090,6 +12090,7 @@ components:
           enum:
           - enabled
           - disabled
+          - adaptive
           title: Type
           default: enabled
         budget_tokens:
diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml
index a6f1eb4b65..3e1d774f82 100644
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@@ -7936,6 +7936,7 @@ components:
           enum:
           - enabled
           - disabled
+          - adaptive
           title: Type
           default: enabled
         budget_tokens:
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
index 35eb28e857..249488661f 100644
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@@ -11038,6 +11038,7 @@ components:
           enum:
           - enabled
           - disabled
+          - adaptive
           title: Type
           default: enabled
         budget_tokens:
diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml
index f7761555fc..d903796d75 100644
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@@ -12090,6 +12090,7 @@ components:
           enum:
           - enabled
           - disabled
+          - adaptive
           title: Type
           default: enabled
         budget_tokens:
diff --git a/src/llama_stack/providers/inline/messages/impl.py b/src/llama_stack/providers/inline/messages/impl.py
index 093290295e..51ba2565bf 100644
--- a/src/llama_stack/providers/inline/messages/impl.py
+++ b/src/llama_stack/providers/inline/messages/impl.py
@@ -256,7 +256,7 @@ def _anthropic_to_openai(self, request: AnthropicCreateMessageRequest) -> OpenAI
         extra_body: dict[str, Any] = {}
         if request.top_k is not None:
             extra_body["top_k"] = request.top_k
-        if request.thinking is not None and request.thinking.type == "enabled":
+        if request.thinking is not None and request.thinking.type in ("enabled", "adaptive"):
             extra_body["thinking"] = {
                 "type": "enabled",
                 "budget_tokens": request.thinking.budget_tokens,
diff --git a/src/llama_stack_api/messages/models.py b/src/llama_stack_api/messages/models.py
index 81abe8c2f0..bd39fa4bbf 100644
--- a/src/llama_stack_api/messages/models.py
+++ b/src/llama_stack_api/messages/models.py
@@ -109,7 +109,7 @@ class AnthropicToolDef(BaseModel):
 class AnthropicThinkingConfig(BaseModel):
     """Configuration for extended thinking."""
 
-    type: Literal["enabled", "disabled"] = "enabled"
+    type: Literal["enabled", "disabled", "adaptive"] = "enabled"
     budget_tokens: int | None = Field(default=None, ge=1, description="Maximum tokens for thinking.")
 
 

From fb087c9392b62e93e7083a3f65256e3a72c99f47 Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Mon, 6 Apr 2026 11:20:39 -0400
Subject: [PATCH 8/9] feat(messages): reuse single httpx client for passthrough

Addressed reviewer feedback that each passthrough request created a new httpx.AsyncClient, incurring TCP overhead. A shared client is now created in initialize() and reused for both normal and streaming passthrough calls. The implementation also removes an unused client variable.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Charlie Doern <cdoern@redhat.com>
---
 .../providers/inline/messages/impl.py         | 38 +++++++++----------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/src/llama_stack/providers/inline/messages/impl.py b/src/llama_stack/providers/inline/messages/impl.py
index 51ba2565bf..068d7e92a4 100644
--- a/src/llama_stack/providers/inline/messages/impl.py
+++ b/src/llama_stack/providers/inline/messages/impl.py
@@ -87,10 +87,10 @@ def __init__(self, config: MessagesConfig, inference_api: Inference):
         self.inference_api = inference_api
 
     async def initialize(self) -> None:
-        pass
+        self._client = httpx.AsyncClient()
 
     async def shutdown(self) -> None:
-        pass
+        await self._client.aclose()
 
     async def create_message(
         self,
@@ -180,10 +180,9 @@ async def _passthrough_request(
         if request.stream:
             return self._passthrough_stream(url, headers, body)
 
-        async with httpx.AsyncClient() as client:
-            resp = await client.post(url, json=body, headers=headers, timeout=300)
-            resp.raise_for_status()
-            return AnthropicMessageResponse(**resp.json())
+        resp = await self._client.post(url, json=body, headers=headers, timeout=300)
+        resp.raise_for_status()
+        return AnthropicMessageResponse(**resp.json())
 
     async def _passthrough_stream(
         self,
@@ -192,20 +191,19 @@ async def _passthrough_stream(
         body: dict[str, Any],
     ) -> AsyncIterator[AnthropicStreamEvent]:
         """Stream SSE events directly from the provider."""
-        async with httpx.AsyncClient() as client:
-            async with client.stream("POST", url, json=body, headers=headers, timeout=300) as resp:
-                resp.raise_for_status()
-                event_type = None
-                async for line in resp.aiter_lines():
-                    line = line.strip()
-                    if line.startswith("event: "):
-                        event_type = line[7:]
-                    elif line.startswith("data: ") and event_type:
-                        data = json.loads(line[6:])
-                        event = self._parse_sse_event(event_type, data)
-                        if event:
-                            yield event
-                        event_type = None
+        async with self._client.stream("POST", url, json=body, headers=headers, timeout=300) as resp:
+            resp.raise_for_status()
+            event_type = None
+            async for line in resp.aiter_lines():
+                line = line.strip()
+                if line.startswith("event: "):
+                    event_type = line[7:]
+                elif line.startswith("data: ") and event_type:
+                    data = json.loads(line[6:])
+                    event = self._parse_sse_event(event_type, data)
+                    if event:
+                        yield event
+                    event_type = None
 
     def _parse_sse_event(self, event_type: str, data: dict[str, Any]) -> AnthropicStreamEvent | None:
         """Parse an Anthropic SSE event from its type and data."""

From 14274812ed6a837ebfae5c6f298fce42131a808d Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Mon, 6 Apr 2026 15:26:49 -0400
Subject: [PATCH 9/9] fix(messages): fix content validation and unsupported
 param errors in OpenAI translation

Single text-part user messages were sent as a bare dict instead of a
string, causing Pydantic validation errors. The Anthropic thinking
parameter was also forwarded to OpenAI chat completions which does not
support it. Additionally, ModelNotFoundError now returns a clean 404
instead of a 500 traceback.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Charlie Doern <cdoern@redhat.com>
---
 .../providers/inline/messages/impl.py         | 28 +++++++++++--------
 .../messages/fastapi_routes.py                |  3 ++
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/llama_stack/providers/inline/messages/impl.py b/src/llama_stack/providers/inline/messages/impl.py
index 068d7e92a4..990517502b 100644
--- a/src/llama_stack/providers/inline/messages/impl.py
+++ b/src/llama_stack/providers/inline/messages/impl.py
@@ -254,11 +254,8 @@ def _anthropic_to_openai(self, request: AnthropicCreateMessageRequest) -> OpenAI
         extra_body: dict[str, Any] = {}
         if request.top_k is not None:
             extra_body["top_k"] = request.top_k
-        if request.thinking is not None and request.thinking.type in ("enabled", "adaptive"):
-            extra_body["thinking"] = {
-                "type": "enabled",
-                "budget_tokens": request.thinking.budget_tokens,
-            }
+        # Note: Anthropic's "thinking" parameter has no equivalent in the OpenAI
+        # chat completions API and is intentionally not forwarded.
 
         params = OpenAIChatCompletionRequestWithExtraBody(
             model=request.model,
@@ -314,17 +311,21 @@ def _convert_single_message(self, msg: AnthropicMessage) -> list[dict[str, Any]]
             if isinstance(block, AnthropicToolResultBlock):
                 # Flush accumulated text first
                 if text_parts:
-                    result.append({"role": "user", "content": text_parts if len(text_parts) > 1 else text_parts[0]})
+                    if len(text_parts) == 1 and text_parts[0].get("type") == "text":
+                        flush_content: str | list[dict[str, Any]] = text_parts[0]["text"]
+                    else:
+                        flush_content = text_parts
+                    result.append({"role": "user", "content": flush_content})
                     text_parts = []
                 # Tool results become separate tool messages
-                content = block.content
-                if isinstance(content, list):
-                    content = "\n".join(b.text for b in content if isinstance(b, AnthropicTextBlock))
+                tool_content = block.content
+                if isinstance(tool_content, list):
+                    tool_content = "\n".join(b.text for b in tool_content if isinstance(b, AnthropicTextBlock))
                 result.append(
                     {
                         "role": "tool",
                         "tool_call_id": block.tool_use_id,
-                        "content": content,
+                        "content": tool_content,
                     }
                 )
             elif isinstance(block, AnthropicTextBlock):
@@ -340,7 +341,12 @@ def _convert_single_message(self, msg: AnthropicMessage) -> list[dict[str, Any]]
                 )
 
         if text_parts:
-            result.append({"role": "user", "content": text_parts if len(text_parts) > 1 else text_parts[0]})
+            # OpenAI content must be a string or a list, never a single dict
+            if len(text_parts) == 1 and text_parts[0].get("type") == "text":
+                user_content: str | list[dict[str, Any]] = text_parts[0]["text"]
+            else:
+                user_content = text_parts
+            result.append({"role": "user", "content": user_content})
 
         return result if result else [{"role": "user", "content": ""}]
 
diff --git a/src/llama_stack_api/messages/fastapi_routes.py b/src/llama_stack_api/messages/fastapi_routes.py
index 493c71f2a1..e82dc21602 100644
--- a/src/llama_stack_api/messages/fastapi_routes.py
+++ b/src/llama_stack_api/messages/fastapi_routes.py
@@ -21,6 +21,7 @@
 from fastapi.responses import JSONResponse, StreamingResponse
 from pydantic import BaseModel
 
+from llama_stack_api.common.errors import ModelNotFoundError
 from llama_stack_api.router_utils import standard_responses
 from llama_stack_api.version import LLAMA_STACK_API_V1
 
@@ -149,6 +150,8 @@ async def create_message(
             result = await impl.create_message(params)
         except NotImplementedError as e:
             return _anthropic_error_response(501, str(e))
+        except ModelNotFoundError as e:
+            return _anthropic_error_response(404, str(e))
         except ValueError as e:
             return _anthropic_error_response(400, str(e))
         except HTTPException as e: