fix: emit response.output_text.done streaming event per OpenAI spec

robinnarsinghranabhat · rranabha · commit f76c9c83cf7b · 2026-03-25T21:38:45.000-04:00
The LlamaStack server was missing the `response.output_text.done`
streaming event, which the OpenAI Responses API spec requires between
`output_text.delta` and `content_part.done`. This event carries the
final accumulated text and logprobs.

Discovered by comparing streaming event sequences between OpenAI's
gpt-5.1 (ground truth) and LlamaStack server output using the OpenAI
Python client.

Changes:
- streaming.py: Import and emit OutputTextDone with final text and
  logprobs before content_part.done
- openai_responses.py: Add logprobs field to OutputTextDone type
  definition (required per OpenAI spec)
- test_openai_responses.py: Verify output_text.done is emitted with
  correct fields and ordering (before content_part.done)
diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml
@@ -8476,6 +8476,13 @@ components:
         sequence_number:
           title: Sequence Number
           type: integer
+        logprobs:
+          anyOf:
+          - items:
+              $ref: '#/components/schemas/OpenAITokenLogProb'
+            type: array
+          - type: 'null'
+          nullable: true
         type:
           const: response.output_text.done
           default: response.output_text.done
diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml
@@ -5177,6 +5177,13 @@ components:
         sequence_number:
           title: Sequence Number
           type: integer
+        logprobs:
+          anyOf:
+          - items:
+              $ref: '#/components/schemas/OpenAITokenLogProb'
+            type: array
+          - type: 'null'
+          nullable: true
         type:
           const: response.output_text.done
           default: response.output_text.done
diff --git a/docs/static/experimental-llama-stack-spec.yaml b/docs/static/experimental-llama-stack-spec.yaml
@@ -5358,6 +5358,13 @@ components:
         sequence_number:
           title: Sequence Number
           type: integer
+        logprobs:
+          anyOf:
+          - items:
+              $ref: '#/components/schemas/OpenAITokenLogProb'
+            type: array
+          - type: 'null'
+          nullable: true
         type:
           const: response.output_text.done
           default: response.output_text.done
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
@@ -7361,6 +7361,13 @@ components:
         sequence_number:
           title: Sequence Number
           type: integer
+        logprobs:
+          anyOf:
+          - items:
+              $ref: '#/components/schemas/OpenAITokenLogProb'
+            type: array
+          - type: 'null'
+          nullable: true
         type:
           const: response.output_text.done
           default: response.output_text.done
diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml
@@ -8476,6 +8476,13 @@ components:
         sequence_number:
           title: Sequence Number
           type: integer
+        logprobs:
+          anyOf:
+          - items:
+              $ref: '#/components/schemas/OpenAITokenLogProb'
+            type: array
+          - type: 'null'
+          nullable: true
         type:
           const: response.output_text.done
           default: response.output_text.done
diff --git a/src/llama_stack/providers/inline/responses/builtin/responses/streaming.py b/src/llama_stack/providers/inline/responses/builtin/responses/streaming.py
@@ -74,6 +74,7 @@
     OpenAIResponseObjectStreamResponseOutputItemAdded,
     OpenAIResponseObjectStreamResponseOutputItemDone,
     OpenAIResponseObjectStreamResponseOutputTextDelta,
+    OpenAIResponseObjectStreamResponseOutputTextDone,
     OpenAIResponseObjectStreamResponseReasoningTextDelta,
     OpenAIResponseObjectStreamResponseReasoningTextDone,
     OpenAIResponseObjectStreamResponseRefusalDelta,
@@ -1145,9 +1146,19 @@ async def _process_streaming_chunks(
                 sequence_number=self.sequence_number,
             )
 
-        # Emit content_part.done event if text content was streamed (before content gets cleared)
+        # Emit output_text.done and content_part.done events if text content was streamed
         if content_part_emitted:
             final_text = "".join(chat_response_content)
+            # Emit output_text.done with the final accumulated text (per OpenAI protocol)
+            self.sequence_number += 1
+            yield OpenAIResponseObjectStreamResponseOutputTextDone(
+                content_index=content_index,
+                text=final_text,
+                item_id=message_item_id,
+                output_index=message_output_index,
+                sequence_number=self.sequence_number,
+                logprobs=chat_response_logprobs if chat_response_logprobs else [],
+            )
             self.sequence_number += 1
             yield OpenAIResponseObjectStreamResponseContentPartDone(
                 content_index=content_index,
diff --git a/src/llama_stack_api/openai_responses.py b/src/llama_stack_api/openai_responses.py
@@ -928,6 +928,7 @@ class OpenAIResponseObjectStreamResponseOutputTextDone(BaseModel):
     :param item_id: Unique identifier of the completed output item
     :param output_index: Index position of the item in the output list
     :param sequence_number: Sequential number for ordering streaming events
+    :param logprobs: Token log probability details for the completed text
     :param type: Event type identifier, always "response.output_text.done"
     """
 
@@ -936,6 +937,7 @@ class OpenAIResponseObjectStreamResponseOutputTextDone(BaseModel):
     item_id: str
     output_index: int
     sequence_number: int
+    logprobs: list[OpenAITokenLogProb] | None = None
     type: Literal["response.output_text.done"] = "response.output_text.done"
 
 
diff --git a/tests/unit/providers/responses/builtin/test_openai_responses.py b/tests/unit/providers/responses/builtin/test_openai_responses.py
@@ -241,7 +241,8 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
     )
 
     # Should have content part events for text streaming
-    # Expected: response.created, response.in_progress, content_part.added, output_text.delta, content_part.done, response.completed
+    # Expected: response.created, response.in_progress, output_item.added, content_part.added,
+    #           output_text.delta, output_text.done, content_part.done, output_item.done, response.completed
     assert len(chunks) >= 5
     assert chunks[0].type == "response.created"
     assert any(chunk.type == "response.in_progress" for chunk in chunks)
@@ -250,10 +251,12 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
     content_part_added_events = [c for c in chunks if c.type == "response.content_part.added"]
     content_part_done_events = [c for c in chunks if c.type == "response.content_part.done"]
     text_delta_events = [c for c in chunks if c.type == "response.output_text.delta"]
+    text_done_events = [c for c in chunks if c.type == "response.output_text.done"]
 
     assert len(content_part_added_events) >= 1, "Should have content_part.added event for text"
     assert len(content_part_done_events) >= 1, "Should have content_part.done event for text"
     assert len(text_delta_events) >= 1, "Should have text delta events"
+    assert len(text_done_events) >= 1, "Should have output_text.done event with final accumulated text"
 
     added_event = content_part_added_events[0]
     done_event = content_part_done_events[0]
@@ -263,6 +266,20 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
     assert added_event.item_id == done_event.item_id
     assert added_event.response_id == done_event.response_id
 
+    # Verify output_text.done contains the final accumulated text and correct indices
+    text_done_event = text_done_events[0]
+    assert text_done_event.content_index == 0
+    assert text_done_event.output_index == 0
+    assert text_done_event.item_id == added_event.item_id
+    assert isinstance(text_done_event.text, str)
+    assert len(text_done_event.text) > 0, "output_text.done should contain the final text"
+
+    # Verify output_text.done comes before content_part.done (per OpenAI protocol)
+    chunk_types = [c.type for c in chunks]
+    text_done_idx = chunk_types.index("response.output_text.done")
+    content_done_idx = chunk_types.index("response.content_part.done")
+    assert text_done_idx < content_done_idx, "output_text.done must precede content_part.done"
+
     # Verify final event is completion
     assert chunks[-1].type == "response.completed"