fix: dedent structural tag block and preserve response format in system-instructions mode

garrio-1 · garrio-1 · commit dd23bae72aa8 · 2026-03-18T14:31:36.000-04:00
Fix two bugs identified in PR review:

1. The structural tag setup block was nested inside the `else` branch of
   `if self.use_harmony:`, making it unreachable for GPT-OSS (the primary
   target). Dedent the block so it runs unconditionally after context selection.

2. The `# Response Formats` schema section was lost when
   VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS was enabled, because
   get_developer_message() dropped all instructions in that mode. Add a
   separate response_format_section parameter so the schema is always
   included in the developer message regardless of the system-instructions
   flag.

Signed-off-by: Will Deines &lt;will@garr.io&gt;
diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py
@@ -1,12 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from unittest.mock import patch
+
 import pytest
 from openai_harmony import Message, Role
 
 from tests.entrypoints.openai.utils import verify_harmony_messages
 from vllm.entrypoints.openai.parser.harmony_utils import (
     auto_drop_analysis_messages,
+    get_developer_message,
     get_encoding,
     get_system_message,
     has_custom_tools,
@@ -955,3 +958,73 @@ def test_compact_json_no_spaces(self):
     def test_section_separated_by_blank_lines(self):
         result = inject_response_formats("Instructions here.", {"type": "object"})
         assert "\n\n# Response Formats\n\n## structured_output\n\n" in result
+
+
+class TestGetDeveloperMessageResponseFormats:
+    """Tests for response_format_section parameter in get_developer_message."""
+
+    ENV_VAR = (
+        "vllm.entrypoints.openai.parser.harmony_utils"
+        ".envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS"
+    )
+
+    def _extract_instructions_text(self, dev_msg: Message) -> str | None:
+        """Extract the raw text from a developer message's instructions."""
+        for content_item in dev_msg.content:
+            instructions = getattr(content_item, "instructions", None)
+            if instructions is not None:
+                return instructions
+        return None
+
+    def test_response_format_preserved_with_system_instructions(self):
+        """When VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS is True,
+        user instructions should be dropped but response format schema
+        should still appear in the developer message."""
+        schema_section = "# Response Formats\n\n## structured_output\n\n{}"
+        with patch(self.ENV_VAR, True):
+            dev_msg = get_developer_message(
+                instructions="Be concise.",
+                response_format_section=schema_section,
+            )
+        text = self._extract_instructions_text(dev_msg)
+        assert text is not None
+        assert "# Response Formats" in text
+        # User instructions should NOT be present
+        assert "Be concise." not in text
+
+    def test_response_format_and_instructions_without_system_instructions(self):
+        """When VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS is False,
+        both instructions and response format schema should appear."""
+        schema_section = "# Response Formats\n\n## structured_output\n\n{}"
+        with patch(self.ENV_VAR, False):
+            dev_msg = get_developer_message(
+                instructions="Be concise.",
+                response_format_section=schema_section,
+            )
+        text = self._extract_instructions_text(dev_msg)
+        assert text is not None
+        assert "Be concise." in text
+        assert "# Response Formats" in text
+
+    def test_response_format_only_no_instructions(self):
+        """With instructions=None, only the response format section appears."""
+        schema_section = "# Response Formats\n\n## structured_output\n\n{}"
+        with patch(self.ENV_VAR, False):
+            dev_msg = get_developer_message(
+                instructions=None,
+                response_format_section=schema_section,
+            )
+        text = self._extract_instructions_text(dev_msg)
+        assert text is not None
+        assert "# Response Formats" in text
+
+    def test_backward_compat_no_response_format(self):
+        """Without response_format_section, behavior matches the original."""
+        with patch(self.ENV_VAR, False):
+            dev_msg = get_developer_message(
+                instructions="Be concise.",
+            )
+        text = self._extract_instructions_text(dev_msg)
+        assert text is not None
+        assert "Be concise." in text
+        assert "# Response Formats" not in text
diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py
@@ -122,10 +122,16 @@ def create_tool_definition(tool: ChatCompletionToolsParam | Tool):
 def get_developer_message(
     instructions: str | None = None,
     tools: list[Tool | ChatCompletionToolsParam] | None = None,
+    response_format_section: str | None = None,
 ) -> Message:
     dev_msg_content = DeveloperContent.new()
+    parts: list[str] = []
     if instructions is not None and not envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS:
-        dev_msg_content = dev_msg_content.with_instructions(instructions)
+        parts.append(instructions)
+    if response_format_section is not None:
+        parts.append(response_format_section)
+    if parts:
+        dev_msg_content = dev_msg_content.with_instructions("\n\n".join(parts))
     if tools is not None:
         function_tools: list[Tool | ChatCompletionToolsParam] = []
         for tool in tools:
diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py
@@ -519,61 +519,61 @@ async def create_responses(
                     else:
                         context = SimpleContext()
 
-                    if self.parser and self.parser.reasoning_parser_cls is not None:
-                        reasoning_parser = self.parser.reasoning_parser_cls(tokenizer)
-                        struct_out = sampling_params.structured_outputs
-
-                        if isinstance(struct_out, StructuredOutputsParams):
-                            if struct_out.all_non_structural_tag_constraints_none():
-                                # No content constraint — just apply reasoning
-                                # channel tags
-                                sampling_params.structured_outputs = replace(
-                                    struct_out,
-                                    structural_tag=(
-                                        reasoning_parser.prepare_structured_tag(
-                                            struct_out.structural_tag,
-                                            self.tool_server,
-                                        )
-                                    ),
-                                )
-                            else:
-                                # Content constraint present (json, regex,
-                                # grammar, choice, json_object). Embed it in the
-                                # final channel tag within the structural tag.
-                                content_fmt = _constraint_to_content_format(struct_out)
-                                if content_fmt is not None:
-                                    structural_tag = (
-                                        reasoning_parser.prepare_structured_tag(
-                                            None,
-                                            self.tool_server,
-                                            final_content_format=content_fmt,
-                                        )
+                if self.parser and self.parser.reasoning_parser_cls is not None:
+                    reasoning_parser = self.parser.reasoning_parser_cls(tokenizer)
+                    struct_out = sampling_params.structured_outputs
+
+                    if isinstance(struct_out, StructuredOutputsParams):
+                        if struct_out.all_non_structural_tag_constraints_none():
+                            # No content constraint — just apply reasoning
+                            # channel tags
+                            sampling_params.structured_outputs = replace(
+                                struct_out,
+                                structural_tag=(
+                                    reasoning_parser.prepare_structured_tag(
+                                        struct_out.structural_tag,
+                                        self.tool_server,
                                     )
-                                    if structural_tag is not None:
-                                        # Clear content constraints, set
-                                        # structural_tag, but preserve options
-                                        # like disable_any_whitespace.
-                                        sampling_params.structured_outputs = replace(
-                                            struct_out,
-                                            json=None,
-                                            regex=None,
-                                            choice=None,
-                                            grammar=None,
-                                            json_object=None,
-                                            structural_tag=structural_tag,
-                                        )
-                        elif struct_out is None:
-                            # No structured output requested, but still need
-                            # reasoning channel tags
-                            tag = reasoning_parser.prepare_structured_tag(
-                                None, self.tool_server
+                                ),
                             )
-                            if tag is not None:
-                                sampling_params.structured_outputs = (
-                                    StructuredOutputsParams(
-                                        structural_tag=tag  # type: ignore[call-arg]
+                        else:
+                            # Content constraint present (json, regex,
+                            # grammar, choice, json_object). Embed it in the
+                            # final channel tag within the structural tag.
+                            content_fmt = _constraint_to_content_format(struct_out)
+                            if content_fmt is not None:
+                                structural_tag = (
+                                    reasoning_parser.prepare_structured_tag(
+                                        None,
+                                        self.tool_server,
+                                        final_content_format=content_fmt,
                                     )
                                 )
+                                if structural_tag is not None:
+                                    # Clear content constraints, set
+                                    # structural_tag, but preserve options
+                                    # like disable_any_whitespace.
+                                    sampling_params.structured_outputs = replace(
+                                        struct_out,
+                                        json=None,
+                                        regex=None,
+                                        choice=None,
+                                        grammar=None,
+                                        json_object=None,
+                                        structural_tag=structural_tag,
+                                    )
+                    elif struct_out is None:
+                        # No structured output requested, but still need
+                        # reasoning channel tags
+                        tag = reasoning_parser.prepare_structured_tag(
+                            None, self.tool_server
+                        )
+                        if tag is not None:
+                            sampling_params.structured_outputs = (
+                                StructuredOutputsParams(
+                                    structural_tag=tag  # type: ignore[call-arg]
+                                )
+                            )
                 generator = self._generate_with_builtin_tools(
                     request_id=request.request_id,
                     engine_prompt=engine_prompt,
@@ -1242,14 +1242,15 @@ def _construct_input_messages_with_harmony(
             needs_dev_msg = with_custom_tools or response_format_schema is not None
 
             if needs_dev_msg:
-                dev_instructions = request.instructions
+                response_format_text = None
                 if response_format_schema is not None:
-                    dev_instructions = inject_response_formats(
-                        dev_instructions, response_format_schema
+                    response_format_text = inject_response_formats(
+                        None, response_format_schema
                     )
                 dev_msg = get_developer_message(
-                    instructions=dev_instructions,
+                    instructions=request.instructions,
                     tools=request.tools if with_custom_tools else None,
+                    response_format_section=response_format_text,
                 )
                 messages.append(dev_msg)
             messages += construct_harmony_previous_input_messages(request)