vectorize-io · nicoloboschi · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025
diff --git a/hindsight-api/hindsight_api/api/http.py b/hindsight-api/hindsight_api/api/http.py
@@ -385,17 +385,31 @@ class ReflectRequest(BaseModel):
                 "query": "What do you think about artificial intelligence?",
                 "budget": "low",
                 "context": "This is for a research paper on AI ethics",
+                "max_tokens": 4096,
                 "include": {"facts": {}},
+                "response_schema": {
+                    "type": "object",
+                    "properties": {
+                        "summary": {"type": "string"},
+                        "key_points": {"type": "array", "items": {"type": "string"}},
+                    },
+                    "required": ["summary", "key_points"],
+                },
             }
         }
     )
 
     query: str
     budget: Budget = Budget.LOW
     context: str | None = None
+    max_tokens: int = Field(default=4096, description="Maximum tokens for the response")
     include: ReflectIncludeOptions = Field(
         default_factory=ReflectIncludeOptions, description="Options for including additional data (disabled by default)"
     )
+    response_schema: dict | None = Field(
+        default=None,
+        description="Optional JSON Schema for structured output. When provided, the response will include a 'structured_output' field with the LLM response parsed according to this schema.",
+    )
 
 
 class OpinionItem(BaseModel):
@@ -440,12 +454,20 @@ class ReflectResponse(BaseModel):
                     {"id": "123", "text": "AI is used in healthcare", "type": "world"},
                     {"id": "456", "text": "I discussed AI applications last week", "type": "experience"},
                 ],
+                "structured_output": {
+                    "summary": "AI is transformative",
+                    "key_points": ["Used in healthcare", "Discussed recently"],
+                },
             }
         }
     )
 
     text: str
     based_on: list[ReflectFact] = []  # Facts used to generate the response
+    structured_output: dict | None = Field(
+        default=None,
+        description="Structured output parsed according to the request's response_schema. Only present when response_schema was provided in the request.",
+    )
 
 
 class BanksResponse(BaseModel):
@@ -1211,6 +1233,8 @@ async def api_reflect(
                     query=request.query,
                     budget=request.budget,
                     context=request.context,
+                    max_tokens=request.max_tokens,
+                    response_schema=request.response_schema,
                     request_context=request_context,
                 )
 
@@ -1233,6 +1257,7 @@ async def api_reflect(
             return ReflectResponse(
                 text=core_result.text,
                 based_on=based_on_facts,
+                structured_output=core_result.structured_output,
             )
 
         except Exception as e:

diff --git a/hindsight-api/hindsight_api/engine/interface.py b/hindsight-api/hindsight_api/engine/interface.py
@@ -110,6 +110,8 @@ async def reflect_async(
         *,
         budget: "Budget | None" = None,
         context: str | None = None,
+        max_tokens: int = 4096,
+        response_schema: dict | None = None,
         request_context: "RequestContext",
     ) -> "ReflectResult":
         """
@@ -120,6 +122,8 @@ async def reflect_async(
             query: The question to reflect on.
             budget: Search budget for retrieving context.
             context: Additional context for the reflection.
+            max_tokens: Maximum tokens for the response.
+            response_schema: Optional JSON Schema for structured output.
             request_context: Request context for authentication.
 
         Returns:

diff --git a/hindsight-api/hindsight_api/engine/llm_wrapper.py b/hindsight-api/hindsight_api/engine/llm_wrapper.py
@@ -135,6 +135,7 @@ async def call(
         initial_backoff: float = 1.0,
         max_backoff: float = 60.0,
         skip_validation: bool = False,
+        strict_schema: bool = False,
     ) -> Any:
         """
         Make an LLM API call with retry logic.
@@ -149,6 +150,7 @@ async def call(
             initial_backoff: Initial backoff time in seconds.
             max_backoff: Maximum backoff time in seconds.
             skip_validation: Return raw JSON without Pydantic validation.
+            strict_schema: Use strict JSON schema enforcement (OpenAI only). Guarantees all required fields.
 
         Returns:
             Parsed response if response_format is provided, otherwise text content.
@@ -226,19 +228,35 @@ async def call(
             for attempt in range(max_retries + 1):
                 try:
                     if response_format is not None:
-                        # Add schema to system message for JSON mode
+                        schema = None
                         if hasattr(response_format, "model_json_schema"):
                             schema = response_format.model_json_schema()
-                            schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
 
-                            if call_params["messages"] and call_params["messages"][0].get("role") == "system":
-                                call_params["messages"][0]["content"] += schema_msg
-                            elif call_params["messages"]:
-                                call_params["messages"][0]["content"] = (
-                                    schema_msg + "\n\n" + call_params["messages"][0]["content"]
-                                )
+                        if strict_schema and schema is not None:
+                            # Use OpenAI's strict JSON schema enforcement
+                            # This guarantees all required fields are returned
+                            call_params["response_format"] = {
+                                "type": "json_schema",
+                                "json_schema": {
+                                    "name": "response",
+                                    "strict": True,
+                                    "schema": schema,
+                                },
+                            }
+                        else:
+                            # Soft enforcement: add schema to prompt and use json_object mode
+                            if schema is not None:
+                                schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
+
+                                if call_params["messages"] and call_params["messages"][0].get("role") == "system":
+                                    call_params["messages"][0]["content"] += schema_msg
+                                elif call_params["messages"]:
+                                    call_params["messages"][0]["content"] = (
+                                        schema_msg + "\n\n" + call_params["messages"][0]["content"]
+                                    )
+
+                            call_params["response_format"] = {"type": "json_object"}
 
-                        call_params["response_format"] = {"type": "json_object"}
                         response = await self._client.chat.completions.create(**call_params)
 
                         content = response.choices[0].message.content

diff --git a/hindsight-api/hindsight_api/engine/memory_engine.py b/hindsight-api/hindsight_api/engine/memory_engine.py
@@ -3076,6 +3076,8 @@ async def reflect_async(
         *,
         budget: Budget | None = None,
         context: str | None = None,
+        max_tokens: int = 4096,
+        response_schema: dict | None = None,
         request_context: "RequestContext",
     ) -> ReflectResult:
         """
@@ -3087,19 +3089,22 @@ async def reflect_async(
         3. Retrieves existing opinions (bank's formed perspectives)
         4. Uses LLM to formulate an answer
         5. Extracts and stores any new opinions formed during reflection
-        6. Returns plain text answer and the facts used
+        6. Optionally generates structured output based on response_schema
+        7. Returns plain text answer and the facts used
 
         Args:
             bank_id: bank identifier
             query: Question to answer
             budget: Budget level for memory exploration (low=100, mid=300, high=600 units)
             context: Additional context string to include in LLM prompt (not used in recall)
+            response_schema: Optional JSON Schema for structured output
 
         Returns:
             ReflectResult containing:
                 - text: Plain text answer (no markdown)
                 - based_on: Dict with 'world', 'experience', and 'opinion' fact lists (MemoryFact objects)
                 - new_opinions: List of newly formed opinions
+                - structured_output: Optional dict if response_schema was provided
         """
         # Use cached LLM config
         if self._llm_config is None:
@@ -3177,17 +3182,40 @@ async def reflect_async(
         log_buffer.append(f"[REFLECT {reflect_id}] Prompt: {len(prompt)} chars")
 
         system_message = think_utils.get_system_message(disposition)
+        messages = [{"role": "system", "content": system_message}, {"role": "user", "content": prompt}]
+
+        # Prepare response_format if schema provided
+        response_format = None
+        if response_schema is not None:
+            # Wrapper class to provide Pydantic-like interface for raw JSON schemas
+            class JsonSchemaWrapper:
+                def __init__(self, schema: dict):
+                    self._schema = schema
+
+                def model_json_schema(self):
+                    return self._schema
+
+            response_format = JsonSchemaWrapper(response_schema)
 
         llm_start = time.time()
-        answer_text = await self._llm_config.call(
-            messages=[{"role": "system", "content": system_message}, {"role": "user", "content": prompt}],
-            scope="memory_think",
-            temperature=0.9,
-            max_completion_tokens=1000,
+        result = await self._llm_config.call(
+            messages=messages,
+            scope="memory_reflect",
+            max_completion_tokens=max_tokens,
+            response_format=response_format,
+            skip_validation=True if response_format else False,
+            strict_schema=True if response_format else False,
         )
         llm_time = time.time() - llm_start
 
-        answer_text = answer_text.strip()
+        # Handle response based on whether structured output was requested
+        if response_schema is not None:
+            structured_output = result
+            answer_text = ""  # Empty for backward compatibility
+            log_buffer.append(f"[REFLECT {reflect_id}] Structured output generated")
+        else:
+            structured_output = None
+            answer_text = result.strip()
 
         # Submit form_opinion task for background processing
         await self._task_backend.submit_task(
@@ -3205,6 +3233,7 @@ async def reflect_async(
             text=answer_text,
             based_on={"world": world_results, "experience": agent_results, "opinion": opinion_results},
             new_opinions=[],  # Opinions are being extracted asynchronously
+            structured_output=structured_output,
         )
 
         # Call post-operation hook if validator is configured

diff --git a/hindsight-api/hindsight_api/engine/response_models.py b/hindsight-api/hindsight_api/engine/response_models.py
@@ -123,7 +123,8 @@ class ReflectResult(BaseModel):
     Result from a reflect operation.
 
     Contains the formulated answer, the facts it was based on (organized by type),
-    and any new opinions that were formed during the reflection process.
+    any new opinions that were formed during the reflection process, and optionally
+    structured output if a response schema was provided.
     """
 
     model_config = ConfigDict(
@@ -145,6 +146,7 @@ class ReflectResult(BaseModel):
                     "opinion": [],
                 },
                 "new_opinions": ["Machine learning has great potential in healthcare"],
+                "structured_output": {"summary": "ML in healthcare", "confidence": 0.9},
             }
         }
     )
@@ -154,6 +156,10 @@ class ReflectResult(BaseModel):
         description="Facts used to formulate the answer, organized by type (world, experience, opinion)"
     )
     new_opinions: list[str] = Field(default_factory=list, description="List of newly formed opinions during reflection")
+    structured_output: dict[str, Any] | None = Field(
+        default=None,
+        description="Structured output parsed according to the provided response schema. Only present when response_schema was provided.",
+    )
 
 
 class Opinion(BaseModel):