diff --git a/dspy/adapters/json_adapter.py b/dspy/adapters/json_adapter.py index ae534d778f..302b4fd2f1 100644 --- a/dspy/adapters/json_adapter.py +++ b/dspy/adapters/json_adapter.py @@ -5,12 +5,12 @@ import json_repair import litellm import pydantic -import regex from pydantic.fields import FieldInfo from dspy.adapters.chat_adapter import ChatAdapter, FieldInfoWithName from dspy.adapters.types.tool import ToolCalls from dspy.adapters.utils import ( + _extract_first_json_object, format_field_value, get_annotation_name, parse_value, @@ -151,10 +151,9 @@ def format_assistant_message_content( return self.format_field_with_value(fields_with_values, role="assistant") def parse(self, signature: type[Signature], completion: str) -> dict[str, Any]: - pattern = r"\{(?:[^{}]|(?R))*\}" - match = regex.search(pattern, completion, regex.DOTALL) - if match: - completion = match.group(0) + extracted_object = _extract_first_json_object(completion) + if extracted_object: + completion = extracted_object fields = json_repair.loads(completion) if not isinstance(fields, dict): diff --git a/dspy/adapters/utils.py b/dspy/adapters/utils.py index f38a77ee8a..63740933e9 100644 --- a/dspy/adapters/utils.py +++ b/dspy/adapters/utils.py @@ -280,3 +280,42 @@ def _quoted_string_for_literal_type_annotation(s: str) -> str: else: # Neither => enclose in single quotes return f"'{s}'" + +def _extract_first_json_object(text: str) -> str | None: + """Return the first balanced JSON object found in text or None if absent.""" + + in_string = False + escape = False + depth = 0 + start_idx: int | None = None + seen_lbrace = False + + for idx, char in enumerate(text): + if seen_lbrace and in_string: + if escape: + escape = False + elif char == "\\": + escape = True + elif char == '"': + in_string = False + continue + + if seen_lbrace and char == '"': + in_string = True + continue + + if char == "{": + if depth == 0: + start_idx = idx + seen_lbrace = True + depth += 1 + continue + + if char == "}": + if depth == 0 or start_idx is None: + continue + depth -= 1 + if depth == 0: + return text[start_idx : idx + 1] + + return None diff --git a/tests/adapters/test_adapter_utils.py b/tests/adapters/test_adapter_utils.py index d39ddcf8cb..cefcf65070 100644 --- a/tests/adapters/test_adapter_utils.py +++ b/tests/adapters/test_adapter_utils.py @@ -5,7 +5,7 @@ import pytest from pydantic import BaseModel -from dspy.adapters.utils import parse_value +from dspy.adapters.utils import _extract_first_json_object, parse_value class Profile(BaseModel): @@ -105,3 +105,43 @@ def test_parse_value_json_repair(): malformed = "not json or literal" with pytest.raises(Exception): parse_value(malformed, dict) + + +@pytest.mark.parametrize( + "text,expected", + [ + # JSON at the start of text + ('{"name": "John", "age": 30} and some trailing text', '{"name": "John", "age": 30}'), + # JSON in the middle of text + ('Here is your result: {"status": "success", "data": [1, 2, 3]} done', '{"status": "success", "data": [1, 2, 3]}'), + # JSON at the end of text + ('The answer is {"result": 42}', '{"result": 42}'), + # Nested JSON objects + ('Response: {"outer": {"inner": {"deep": "value"}}, "count": 5}', '{"outer": {"inner": {"deep": "value"}}, "count": 5}'), + # JSON with braces inside string values + ('{"message": "Use {placeholders} like {this}", "valid": true}', '{"message": "Use {placeholders} like {this}", "valid": true}'), + # JSON with escaped quotes in strings + ('{"quote": "She said \\"hello\\" to me"}', '{"quote": "She said \\"hello\\" to me"}'), + # No JSON present + ("This is just plain text without any JSON", None), + # Empty JSON object + ("Here is an empty object: {}", "{}"), + # Unbalanced braces (no valid JSON) + ("This has { an opening but no closing", None), + # Multiple JSON objects - should extract only the first + ('{"first": 1} and then {"second": 2}', '{"first": 1}'), + # JSON with newlines + ("""Here is the result: + { + "name": "Alice", + "scores": [95, 87, 92] + } + End of message""", """{ + "name": "Alice", + "scores": [95, 87, 92] + }"""), + ], +) +def test_extract_first_json_object(text, expected): + result = _extract_first_json_object(text) + assert result == expected diff --git a/tests/adapters/test_json_adapter.py b/tests/adapters/test_json_adapter.py index 2acd4e63c3..46214071ea 100644 --- a/tests/adapters/test_json_adapter.py +++ b/tests/adapters/test_json_adapter.py @@ -239,6 +239,41 @@ def test_json_adapter_parse_raise_error_on_mismatch_fields(): ) +def test_json_adapter_parse_handles_braces_inside_string_values(): + class CodeIssue(pydantic.BaseModel): + issue_type: str + severity_level: str + problem_code_snippet: str + + class CodeReview(dspy.Signature): + reasoning: str = dspy.OutputField(desc="Short chain-of-thought analysis") + issue_list: list[CodeIssue] = dspy.OutputField(desc="Detected issues") + + adapter = dspy.JSONAdapter() + completion = ( + "Here is the review output you asked for:\n\n" + "{\n" + ' "reasoning": "Inspecting the conditional reveals an unmatched brace.",\n' + ' "issue_list": [\n' + " {\n" + ' "issue_type": "style",\n' + ' "severity_level": "fatal",\n' + ' "problem_code_snippet": "if (user) {"\n' + " }\n" + " ]\n" + "}\n" + ) + + result = adapter.parse(CodeReview, completion) + + assert result["reasoning"] == "Inspecting the conditional reveals an unmatched brace." + assert len(result["issue_list"]) == 1 + issue = result["issue_list"][0] + assert issue.issue_type == "style" + assert issue.severity_level == "fatal" + assert issue.problem_code_snippet == "if (user) {" + + def test_json_adapter_formats_image(): # Test basic image formatting image = dspy.Image(url="https://example.com/image.jpg")