From 2c6132ba563ae81a471dfed1799a5aa31d2e2192 Mon Sep 17 00:00:00 2001 From: JRussas <159085336+JMRussas@users.noreply.github.com> Date: Fri, 6 Mar 2026 22:12:23 -0500 Subject: [PATCH 1/2] Switch plan generation from JSON to XML format Claude now produces XML plans which are stored as source of truth in the new plan_xml column. The decomposer and routes prefer plan_xml with automatic fallback to plan_json for backward compatibility. - Add xml_utils.py with extract_xml_plan() and parse_plan_xml() - Rewrite planner prompt suffixes (L1/L2/L3/C#) to request XML output - Add plan_xml TEXT column (migration 015, inline schema, models_metadata) - Update decomposer to parse plan_xml when available - Update routes to prefer plan_xml in read/clone/export paths - Add plan_xml field to PlanOut schema - 18 new XML parsing tests, updated planner test mocks to return XML Co-Authored-By: Claude Opus 4.6 --- backend/db/connection.py | 1 + backend/db/models_metadata.py | 1 + .../migrations/versions/015_add_plan_xml.py | 32 ++ backend/models/schemas.py | 3 +- backend/routes/projects.py | 23 +- backend/services/decomposer.py | 12 +- backend/services/planner.py | 315 +++++++++-------- backend/utils/xml_utils.py | 161 +++++++++ tests/unit/test_csharp_planner.py | 21 +- tests/unit/test_planner_service.py | 22 +- tests/unit/test_planning_rigor.py | 49 ++- tests/unit/test_xml_utils.py | 324 ++++++++++++++++++ 12 files changed, 781 insertions(+), 183 deletions(-) create mode 100644 backend/migrations/versions/015_add_plan_xml.py create mode 100644 backend/utils/xml_utils.py create mode 100644 tests/unit/test_xml_utils.py diff --git a/backend/db/connection.py b/backend/db/connection.py index a42d39b..b429753 100644 --- a/backend/db/connection.py +++ b/backend/db/connection.py @@ -59,6 +59,7 @@ completion_tokens INTEGER NOT NULL DEFAULT 0, cost_usd REAL NOT NULL DEFAULT 0.0, plan_json TEXT NOT NULL, + plan_xml TEXT, status TEXT NOT NULL DEFAULT 'draft', created_at REAL NOT NULL ); diff --git a/backend/db/models_metadata.py b/backend/db/models_metadata.py index 594e487..219973d 100644 --- a/backend/db/models_metadata.py +++ b/backend/db/models_metadata.py @@ -63,6 +63,7 @@ Column("completion_tokens", Integer, nullable=False, server_default="0"), Column("cost_usd", Float, nullable=False, server_default="0.0"), Column("plan_json", Text, nullable=False), + Column("plan_xml", Text, nullable=True), Column("status", Text, nullable=False, server_default="draft"), Column("created_at", Float, nullable=False), ) diff --git a/backend/migrations/versions/015_add_plan_xml.py b/backend/migrations/versions/015_add_plan_xml.py new file mode 100644 index 0000000..da03d08 --- /dev/null +++ b/backend/migrations/versions/015_add_plan_xml.py @@ -0,0 +1,32 @@ +# Orchestration Engine - Migration 015 +# +# Add plan_xml column to plans table for XML plan storage. +# Existing plans remain in plan_json; new plans write both columns. +# +# Depends on: 014_add_api_keys_and_claim_tracking +# Used by: services/planner.py, services/decomposer.py + +"""Add plan_xml column to plans table. + +Revision ID: 015 +Revises: 014 +Create Date: 2026-03-06 +""" + +from alembic import op +import sqlalchemy as sa + +revision = "015" +down_revision = "014" +branch_labels = None +depends_on = None + + +def upgrade(): + with op.batch_alter_table("plans") as batch_op: + batch_op.add_column(sa.Column("plan_xml", sa.Text(), nullable=True)) + + +def downgrade(): + with op.batch_alter_table("plans") as batch_op: + batch_op.drop_column("plan_xml") diff --git a/backend/models/schemas.py b/backend/models/schemas.py index 3a7b6c8..6e89db7 100644 --- a/backend/models/schemas.py +++ b/backend/models/schemas.py @@ -95,7 +95,8 @@ class PlanOut(BaseModel): prompt_tokens: int completion_tokens: int cost_usd: float - plan: dict # The structured plan JSON + plan: dict # The structured plan data (parsed from XML or JSON) + plan_xml: str | None = None # Raw XML plan (if available) status: PlanStatus created_at: float diff --git a/backend/routes/projects.py b/backend/routes/projects.py index 0eef113..575a8d3 100644 --- a/backend/routes/projects.py +++ b/backend/routes/projects.py @@ -35,6 +35,15 @@ # Helpers # --------------------------------------------------------------------------- +def _parse_plan_from_row(row) -> dict: + """Parse plan data from a DB row, preferring plan_xml over plan_json.""" + plan_xml_raw = row["plan_xml"] + if plan_xml_raw: + from backend.utils.xml_utils import parse_plan_xml + return parse_plan_xml(plan_xml_raw) + return json.loads(row["plan_json"]) + + async def _row_to_project( row, db: Database, include_task_summary: bool = False, @@ -298,7 +307,8 @@ async def list_plans( prompt_tokens=r["prompt_tokens"], completion_tokens=r["completion_tokens"], cost_usd=r["cost_usd"], - plan=json.loads(r["plan_json"]), + plan=_parse_plan_from_row(r), + plan_xml=r["plan_xml"], status=r["status"], created_at=r["created_at"], ) @@ -438,9 +448,10 @@ async def clone_project( new_plan_id = uuid.uuid4().hex[:12] await db.execute_write( "INSERT INTO plans (id, project_id, version, model_used, prompt_tokens, " - "completion_tokens, cost_usd, plan_json, status, created_at) " - "VALUES (?, ?, 1, ?, 0, 0, 0.0, ?, 'draft', ?)", - (new_plan_id, new_project_id, plan_row["model_used"], plan_row["plan_json"], now), + "completion_tokens, cost_usd, plan_json, plan_xml, status, created_at) " + "VALUES (?, ?, 1, ?, 0, 0, 0.0, ?, ?, 'draft', ?)", + (new_plan_id, new_project_id, plan_row["model_used"], + plan_row["plan_json"], plan_row["plan_xml"], now), ) # 3. Clone tasks (reset status, clear output/cost/retry) @@ -512,8 +523,8 @@ async def export_project( { "id": p["id"], "version": p["version"], "model_used": p["model_used"], "prompt_tokens": p["prompt_tokens"], "completion_tokens": p["completion_tokens"], - "cost_usd": p["cost_usd"], "plan": json.loads(p["plan_json"]), - "status": p["status"], "created_at": p["created_at"], + "cost_usd": p["cost_usd"], "plan": _parse_plan_from_row(p), + "plan_xml": p["plan_xml"], "status": p["status"], "created_at": p["created_at"], } for p in plan_rows ] diff --git a/backend/services/decomposer.py b/backend/services/decomposer.py index 830b47d..29e0642 100644 --- a/backend/services/decomposer.py +++ b/backend/services/decomposer.py @@ -1,8 +1,8 @@ # Orchestration Engine - Plan Decomposer # -# Converts an approved plan JSON into task rows with dependency edges. +# Converts an approved plan (XML or JSON) into task rows with dependency edges. # -# Depends on: backend/config.py, services/model_router.py +# Depends on: backend/config.py, services/model_router.py, utils/xml_utils.py # Used by: routes/projects.py, container.py import json @@ -66,7 +66,13 @@ async def decompose(self, project_id: str, plan_id: str) -> dict: if plan_row["project_id"] != project_id: raise NotFoundError(f"Plan {plan_id} does not belong to project {project_id}") - plan_data = json.loads(plan_row["plan_json"]) + # Prefer XML plan (source of truth) with JSON fallback + plan_xml_raw = plan_row["plan_xml"] + if plan_xml_raw: + from backend.utils.xml_utils import parse_plan_xml + plan_data = parse_plan_xml(plan_xml_raw) + else: + plan_data = json.loads(plan_row["plan_json"]) tasks_data, phase_names = _flatten_plan_tasks(plan_data) if not tasks_data: diff --git a/backend/services/planner.py b/backend/services/planner.py index b436d0f..b962db1 100644 --- a/backend/services/planner.py +++ b/backend/services/planner.py @@ -17,6 +17,7 @@ from backend.models.enums import PlanningRigor, PlanStatus, ProjectStatus from backend.services.model_router import calculate_cost from backend.utils.json_utils import extract_json_object, parse_requirements +from backend.utils.xml_utils import extract_xml_plan, parse_plan_xml logger = logging.getLogger("orchestration.planner") @@ -71,110 +72,128 @@ """ -_TASK_SCHEMA = """{ - "title": "Short task title", - "description": "Detailed description...", - "task_type": "code|research|analysis|asset|integration|documentation", - "complexity": "simple|medium|complex", - "depends_on": [], - "tools_needed": ["search_knowledge", "lookup_type", "local_llm", "generate_image", "read_file", "write_file"], - "requirement_ids": ["R1", "R3"], - "verification_criteria": "How to verify this task was completed correctly", - "affected_files": ["src/auth.ts", "db/schema.sql"] - }""" - -_RIGOR_SUFFIX_L1 = f"""Produce a JSON plan with this exact structure: -{{ - "summary": "Brief summary of what will be built", - "tasks": [ - {_TASK_SCHEMA} - ] -}} - +_TASK_SCHEMA_XML = """ + Short task title + Detailed description of what this task does + code + medium + + search_knowledge,read_file,write_file + R1,R3 + How to verify this task was completed correctly + src/auth.ts,db/schema.sql + """ + +_RIGOR_SUFFIX_L1 = f"""Produce an XML plan with this exact structure: + + + Brief summary of what will be built + +{_TASK_SCHEMA_XML} + + + +Task field notes: +- task_type: code|research|analysis|asset|integration|documentation +- complexity: simple|medium|complex +- depends_on: comma-separated 0-based task indices (empty if no dependencies) +- tools_needed: comma-separated from: search_knowledge, lookup_type, local_llm, generate_image, read_file, write_file +- requirement_ids: comma-separated (e.g. R1,R3) +- affected_files: comma-separated file paths - Aim for 3-15 tasks. Too few means tasks are too large; too many means overhead. - -Respond with ONLY the JSON plan, no markdown fences or explanation.""" - -_RIGOR_SUFFIX_L2 = f"""Produce a JSON plan organized into phases. Each phase groups related tasks into a logical stage of work. - -{{ - "summary": "Brief summary of what will be built", - "phases": [ - {{ - "name": "Phase name (e.g. 'Foundation', 'Core Logic', 'Integration')", - "description": "What this phase accomplishes and why it comes at this point", - "tasks": [ - {_TASK_SCHEMA} - ] - }} - ], - "open_questions": [ - {{ - "question": "An ambiguity or decision in the requirements", - "proposed_answer": "How you propose to handle it", - "impact": "What changes if the answer differs" - }} - ] -}} +- Use XML entities for special characters in descriptions: < > & + +Respond with ONLY the XML plan, no markdown fences or explanation.""" + +_RIGOR_SUFFIX_L2 = f"""Produce an XML plan organized into phases. Each phase groups related tasks into a logical stage of work. + + + Brief summary of what will be built + + + What this phase accomplishes and why it comes at this point +{_TASK_SCHEMA_XML} + + + + + An ambiguity or decision in the requirements + How you propose to handle it + What changes if the answer differs + + + + +Task field notes: +- task_type: code|research|analysis|asset|integration|documentation +- complexity: simple|medium|complex +- depends_on: comma-separated 0-based task indices, GLOBAL across all phases (empty if none) +- tools_needed: comma-separated from: search_knowledge, lookup_type, local_llm, generate_image, read_file, write_file +- requirement_ids: comma-separated (e.g. R1,R3) +- affected_files: comma-separated file paths +- Use XML entities for special characters in descriptions: < > & Phase guidelines: - Group related tasks into 2-5 phases that represent logical stages of work. - Name phases clearly: "Research & Discovery", "Core Implementation", "Integration & Testing", etc. - Earlier phases should have no dependencies on later phases. -- depends_on indices are GLOBAL across all phases (0-based from the first task in the first phase). - Aim for 3-15 total tasks across all phases. Open questions: - Surface 1-5 ambiguities, assumptions, or decisions that could affect the plan. -- Each must include a proposed_answer so the user can approve or override quickly. - -Respond with ONLY the JSON plan, no markdown fences or explanation.""" - -_RIGOR_SUFFIX_L3 = f"""Produce a thorough JSON plan organized into phases with risk analysis and test strategy. - -{{ - "summary": "Brief summary of what will be built", - "phases": [ - {{ - "name": "Phase name (e.g. 'Foundation', 'Core Logic', 'Integration')", - "description": "What this phase accomplishes and why it comes at this point", - "tasks": [ - {_TASK_SCHEMA} - ] - }} - ], - "open_questions": [ - {{ - "question": "An ambiguity or decision in the requirements", - "proposed_answer": "How you propose to handle it", - "impact": "What changes if the answer differs" - }} - ], - "risk_assessment": [ - {{ - "risk": "Description of a technical or schedule risk", - "likelihood": "low|medium|high", - "impact": "low|medium|high", - "mitigation": "How to reduce or handle this risk" - }} - ], - "test_strategy": {{ - "approach": "Overall testing approach description", - "test_tasks": ["Task titles that represent test/verification work"], - "coverage_notes": "What areas need testing and how" - }} -}} +- Each must include a proposed answer so the user can approve or override quickly. + +Respond with ONLY the XML plan, no markdown fences or explanation.""" + +_RIGOR_SUFFIX_L3 = f"""Produce a thorough XML plan organized into phases with risk analysis and test strategy. + + + Brief summary of what will be built + + + What this phase accomplishes +{_TASK_SCHEMA_XML} + + + + + An ambiguity or decision in the requirements + How you propose to handle it + What changes if the answer differs + + + + + Description of a technical or schedule risk + medium + high + How to reduce or handle this risk + + + + Overall testing approach description + Task title 1,Task title 2 + What areas need testing and how + + + +Task field notes: +- task_type: code|research|analysis|asset|integration|documentation +- complexity: simple|medium|complex +- depends_on: comma-separated 0-based task indices, GLOBAL across all phases (empty if none) +- tools_needed: comma-separated from: search_knowledge, lookup_type, local_llm, generate_image, read_file, write_file +- requirement_ids: comma-separated (e.g. R1,R3) +- affected_files: comma-separated file paths +- Use XML entities for special characters in descriptions: < > & Phase guidelines: - Group related tasks into 2-5 phases that represent logical stages of work. - Name phases clearly: "Research & Discovery", "Core Implementation", "Integration & Testing", etc. - Earlier phases should have no dependencies on later phases. -- depends_on indices are GLOBAL across all phases (0-based from the first task in the first phase). - Aim for 5-15 total tasks across all phases. Open questions: - Surface 1-5 ambiguities, assumptions, or decisions that could affect the plan. -- Each must include a proposed_answer so the user can approve or override quickly. Risk assessment: - Identify 2-5 technical, integration, or scope risks. @@ -185,7 +204,7 @@ - Reference specific tasks that perform testing/verification. - Note coverage gaps the user should be aware of. -You may optionally begin your response with a block to reason through dependencies, risks, and trade-offs before producing the plan. After your reasoning (if any), output the JSON plan with no markdown fences.""" +You may optionally begin your response with a block to reason through the plan before producing it. After your reasoning (if any), output the XML plan.""" _RIGOR_SUFFIXES = { PlanningRigor.L1: _RIGOR_SUFFIX_L1, @@ -228,57 +247,63 @@ def _build_system_prompt(rigor: PlanningRigor) -> str: """ -_CSHARP_TASK_SCHEMA = """{ - "title": "ClassName.MethodName", - "description": "What this method does, including behavioral contract and edge cases", - "task_type": "csharp_method", - "complexity": "simple|medium|complex", - "depends_on": [], - "target_class": "Namespace.ClassName", - "target_signature": "public async Task MethodName(ParamType param)", - "available_methods": ["signatures of other methods in the same class or injected services"], - "constructor_params": ["IDbContext db", "ILogger logger"], - "requirement_ids": ["R1"], - "verification_criteria": "How to verify this method works correctly", - "affected_files": ["src/Services/MyService.cs"] - }""" - -_CSHARP_RIGOR_SUFFIX = f"""Produce a JSON plan organized into phases. Each phase corresponds to one class being modified or created. - -{{ - "summary": "Brief summary of the feature being implemented", - "phases": [ - {{ - "name": "ClassName (e.g. 'UserService', 'OrderValidator')", - "description": "What this class does and why these methods are needed", - "tasks": [ - {_CSHARP_TASK_SCHEMA} - ] - }} - ], - "open_questions": [ - {{ - "question": "An ambiguity or decision in the requirements", - "proposed_answer": "How you propose to handle it", - "impact": "What changes if the answer differs" - }} - ], - "assembly_config": {{ - "new_files": ["Paths to new .cs files that need to be created"], - "modified_files": ["Paths to existing .cs files that will be modified"] - }} -}} +_CSHARP_TASK_SCHEMA_XML = """ + ClassName.MethodName + What this method does, including behavioral contract and edge cases + csharp_method + medium + + Namespace.ClassName + public async Task<bool> MethodName(ParamType param) + signatures of other methods in the same class + IDbContext db,ILogger logger + R1 + How to verify this method works correctly + src/Services/MyService.cs + """ + +_CSHARP_RIGOR_SUFFIX = f"""Produce an XML plan organized into phases. Each phase corresponds to one class being modified or created. + + + Brief summary of the feature being implemented + + + What this class does and why these methods are needed +{_CSHARP_TASK_SCHEMA_XML} + + + + + An ambiguity or decision in the requirements + How you propose to handle it + What changes if the answer differs + + + + path/to/NewFile.cs + path/to/ExistingFile.cs + + + +Task field notes: +- task_type is always csharp_method for method-level tasks +- complexity: simple|medium|complex +- depends_on: comma-separated 0-based task indices, GLOBAL across all phases +- target_class: full namespace-qualified class name +- target_signature: exact method signature (use < > for generics) +- available_methods: comma-separated signatures of other methods in the class +- constructor_params: comma-separated injected dependencies +- Use XML entities for special characters: < > & Phase guidelines: - One phase per class. Phase name = class name. - Within a phase, order tasks so independent methods come first. -- depends_on indices are GLOBAL across all phases (0-based from the first task in the first phase). - After all method tasks in a phase, the system will auto-create an assembly task to stitch and build. Open questions: - Surface 1-5 ambiguities about the requirements or existing code structure. -Respond with ONLY the JSON plan, no markdown fences or explanation.""" +Respond with ONLY the XML plan, no markdown fences or explanation.""" def _build_csharp_system_prompt(type_map: str) -> str: @@ -421,16 +446,23 @@ async def generate( completion_tokens = response.usage.output_tokens cost = calculate_cost(PLANNING_MODEL, prompt_tokens, completion_tokens) - # Parse the plan JSON - try: - plan_data = json.loads(response_text) - except json.JSONDecodeError: - # Try to extract JSON from the response (in case of markdown fences). - # Use a balanced-brace approach to find the outermost JSON object, - # instead of a greedy regex that could match too much. - plan_data = extract_json_object(response_text) - if plan_data is None: - raise PlanParseError("Failed to parse plan JSON from Claude response") + # Parse the plan (XML primary, JSON fallback) + plan_xml_str = extract_xml_plan(response_text) + if plan_xml_str: + try: + plan_data = parse_plan_xml(plan_xml_str) + except Exception as xml_err: + logger.warning("XML plan parse failed, trying JSON fallback: %s", xml_err) + plan_xml_str = None # Clear so we don't store bad XML + + if not plan_xml_str: + # Fallback: try JSON (backward compat or if Claude ignored XML instruction) + try: + plan_data = json.loads(response_text) + except json.JSONDecodeError: + plan_data = extract_json_object(response_text) + if plan_data is None: + raise PlanParseError("Failed to parse plan from Claude response") except Exception: # Record actual API spend even if parsing failed — prevents budget leak @@ -476,10 +508,11 @@ async def generate( now = time.time() await db.execute_write( "INSERT INTO plans (id, project_id, version, model_used, prompt_tokens, " - "completion_tokens, cost_usd, plan_json, status, created_at) " - "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + "completion_tokens, cost_usd, plan_json, plan_xml, status, created_at) " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (plan_id, project_id, version, PLANNING_MODEL, prompt_tokens, - completion_tokens, cost, json.dumps(plan_data), PlanStatus.DRAFT, now), + completion_tokens, cost, json.dumps(plan_data), plan_xml_str, + PlanStatus.DRAFT, now), ) # Record spending and release reservation diff --git a/backend/utils/xml_utils.py b/backend/utils/xml_utils.py new file mode 100644 index 0000000..3061516 --- /dev/null +++ b/backend/utils/xml_utils.py @@ -0,0 +1,161 @@ +# Orchestration Engine - XML Plan Utilities +# +# Extraction and parsing of XML plans from LLM output. +# Converts XML plan format to dicts matching the existing PlanData shape +# so downstream code (decomposer, routes, frontend) stays unchanged. +# +# Depends on: (none — stdlib only) +# Used by: services/planner.py, services/decomposer.py + +import re +import xml.etree.ElementTree as ET + + +def extract_xml_plan(text: str) -> str | None: + """Extract the ... block from LLM response text. + + Handles markdown fences, preamble text, and blocks. + Returns the raw XML string (including the tags) or None. + """ + # Strip markdown fences + text = re.sub(r"```(?:xml)?\s*\n?", "", text) + + # Find + start = text.find("") + if end == -1: + return None + + return text[start:end + len("")] + + +def _text(el: ET.Element | None) -> str: + """Get text content of an element, defaulting to empty string.""" + if el is None: + return "" + return (el.text or "").strip() + + +def _split_csv(value: str) -> list[str]: + """Split a comma-separated string, filtering empty values.""" + return [v.strip() for v in value.split(",") if v.strip()] + + +def _parse_task(task_el: ET.Element) -> dict: + """Convert a element to a dict matching the JSON task schema.""" + task = { + "title": _text(task_el.find("title")), + "description": _text(task_el.find("description")), + "task_type": _text(task_el.find("task_type")) or "code", + "complexity": _text(task_el.find("complexity")) or "medium", + "tools_needed": _split_csv(_text(task_el.find("tools_needed"))), + "requirement_ids": _split_csv(_text(task_el.find("requirement_ids"))), + "verification_criteria": _text(task_el.find("verification_criteria")), + "affected_files": _split_csv(_text(task_el.find("affected_files"))), + } + + # depends_on: comma-separated integers + deps_text = _text(task_el.find("depends_on")) + if deps_text: + task["depends_on"] = [int(d.strip()) for d in deps_text.split(",") if d.strip()] + else: + task["depends_on"] = [] + + # C# specific fields (optional) + for field in ("target_class", "target_signature"): + val = _text(task_el.find(field)) + if val: + task[field] = val + + # C# list fields + for field in ("available_methods", "constructor_params"): + val = _text(task_el.find(field)) + if val: + task[field] = _split_csv(val) + + return task + + +def _parse_question(q_el: ET.Element) -> dict: + """Convert a element to a dict.""" + return { + "question": _text(q_el.find("ask")), + "proposed_answer": _text(q_el.find("proposed")), + "impact": _text(q_el.find("impact")), + } + + +def _parse_risk(r_el: ET.Element) -> dict: + """Convert a element to a dict.""" + return { + "risk": _text(r_el.find("description")), + "likelihood": _text(r_el.find("likelihood")) or "medium", + "impact": _text(r_el.find("impact")) or "medium", + "mitigation": _text(r_el.find("mitigation")), + } + + +def parse_plan_xml(xml_str: str) -> dict: + """Parse an XML plan string into a dict matching the PlanData shape. + + Supports L1 (flat tasks), L2 (phased + questions), L3 (+ risks + test strategy), + and C# reflection plans (+ target_class, target_signature, assembly_config). + + Returns a dict identical in structure to what the JSON planner produces, + so downstream code (decomposer, routes, frontend) needs no changes. + """ + root = ET.fromstring(xml_str) + + result: dict = { + "summary": _text(root.find("summary")), + } + + # L1: flat container + tasks_el = root.find("tasks") + if tasks_el is not None: + result["tasks"] = [_parse_task(t) for t in tasks_el.findall("task")] + + # L2+: container + phases_el = root.find("phases") + if phases_el is not None: + phases = [] + for phase_el in phases_el.findall("phase"): + phase = { + "name": phase_el.get("name", ""), + "description": _text(phase_el.find("description")), + "tasks": [_parse_task(t) for t in phase_el.findall("task")], + } + phases.append(phase) + result["phases"] = phases + + # L2+: + questions_el = root.find("questions") + if questions_el is not None: + result["open_questions"] = [_parse_question(q) for q in questions_el.findall("question")] + + # L3: + risks_el = root.find("risks") + if risks_el is not None: + result["risk_assessment"] = [_parse_risk(r) for r in risks_el.findall("risk")] + + # L3: + ts_el = root.find("test_strategy") + if ts_el is not None: + result["test_strategy"] = { + "approach": _text(ts_el.find("approach")), + "test_tasks": _split_csv(_text(ts_el.find("test_tasks"))), + "coverage_notes": _text(ts_el.find("coverage_notes")), + } + + # C#: + ac_el = root.find("assembly_config") + if ac_el is not None: + result["assembly_config"] = { + "new_files": _split_csv(_text(ac_el.find("new_files"))), + "modified_files": _split_csv(_text(ac_el.find("modified_files"))), + } + + return result diff --git a/tests/unit/test_csharp_planner.py b/tests/unit/test_csharp_planner.py index 5fbdfbf..446af35 100644 --- a/tests/unit/test_csharp_planner.py +++ b/tests/unit/test_csharp_planner.py @@ -30,10 +30,10 @@ def test_includes_csharp_preamble(self): def test_includes_task_schema(self): prompt = _build_csharp_system_prompt("types") - assert "target_signature" in prompt - assert "target_class" in prompt - assert "available_methods" in prompt - assert "constructor_params" in prompt + assert "" in prompt + assert "" in prompt + assert "" in prompt + assert "" in prompt def test_includes_strategy_rules(self): prompt = _build_csharp_system_prompt("types") @@ -50,7 +50,7 @@ def test_generic_prompt_unchanged(self): """Verify the generic prompt path still works.""" prompt = _build_system_prompt(PlanningRigor.L2) assert "project planner" in prompt - assert "reflected_types" not in prompt + assert "" not in prompt def _make_planner_db_mock(config_json): @@ -69,8 +69,13 @@ def _make_planner_db_mock(config_json): return mock_db -def _make_anthropic_mock(response_text='{"summary": "test", "phases": []}'): +_DEFAULT_CSHARP_XML = 'test' + + +def _make_anthropic_mock(response_text=None): """Create a mock anthropic module + client.""" + if response_text is None: + response_text = _DEFAULT_CSHARP_XML mock_anthropic = AsyncMock() mock_client = AsyncMock() mock_response = AsyncMock() @@ -116,7 +121,9 @@ async def test_csharp_strategy_fallback_on_reflection_failure(self): mock_reflect.return_value = None # Reflection failed with patch("backend.services.planner.anthropic") as mock_anthropic_mod: - mock_anthropic, mock_client = _make_anthropic_mock('{"summary": "test", "tasks": []}') + mock_anthropic, mock_client = _make_anthropic_mock( + 'test' + ) mock_anthropic_mod.AsyncAnthropic.return_value = mock_client await planner.generate("proj1") diff --git a/tests/unit/test_planner_service.py b/tests/unit/test_planner_service.py index 0673337..f406d88 100644 --- a/tests/unit/test_planner_service.py +++ b/tests/unit/test_planner_service.py @@ -5,7 +5,6 @@ # Depends on: backend/services/planner.py, backend/db/connection.py # Used by: pytest -import json import time from unittest.mock import AsyncMock, MagicMock, patch @@ -56,14 +55,25 @@ def test_json_after_markdown_fence(self): # TestPlannerServiceGenerate # --------------------------------------------------------------------------- +_DEFAULT_XML_PLAN = """ + Test plan + + + Task 1 + Do it + code + simple + + + + +""" + + def _make_plan_response(plan_text=None, pt=100, ct=200): """Build a mock Claude response for planning.""" if plan_text is None: - plan_text = json.dumps({ - "summary": "Test plan", - "tasks": [{"title": "Task 1", "description": "Do it", "task_type": "code", - "complexity": "simple", "depends_on": [], "tools_needed": []}], - }) + plan_text = _DEFAULT_XML_PLAN response = MagicMock() response.content = [MagicMock(text=plan_text, type="text")] response.usage = MagicMock(input_tokens=pt, output_tokens=ct) diff --git a/tests/unit/test_planning_rigor.py b/tests/unit/test_planning_rigor.py index 3a783bd..314555b 100644 --- a/tests/unit/test_planning_rigor.py +++ b/tests/unit/test_planning_rigor.py @@ -31,20 +31,20 @@ class TestBuildSystemPrompt: def test_l1_prompt_contains_flat_tasks(self): prompt = _build_system_prompt(PlanningRigor.L1) - assert '"tasks"' in prompt - assert '"phases"' not in prompt + assert "" in prompt + assert "" not in prompt def test_l2_prompt_contains_phases_and_questions(self): prompt = _build_system_prompt(PlanningRigor.L2) - assert '"phases"' in prompt - assert '"open_questions"' in prompt - assert '"risk_assessment"' not in prompt + assert "" in prompt + assert "" in prompt + assert "" not in prompt def test_l3_prompt_contains_risk_and_test_strategy(self): prompt = _build_system_prompt(PlanningRigor.L3) - assert '"phases"' in prompt - assert '"risk_assessment"' in prompt - assert '"test_strategy"' in prompt + assert "" in prompt + assert "" in prompt + assert "" in prompt def test_all_rigor_levels_have_suffix(self): for rigor in PlanningRigor: @@ -161,13 +161,24 @@ def test_global_dependency_indexing_preserved(self): # PlannerService rigor from project config # --------------------------------------------------------------------------- +_DEFAULT_XML_PLAN = """ + Test plan + + + T1 + Do it + code + simple + + + + +""" + + def _make_plan_response(plan_text=None, pt=100, ct=200): if plan_text is None: - plan_text = json.dumps({ - "summary": "Test plan", - "tasks": [{"title": "T1", "description": "Do it", "task_type": "code", - "complexity": "simple", "depends_on": [], "tools_needed": []}], - }) + plan_text = _DEFAULT_XML_PLAN response = MagicMock() response.content = [MagicMock(text=plan_text, type="text")] response.usage = MagicMock(input_tokens=pt, output_tokens=ct) @@ -206,8 +217,8 @@ async def test_l1_uses_flat_prompt(self, _mock_cost, rigor_db): call_kwargs = mock_client.messages.create.call_args.kwargs system = call_kwargs["system"] - assert '"tasks"' in system - assert '"phases"' not in system + assert "" in system + assert "" not in system assert call_kwargs["max_tokens"] == _MAX_TOKENS_BY_RIGOR[PlanningRigor.L1] @patch("backend.services.planner.calculate_cost", return_value=0.01) @@ -226,8 +237,8 @@ async def test_l3_uses_thorough_prompt(self, _mock_cost, rigor_db): call_kwargs = mock_client.messages.create.call_args.kwargs system = call_kwargs["system"] - assert '"risk_assessment"' in system - assert '"test_strategy"' in system + assert "" in system + assert "" in system assert call_kwargs["max_tokens"] == _MAX_TOKENS_BY_RIGOR[PlanningRigor.L3] @patch("backend.services.planner.calculate_cost", return_value=0.01) @@ -246,8 +257,8 @@ async def test_missing_rigor_defaults_to_l2(self, _mock_cost, rigor_db): call_kwargs = mock_client.messages.create.call_args.kwargs system = call_kwargs["system"] - assert '"phases"' in system - assert '"open_questions"' in system + assert "" in system + assert "" in system assert call_kwargs["max_tokens"] == _MAX_TOKENS_BY_RIGOR[PlanningRigor.L2] diff --git a/tests/unit/test_xml_utils.py b/tests/unit/test_xml_utils.py new file mode 100644 index 0000000..ac9add9 --- /dev/null +++ b/tests/unit/test_xml_utils.py @@ -0,0 +1,324 @@ +# Orchestration Engine - XML Plan Utilities Tests +# +# Tests for extract_xml_plan() and parse_plan_xml() in xml_utils.py. +# +# Depends on: backend/utils/xml_utils.py +# Used by: CI + +from backend.utils.xml_utils import extract_xml_plan, parse_plan_xml + + +# --- extract_xml_plan tests --- + + +def test_extract_simple_plan(): + text = 'Test' + assert extract_xml_plan(text) == text + + +def test_extract_with_preamble(): + text = 'Here is my plan:\n\nS\n\nDone.' + result = extract_xml_plan(text) + assert result.startswith("") + + +def test_extract_with_markdown_fences(): + text = '```xml\nS\n```' + result = extract_xml_plan(text) + assert result is not None + assert "S" in result + + +def test_extract_with_thinking_block(): + text = 'Let me reason...\n\nS' + result = extract_xml_plan(text) + assert result.startswith("" not in result + + +def test_extract_no_plan_returns_none(): + assert extract_xml_plan("Just some text with no plan") is None + + +def test_extract_unclosed_plan_returns_none(): + assert extract_xml_plan('S') is None + + +# --- parse_plan_xml tests --- + + +_L1_XML = """ + Build a widget + + + Create widget + Build the widget component + code + medium + + read_file,write_file + R1 + Widget renders + src/widget.ts + + + Test widget + Add unit tests + code + simple + 0 + write_file + R1 + Tests pass + tests/widget.test.ts + + +""" + + +def test_parse_l1_flat_tasks(): + result = parse_plan_xml(_L1_XML) + assert result["summary"] == "Build a widget" + assert len(result["tasks"]) == 2 + assert result["tasks"][0]["title"] == "Create widget" + assert result["tasks"][0]["task_type"] == "code" + assert result["tasks"][0]["depends_on"] == [] + assert result["tasks"][0]["tools_needed"] == ["read_file", "write_file"] + assert result["tasks"][0]["requirement_ids"] == ["R1"] + assert result["tasks"][0]["affected_files"] == ["src/widget.ts"] + assert result["tasks"][1]["depends_on"] == [0] + + +_L2_XML = """ + Build auth system + + + Set up core auth infrastructure + + User model + Define user table + code + medium + + write_file + R1 + Migration runs + db/models.py + + + + Wire auth into API + + Auth middleware + JWT validation + code + complex + 0 + read_file,write_file + R2,R3 + Auth tests pass + src/middleware.py + + + + + + Use JWT or sessions? + JWT for stateless auth + Sessions would need Redis + + +""" + + +def test_parse_l2_phased(): + result = parse_plan_xml(_L2_XML) + assert result["summary"] == "Build auth system" + assert len(result["phases"]) == 2 + assert result["phases"][0]["name"] == "Foundation" + assert result["phases"][0]["description"] == "Set up core auth infrastructure" + assert len(result["phases"][0]["tasks"]) == 1 + assert result["phases"][1]["tasks"][0]["depends_on"] == [0] + assert result["phases"][1]["tasks"][0]["requirement_ids"] == ["R2", "R3"] + + +def test_parse_l2_open_questions(): + result = parse_plan_xml(_L2_XML) + assert len(result["open_questions"]) == 1 + q = result["open_questions"][0] + assert q["question"] == "Use JWT or sessions?" + assert q["proposed_answer"] == "JWT for stateless auth" + assert q["impact"] == "Sessions would need Redis" + + +_L3_XML = """ + Payment integration + + + Payment processing + + Stripe client + Wrap Stripe API + code + complex + + write_file + R1 + API calls succeed + src/stripe.py + + + + + + Which payment provider? + Stripe + Different SDK + + + + + Stripe rate limits during peak + low + high + Implement retry with exponential backoff + + + + Mock Stripe API in tests + Stripe client,Payment flow + Cover refund edge cases + +""" + + +def test_parse_l3_risks(): + result = parse_plan_xml(_L3_XML) + assert len(result["risk_assessment"]) == 1 + r = result["risk_assessment"][0] + assert r["risk"] == "Stripe rate limits during peak" + assert r["likelihood"] == "low" + assert r["impact"] == "high" + assert "backoff" in r["mitigation"] + + +def test_parse_l3_test_strategy(): + result = parse_plan_xml(_L3_XML) + ts = result["test_strategy"] + assert ts["approach"] == "Mock Stripe API in tests" + assert ts["test_tasks"] == ["Stripe client", "Payment flow"] + assert "refund" in ts["coverage_notes"] + + +_CSHARP_XML = """ + Implement user service + + + Core user operations + + UserService.GetUser + Fetch user by ID + csharp_method + medium + + MyApp.Services.UserService + public async Task<User> GetUser(Guid id) + Save(User u),Delete(Guid id) + IDbContext db,ILogger logger + R1 + Returns user or throws + src/Services/UserService.cs + + + + + + Use nullable return or exception? + Exception for not found + Changes caller error handling + + + + src/Services/UserService.cs + src/DI/Container.cs + +""" + + +def test_parse_csharp_plan(): + result = parse_plan_xml(_CSHARP_XML) + task = result["phases"][0]["tasks"][0] + assert task["task_type"] == "csharp_method" + assert task["target_class"] == "MyApp.Services.UserService" + assert "Task" in task["target_signature"] # XML entity decoded + assert task["constructor_params"] == ["IDbContext db", "ILogger logger"] + assert task["available_methods"] == ["Save(User u)", "Delete(Guid id)"] + + +def test_parse_csharp_assembly_config(): + result = parse_plan_xml(_CSHARP_XML) + ac = result["assembly_config"] + assert ac["new_files"] == ["src/Services/UserService.cs"] + assert ac["modified_files"] == ["src/DI/Container.cs"] + + +def test_parse_depends_on_multiple(): + xml = """S + TD + 0,1 + """ + result = parse_plan_xml(xml) + assert result["tasks"][0]["depends_on"] == [0, 1] + + +def test_parse_empty_depends_on(): + xml = """S + TD + + """ + result = parse_plan_xml(xml) + assert result["tasks"][0]["depends_on"] == [] + + +def test_parse_missing_optional_fields(): + xml = """S + TD + """ + result = parse_plan_xml(xml) + t = result["tasks"][0] + assert t["tools_needed"] == [] + assert t["affected_files"] == [] + assert t["depends_on"] == [] + assert t["task_type"] == "code" + assert t["complexity"] == "medium" + + +def test_parse_xml_entities(): + """XML entities like < and & are decoded properly.""" + xml = """S + T + Use <T> and & operator + """ + result = parse_plan_xml(xml) + assert result["tasks"][0]["description"] == "Use and & operator" + + +def test_roundtrip_dict_shape(): + """Parsed XML dict has the same keys as what JSON planner produces.""" + result = parse_plan_xml(_L2_XML) + # Must have these top-level keys + assert "summary" in result + assert "phases" in result + assert "open_questions" in result + # Each phase must have these keys + phase = result["phases"][0] + assert "name" in phase + assert "description" in phase + assert "tasks" in phase + # Each task must have these keys + task = phase["tasks"][0] + for key in ("title", "description", "task_type", "complexity", + "depends_on", "tools_needed", "requirement_ids", + "verification_criteria", "affected_files"): + assert key in task, f"Missing key: {key}" From 0a9b9dca099f2c78fa44cdd68e593e4e4c619cb8 Mon Sep 17 00:00:00 2001 From: JRussas <159085336+JMRussas@users.noreply.github.com> Date: Fri, 6 Mar 2026 22:14:36 -0500 Subject: [PATCH 2/2] Update CLAUDE.md with XML plan format and test count Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CLAUDE.md b/CLAUDE.md index b5eb728..5bdae49 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -69,6 +69,7 @@ docker run -p 5200:5200 -v ./config.json:/app/config.json orchestration | `backend/services/git_service.py` | Stateless git operations via subprocess + asyncio.to_thread | | `backend/services/resource_monitor.py` | Health checks (Ollama, ComfyUI, Claude) | | `backend/services/progress.py` | SSE broadcast, event persistence | +| `backend/utils/xml_utils.py` | XML plan extraction and parsing (extract_xml_plan, parse_plan_xml) | | `backend/tools/registry.py` | Injectable `ToolRegistry` class | | `backend/tools/` | Tool implementations (RAG, Ollama, ComfyUI, file) | | `frontend/` | React 19 + TypeScript + Vite UI (ErrorBoundary, 404 page) | @@ -93,6 +94,7 @@ docker run -p 5200:5200 -v ./config.json:/app/config.json orchestration - **Auth**: JWT Bearer tokens for REST, API keys (`orch_` prefix) for MCP/external executors, short-lived SSE tokens for EventSource. First registered user becomes admin. - **Ownership**: projects have `owner_id`. Users see/modify only their own projects. Admins can access all. - **Budget**: every API call recorded in `usage_log`, checked against limits before execution. Budget endpoints are admin-only. +- **Plans**: XML format (source of truth in `plan_xml` column). Dual-column: `plan_xml` + `plan_json` for backward compat. Decomposer/routes prefer XML with JSON fallback. Planner has JSON fallback if Claude returns JSON despite XML prompt. - **Models**: Ollama (free) for simple tasks, Haiku ($) for medium, Sonnet ($$) for complex - **Tools**: registered in `ToolRegistry` class, injected via DI container - **SSE**: short-lived token via `POST /api/events/{project_id}/token`, then stream via `GET /api/events/{project_id}?token=...` @@ -108,7 +110,7 @@ docker run -p 5200:5200 -v ./config.json:/app/config.json orchestration - **Traceability**: requirements numbered [R1], [R2], mapped to tasks; coverage endpoint shows gaps - **External execution**: MCP server (`backend/mcp/server.py`) for Claude Code integration. Execution modes: auto (engine-only), hybrid (Ollama internal, Claude external), external (all external). Tasks claimed atomically via CAS, results submitted with cost tracking. - **Git integration**: optional per-project (`repo_path` nullable). `GitService` wraps subprocess via `asyncio.to_thread()`. Config in `git.*` section. Phase 1 (foundation) complete; execution wiring (Phase 2+) pending. -- **Tests**: Backend: pytest-asyncio (auto mode), 731 tests. Frontend: vitest + @testing-library/react, 137 tests. Load tests: 7 (excluded from CI via `slow` marker) +- **Tests**: Backend: pytest-asyncio (auto mode), 797 tests. Frontend: vitest + @testing-library/react, 137 tests. Load tests: 7 (excluded from CI via `slow` marker) ## Git Workflow