From fc02c0b5e0adb0058184881c6a3009c2cc3d0fdf Mon Sep 17 00:00:00 2001 From: JRussas <159085336+JMRussas@users.noreply.github.com> Date: Sat, 7 Mar 2026 13:07:50 -0500 Subject: [PATCH 1/8] Add /internal/chat endpoint for multi-model editor integration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Standalone chat proxy that routes to CLI providers (Claude, Gemini, Codex) via subprocess and Ollama via HTTP API. Supports conversation history (last 20 messages), model selection, and native Ollama /api/chat messages. No auth — intended for trusted network access (editor integrations). Co-Authored-By: Claude Opus 4.6 --- backend/app.py | 4 + backend/routes/internal.py | 207 +++++++++++++++++++++++++++++++++++++ 2 files changed, 211 insertions(+) create mode 100644 backend/routes/internal.py diff --git a/backend/app.py b/backend/app.py index 3f41e1c..de9d4d5 100644 --- a/backend/app.py +++ b/backend/app.py @@ -40,6 +40,7 @@ from backend.routes.checkpoints import router as checkpoints_router from backend.routes.events import router as events_router from backend.routes.external import router as external_router +from backend.routes.internal import router as internal_router from backend.routes.projects import router as projects_router from backend.routes.rag import router as rag_router from backend.routes.services import health_router, router as services_router @@ -255,6 +256,9 @@ async def send_with_headers(message): app.include_router(rag_router, prefix="/api", dependencies=_auth_dep) app.include_router(external_router, prefix="/api", dependencies=_auth_dep) +# Internal routes — no auth (trusted by network isolation) +app.include_router(internal_router, prefix="/api") + # Events route uses query-param token auth (EventSource can't send headers) app.include_router(events_router, prefix="/api") diff --git a/backend/routes/internal.py b/backend/routes/internal.py new file mode 100644 index 0000000..1c1c6cc --- /dev/null +++ b/backend/routes/internal.py @@ -0,0 +1,207 @@ +# Orchestration Engine - Internal Routes +# +# Unauthenticated endpoints for internal use: chat proxy for editor +# integration, multi-model routing across CLI providers and Ollama. +# +# Depends on: (none — standalone, no DB or DI required) +# Used by: app.py + +import asyncio +import logging +import os +from typing import Optional + +import httpx + +from fastapi import APIRouter +from pydantic import BaseModel + +logger = logging.getLogger("orchestration.internal") + +router = APIRouter(prefix="/internal", tags=["internal"]) + + +# --------------------------------------------------------------------------- +# Request / Response schemas +# --------------------------------------------------------------------------- + + +class ChatMessageEntry(BaseModel): + """A single message in a conversation history.""" + + role: str # "user", "assistant", "system" + content: str + + +class ChatRequest(BaseModel): + prompt: str + context: Optional[str] = None + provider: Optional[str] = None + messages: Optional[list[ChatMessageEntry]] = None + model: Optional[str] = None + + +class ChatResponse(BaseModel): + response: str + provider: Optional[str] = None + model_used: Optional[str] = None + + +# --------------------------------------------------------------------------- +# Endpoints +# --------------------------------------------------------------------------- + + +@router.post("/chat") +async def chat(request: ChatRequest): + """Chat endpoint for editor integration with conversation history support. + + Routes to CLI providers (Claude, Gemini, Codex) via subprocess or to + Ollama via HTTP API. Supports conversation history and model selection. + """ + # Build prompt with optional context and conversation history + parts: list[str] = [] + + if request.context: + parts.append(f"Context: {request.context}") + + if request.messages: + # Include last 20 messages to keep prompt size manageable + recent = request.messages[-20:] + history_lines = [f"[{m.role}]: {m.content}" for m in recent] + parts.append("Previous conversation:\n" + "\n".join(history_lines)) + + parts.append(f"Current request:\n{request.prompt}" if request.messages else request.prompt) + + full_prompt = "\n\n".join(parts) + + # Determine provider (default to gemini) + provider = (request.provider or "gemini").lower() + + # Ollama: use HTTP API directly (supports messages natively) + if provider == "ollama": + return await _chat_ollama(request, full_prompt) + + # CLI-based providers + model_used: Optional[str] = None + + if provider == "claude": + cmd_args = ["claude", "-p", full_prompt, "--output-format", "text"] + # Claude -p mode doesn't support model selection + elif provider == "codex": + cmd_args = ["codex", "exec"] + if request.model: + cmd_args.extend(["--model", request.model]) + model_used = request.model + cmd_args.extend(["--", full_prompt]) + else: # gemini default + cmd_args = ["gemini"] + if request.model: + cmd_args.extend(["-m", request.model]) + model_used = request.model + cmd_args.extend(["-p", full_prompt]) + + try: + proc = await asyncio.create_subprocess_exec( + *cmd_args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=120) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + return ChatResponse( + response="Request timed out", + provider=provider, + model_used=model_used, + ) + + stdout_text = stdout.decode().strip() + stderr_text = stderr.decode().strip() + + if proc.returncode != 0: + error_msg = stderr_text or f"Command exited with code {proc.returncode}" + return ChatResponse( + response=f"Error: {error_msg}", + provider=provider, + model_used=model_used, + ) + + return ChatResponse( + response=stdout_text, + provider=provider, + model_used=model_used, + ) + except FileNotFoundError: + return ChatResponse( + response=f"Provider '{provider}' CLI not found", + provider=provider, + model_used=model_used, + ) + + +async def _chat_ollama(request: ChatRequest, full_prompt: str) -> ChatResponse: + """Route chat to Ollama HTTP API with native message support.""" + ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434") + ollama_model = request.model or os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:14b") + + # Build Ollama messages array if conversation history provided + if request.messages: + messages: list[dict] = [] + + if request.context: + messages.append({"role": "system", "content": request.context}) + + for m in request.messages[-20:]: + messages.append({"role": m.role, "content": m.content}) + + # Add current prompt as the latest user message + messages.append({"role": "user", "content": request.prompt}) + + payload = { + "model": ollama_model, + "messages": messages, + "stream": False, + } + api_path = "/api/chat" + else: + # Simple generate mode (no history) + payload = { + "model": ollama_model, + "prompt": full_prompt, + "stream": False, + } + api_path = "/api/generate" + + try: + async with httpx.AsyncClient(timeout=120.0) as client: + resp = await client.post(f"{ollama_url}{api_path}", json=payload) + resp.raise_for_status() + data = resp.json() + + # /api/chat returns {"message": {"content": "..."}}, + # /api/generate returns {"response": "..."} + if "message" in data: + response_text = data["message"].get("content", "") + else: + response_text = data.get("response", "") + + return ChatResponse( + response=response_text.strip(), + provider="ollama", + model_used=ollama_model, + ) + except httpx.HTTPStatusError as exc: + return ChatResponse( + response=f"Ollama error: {exc.response.status_code} {exc.response.text}", + provider="ollama", + model_used=ollama_model, + ) + except httpx.ConnectError: + return ChatResponse( + response=f"Cannot connect to Ollama at {ollama_url}", + provider="ollama", + model_used=ollama_model, + ) From 18acb947c22418b5d0661f5645705347e0550fd6 Mon Sep 17 00:00:00 2001 From: JRussas <159085336+JMRussas@users.noreply.github.com> Date: Sat, 7 Mar 2026 13:30:30 -0500 Subject: [PATCH 2/8] Fix Windows CLI resolution for npm global binaries shutil.which() resolves bare command names to full .cmd paths on Windows. Avoids shell=True (command injection risk) while finding npm global CLIs. Co-Authored-By: Claude Opus 4.6 --- backend/routes/internal.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/backend/routes/internal.py b/backend/routes/internal.py index 1c1c6cc..29ffd9e 100644 --- a/backend/routes/internal.py +++ b/backend/routes/internal.py @@ -9,6 +9,8 @@ import asyncio import logging import os +import shutil +import sys from typing import Optional import httpx @@ -101,6 +103,12 @@ async def chat(request: ChatRequest): model_used = request.model cmd_args.extend(["-p", full_prompt]) + # On Windows, npm global binaries are .cmd — resolve to full path + if sys.platform == "win32": + resolved = shutil.which(cmd_args[0]) + if resolved: + cmd_args[0] = resolved + try: proc = await asyncio.create_subprocess_exec( *cmd_args, From 91ebbd3dcc16eed7e8ae609cbb2713ed5b628583 Mon Sep 17 00:00:00 2001 From: JRussas <159085336+JMRussas@users.noreply.github.com> Date: Sat, 7 Mar 2026 13:44:11 -0500 Subject: [PATCH 3/8] Route planner through CLI providers instead of Anthropic API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New llm_router.py: call_llm() routes through CLI subprocess (Claude, Gemini, Codex) or Ollama HTTP with automatic fallback chain. Zero cost on subscription billing. Planner no longer requires ANTHROPIC_API_KEY. Budget reservation removed (cost is always $0 on subscription). Provider fallback: gemini → claude → codex. Co-Authored-By: Claude Opus 4.6 --- backend/services/llm_router.py | 168 +++++++++++++++++++++++++++++++++ backend/services/planner.py | 123 +++++++----------------- 2 files changed, 201 insertions(+), 90 deletions(-) create mode 100644 backend/services/llm_router.py diff --git a/backend/services/llm_router.py b/backend/services/llm_router.py new file mode 100644 index 0000000..7d574d6 --- /dev/null +++ b/backend/services/llm_router.py @@ -0,0 +1,168 @@ +# Orchestration Engine - LLM Router +# +# Routes LLM calls through CLI providers (subscription billing) instead of +# the Anthropic API. Supports Claude, Gemini, Codex CLIs and Ollama HTTP. +# +# Depends on: backend/config.py +# Used by: planner.py, decomposer.py, verifier.py, knowledge_extractor.py + +import asyncio +import logging +import os +import shutil +import sys +from dataclasses import dataclass +from typing import Optional + +import httpx + +logger = logging.getLogger("orchestration.llm_router") + +# Provider preference order for planning (complex reasoning tasks) +_PLANNING_PROVIDERS = ["gemini", "claude", "codex"] + +# Provider preference order for simple tasks (verification, extraction) +_SIMPLE_PROVIDERS = ["gemini", "ollama", "codex"] + + +@dataclass +class LLMResponse: + """Response from an LLM call.""" + + text: str + provider: str + model: Optional[str] = None + # CLI providers don't expose token counts — cost is $0 on subscription + prompt_tokens: int = 0 + completion_tokens: int = 0 + cost_usd: float = 0.0 + + +def _resolve_cmd(name: str) -> str: + """Resolve a CLI command name to full path (handles .cmd on Windows).""" + if sys.platform == "win32": + resolved = shutil.which(name) + if resolved: + return resolved + return name + + +async def _call_cli(provider: str, system_prompt: str, user_message: str, + model: Optional[str] = None) -> LLMResponse: + """Call a CLI provider with system prompt and user message. + + Combines system_prompt + user_message into a single prompt since CLIs + don't natively support separate system/user roles. + """ + full_prompt = f"{system_prompt}\n\n---\n\n{user_message}" + + if provider == "claude": + cmd_args = [_resolve_cmd("claude"), "-p", full_prompt, "--output-format", "text"] + elif provider == "codex": + cmd_args = [_resolve_cmd("codex"), "exec"] + if model: + cmd_args.extend(["--model", model]) + cmd_args.extend(["--", full_prompt]) + elif provider == "gemini": + cmd_args = [_resolve_cmd("gemini")] + if model: + cmd_args.extend(["-m", model]) + cmd_args.extend(["-p", full_prompt]) + else: + raise ValueError(f"Unknown CLI provider: {provider}") + + proc = await asyncio.create_subprocess_exec( + *cmd_args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=300) + stdout_text = stdout.decode().strip() + stderr_text = stderr.decode().strip() + + if proc.returncode != 0: + raise RuntimeError(f"{provider} CLI failed (exit {proc.returncode}): {stderr_text}") + + return LLMResponse(text=stdout_text, provider=provider, model=model) + + +async def _call_ollama(system_prompt: str, user_message: str, + model: Optional[str] = None) -> LLMResponse: + """Call Ollama HTTP API.""" + ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434") + ollama_model = model or os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:14b") + + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_message}, + ] + + async with httpx.AsyncClient(timeout=300.0) as client: + resp = await client.post( + f"{ollama_url}/api/chat", + json={"model": ollama_model, "messages": messages, "stream": False}, + ) + resp.raise_for_status() + data = resp.json() + + text = data.get("message", {}).get("content", "") + return LLMResponse(text=text.strip(), provider="ollama", model=ollama_model) + + +async def call_llm( + system_prompt: str, + user_message: str, + *, + provider: Optional[str] = None, + providers: Optional[list[str]] = None, + model: Optional[str] = None, + task_type: str = "planning", +) -> LLMResponse: + """Route an LLM call through CLI providers with fallback. + + Args: + system_prompt: System instructions for the LLM. + user_message: The user/task content. + provider: Explicit single provider to use (no fallback). + providers: Ordered list of providers to try (with fallback). + model: Optional model override for the provider. + task_type: "planning" or "simple" — determines default provider order. + + Returns: + LLMResponse with the text output. + + Raises: + RuntimeError if all providers fail. + """ + if provider: + chain = [provider] + elif providers: + chain = providers + elif task_type == "simple": + chain = list(_SIMPLE_PROVIDERS) + else: + chain = list(_PLANNING_PROVIDERS) + + errors = [] + for p in chain: + try: + logger.info("Calling %s for %s task", p, task_type) + if p == "ollama": + return await _call_ollama(system_prompt, user_message, model) + else: + return await _call_cli(p, system_prompt, user_message, model) + except FileNotFoundError: + msg = f"{p} CLI not found" + logger.warning(msg) + errors.append(msg) + except asyncio.TimeoutError: + msg = f"{p} timed out (300s)" + logger.warning(msg) + errors.append(msg) + except Exception as e: + msg = f"{p} failed: {e}" + logger.warning(msg) + errors.append(msg) + + raise RuntimeError(f"All providers failed: {'; '.join(errors)}") diff --git a/backend/services/planner.py b/backend/services/planner.py index b436d0f..8684ab2 100644 --- a/backend/services/planner.py +++ b/backend/services/planner.py @@ -9,28 +9,16 @@ import logging import time import uuid +from typing import Optional -import anthropic - -from backend.config import ANTHROPIC_API_KEY, API_TIMEOUT, PLANNING_MODEL -from backend.exceptions import BudgetExhaustedError, NotFoundError, PlanParseError +from backend.config import PLANNING_MODEL +from backend.exceptions import NotFoundError, PlanParseError from backend.models.enums import PlanningRigor, PlanStatus, ProjectStatus -from backend.services.model_router import calculate_cost +from backend.services.llm_router import call_llm from backend.utils.json_utils import extract_json_object, parse_requirements logger = logging.getLogger("orchestration.planner") -# Token estimates for budget reservation before API calls -_EST_PLANNING_INPUT_TOKENS = 2000 # system prompt (~1.5k) + requirements -_EST_PLANNING_OUTPUT_TOKENS = 2000 # plan JSON response - -# Max output tokens by rigor level (more structured output needs more tokens) -_MAX_TOKENS_BY_RIGOR = { - PlanningRigor.L1: 4096, - PlanningRigor.L2: 6144, - PlanningRigor.L3: 8192, -} - # Backward-compat alias for external importers _extract_json_object = extract_json_object @@ -336,18 +324,21 @@ async def _get_csharp_type_map(self, config: dict) -> str | None: async def generate( self, project_id: str, - client: anthropic.AsyncAnthropic | None = None, + provider: Optional[str] = None, + client=None, # Deprecated — kept for backward compat, ignored ) -> dict: - """Generate a structured plan for a project using Claude. + """Generate a structured plan for a project using CLI providers. + + Routes through llm_router (CLI subprocess) instead of Anthropic API. + Zero cost on subscription billing. Args: project_id: The project to plan for. - client: Optional shared Anthropic client. If None, creates (and closes) one. + provider: Optional explicit provider (gemini, claude, codex). Defaults to fallback chain. Returns the plan dict and updates the database. """ db = self._db - budget = self._budget # Get project row = await db.fetchone("SELECT * FROM projects WHERE id = ?", (project_id,)) @@ -373,15 +364,8 @@ async def generate( if csharp_type_map is not None: system_prompt = _build_csharp_system_prompt(csharp_type_map) - max_tokens = 8192 # C# plans are detailed else: system_prompt = _build_system_prompt(rigor) - max_tokens = _MAX_TOKENS_BY_RIGOR[rigor] - - # Reserve budget before making the API call (prevents TOCTOU race) - estimated_cost = calculate_cost(PLANNING_MODEL, _EST_PLANNING_INPUT_TOKENS, _EST_PLANNING_OUTPUT_TOKENS) - if not await budget.reserve_spend(estimated_cost): - raise BudgetExhaustedError("Budget limit reached. Cannot generate plan.") # Update project status await db.execute_write( @@ -389,11 +373,6 @@ async def generate( (ProjectStatus.PLANNING, time.time(), project_id), ) - # Use provided client or create a temporary one - owns_client = client is None - if owns_client: - client = anthropic.AsyncAnthropic(api_key=ANTHROPIC_API_KEY) - # Number requirements for traceability (paragraph-based splitting) req_blocks = parse_requirements(requirements) if req_blocks: @@ -402,61 +381,35 @@ async def generate( numbered = requirements user_msg = f"Project: {project_name}\n\nRequirements:\n{numbered}" - response = None try: - response = await client.messages.create( - model=PLANNING_MODEL, - max_tokens=max_tokens, - system=system_prompt, - messages=[{"role": "user", "content": user_msg}], - timeout=API_TIMEOUT, + llm_response = await call_llm( + system_prompt, + user_msg, + provider=provider, + task_type="planning", ) - # Extract text and tokens - if not response.content: - raise PlanParseError("Claude returned an empty response") - - response_text = response.content[0].text - prompt_tokens = response.usage.input_tokens - completion_tokens = response.usage.output_tokens - cost = calculate_cost(PLANNING_MODEL, prompt_tokens, completion_tokens) + response_text = llm_response.text + if not response_text: + raise PlanParseError("LLM returned an empty response") # Parse the plan JSON try: plan_data = json.loads(response_text) except json.JSONDecodeError: - # Try to extract JSON from the response (in case of markdown fences). - # Use a balanced-brace approach to find the outermost JSON object, - # instead of a greedy regex that could match too much. plan_data = extract_json_object(response_text) if plan_data is None: - raise PlanParseError("Failed to parse plan JSON from Claude response") + raise PlanParseError( + f"Failed to parse plan JSON from {llm_response.provider} response" + ) except Exception: - # Record actual API spend even if parsing failed — prevents budget leak - if response is not None and hasattr(response, "usage"): - pt = response.usage.input_tokens - ct = response.usage.output_tokens - actual_cost = calculate_cost(PLANNING_MODEL, pt, ct) - await budget.record_spend( - cost_usd=actual_cost, - prompt_tokens=pt, - completion_tokens=ct, - provider="anthropic", - model=PLANNING_MODEL, - purpose="planning", - project_id=project_id, - ) # Reset project status so it's not stuck in PLANNING await db.execute_write( "UPDATE projects SET status = ?, updated_at = ? WHERE id = ?", (ProjectStatus.DRAFT, time.time(), project_id), ) - await budget.release_reservation(estimated_cost) raise - finally: - if owns_client: - await client.close() # Determine plan version version_row = await db.fetchone( @@ -471,28 +424,17 @@ async def generate( (PlanStatus.SUPERSEDED, project_id, PlanStatus.DRAFT), ) - # Store the plan + # Store the plan — cost is $0 on subscription billing plan_id = uuid.uuid4().hex[:12] + model_used = f"{llm_response.provider}/{llm_response.model or 'default'}" now = time.time() await db.execute_write( "INSERT INTO plans (id, project_id, version, model_used, prompt_tokens, " "completion_tokens, cost_usd, plan_json, status, created_at) " "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", - (plan_id, project_id, version, PLANNING_MODEL, prompt_tokens, - completion_tokens, cost, json.dumps(plan_data), PlanStatus.DRAFT, now), - ) - - # Record spending and release reservation - await budget.record_spend( - cost_usd=cost, - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - provider="anthropic", - model=PLANNING_MODEL, - purpose="planning", - project_id=project_id, + (plan_id, project_id, version, model_used, 0, 0, 0.0, + json.dumps(plan_data), PlanStatus.DRAFT, now), ) - await budget.release_reservation(estimated_cost) # Update project status back to draft (awaiting approval) await db.execute_write( @@ -504,10 +446,10 @@ async def generate( "plan_id": plan_id, "version": version, "plan": plan_data, - "model_used": PLANNING_MODEL, - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "cost_usd": cost, + "model_used": model_used, + "prompt_tokens": 0, + "completion_tokens": 0, + "cost_usd": 0.0, } @@ -516,7 +458,8 @@ async def generate_plan( *, db, budget, - client: anthropic.AsyncAnthropic | None = None, + provider: Optional[str] = None, + client=None, # Deprecated — ignored ) -> dict: """Convenience wrapper for backward compatibility with tests and direct callers.""" - return await PlannerService(db=db, budget=budget).generate(project_id, client=client) + return await PlannerService(db=db, budget=budget).generate(project_id, provider=provider) From 907c60609de355d29e4146e8ddcd82eb3651124b Mon Sep 17 00:00:00 2001 From: JRussas <159085336+JMRussas@users.noreply.github.com> Date: Sat, 7 Mar 2026 13:51:08 -0500 Subject: [PATCH 4/8] Add /internal/plan endpoint for unauthenticated planning Bypasses auth to allow MCP and internal callers to trigger planning via CLI providers. Includes traceback in error response for debugging. Co-Authored-By: Claude Opus 4.6 --- backend/routes/internal.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/backend/routes/internal.py b/backend/routes/internal.py index 29ffd9e..056fe92 100644 --- a/backend/routes/internal.py +++ b/backend/routes/internal.py @@ -13,11 +13,17 @@ import sys from typing import Optional +import traceback + import httpx -from fastapi import APIRouter +from dependency_injector.wiring import inject, Provide +from fastapi import APIRouter, Depends from pydantic import BaseModel +from backend.container import Container +from backend.services.planner import PlannerService + logger = logging.getLogger("orchestration.internal") router = APIRouter(prefix="/internal", tags=["internal"]) @@ -213,3 +219,24 @@ async def _chat_ollama(request: ChatRequest, full_prompt: str) -> ChatResponse: provider="ollama", model_used=ollama_model, ) + + +class PlanRequest(BaseModel): + project_id: str + provider: Optional[str] = None + + +@router.post("/plan") +@inject +async def plan( + request: PlanRequest, + planner: PlannerService = Depends(Provide[Container.planner]), +): + """Unauthenticated plan endpoint for internal use. Routes through CLI.""" + try: + result = await planner.generate(request.project_id, provider=request.provider) + return result + except Exception as e: + tb = traceback.format_exc() + logger.error("Plan failed: %s\n%s", e, tb) + return {"error": str(e), "traceback": tb} From 76b8bd6667a6807582c9be827f89e718aca84c6f Mon Sep 17 00:00:00 2001 From: JRussas <159085336+JMRussas@users.noreply.github.com> Date: Sat, 7 Mar 2026 13:53:17 -0500 Subject: [PATCH 5/8] Wire internal routes into DI container for planner access Co-Authored-By: Claude Opus 4.6 --- backend/container.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/container.py b/backend/container.py index 34294b2..94add8a 100644 --- a/backend/container.py +++ b/backend/container.py @@ -50,6 +50,7 @@ class Container(containers.DeclarativeContainer): "backend.routes.rag", "backend.routes.auth_oidc", "backend.routes.external", + "backend.routes.internal", "backend.middleware.auth", ] ) From 42ca92ce6b25cb1c7065e84bd91f1ea272aeec34 Mon Sep 17 00:00:00 2001 From: JRussas <159085336+JMRussas@users.noreply.github.com> Date: Sat, 7 Mar 2026 13:57:18 -0500 Subject: [PATCH 6/8] Fix CLI prompt routing to use stdin instead of command line args Windows has a ~32K command line length limit. Large planning prompts (system prompt + requirements) exceed this when passed as -p arguments. Now pipes prompts via stdin for all CLI providers. Co-Authored-By: Claude Opus 4.6 --- backend/services/llm_router.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/backend/services/llm_router.py b/backend/services/llm_router.py index 7d574d6..ae58e44 100644 --- a/backend/services/llm_router.py +++ b/backend/services/llm_router.py @@ -51,33 +51,36 @@ async def _call_cli(provider: str, system_prompt: str, user_message: str, model: Optional[str] = None) -> LLMResponse: """Call a CLI provider with system prompt and user message. - Combines system_prompt + user_message into a single prompt since CLIs - don't natively support separate system/user roles. + Pipes the prompt via stdin to avoid Windows command line length limits. + All CLIs support reading prompts from stdin. """ full_prompt = f"{system_prompt}\n\n---\n\n{user_message}" if provider == "claude": - cmd_args = [_resolve_cmd("claude"), "-p", full_prompt, "--output-format", "text"] + # Claude: -p is --print (non-interactive mode), reads prompt from stdin + cmd_args = [_resolve_cmd("claude"), "-p", "--output-format", "text"] elif provider == "codex": cmd_args = [_resolve_cmd("codex"), "exec"] if model: cmd_args.extend(["--model", model]) - cmd_args.extend(["--", full_prompt]) elif provider == "gemini": - cmd_args = [_resolve_cmd("gemini")] + # Gemini: -p "" triggers non-interactive mode, stdin is prepended to prompt + cmd_args = [_resolve_cmd("gemini"), "-p", ""] if model: cmd_args.extend(["-m", model]) - cmd_args.extend(["-p", full_prompt]) else: raise ValueError(f"Unknown CLI provider: {provider}") proc = await asyncio.create_subprocess_exec( *cmd_args, + stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) - stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=300) + stdout, stderr = await asyncio.wait_for( + proc.communicate(input=full_prompt.encode()), timeout=300, + ) stdout_text = stdout.decode().strip() stderr_text = stderr.decode().strip() From 09ff64b60e7908042dc16c52674b2fbb99bd30ce Mon Sep 17 00:00:00 2001 From: JRussas <159085336+JMRussas@users.noreply.github.com> Date: Sat, 7 Mar 2026 14:01:13 -0500 Subject: [PATCH 7/8] Fix MCP plan_project to handle plan_id key from planner service Planner returns plan_id, not id. Use .get() with fallback for resilience. Co-Authored-By: Claude Opus 4.6 --- backend/mcp/server.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/mcp/server.py b/backend/mcp/server.py index 233f04a..2d414fd 100644 --- a/backend/mcp/server.py +++ b/backend/mcp/server.py @@ -140,9 +140,9 @@ async def plan_project(project_id: str) -> str: summary = plan.get("summary", "No summary") out = "--- Plan Generated ---\n" - out += f"Plan ID: {result['id']}\n" - out += f"Model: {result['model_used']}\n" - out += f"Cost: ${result['cost_usd']:.4f}\n\n" + out += f"Plan ID: {result.get('plan_id', result.get('id', '?'))}\n" + out += f"Model: {result.get('model_used', '?')}\n" + out += f"Cost: ${result.get('cost_usd', 0):.4f}\n\n" out += f"Summary: {summary}\n\n" if phases: From 8abce73fab2ffd5bfc8f9cd27e73357b32e30d69 Mon Sep 17 00:00:00 2001 From: JRussas <159085336+JMRussas@users.noreply.github.com> Date: Sat, 7 Mar 2026 14:11:49 -0500 Subject: [PATCH 8/8] Fix chat endpoint to pipe prompts via stdin like llm_router MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same Windows command line length fix as llm_router.py — pipe prompts via stdin instead of passing as -p arguments to CLI providers. Co-Authored-By: Claude Opus 4.6 --- backend/routes/internal.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/backend/routes/internal.py b/backend/routes/internal.py index 056fe92..f09a849 100644 --- a/backend/routes/internal.py +++ b/backend/routes/internal.py @@ -90,24 +90,21 @@ async def chat(request: ChatRequest): if provider == "ollama": return await _chat_ollama(request, full_prompt) - # CLI-based providers + # CLI-based providers — pipe prompt via stdin to avoid command line length limits model_used: Optional[str] = None if provider == "claude": - cmd_args = ["claude", "-p", full_prompt, "--output-format", "text"] - # Claude -p mode doesn't support model selection + cmd_args = ["claude", "-p", "--output-format", "text"] elif provider == "codex": cmd_args = ["codex", "exec"] if request.model: cmd_args.extend(["--model", request.model]) model_used = request.model - cmd_args.extend(["--", full_prompt]) else: # gemini default - cmd_args = ["gemini"] + cmd_args = ["gemini", "-p", ""] if request.model: cmd_args.extend(["-m", request.model]) model_used = request.model - cmd_args.extend(["-p", full_prompt]) # On Windows, npm global binaries are .cmd — resolve to full path if sys.platform == "win32": @@ -118,11 +115,14 @@ async def chat(request: ChatRequest): try: proc = await asyncio.create_subprocess_exec( *cmd_args, + stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) try: - stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=120) + stdout, stderr = await asyncio.wait_for( + proc.communicate(input=full_prompt.encode()), timeout=120, + ) except asyncio.TimeoutError: proc.kill() await proc.wait()