diff --git a/.agents/skills/nemoclaw-user-configure-inference/SKILL.md b/.agents/skills/nemoclaw-user-configure-inference/SKILL.md index ffc9ea1917..22f79a7c53 100644 --- a/.agents/skills/nemoclaw-user-configure-inference/SKILL.md +++ b/.agents/skills/nemoclaw-user-configure-inference/SKILL.md @@ -128,6 +128,35 @@ $ openshell inference set --provider compatible-anthropic-endpoint --model **Note:** `NEMOCLAW_INFERENCE_API_OVERRIDE` patches the config at container startup but +> does not update the Dockerfile ARG baked into the image. +> If you recreate the sandbox without the override env var, the image reverts to +> the original API path. +> A fresh `nemoclaw onboard` is the reliable fix because it updates both the +> session and the baked image. + ## Step 2: Cross-Provider Switching Switching to a different provider family (for example, from NVIDIA Endpoints to Anthropic) requires updating both the gateway route and the sandbox config. @@ -147,7 +176,7 @@ $ nemoclaw onboard --resume --recreate-sandbox ``` The entrypoint patches `openclaw.json` at container startup with the override values. -No image rebuild is needed. +You do not need to rebuild the image. Remove the env vars and recreate the sandbox to revert to the original model. `NEMOCLAW_INFERENCE_API_OVERRIDE` accepts `openai-completions` (for NVIDIA, OpenAI, Gemini, compatible endpoints) or `anthropic-messages` (for Anthropic and Anthropic-compatible endpoints). @@ -281,6 +310,33 @@ $ NEMOCLAW_PROVIDER=custom \ | `NEMOCLAW_MODEL` | Model ID as reported by the server. | | `COMPATIBLE_API_KEY` | API key for the endpoint. Use any non-empty value if authentication is not required. | +### Forcing Chat Completions API + +Some OpenAI-compatible servers (such as SGLang) expose `/v1/responses` but do +not emit the granular streaming events that OpenClaw requires. +NemoClaw tests streaming events during onboarding and falls back to +`/v1/chat/completions` automatically when it detects incomplete streaming. + +If you need to bypass the `/v1/responses` probe entirely, set +`NEMOCLAW_PREFERRED_API` before running onboard: + +```console +$ NEMOCLAW_PREFERRED_API=openai-completions nemoclaw onboard +``` + +Set this variable to make the wizard skip the `/v1/responses` probe and use +`/v1/chat/completions` directly. +You can use it in both interactive and non-interactive mode. + +| Variable | Values | Default | +|---|---|---| +| `NEMOCLAW_PREFERRED_API` | `openai-completions`, `chat-completions` | unset (auto-detect) | + +If you already onboarded and the sandbox is failing at runtime, re-run +`nemoclaw onboard` to re-probe the endpoint and bake the correct API path +into the image. +Refer to Switch Inference Models (see the `nemoclaw-user-configure-inference` skill) for details. + ## Step 7: Anthropic-Compatible Server If your local server implements the Anthropic Messages API (`/v1/messages`), choose **Other Anthropic-compatible endpoint** during onboarding instead. diff --git a/.agents/skills/nemoclaw-user-reference/references/troubleshooting.md b/.agents/skills/nemoclaw-user-reference/references/troubleshooting.md index 333ed815e3..3304cb082c 100644 --- a/.agents/skills/nemoclaw-user-reference/references/troubleshooting.md +++ b/.agents/skills/nemoclaw-user-reference/references/troubleshooting.md @@ -256,6 +256,33 @@ $ export NEMOCLAW_LOCAL_INFERENCE_TIMEOUT=300 $ nemoclaw onboard ``` +### Agent fails at runtime after onboarding succeeds with a compatible endpoint + +Some OpenAI-compatible servers (such as SGLang) expose `/v1/responses` and pass +the onboarding validation probe, but their streaming mode is incomplete. +OpenClaw requires granular streaming events like `response.output_text.delta` +that these backends do not emit. + +NemoClaw now tests streaming events during the `/v1/responses` probe and falls +back to `/v1/chat/completions` automatically. +If you onboarded before this check was added, re-run onboarding so the wizard +re-probes the endpoint and bakes the correct API path into the image: + +```console +$ nemoclaw onboard +``` + +To force `/v1/chat/completions` without re-probing, set `NEMOCLAW_PREFERRED_API`: + +```console +$ NEMOCLAW_PREFERRED_API=openai-completions nemoclaw onboard +``` + +Do not rely on `NEMOCLAW_INFERENCE_API_OVERRIDE` alone — it patches the config +at container startup but does not update the Dockerfile ARG baked into the +image. +A fresh `nemoclaw onboard` is the reliable fix. + ### `NEMOCLAW_DISABLE_DEVICE_AUTH=1` does not change an existing sandbox This is expected behavior. diff --git a/docs/inference/switch-inference-providers.md b/docs/inference/switch-inference-providers.md index cda2e7a3cd..2ac5df5ad3 100644 --- a/docs/inference/switch-inference-providers.md +++ b/docs/inference/switch-inference-providers.md @@ -73,6 +73,37 @@ $ openshell inference set --provider compatible-anthropic-endpoint --model { expect(result.stderr).toContain("spawn ENOENT"); }); }); + +describe("runStreamingEventProbe", () => { + /** Helper to build a spawnSyncImpl that writes SSE content to the -o file. */ + function mockStreaming(sseBody: string, exitCode = 0) { + return (_command: string, args: readonly string[]) => { + const oIdx = args.indexOf("-o"); + if (oIdx !== -1) { + const outputPath = args[oIdx + 1] as string; + fs.writeFileSync(outputPath, sseBody); + } + return { + pid: 1, + output: [], + stdout: "", + stderr: "", + status: exitCode, + signal: null, + }; + }; + } + + it("passes when all required streaming events are present", () => { + const sseBody = [ + "event: response.created", + 'data: {"id":"resp_1"}', + "", + "event: response.in_progress", + 'data: {"id":"resp_1"}', + "", + "event: response.output_item.added", + 'data: {"id":"resp_1"}', + "", + "event: response.content_part.added", + 'data: {"id":"resp_1"}', + "", + "event: response.output_text.delta", + 'data: {"delta":"OK"}', + "", + "event: response.output_text.done", + 'data: {"text":"OK"}', + "", + "event: response.content_part.done", + 'data: {"id":"resp_1"}', + "", + "event: response.completed", + 'data: {"id":"resp_1"}', + "", + ].join("\n"); + + const result = runStreamingEventProbe( + ["-sS", "--max-time", "15", "https://example.test/v1/responses"], + { spawnSyncImpl: mockStreaming(sseBody) }, + ); + + expect(result.ok).toBe(true); + expect(result.missingEvents).toEqual([]); + }); + + it("fails when only basic lifecycle events are present (SGLang-like)", () => { + const sseBody = [ + "event: response.created", + 'data: {"id":"resp_1"}', + "", + "event: response.in_progress", + 'data: {"id":"resp_1"}', + "", + "event: response.completed", + 'data: {"id":"resp_1","text":"OK"}', + "", + ].join("\n"); + + const result = runStreamingEventProbe( + ["-sS", "--max-time", "15", "https://example.test/v1/responses"], + { spawnSyncImpl: mockStreaming(sseBody) }, + ); + + expect(result.ok).toBe(false); + expect(result.missingEvents).toContain("response.output_text.delta"); + expect(result.message).toContain("response.output_text.delta"); + }); + + it("still passes if curl exits with 28 (timeout) but events were captured", () => { + const sseBody = [ + "event: response.created", + 'data: {"id":"resp_1"}', + "", + "event: response.output_text.delta", + 'data: {"delta":"O"}', + "", + ].join("\n"); + + const result = runStreamingEventProbe( + ["-sS", "--max-time", "15", "https://example.test/v1/responses"], + { spawnSyncImpl: mockStreaming(sseBody, 28) }, + ); + + expect(result.ok).toBe(true); + expect(result.missingEvents).toEqual([]); + }); + + it("fails on spawn error", () => { + const result = runStreamingEventProbe( + ["-sS", "https://example.test/v1/responses"], + { + spawnSyncImpl: () => { + const error = Object.assign(new Error("spawn ENOENT"), { code: "ENOENT" }); + return { + pid: 1, + output: [], + stdout: "", + stderr: "", + status: null, + signal: null, + error, + }; + }, + }, + ); + + expect(result.ok).toBe(false); + expect(result.message).toContain("Streaming probe failed"); + }); + + it("cleans up temp files after probe", () => { + let outputPath = ""; + runStreamingEventProbe( + ["-sS", "--max-time", "15", "https://example.test/v1/responses"], + { + spawnSyncImpl: (_command, args) => { + const oIdx = args.indexOf("-o"); + if (oIdx !== -1) { + outputPath = args[oIdx + 1] as string; + fs.writeFileSync(outputPath, "event: response.output_text.delta\ndata: {}\n"); + } + return { + pid: 1, + output: [], + stdout: "", + stderr: "", + status: 0, + signal: null, + }; + }, + }, + ); + + expect(outputPath).not.toBe(""); + expect(fs.existsSync(outputPath)).toBe(false); + expect(fs.existsSync(path.dirname(outputPath))).toBe(false); + }); +}); diff --git a/src/lib/http-probe.ts b/src/lib/http-probe.ts index 9883c1f78a..983ccd9fe3 100644 --- a/src/lib/http-probe.ts +++ b/src/lib/http-probe.ts @@ -26,6 +26,12 @@ export interface CurlProbeOptions { ) => SpawnSyncReturns; } +export interface StreamingProbeResult { + ok: boolean; + missingEvents: string[]; + message: string; +} + function secureTempFile(prefix: string, ext = ""): string { const dir = fs.mkdtempSync(path.join(os.tmpdir(), `${prefix}-`)); return path.join(dir, `${prefix}${ext}`); @@ -147,3 +153,92 @@ export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlP cleanupTempDir(bodyFile, "nemoclaw-curl-probe"); } } + +/** + * The minimum set of streaming events that OpenClaw requires from a + * `/v1/responses` endpoint. Backends that only emit the top-level lifecycle + * events (created / in_progress / completed) will cause runtime failures + * because OpenClaw never receives the incremental content deltas. + */ +const REQUIRED_STREAMING_EVENTS = ["response.output_text.delta"]; + +/** + * Send a streaming request to a `/v1/responses`-style endpoint and verify + * that the SSE event stream includes the granular events OpenClaw needs. + * + * This catches backends like SGLang that return valid non-streaming + * responses but emit only `response.created`, `response.in_progress`, and + * `response.completed` in streaming mode — missing the content deltas that + * OpenClaw relies on. + */ +export function runStreamingEventProbe( + argv: string[], + opts: CurlProbeOptions = {}, +): StreamingProbeResult { + const bodyFile = secureTempFile("nemoclaw-streaming-probe", ".sse"); + try { + const args = [...argv]; + const url = args.pop(); + const spawnSyncImpl = opts.spawnSyncImpl ?? spawnSync; + const result = spawnSyncImpl( + "curl", + [...args, "-N", "-o", bodyFile, String(url || "")], + { + cwd: opts.cwd ?? ROOT, + encoding: "utf8", + timeout: 30_000, + env: { + ...process.env, + ...opts.env, + }, + }, + ); + + const body = fs.existsSync(bodyFile) ? fs.readFileSync(bodyFile, "utf8") : ""; + + if (result.error || (result.status !== null && result.status !== 0 && result.status !== 28)) { + // curl exit 28 = timeout, which is expected — we cap with --max-time + // and may still have collected enough events before the timeout. + const detail = result.error + ? String((result.error as Error).message || result.error) + : String(result.stderr || ""); + return { + ok: false, + missingEvents: REQUIRED_STREAMING_EVENTS, + message: `Streaming probe failed: ${compactText(detail).slice(0, 200)}`, + }; + } + + // Parse SSE event types from the raw output. + // Each event line looks like: "event: response.output_text.delta" + const eventTypes = new Set(); + for (const line of body.split("\n")) { + const match = /^event:\s*(.+)$/i.exec(line.trim()); + if (match) { + eventTypes.add(match[1].trim()); + } + } + + const missing = REQUIRED_STREAMING_EVENTS.filter((e) => !eventTypes.has(e)); + if (missing.length > 0) { + return { + ok: false, + missingEvents: missing, + message: + `Responses API streaming is missing required events: ${missing.join(", ")}. ` + + "Falling back to chat completions API.", + }; + } + + return { ok: true, missingEvents: [], message: "" }; + } catch (error) { + const detail = error instanceof Error ? error.message : String(error); + return { + ok: false, + missingEvents: REQUIRED_STREAMING_EVENTS, + message: `Streaming probe error: ${detail}`, + }; + } finally { + cleanupTempDir(bodyFile, "nemoclaw-streaming-probe"); + } +} diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 6c8b101272..1156c0cc8d 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -495,7 +495,7 @@ function hydrateCredentialEnv(envName) { return value || null; } -const { getCurlTimingArgs, summarizeCurlFailure, summarizeProbeFailure, runCurlProbe } = httpProbe; +const { getCurlTimingArgs, summarizeCurlFailure, summarizeProbeFailure, runCurlProbe, runStreamingEventProbe } = httpProbe; function getNavigationChoice(value = "") { const normalized = String(value || "") @@ -523,6 +523,7 @@ const { isNvcfFunctionNotFoundForAccount, nvcfFunctionNotFoundMessage, shouldSkipResponsesProbe, + shouldForceCompletionsApi, } = validation; // validateNvidiaApiKeyValue — see validation import above @@ -1188,6 +1189,56 @@ function probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, options = {}) { for (const probe of probes) { const result = probe.execute(); if (result.ok) { + // Streaming event validation — catch backends like SGLang that return + // valid non-streaming responses but emit incomplete SSE events in + // streaming mode. Only run for /responses probes on custom endpoints + // where probeStreaming was requested. + if (probe.api === "openai-responses" && options.probeStreaming === true) { + const streamResult = runStreamingEventProbe([ + "-sS", + ...getValidationProbeCurlArgs(), + "-H", + "Content-Type: application/json", + ...(apiKey ? ["-H", `Authorization: Bearer ${normalizeCredentialValue(apiKey)}`] : []), + "-d", + JSON.stringify({ + model, + input: "Reply with exactly: OK", + stream: true, + }), + `${String(endpointUrl).replace(/\/+$/, "")}/responses`, + ]); + if (!streamResult.ok && streamResult.missingEvents.length > 0) { + // Backend responds but lacks required streaming events — fall back + // to /chat/completions silently. + console.log(` ℹ ${streamResult.message}`); + failures.push({ + name: probe.name + " (streaming)", + httpStatus: 0, + curlStatus: 0, + message: streamResult.message, + body: "", + }); + continue; + } + if (!streamResult.ok) { + // Transport or execution failure — surface as a hard error instead + // of silently switching APIs. + return { + ok: false, + message: `${probe.name} (streaming): ${streamResult.message}`, + failures: [ + { + name: probe.name + " (streaming)", + httpStatus: 0, + curlStatus: 0, + message: streamResult.message, + body: "", + }, + ], + }; + } + } return { ok: true, api: probe.api, label: probe.name }; } // Preserve the raw response body alongside the summarized message so the @@ -1336,6 +1387,8 @@ async function validateCustomOpenAiLikeSelection( const apiKey = getCredential(credentialEnv); const probe = probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, { requireResponsesToolCalling: true, + skipResponsesProbe: shouldForceCompletionsApi(process.env.NEMOCLAW_PREFERRED_API), + probeStreaming: true, }); if (probe.ok) { console.log(` ${probe.label} available — OpenClaw will use ${probe.api}.`); diff --git a/src/lib/validation.test.ts b/src/lib/validation.test.ts index 7aa135ba27..e1b9774b88 100644 --- a/src/lib/validation.test.ts +++ b/src/lib/validation.test.ts @@ -12,6 +12,7 @@ import { isNvcfFunctionNotFoundForAccount, nvcfFunctionNotFoundMessage, shouldSkipResponsesProbe, + shouldForceCompletionsApi, } from "../../dist/lib/validation"; describe("classifyValidationFailure", () => { @@ -222,3 +223,29 @@ describe("shouldSkipResponsesProbe", () => { expect(shouldSkipResponsesProbe("")).toBe(false); }); }); + +describe("shouldForceCompletionsApi", () => { + it("returns true when passed openai-completions", () => { + expect(shouldForceCompletionsApi("openai-completions")).toBe(true); + }); + + it("returns true for the chat-completions alias", () => { + expect(shouldForceCompletionsApi("chat-completions")).toBe(true); + }); + + it("is case-insensitive", () => { + expect(shouldForceCompletionsApi("OpenAI-Completions")).toBe(true); + }); + + it("returns false when undefined", () => { + expect(shouldForceCompletionsApi(undefined)).toBe(false); + }); + + it("returns false for openai-responses", () => { + expect(shouldForceCompletionsApi("openai-responses")).toBe(false); + }); + + it("returns false for empty string", () => { + expect(shouldForceCompletionsApi("")).toBe(false); + }); +}); diff --git a/src/lib/validation.ts b/src/lib/validation.ts index 8429e90e3a..6942dc6877 100644 --- a/src/lib/validation.ts +++ b/src/lib/validation.ts @@ -128,3 +128,14 @@ export function nvcfFunctionNotFoundMessage(model: string): string { export function shouldSkipResponsesProbe(provider: string): boolean { return provider === "nvidia-prod"; } + +/** + * Whether the caller has explicitly requested the chat completions API path. + * Pass the value of `NEMOCLAW_PREFERRED_API` (or any other source). This lets + * users with backends that expose `/v1/responses` but lack full streaming-event + * support (e.g. SGLang) skip the Responses API probe during onboarding. + */ +export function shouldForceCompletionsApi(preferredApi?: string): boolean { + const value = (preferredApi || "").trim().toLowerCase(); + return value === "openai-completions" || value === "chat-completions"; +}