From db3d6f85a05daf981945239faf43917d2321b41e Mon Sep 17 00:00:00 2001 From: Jason Warta Date: Mon, 13 Apr 2026 16:02:41 -0800 Subject: [PATCH 1/3] feat: add LM Studio provider using native /api/v1 API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New provider talks to LM Studio's native REST endpoints (/api/v1/models and /api/v1/chat) rather than the OpenAI-compat surface, so we surface local-inference detail (tokens/sec, ttft, model load time) at debug level for diagnosing perf on your own hardware. Auto-registers when LM Studio is reachable. Defaults to http://localhost:1234, override with LMSTUDIO_URL for LAN use. Filters embedding models via the `type` field from /api/v1/models. JIT-loading is delegated to LM Studio — not-loaded models load on first query. Route explicitly with the `lmstudio/` prefix. Known limitation: /api/v1/chat's message-array input shape isn't documented on the probed instance, so system_prompt is prepended to the single-string input form. Single-turn prompts (all HydraMCP tools today) work correctly. Co-Authored-By: Claude Opus 4.6 (1M context) --- .env.example | 6 ++ README.md | 10 ++- src/index.ts | 10 +++ src/providers/lmstudio.ts | 175 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 200 insertions(+), 1 deletion(-) create mode 100644 src/providers/lmstudio.ts diff --git a/.env.example b/.env.example index 4c74913..c7ddd61 100644 --- a/.env.example +++ b/.env.example @@ -7,5 +7,11 @@ CLIPROXYAPI_KEY=sk-cliproxy-local-001 # Ollama backend (local models) OLLAMA_URL=http://localhost:11434 +# LM Studio backend (local or LAN) +# Defaults to http://localhost:1234 if unset. Override to point at a +# machine running LM Studio on the network, e.g. http://192.168.40.10:1234 +# LMSTUDIO_URL=http://localhost:1234 +# LMSTUDIO_API_KEY= + # Logging level: debug | info | warn | error HYDRA_LOG_LEVEL=info diff --git a/README.md b/README.md index fb29a09..ac0419f 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,7 @@ Claude Code |-- Anthropic -> api.anthropic.com (API key) |-- Sub -> CLI tools (Gemini CLI, Claude Code, Codex CLI) |-- Ollama -> local models (your hardware) + |-- LM Studio -> local or LAN LM Studio server ``` ## Three Ways to Connect Models @@ -127,6 +128,12 @@ Install [Ollama](https://ollama.com), pull a model, done. Auto-detected. ollama pull qwen2.5-coder:14b ``` +Or run [LM Studio](https://lmstudio.ai), load a model, and start its server. Auto-detected on `localhost:1234`. Point at a different host with: + +```bash +LMSTUDIO_URL=http://192.168.40.10:1234 +``` + ### Mix and Match All three methods stack. Use API keys for some providers, subscriptions for others, and Ollama for local. They all show up in `list_models` together. @@ -136,6 +143,7 @@ Route explicitly with prefixes: - `google/gemini-2.5-flash` — force Google API - `sub/gemini-2.5-flash` — force subscription CLI - `ollama/qwen2.5-coder:14b` — force local +- `lmstudio/` — force LM Studio - `gpt-5` — auto-detect (tries each provider) ## Setup Details @@ -185,7 +193,7 @@ interface Provider { See `src/providers/ollama.ts` for a working example. Implement it, register in `src/index.ts`, done. -Providers we'd love to see: LM Studio, OpenRouter, Groq, Together AI, or anything that speaks HTTP. +Providers we'd love to see: OpenRouter, Groq, Together AI, or anything that speaks HTTP. ## License diff --git a/src/index.ts b/src/index.ts index bdc0d45..01e2037 100644 --- a/src/index.ts +++ b/src/index.ts @@ -17,6 +17,7 @@ * * Local models: * OLLAMA_URL → Ollama local models (auto-detected) + * LMSTUDIO_URL → LM Studio (defaults to http://localhost:1234) * * Set any combination. HydraMCP registers what's available. * @@ -27,6 +28,7 @@ * "sub/gemini-2.5-flash" → Gemini CLI subscription * "sub/claude-..." → Claude CLI subscription * "ollama/llama3" → local Ollama instance + * "lmstudio/" → local/LAN LM Studio instance * "gpt-4o" → auto-detect (tries each provider) */ @@ -36,6 +38,7 @@ import { GoogleProvider } from "./providers/google.js"; import { AnthropicProvider } from "./providers/anthropic.js"; import { SubscriptionProvider } from "./providers/subscription.js"; import { OllamaProvider } from "./providers/ollama.js"; +import { LMStudioProvider } from "./providers/lmstudio.js"; import { MultiProvider } from "./providers/multi-provider.js"; import { SmartProvider } from "./orchestrator/index.js"; import { createServer } from "./server.js"; @@ -90,6 +93,12 @@ async function main() { active.push("Ollama"); } + const lmstudio = new LMStudioProvider(); + if (await lmstudio.healthCheck()) { + multi.register("lmstudio", lmstudio); + active.push("LM Studio"); + } + // --- Startup summary --- if (active.length === 0) { @@ -108,6 +117,7 @@ async function main() { "\n" + " Local models:\n" + " Install Ollama → ollama pull llama3\n" + + " LM Studio → LMSTUDIO_URL=http://host:1234 (default: localhost:1234)\n" + "\n" + "HydraMCP will start anyway and retry on first request." ); diff --git a/src/providers/lmstudio.ts b/src/providers/lmstudio.ts new file mode 100644 index 0000000..5c6b39f --- /dev/null +++ b/src/providers/lmstudio.ts @@ -0,0 +1,175 @@ +/** + * LM Studio Backend — uses LM Studio's native REST API at /api/v1/*. + * + * Endpoints used: + * GET /api/v1/models — rich model metadata (type, capabilities, loaded + * instances with their runtime ctx, max ctx) + * POST /api/v1/chat — inference; JIT-loads the model if not loaded + * + * This is the LM Studio native API — distinct from their OpenAI-compat + * surface at /v1/*. It returns local-inference detail the OpenAI shape + * strips (tokens/sec, time-to-first-token, load time, model instance id). + * + * Known limitation — system prompts: + * /api/v1/chat accepts `input` as either a string or a message array, + * but the message-array content-part discriminator is not documented + * on my probed instance and the exact shape couldn't be determined + * from error messages. Until that's pinned down we use string `input` + * and prepend the system prompt as a framed prefix. Works correctly + * for single-turn prompts, which is all HydraMCP's tools currently do. + * + * Known limitation — context size: + * We report whatever LM Studio has configured. If a model is loaded, + * we see `loaded_instances[0].config.context_length`. If it's not, + * /api/v1/chat JIT-loads it with whatever default LM Studio was last + * set to for that model. To run at larger context, bump it in the + * LM Studio UI — we don't force a reload here. + * + * Default endpoint: http://localhost:1234 + * Override with LMSTUDIO_URL (e.g. http://192.168.40.10:1234 when + * LM Studio runs on another machine on the LAN). + * + * Optional LMSTUDIO_API_KEY — sent as Bearer token if set. Only needed + * if LM Studio is behind a reverse proxy that enforces auth. + */ + +import { Provider, ModelInfo, QueryOptions, QueryResponse } from "./provider.js"; +import { logger } from "../utils/logger.js"; + +interface LMStudioModel { + key: string; + type?: string; + display_name?: string; + max_context_length?: number; + loaded_instances?: Array<{ + id: string; + config?: { context_length?: number }; + }>; +} + +export class LMStudioProvider implements Provider { + name = "LM Studio"; + private baseUrl: string; + private apiKey: string; + + constructor(baseUrl?: string, apiKey?: string) { + this.baseUrl = baseUrl ?? process.env.LMSTUDIO_URL ?? "http://localhost:1234"; + this.apiKey = apiKey ?? process.env.LMSTUDIO_API_KEY ?? ""; + } + + private headers(): Record { + const h: Record = { "Content-Type": "application/json" }; + if (this.apiKey) h["Authorization"] = `Bearer ${this.apiKey}`; + return h; + } + + async healthCheck(): Promise { + try { + const res = await fetch(`${this.baseUrl}/api/v1/models`, { + headers: this.headers(), + }); + return res.ok; + } catch { + return false; + } + } + + async listModels(): Promise { + const res = await fetch(`${this.baseUrl}/api/v1/models`, { + headers: this.headers(), + }); + if (!res.ok) { + throw new Error(`LM Studio: failed to list models (${res.status})`); + } + + const data = (await res.json()) as { models?: LMStudioModel[] }; + + // Only chat-capable types. /api/v1/models uses `embedding` (singular) + // for embedding models; exclude them and anything else non-chat. + return (data.models ?? []) + .filter((m) => m.type === "llm" || m.type === "vlm") + .map((m) => ({ id: m.key, name: m.display_name ?? m.key, provider: "lmstudio" })); + } + + async query( + model: string, + prompt: string, + options?: QueryOptions + ): Promise { + const startTime = Date.now(); + + // See "Known limitation — system prompts" in the file header. + const input = options?.system_prompt + ? `[SYSTEM]\n${options.system_prompt}\n\n[USER]\n${prompt}` + : prompt; + + const body: Record = { model, input }; + if (options?.temperature !== undefined) body.temperature = options.temperature; + if (options?.max_tokens !== undefined) body.max_output_tokens = options.max_tokens; + + const res = await fetch(`${this.baseUrl}/api/v1/chat`, { + method: "POST", + headers: this.headers(), + body: JSON.stringify(body), + }); + + if (!res.ok) { + const errorText = await res.text(); + throw new Error(`LM Studio query failed (${res.status}): ${errorText}`); + } + + const data = (await res.json()) as { + model_instance_id?: string; + output?: Array<{ type?: string; content?: string }>; + stats?: { + input_tokens?: number; + total_output_tokens?: number; + reasoning_output_tokens?: number; + tokens_per_second?: number; + time_to_first_token_seconds?: number; + model_load_time_seconds?: number; + stop_reason?: string; + }; + response_id?: string; + }; + + const latency_ms = Date.now() - startTime; + + // /api/v1/chat returns output[] where type="message" holds the assistant + // content. Reasoning models may also emit type="reasoning" items we skip. + const message = data.output?.find((o) => o.type === "message" || o.type === undefined); + const content = message?.content ?? ""; + + const prompt_tokens = data.stats?.input_tokens ?? 0; + const completion_tokens = data.stats?.total_output_tokens ?? 0; + + // Surface LM Studio-only fields via debug log. These don't belong in + // QueryResponse (which is provider-agnostic) but help diagnose local + // inference performance. + if (data.stats) { + const parts: string[] = [`lmstudio ${model}`]; + if (data.stats.tokens_per_second !== undefined) { + parts.push(`${data.stats.tokens_per_second.toFixed(1)} tok/s`); + } + if (data.stats.time_to_first_token_seconds !== undefined) { + parts.push(`ttft ${(data.stats.time_to_first_token_seconds * 1000).toFixed(0)}ms`); + } + if (data.stats.model_load_time_seconds !== undefined) { + parts.push(`load ${(data.stats.model_load_time_seconds * 1000).toFixed(0)}ms`); + } + logger.debug(parts.join(" | ")); + } + + return { + model, + content, + usage: { + prompt_tokens, + completion_tokens, + total_tokens: prompt_tokens + completion_tokens, + }, + latency_ms, + finish_reason: data.stats?.stop_reason ?? "stop", + }; + } +} From 1f63744791a03683d7bd0345548a9ede137a8b84 Mon Sep 17 00:00:00 2001 From: Jason Warta Date: Mon, 13 Apr 2026 16:14:10 -0800 Subject: [PATCH 2/3] fix: register local providers unconditionally so they recover at runtime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously Ollama and LM Studio were only registered if a one-shot boot-time health check succeeded. If the local server happened to be down or unreachable at startup, the provider was silently dropped for the entire lifetime of the MCP process — `/mcp` reconnect would not fix it; only a full Claude Code restart or `claude mcp remove && add` would. This surprised at least one user who had to remove+re-add to recover after waking a LAN LM Studio machine. Local servers restart independently of the MCP process, so gating registration on a boot-time check is the wrong shape. This aligns them with cloud providers, which register based on env-var presence alone. Now listModels and query reach out live on each tool call (backed by the existing 30s model-list cache) and Promise.allSettled in MultiProvider means unreachable providers just contribute no models. To keep `list_models` snappy when a provider is down, add a 3s AbortController timeout to LM Studio's healthCheck and listModels. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/index.ts | 26 ++++++++++++++------------ src/providers/lmstudio.ts | 35 +++++++++++++++++++++++++++++------ 2 files changed, 43 insertions(+), 18 deletions(-) diff --git a/src/index.ts b/src/index.ts index 01e2037..ae3bd0c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -86,18 +86,20 @@ async function main() { } // --- Local models --- - - const ollama = new OllamaProvider(); - if (await ollama.healthCheck()) { - multi.register("ollama", ollama); - active.push("Ollama"); - } - - const lmstudio = new LMStudioProvider(); - if (await lmstudio.healthCheck()) { - multi.register("lmstudio", lmstudio); - active.push("LM Studio"); - } + // + // Register unconditionally. Local servers (Ollama, LM Studio) can start or + // restart independently of the MCP process, so we don't gate registration + // on a one-shot boot-time health check — that used to silently drop a + // provider for the life of the process if it happened to be down during + // startup. Instead, listModels/query reach out live on each tool call + // (backed by the 30s model-list cache), and listModels uses + // Promise.allSettled so unreachable providers just contribute no models. + + multi.register("ollama", new OllamaProvider()); + active.push("Ollama"); + + multi.register("lmstudio", new LMStudioProvider()); + active.push("LM Studio"); // --- Startup summary --- diff --git a/src/providers/lmstudio.ts b/src/providers/lmstudio.ts index 5c6b39f..2612302 100644 --- a/src/providers/lmstudio.ts +++ b/src/providers/lmstudio.ts @@ -63,11 +63,32 @@ export class LMStudioProvider implements Provider { return h; } + /** + * Fetch with a hard timeout so a dead/unreachable LM Studio can't stall + * tool calls. list_models waits on this inside Promise.allSettled alongside + * other providers, so we want it to fail fast. + */ + private async fetchWithTimeout( + url: string, + init: RequestInit, + timeoutMs: number + ): Promise { + const ctrl = new AbortController(); + const timer = setTimeout(() => ctrl.abort(), timeoutMs); + try { + return await fetch(url, { ...init, signal: ctrl.signal }); + } finally { + clearTimeout(timer); + } + } + async healthCheck(): Promise { try { - const res = await fetch(`${this.baseUrl}/api/v1/models`, { - headers: this.headers(), - }); + const res = await this.fetchWithTimeout( + `${this.baseUrl}/api/v1/models`, + { headers: this.headers() }, + 3_000 + ); return res.ok; } catch { return false; @@ -75,9 +96,11 @@ export class LMStudioProvider implements Provider { } async listModels(): Promise { - const res = await fetch(`${this.baseUrl}/api/v1/models`, { - headers: this.headers(), - }); + const res = await this.fetchWithTimeout( + `${this.baseUrl}/api/v1/models`, + { headers: this.headers() }, + 3_000 + ); if (!res.ok) { throw new Error(`LM Studio: failed to list models (${res.status})`); } From bf1e8e0367683d8e5f17547fd6016df32f63140b Mon Sep 17 00:00:00 2001 From: Jason Warta Date: Mon, 13 Apr 2026 16:29:17 -0800 Subject: [PATCH 3/3] docs: show claude mcp add -e form for LMSTUDIO_URL The existing snippet only showed the bare shell env var, but the canonical way to pass config to a Claude Code MCP server is via \`claude mcp add -e\` (matches the Quick Start example at the top of the README). Document both so readers see the integration form first and the raw-env form as an alternative. Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ac0419f..6c825de 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,15 @@ Install [Ollama](https://ollama.com), pull a model, done. Auto-detected. ollama pull qwen2.5-coder:14b ``` -Or run [LM Studio](https://lmstudio.ai), load a model, and start its server. Auto-detected on `localhost:1234`. Point at a different host with: +Or run [LM Studio](https://lmstudio.ai), load a model, and start its server. Auto-detected on `localhost:1234`. If LM Studio is on another machine, point HydraMCP at it. Either pass it to Claude Code when registering the server: + +```bash +claude mcp add hydramcp \ + -e LMSTUDIO_URL=http://192.168.40.10:1234 \ + -- npx hydramcp +``` + +Or set it in your shell / `~/.hydramcp/.env` before starting HydraMCP: ```bash LMSTUDIO_URL=http://192.168.40.10:1234