From db3d6f85a05daf981945239faf43917d2321b41e Mon Sep 17 00:00:00 2001
From: Jason Warta <jasonwarta@gmail.com>
Date: Mon, 13 Apr 2026 16:02:41 -0800
Subject: [PATCH 1/3] feat: add LM Studio provider using native /api/v1 API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New provider talks to LM Studio's native REST endpoints
(/api/v1/models and /api/v1/chat) rather than the OpenAI-compat
surface, so we surface local-inference detail (tokens/sec, ttft,
model load time) at debug level for diagnosing perf on your own
hardware.

Auto-registers when LM Studio is reachable. Defaults to
http://localhost:1234, override with LMSTUDIO_URL for LAN use.
Filters embedding models via the `type` field from /api/v1/models.
JIT-loading is delegated to LM Studio — not-loaded models load on
first query. Route explicitly with the `lmstudio/` prefix.

Known limitation: /api/v1/chat's message-array input shape isn't
documented on the probed instance, so system_prompt is prepended
to the single-string input form. Single-turn prompts (all HydraMCP
tools today) work correctly.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .env.example              |   6 ++
 README.md                 |  10 ++-
 src/index.ts              |  10 +++
 src/providers/lmstudio.ts | 175 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 200 insertions(+), 1 deletion(-)
 create mode 100644 src/providers/lmstudio.ts
diff --git a/.env.example b/.env.example
index 4c74913..c7ddd61 100644
--- a/.env.example
+++ b/.env.example
@@ -7,5 +7,11 @@ CLIPROXYAPI_KEY=sk-cliproxy-local-001
 # Ollama backend (local models)
 OLLAMA_URL=http://localhost:11434
 
+# LM Studio backend (local or LAN)
+# Defaults to http://localhost:1234 if unset. Override to point at a
+# machine running LM Studio on the network, e.g. http://192.168.40.10:1234
+# LMSTUDIO_URL=http://localhost:1234
+# LMSTUDIO_API_KEY=
+
 # Logging level: debug | info | warn | error
 HYDRA_LOG_LEVEL=info
diff --git a/README.md b/README.md
index fb29a09..ac0419f 100644
--- a/README.md
+++ b/README.md
@@ -89,6 +89,7 @@ Claude Code
     |-- Anthropic  -> api.anthropic.com (API key)
     |-- Sub        -> CLI tools (Gemini CLI, Claude Code, Codex CLI)
     |-- Ollama     -> local models (your hardware)
+    |-- LM Studio  -> local or LAN LM Studio server
 ```
 
 ## Three Ways to Connect Models
@@ -127,6 +128,12 @@ Install [Ollama](https://ollama.com), pull a model, done. Auto-detected.
 ollama pull qwen2.5-coder:14b
 ```
 
+Or run [LM Studio](https://lmstudio.ai), load a model, and start its server. Auto-detected on `localhost:1234`. Point at a different host with:
+
+```bash
+LMSTUDIO_URL=http://192.168.40.10:1234
+```
+
 ### Mix and Match
 
 All three methods stack. Use API keys for some providers, subscriptions for others, and Ollama for local. They all show up in `list_models` together.
@@ -136,6 +143,7 @@ Route explicitly with prefixes:
 - `google/gemini-2.5-flash` — force Google API
 - `sub/gemini-2.5-flash` — force subscription CLI
 - `ollama/qwen2.5-coder:14b` — force local
+- `lmstudio/<model>` — force LM Studio
 - `gpt-5` — auto-detect (tries each provider)
 
 ## Setup Details
@@ -185,7 +193,7 @@ interface Provider {
 
 See `src/providers/ollama.ts` for a working example. Implement it, register in `src/index.ts`, done.
 
-Providers we'd love to see: LM Studio, OpenRouter, Groq, Together AI, or anything that speaks HTTP.
+Providers we'd love to see: OpenRouter, Groq, Together AI, or anything that speaks HTTP.
 
 ## License
 
diff --git a/src/index.ts b/src/index.ts
index bdc0d45..01e2037 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -17,6 +17,7 @@
  *
  *   Local models:
  *     OLLAMA_URL           → Ollama local models (auto-detected)
+ *     LMSTUDIO_URL         → LM Studio (defaults to http://localhost:1234)
  *
  * Set any combination. HydraMCP registers what's available.
  *
@@ -27,6 +28,7 @@
  *   "sub/gemini-2.5-flash"    → Gemini CLI subscription
  *   "sub/claude-..."          → Claude CLI subscription
  *   "ollama/llama3"           → local Ollama instance
+ *   "lmstudio/<model>"        → local/LAN LM Studio instance
  *   "gpt-4o"                  → auto-detect (tries each provider)
  */
 
@@ -36,6 +38,7 @@ import { GoogleProvider } from "./providers/google.js";
 import { AnthropicProvider } from "./providers/anthropic.js";
 import { SubscriptionProvider } from "./providers/subscription.js";
 import { OllamaProvider } from "./providers/ollama.js";
+import { LMStudioProvider } from "./providers/lmstudio.js";
 import { MultiProvider } from "./providers/multi-provider.js";
 import { SmartProvider } from "./orchestrator/index.js";
 import { createServer } from "./server.js";
@@ -90,6 +93,12 @@ async function main() {
     active.push("Ollama");
   }
 
+  const lmstudio = new LMStudioProvider();
+  if (await lmstudio.healthCheck()) {
+    multi.register("lmstudio", lmstudio);
+    active.push("LM Studio");
+  }
+
   // --- Startup summary ---
 
   if (active.length === 0) {
@@ -108,6 +117,7 @@ async function main() {
         "\n" +
         "  Local models:\n" +
         "    Install Ollama → ollama pull llama3\n" +
+        "    LM Studio      → LMSTUDIO_URL=http://host:1234 (default: localhost:1234)\n" +
         "\n" +
         "HydraMCP will start anyway and retry on first request."
     );
diff --git a/src/providers/lmstudio.ts b/src/providers/lmstudio.ts
new file mode 100644
index 0000000..5c6b39f
--- /dev/null
+++ b/src/providers/lmstudio.ts
@@ -0,0 +1,175 @@
+/**
+ * LM Studio Backend — uses LM Studio's native REST API at /api/v1/*.
+ *
+ * Endpoints used:
+ *   GET  /api/v1/models   — rich model metadata (type, capabilities, loaded
+ *                           instances with their runtime ctx, max ctx)
+ *   POST /api/v1/chat     — inference; JIT-loads the model if not loaded
+ *
+ * This is the LM Studio native API — distinct from their OpenAI-compat
+ * surface at /v1/*. It returns local-inference detail the OpenAI shape
+ * strips (tokens/sec, time-to-first-token, load time, model instance id).
+ *
+ * Known limitation — system prompts:
+ *   /api/v1/chat accepts `input` as either a string or a message array,
+ *   but the message-array content-part discriminator is not documented
+ *   on my probed instance and the exact shape couldn't be determined
+ *   from error messages. Until that's pinned down we use string `input`
+ *   and prepend the system prompt as a framed prefix. Works correctly
+ *   for single-turn prompts, which is all HydraMCP's tools currently do.
+ *
+ * Known limitation — context size:
+ *   We report whatever LM Studio has configured. If a model is loaded,
+ *   we see `loaded_instances[0].config.context_length`. If it's not,
+ *   /api/v1/chat JIT-loads it with whatever default LM Studio was last
+ *   set to for that model. To run at larger context, bump it in the
+ *   LM Studio UI — we don't force a reload here.
+ *
+ * Default endpoint: http://localhost:1234
+ * Override with LMSTUDIO_URL (e.g. http://192.168.40.10:1234 when
+ * LM Studio runs on another machine on the LAN).
+ *
+ * Optional LMSTUDIO_API_KEY — sent as Bearer token if set. Only needed
+ * if LM Studio is behind a reverse proxy that enforces auth.
+ */
+
+import { Provider, ModelInfo, QueryOptions, QueryResponse } from "./provider.js";
+import { logger } from "../utils/logger.js";
+
+interface LMStudioModel {
+  key: string;
+  type?: string;
+  display_name?: string;
+  max_context_length?: number;
+  loaded_instances?: Array<{
+    id: string;
+    config?: { context_length?: number };
+  }>;
+}
+
+export class LMStudioProvider implements Provider {
+  name = "LM Studio";
+  private baseUrl: string;
+  private apiKey: string;
+
+  constructor(baseUrl?: string, apiKey?: string) {
+    this.baseUrl = baseUrl ?? process.env.LMSTUDIO_URL ?? "http://localhost:1234";
+    this.apiKey = apiKey ?? process.env.LMSTUDIO_API_KEY ?? "";
+  }
+
+  private headers(): Record<string, string> {
+    const h: Record<string, string> = { "Content-Type": "application/json" };
+    if (this.apiKey) h["Authorization"] = `Bearer ${this.apiKey}`;
+    return h;
+  }
+
+  async healthCheck(): Promise<boolean> {
+    try {
+      const res = await fetch(`${this.baseUrl}/api/v1/models`, {
+        headers: this.headers(),
+      });
+      return res.ok;
+    } catch {
+      return false;
+    }
+  }
+
+  async listModels(): Promise<ModelInfo[]> {
+    const res = await fetch(`${this.baseUrl}/api/v1/models`, {
+      headers: this.headers(),
+    });
+    if (!res.ok) {
+      throw new Error(`LM Studio: failed to list models (${res.status})`);
+    }
+
+    const data = (await res.json()) as { models?: LMStudioModel[] };
+
+    // Only chat-capable types. /api/v1/models uses `embedding` (singular)
+    // for embedding models; exclude them and anything else non-chat.
+    return (data.models ?? [])
+      .filter((m) => m.type === "llm" || m.type === "vlm")
+      .map((m) => ({ id: m.key, name: m.display_name ?? m.key, provider: "lmstudio" }));
+  }
+
+  async query(
+    model: string,
+    prompt: string,
+    options?: QueryOptions
+  ): Promise<QueryResponse> {
+    const startTime = Date.now();
+
+    // See "Known limitation — system prompts" in the file header.
+    const input = options?.system_prompt
+      ? `[SYSTEM]\n${options.system_prompt}\n\n[USER]\n${prompt}`
+      : prompt;
+
+    const body: Record<string, unknown> = { model, input };
+    if (options?.temperature !== undefined) body.temperature = options.temperature;
+    if (options?.max_tokens !== undefined) body.max_output_tokens = options.max_tokens;
+
+    const res = await fetch(`${this.baseUrl}/api/v1/chat`, {
+      method: "POST",
+      headers: this.headers(),
+      body: JSON.stringify(body),
+    });
+
+    if (!res.ok) {
+      const errorText = await res.text();
+      throw new Error(`LM Studio query failed (${res.status}): ${errorText}`);
+    }
+
+    const data = (await res.json()) as {
+      model_instance_id?: string;
+      output?: Array<{ type?: string; content?: string }>;
+      stats?: {
+        input_tokens?: number;
+        total_output_tokens?: number;
+        reasoning_output_tokens?: number;
+        tokens_per_second?: number;
+        time_to_first_token_seconds?: number;
+        model_load_time_seconds?: number;
+        stop_reason?: string;
+      };
+      response_id?: string;
+    };
+
+    const latency_ms = Date.now() - startTime;
+
+    // /api/v1/chat returns output[] where type="message" holds the assistant
+    // content. Reasoning models may also emit type="reasoning" items we skip.
+    const message = data.output?.find((o) => o.type === "message" || o.type === undefined);
+    const content = message?.content ?? "";
+
+    const prompt_tokens = data.stats?.input_tokens ?? 0;
+    const completion_tokens = data.stats?.total_output_tokens ?? 0;
+
+    // Surface LM Studio-only fields via debug log. These don't belong in
+    // QueryResponse (which is provider-agnostic) but help diagnose local
+    // inference performance.
+    if (data.stats) {
+      const parts: string[] = [`lmstudio ${model}`];
+      if (data.stats.tokens_per_second !== undefined) {
+        parts.push(`${data.stats.tokens_per_second.toFixed(1)} tok/s`);
+      }
+      if (data.stats.time_to_first_token_seconds !== undefined) {
+        parts.push(`ttft ${(data.stats.time_to_first_token_seconds * 1000).toFixed(0)}ms`);
+      }
+      if (data.stats.model_load_time_seconds !== undefined) {
+        parts.push(`load ${(data.stats.model_load_time_seconds * 1000).toFixed(0)}ms`);
+      }
+      logger.debug(parts.join(" | "));
+    }
+
+    return {
+      model,
+      content,
+      usage: {
+        prompt_tokens,
+        completion_tokens,
+        total_tokens: prompt_tokens + completion_tokens,
+      },
+      latency_ms,
+      finish_reason: data.stats?.stop_reason ?? "stop",
+    };
+  }
+}

From 1f63744791a03683d7bd0345548a9ede137a8b84 Mon Sep 17 00:00:00 2001
From: Jason Warta <jasonwarta@gmail.com>
Date: Mon, 13 Apr 2026 16:14:10 -0800
Subject: [PATCH 2/3] fix: register local providers unconditionally so they
 recover at runtime
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously Ollama and LM Studio were only registered if a one-shot
boot-time health check succeeded. If the local server happened to be
down or unreachable at startup, the provider was silently dropped for
the entire lifetime of the MCP process — `/mcp` reconnect would not
fix it; only a full Claude Code restart or `claude mcp remove && add`
would. This surprised at least one user who had to remove+re-add to
recover after waking a LAN LM Studio machine.

Local servers restart independently of the MCP process, so gating
registration on a boot-time check is the wrong shape. This aligns them
with cloud providers, which register based on env-var presence alone.
Now listModels and query reach out live on each tool call (backed by
the existing 30s model-list cache) and Promise.allSettled in
MultiProvider means unreachable providers just contribute no models.

To keep `list_models` snappy when a provider is down, add a 3s
AbortController timeout to LM Studio's healthCheck and listModels.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/index.ts              | 26 ++++++++++++++------------
 src/providers/lmstudio.ts | 35 +++++++++++++++++++++++++++++------
 2 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/src/index.ts b/src/index.ts
index 01e2037..ae3bd0c 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -86,18 +86,20 @@ async function main() {
   }
 
   // --- Local models ---
-
-  const ollama = new OllamaProvider();
-  if (await ollama.healthCheck()) {
-    multi.register("ollama", ollama);
-    active.push("Ollama");
-  }
-
-  const lmstudio = new LMStudioProvider();
-  if (await lmstudio.healthCheck()) {
-    multi.register("lmstudio", lmstudio);
-    active.push("LM Studio");
-  }
+  //
+  // Register unconditionally. Local servers (Ollama, LM Studio) can start or
+  // restart independently of the MCP process, so we don't gate registration
+  // on a one-shot boot-time health check — that used to silently drop a
+  // provider for the life of the process if it happened to be down during
+  // startup. Instead, listModels/query reach out live on each tool call
+  // (backed by the 30s model-list cache), and listModels uses
+  // Promise.allSettled so unreachable providers just contribute no models.
+
+  multi.register("ollama", new OllamaProvider());
+  active.push("Ollama");
+
+  multi.register("lmstudio", new LMStudioProvider());
+  active.push("LM Studio");
 
   // --- Startup summary ---
 
diff --git a/src/providers/lmstudio.ts b/src/providers/lmstudio.ts
index 5c6b39f..2612302 100644
--- a/src/providers/lmstudio.ts
+++ b/src/providers/lmstudio.ts
@@ -63,11 +63,32 @@ export class LMStudioProvider implements Provider {
     return h;
   }
 
+  /**
+   * Fetch with a hard timeout so a dead/unreachable LM Studio can't stall
+   * tool calls. list_models waits on this inside Promise.allSettled alongside
+   * other providers, so we want it to fail fast.
+   */
+  private async fetchWithTimeout(
+    url: string,
+    init: RequestInit,
+    timeoutMs: number
+  ): Promise<Response> {
+    const ctrl = new AbortController();
+    const timer = setTimeout(() => ctrl.abort(), timeoutMs);
+    try {
+      return await fetch(url, { ...init, signal: ctrl.signal });
+    } finally {
+      clearTimeout(timer);
+    }
+  }
+
   async healthCheck(): Promise<boolean> {
     try {
-      const res = await fetch(`${this.baseUrl}/api/v1/models`, {
-        headers: this.headers(),
-      });
+      const res = await this.fetchWithTimeout(
+        `${this.baseUrl}/api/v1/models`,
+        { headers: this.headers() },
+        3_000
+      );
       return res.ok;
     } catch {
       return false;
@@ -75,9 +96,11 @@ export class LMStudioProvider implements Provider {
   }
 
   async listModels(): Promise<ModelInfo[]> {
-    const res = await fetch(`${this.baseUrl}/api/v1/models`, {
-      headers: this.headers(),
-    });
+    const res = await this.fetchWithTimeout(
+      `${this.baseUrl}/api/v1/models`,
+      { headers: this.headers() },
+      3_000
+    );
     if (!res.ok) {
       throw new Error(`LM Studio: failed to list models (${res.status})`);
     }

From bf1e8e0367683d8e5f17547fd6016df32f63140b Mon Sep 17 00:00:00 2001
From: Jason Warta <jasonwarta@gmail.com>
Date: Mon, 13 Apr 2026 16:29:17 -0800
Subject: [PATCH 3/3] docs: show claude mcp add -e form for LMSTUDIO_URL

The existing snippet only showed the bare shell env var, but the
canonical way to pass config to a Claude Code MCP server is via
\`claude mcp add -e\` (matches the Quick Start example at the top of
the README). Document both so readers see the integration form first
and the raw-env form as an alternative.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 README.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ac0419f..6c825de 100644
--- a/README.md
+++ b/README.md
@@ -128,7 +128,15 @@ Install [Ollama](https://ollama.com), pull a model, done. Auto-detected.
 ollama pull qwen2.5-coder:14b
 ```
 
-Or run [LM Studio](https://lmstudio.ai), load a model, and start its server. Auto-detected on `localhost:1234`. Point at a different host with:
+Or run [LM Studio](https://lmstudio.ai), load a model, and start its server. Auto-detected on `localhost:1234`. If LM Studio is on another machine, point HydraMCP at it. Either pass it to Claude Code when registering the server:
+
+```bash
+claude mcp add hydramcp \
+  -e LMSTUDIO_URL=http://192.168.40.10:1234 \
+  -- npx hydramcp
+```
+
+Or set it in your shell / `~/.hydramcp/.env` before starting HydraMCP:
 
 ```bash
 LMSTUDIO_URL=http://192.168.40.10:1234