Pickle-Pixel · jasonwarta · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/.env.example b/.env.example
@@ -7,5 +7,11 @@ CLIPROXYAPI_KEY=sk-cliproxy-local-001
 # Ollama backend (local models)
 OLLAMA_URL=http://localhost:11434
 
+# LM Studio backend (local or LAN)
+# Defaults to http://localhost:1234 if unset. Override to point at a
+# machine running LM Studio on the network, e.g. http://192.168.40.10:1234
+# LMSTUDIO_URL=http://localhost:1234
+# LMSTUDIO_API_KEY=
+
 # Logging level: debug | info | warn | error
 HYDRA_LOG_LEVEL=info
diff --git a/README.md b/README.md
@@ -89,6 +89,7 @@ Claude Code
     |-- Anthropic  -> api.anthropic.com (API key)
     |-- Sub        -> CLI tools (Gemini CLI, Claude Code, Codex CLI)
     |-- Ollama     -> local models (your hardware)
+    |-- LM Studio  -> local or LAN LM Studio server
 ```
 
 ## Three Ways to Connect Models
@@ -127,6 +128,20 @@ Install [Ollama](https://ollama.com), pull a model, done. Auto-detected.
 ollama pull qwen2.5-coder:14b
 ```
 
+Or run [LM Studio](https://lmstudio.ai), load a model, and start its server. Auto-detected on `localhost:1234`. If LM Studio is on another machine, point HydraMCP at it. Either pass it to Claude Code when registering the server:
+
+```bash
+claude mcp add hydramcp \
+  -e LMSTUDIO_URL=http://192.168.40.10:1234 \
+  -- npx hydramcp
+```
+
+Or set it in your shell / `~/.hydramcp/.env` before starting HydraMCP:
+
+```bash
+LMSTUDIO_URL=http://192.168.40.10:1234
+```
+
 ### Mix and Match
 
 All three methods stack. Use API keys for some providers, subscriptions for others, and Ollama for local. They all show up in `list_models` together.
@@ -136,6 +151,7 @@ Route explicitly with prefixes:
 - `google/gemini-2.5-flash` — force Google API
 - `sub/gemini-2.5-flash` — force subscription CLI
 - `ollama/qwen2.5-coder:14b` — force local
+- `lmstudio/<model>` — force LM Studio
 - `gpt-5` — auto-detect (tries each provider)
 
 ## Setup Details
@@ -185,7 +201,7 @@ interface Provider {
 
 See `src/providers/ollama.ts` for a working example. Implement it, register in `src/index.ts`, done.
 
-Providers we'd love to see: LM Studio, OpenRouter, Groq, Together AI, or anything that speaks HTTP.
+Providers we'd love to see: OpenRouter, Groq, Together AI, or anything that speaks HTTP.
 
 ## License
 

diff --git a/src/index.ts b/src/index.ts
@@ -17,6 +17,7 @@
  *
  *   Local models:
  *     OLLAMA_URL           → Ollama local models (auto-detected)
+ *     LMSTUDIO_URL         → LM Studio (defaults to http://localhost:1234)
  *
  * Set any combination. HydraMCP registers what's available.
  *
@@ -27,6 +28,7 @@
  *   "sub/gemini-2.5-flash"    → Gemini CLI subscription
  *   "sub/claude-..."          → Claude CLI subscription
  *   "ollama/llama3"           → local Ollama instance
+ *   "lmstudio/<model>"        → local/LAN LM Studio instance
  *   "gpt-4o"                  → auto-detect (tries each provider)
  */
 
@@ -36,6 +38,7 @@ import { GoogleProvider } from "./providers/google.js";
 import { AnthropicProvider } from "./providers/anthropic.js";
 import { SubscriptionProvider } from "./providers/subscription.js";
 import { OllamaProvider } from "./providers/ollama.js";
+import { LMStudioProvider } from "./providers/lmstudio.js";
 import { MultiProvider } from "./providers/multi-provider.js";
 import { SmartProvider } from "./orchestrator/index.js";
 import { createServer } from "./server.js";
@@ -83,12 +86,20 @@ async function main() {
   }
 
   // --- Local models ---
+  //
+  // Register unconditionally. Local servers (Ollama, LM Studio) can start or
+  // restart independently of the MCP process, so we don't gate registration
+  // on a one-shot boot-time health check — that used to silently drop a
+  // provider for the life of the process if it happened to be down during
+  // startup. Instead, listModels/query reach out live on each tool call
+  // (backed by the 30s model-list cache), and listModels uses
+  // Promise.allSettled so unreachable providers just contribute no models.
 
-  const ollama = new OllamaProvider();
-  if (await ollama.healthCheck()) {
-    multi.register("ollama", ollama);
-    active.push("Ollama");
-  }
+  multi.register("ollama", new OllamaProvider());
+  active.push("Ollama");
+
+  multi.register("lmstudio", new LMStudioProvider());
+  active.push("LM Studio");
 
   // --- Startup summary ---
 
@@ -108,6 +119,7 @@ async function main() {
         "\n" +
         "  Local models:\n" +
         "    Install Ollama → ollama pull llama3\n" +
+        "    LM Studio      → LMSTUDIO_URL=http://host:1234 (default: localhost:1234)\n" +
         "\n" +
         "HydraMCP will start anyway and retry on first request."
     );

diff --git a/src/providers/lmstudio.ts b/src/providers/lmstudio.ts
@@ -0,0 +1,198 @@
+/**
+ * LM Studio Backend — uses LM Studio's native REST API at /api/v1/*.
+ *
+ * Endpoints used:
+ *   GET  /api/v1/models   — rich model metadata (type, capabilities, loaded
+ *                           instances with their runtime ctx, max ctx)
+ *   POST /api/v1/chat     — inference; JIT-loads the model if not loaded
+ *
+ * This is the LM Studio native API — distinct from their OpenAI-compat
+ * surface at /v1/*. It returns local-inference detail the OpenAI shape
+ * strips (tokens/sec, time-to-first-token, load time, model instance id).
+ *
+ * Known limitation — system prompts:
+ *   /api/v1/chat accepts `input` as either a string or a message array,
+ *   but the message-array content-part discriminator is not documented
+ *   on my probed instance and the exact shape couldn't be determined
+ *   from error messages. Until that's pinned down we use string `input`
+ *   and prepend the system prompt as a framed prefix. Works correctly
+ *   for single-turn prompts, which is all HydraMCP's tools currently do.
+ *
+ * Known limitation — context size:
+ *   We report whatever LM Studio has configured. If a model is loaded,
+ *   we see `loaded_instances[0].config.context_length`. If it's not,
+ *   /api/v1/chat JIT-loads it with whatever default LM Studio was last
+ *   set to for that model. To run at larger context, bump it in the
+ *   LM Studio UI — we don't force a reload here.
+ *
+ * Default endpoint: http://localhost:1234
+ * Override with LMSTUDIO_URL (e.g. http://192.168.40.10:1234 when
+ * LM Studio runs on another machine on the LAN).
+ *
+ * Optional LMSTUDIO_API_KEY — sent as Bearer token if set. Only needed
+ * if LM Studio is behind a reverse proxy that enforces auth.
+ */
+
+import { Provider, ModelInfo, QueryOptions, QueryResponse } from "./provider.js";
+import { logger } from "../utils/logger.js";
+
+interface LMStudioModel {
+  key: string;
+  type?: string;
+  display_name?: string;
+  max_context_length?: number;
+  loaded_instances?: Array<{
+    id: string;
+    config?: { context_length?: number };
+  }>;
+}
+
+export class LMStudioProvider implements Provider {
+  name = "LM Studio";
+  private baseUrl: string;
+  private apiKey: string;
+
+  constructor(baseUrl?: string, apiKey?: string) {
+    this.baseUrl = baseUrl ?? process.env.LMSTUDIO_URL ?? "http://localhost:1234";
+    this.apiKey = apiKey ?? process.env.LMSTUDIO_API_KEY ?? "";
+  }
+
+  private headers(): Record<string, string> {
+    const h: Record<string, string> = { "Content-Type": "application/json" };
+    if (this.apiKey) h["Authorization"] = `Bearer ${this.apiKey}`;
+    return h;
+  }
+
+  /**
+   * Fetch with a hard timeout so a dead/unreachable LM Studio can't stall
+   * tool calls. list_models waits on this inside Promise.allSettled alongside
+   * other providers, so we want it to fail fast.
+   */
+  private async fetchWithTimeout(
+    url: string,
+    init: RequestInit,
+    timeoutMs: number
+  ): Promise<Response> {
+    const ctrl = new AbortController();
+    const timer = setTimeout(() => ctrl.abort(), timeoutMs);
+    try {
+      return await fetch(url, { ...init, signal: ctrl.signal });
+    } finally {
+      clearTimeout(timer);
+    }
+  }
+
+  async healthCheck(): Promise<boolean> {
+    try {
+      const res = await this.fetchWithTimeout(
+        `${this.baseUrl}/api/v1/models`,
+        { headers: this.headers() },
+        3_000
+      );
+      return res.ok;
+    } catch {
+      return false;
+    }
+  }
+
+  async listModels(): Promise<ModelInfo[]> {
+    const res = await this.fetchWithTimeout(
+      `${this.baseUrl}/api/v1/models`,
+      { headers: this.headers() },
+      3_000
+    );
+    if (!res.ok) {
+      throw new Error(`LM Studio: failed to list models (${res.status})`);
+    }
+
+    const data = (await res.json()) as { models?: LMStudioModel[] };
+
+    // Only chat-capable types. /api/v1/models uses `embedding` (singular)
+    // for embedding models; exclude them and anything else non-chat.
+    return (data.models ?? [])
+      .filter((m) => m.type === "llm" || m.type === "vlm")
+      .map((m) => ({ id: m.key, name: m.display_name ?? m.key, provider: "lmstudio" }));
+  }
+
+  async query(
+    model: string,
+    prompt: string,
+    options?: QueryOptions
+  ): Promise<QueryResponse> {
+    const startTime = Date.now();
+
+    // See "Known limitation — system prompts" in the file header.
+    const input = options?.system_prompt
+      ? `[SYSTEM]\n${options.system_prompt}\n\n[USER]\n${prompt}`
+      : prompt;
+
+    const body: Record<string, unknown> = { model, input };
+    if (options?.temperature !== undefined) body.temperature = options.temperature;
+    if (options?.max_tokens !== undefined) body.max_output_tokens = options.max_tokens;
+
+    const res = await fetch(`${this.baseUrl}/api/v1/chat`, {
+      method: "POST",
+      headers: this.headers(),
+      body: JSON.stringify(body),
+    });
+
+    if (!res.ok) {
+      const errorText = await res.text();
+      throw new Error(`LM Studio query failed (${res.status}): ${errorText}`);
+    }
+
+    const data = (await res.json()) as {
+      model_instance_id?: string;
+      output?: Array<{ type?: string; content?: string }>;
+      stats?: {
+        input_tokens?: number;
+        total_output_tokens?: number;
+        reasoning_output_tokens?: number;
+        tokens_per_second?: number;
+        time_to_first_token_seconds?: number;
+        model_load_time_seconds?: number;
+        stop_reason?: string;
+      };
+      response_id?: string;
+    };
+
+    const latency_ms = Date.now() - startTime;
+
+    // /api/v1/chat returns output[] where type="message" holds the assistant
+    // content. Reasoning models may also emit type="reasoning" items we skip.
+    const message = data.output?.find((o) => o.type === "message" || o.type === undefined);
+    const content = message?.content ?? "";
+
+    const prompt_tokens = data.stats?.input_tokens ?? 0;
+    const completion_tokens = data.stats?.total_output_tokens ?? 0;
+
+    // Surface LM Studio-only fields via debug log. These don't belong in
+    // QueryResponse (which is provider-agnostic) but help diagnose local
+    // inference performance.
+    if (data.stats) {
+      const parts: string[] = [`lmstudio ${model}`];
+      if (data.stats.tokens_per_second !== undefined) {
+        parts.push(`${data.stats.tokens_per_second.toFixed(1)} tok/s`);
+      }
+      if (data.stats.time_to_first_token_seconds !== undefined) {
+        parts.push(`ttft ${(data.stats.time_to_first_token_seconds * 1000).toFixed(0)}ms`);
+      }
+      if (data.stats.model_load_time_seconds !== undefined) {
+        parts.push(`load ${(data.stats.model_load_time_seconds * 1000).toFixed(0)}ms`);
+      }
+      logger.debug(parts.join(" | "));
+    }
+
+    return {
+      model,
+      content,
+      usage: {
+        prompt_tokens,
+        completion_tokens,
+        total_tokens: prompt_tokens + completion_tokens,
+      },
+      latency_ms,
+      finish_reason: data.stats?.stop_reason ?? "stop",
+    };
+  }
+}