diff --git a/.env.example b/.env.example index 4c74913..c7ddd61 100644 --- a/.env.example +++ b/.env.example @@ -7,5 +7,11 @@ CLIPROXYAPI_KEY=sk-cliproxy-local-001 # Ollama backend (local models) OLLAMA_URL=http://localhost:11434 +# LM Studio backend (local or LAN) +# Defaults to http://localhost:1234 if unset. Override to point at a +# machine running LM Studio on the network, e.g. http://192.168.40.10:1234 +# LMSTUDIO_URL=http://localhost:1234 +# LMSTUDIO_API_KEY= + # Logging level: debug | info | warn | error HYDRA_LOG_LEVEL=info diff --git a/README.md b/README.md index fb29a09..6c825de 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,7 @@ Claude Code |-- Anthropic -> api.anthropic.com (API key) |-- Sub -> CLI tools (Gemini CLI, Claude Code, Codex CLI) |-- Ollama -> local models (your hardware) + |-- LM Studio -> local or LAN LM Studio server ``` ## Three Ways to Connect Models @@ -127,6 +128,20 @@ Install [Ollama](https://ollama.com), pull a model, done. Auto-detected. ollama pull qwen2.5-coder:14b ``` +Or run [LM Studio](https://lmstudio.ai), load a model, and start its server. Auto-detected on `localhost:1234`. If LM Studio is on another machine, point HydraMCP at it. Either pass it to Claude Code when registering the server: + +```bash +claude mcp add hydramcp \ + -e LMSTUDIO_URL=http://192.168.40.10:1234 \ + -- npx hydramcp +``` + +Or set it in your shell / `~/.hydramcp/.env` before starting HydraMCP: + +```bash +LMSTUDIO_URL=http://192.168.40.10:1234 +``` + ### Mix and Match All three methods stack. Use API keys for some providers, subscriptions for others, and Ollama for local. They all show up in `list_models` together. @@ -136,6 +151,7 @@ Route explicitly with prefixes: - `google/gemini-2.5-flash` — force Google API - `sub/gemini-2.5-flash` — force subscription CLI - `ollama/qwen2.5-coder:14b` — force local +- `lmstudio/` — force LM Studio - `gpt-5` — auto-detect (tries each provider) ## Setup Details @@ -185,7 +201,7 @@ interface Provider { See `src/providers/ollama.ts` for a working example. Implement it, register in `src/index.ts`, done. -Providers we'd love to see: LM Studio, OpenRouter, Groq, Together AI, or anything that speaks HTTP. +Providers we'd love to see: OpenRouter, Groq, Together AI, or anything that speaks HTTP. ## License diff --git a/src/index.ts b/src/index.ts index bdc0d45..ae3bd0c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -17,6 +17,7 @@ * * Local models: * OLLAMA_URL → Ollama local models (auto-detected) + * LMSTUDIO_URL → LM Studio (defaults to http://localhost:1234) * * Set any combination. HydraMCP registers what's available. * @@ -27,6 +28,7 @@ * "sub/gemini-2.5-flash" → Gemini CLI subscription * "sub/claude-..." → Claude CLI subscription * "ollama/llama3" → local Ollama instance + * "lmstudio/" → local/LAN LM Studio instance * "gpt-4o" → auto-detect (tries each provider) */ @@ -36,6 +38,7 @@ import { GoogleProvider } from "./providers/google.js"; import { AnthropicProvider } from "./providers/anthropic.js"; import { SubscriptionProvider } from "./providers/subscription.js"; import { OllamaProvider } from "./providers/ollama.js"; +import { LMStudioProvider } from "./providers/lmstudio.js"; import { MultiProvider } from "./providers/multi-provider.js"; import { SmartProvider } from "./orchestrator/index.js"; import { createServer } from "./server.js"; @@ -83,12 +86,20 @@ async function main() { } // --- Local models --- + // + // Register unconditionally. Local servers (Ollama, LM Studio) can start or + // restart independently of the MCP process, so we don't gate registration + // on a one-shot boot-time health check — that used to silently drop a + // provider for the life of the process if it happened to be down during + // startup. Instead, listModels/query reach out live on each tool call + // (backed by the 30s model-list cache), and listModels uses + // Promise.allSettled so unreachable providers just contribute no models. - const ollama = new OllamaProvider(); - if (await ollama.healthCheck()) { - multi.register("ollama", ollama); - active.push("Ollama"); - } + multi.register("ollama", new OllamaProvider()); + active.push("Ollama"); + + multi.register("lmstudio", new LMStudioProvider()); + active.push("LM Studio"); // --- Startup summary --- @@ -108,6 +119,7 @@ async function main() { "\n" + " Local models:\n" + " Install Ollama → ollama pull llama3\n" + + " LM Studio → LMSTUDIO_URL=http://host:1234 (default: localhost:1234)\n" + "\n" + "HydraMCP will start anyway and retry on first request." ); diff --git a/src/providers/lmstudio.ts b/src/providers/lmstudio.ts new file mode 100644 index 0000000..2612302 --- /dev/null +++ b/src/providers/lmstudio.ts @@ -0,0 +1,198 @@ +/** + * LM Studio Backend — uses LM Studio's native REST API at /api/v1/*. + * + * Endpoints used: + * GET /api/v1/models — rich model metadata (type, capabilities, loaded + * instances with their runtime ctx, max ctx) + * POST /api/v1/chat — inference; JIT-loads the model if not loaded + * + * This is the LM Studio native API — distinct from their OpenAI-compat + * surface at /v1/*. It returns local-inference detail the OpenAI shape + * strips (tokens/sec, time-to-first-token, load time, model instance id). + * + * Known limitation — system prompts: + * /api/v1/chat accepts `input` as either a string or a message array, + * but the message-array content-part discriminator is not documented + * on my probed instance and the exact shape couldn't be determined + * from error messages. Until that's pinned down we use string `input` + * and prepend the system prompt as a framed prefix. Works correctly + * for single-turn prompts, which is all HydraMCP's tools currently do. + * + * Known limitation — context size: + * We report whatever LM Studio has configured. If a model is loaded, + * we see `loaded_instances[0].config.context_length`. If it's not, + * /api/v1/chat JIT-loads it with whatever default LM Studio was last + * set to for that model. To run at larger context, bump it in the + * LM Studio UI — we don't force a reload here. + * + * Default endpoint: http://localhost:1234 + * Override with LMSTUDIO_URL (e.g. http://192.168.40.10:1234 when + * LM Studio runs on another machine on the LAN). + * + * Optional LMSTUDIO_API_KEY — sent as Bearer token if set. Only needed + * if LM Studio is behind a reverse proxy that enforces auth. + */ + +import { Provider, ModelInfo, QueryOptions, QueryResponse } from "./provider.js"; +import { logger } from "../utils/logger.js"; + +interface LMStudioModel { + key: string; + type?: string; + display_name?: string; + max_context_length?: number; + loaded_instances?: Array<{ + id: string; + config?: { context_length?: number }; + }>; +} + +export class LMStudioProvider implements Provider { + name = "LM Studio"; + private baseUrl: string; + private apiKey: string; + + constructor(baseUrl?: string, apiKey?: string) { + this.baseUrl = baseUrl ?? process.env.LMSTUDIO_URL ?? "http://localhost:1234"; + this.apiKey = apiKey ?? process.env.LMSTUDIO_API_KEY ?? ""; + } + + private headers(): Record { + const h: Record = { "Content-Type": "application/json" }; + if (this.apiKey) h["Authorization"] = `Bearer ${this.apiKey}`; + return h; + } + + /** + * Fetch with a hard timeout so a dead/unreachable LM Studio can't stall + * tool calls. list_models waits on this inside Promise.allSettled alongside + * other providers, so we want it to fail fast. + */ + private async fetchWithTimeout( + url: string, + init: RequestInit, + timeoutMs: number + ): Promise { + const ctrl = new AbortController(); + const timer = setTimeout(() => ctrl.abort(), timeoutMs); + try { + return await fetch(url, { ...init, signal: ctrl.signal }); + } finally { + clearTimeout(timer); + } + } + + async healthCheck(): Promise { + try { + const res = await this.fetchWithTimeout( + `${this.baseUrl}/api/v1/models`, + { headers: this.headers() }, + 3_000 + ); + return res.ok; + } catch { + return false; + } + } + + async listModels(): Promise { + const res = await this.fetchWithTimeout( + `${this.baseUrl}/api/v1/models`, + { headers: this.headers() }, + 3_000 + ); + if (!res.ok) { + throw new Error(`LM Studio: failed to list models (${res.status})`); + } + + const data = (await res.json()) as { models?: LMStudioModel[] }; + + // Only chat-capable types. /api/v1/models uses `embedding` (singular) + // for embedding models; exclude them and anything else non-chat. + return (data.models ?? []) + .filter((m) => m.type === "llm" || m.type === "vlm") + .map((m) => ({ id: m.key, name: m.display_name ?? m.key, provider: "lmstudio" })); + } + + async query( + model: string, + prompt: string, + options?: QueryOptions + ): Promise { + const startTime = Date.now(); + + // See "Known limitation — system prompts" in the file header. + const input = options?.system_prompt + ? `[SYSTEM]\n${options.system_prompt}\n\n[USER]\n${prompt}` + : prompt; + + const body: Record = { model, input }; + if (options?.temperature !== undefined) body.temperature = options.temperature; + if (options?.max_tokens !== undefined) body.max_output_tokens = options.max_tokens; + + const res = await fetch(`${this.baseUrl}/api/v1/chat`, { + method: "POST", + headers: this.headers(), + body: JSON.stringify(body), + }); + + if (!res.ok) { + const errorText = await res.text(); + throw new Error(`LM Studio query failed (${res.status}): ${errorText}`); + } + + const data = (await res.json()) as { + model_instance_id?: string; + output?: Array<{ type?: string; content?: string }>; + stats?: { + input_tokens?: number; + total_output_tokens?: number; + reasoning_output_tokens?: number; + tokens_per_second?: number; + time_to_first_token_seconds?: number; + model_load_time_seconds?: number; + stop_reason?: string; + }; + response_id?: string; + }; + + const latency_ms = Date.now() - startTime; + + // /api/v1/chat returns output[] where type="message" holds the assistant + // content. Reasoning models may also emit type="reasoning" items we skip. + const message = data.output?.find((o) => o.type === "message" || o.type === undefined); + const content = message?.content ?? ""; + + const prompt_tokens = data.stats?.input_tokens ?? 0; + const completion_tokens = data.stats?.total_output_tokens ?? 0; + + // Surface LM Studio-only fields via debug log. These don't belong in + // QueryResponse (which is provider-agnostic) but help diagnose local + // inference performance. + if (data.stats) { + const parts: string[] = [`lmstudio ${model}`]; + if (data.stats.tokens_per_second !== undefined) { + parts.push(`${data.stats.tokens_per_second.toFixed(1)} tok/s`); + } + if (data.stats.time_to_first_token_seconds !== undefined) { + parts.push(`ttft ${(data.stats.time_to_first_token_seconds * 1000).toFixed(0)}ms`); + } + if (data.stats.model_load_time_seconds !== undefined) { + parts.push(`load ${(data.stats.model_load_time_seconds * 1000).toFixed(0)}ms`); + } + logger.debug(parts.join(" | ")); + } + + return { + model, + content, + usage: { + prompt_tokens, + completion_tokens, + total_tokens: prompt_tokens + completion_tokens, + }, + latency_ms, + finish_reason: data.stats?.stop_reason ?? "stop", + }; + } +}