release: v2026.4.7.1918 — Ollama input modalities fix

profbernardoj · profbernardoj · commit 3ff1739ffa97 · 2026-04-07T14:18:47.000-05:00
- Fixed gateway startup failure on fresh installs/upgrades
- OpenClaw validator only allows [text, image] in model input
- EverClaw templates had [text, image, audio] for Gemma 4 E2B/E4B
- Added migration in setup.mjs to sanitize input arrays
- Added A11 check in diagnose.sh to detect bad values
- Updated both templates and setup-ollama.sh
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,26 @@
 
 All notable changes to EverClaw are documented here.
 
+## [2026.4.7.1833] - 2026-04-07 — Ollama Input Modalities Fix
+
+### Fixed
+- **Gateway startup failure on fresh installs and upgrades** — OpenClaw's config validator only allows `["text", "image"]` in model `input` arrays. EverClaw templates and `setup-ollama.sh` were generating `["text", "image", "audio"]` for Gemma 4 E2B/E4B (which natively support audio), causing the gateway to reject the config and fail to start. Now all input arrays are capped to `["text", "image"]` until OpenClaw adds audio support.
+
+### Added
+- **Model input modality migration** — `setup.mjs` now sanitizes all provider model `input` arrays at apply time, removing unsupported values like `"audio"`. Defensive migration runs on all providers (not just Ollama) for future-proofing.
+- **A11 diagnostic check** — `diagnose.sh` now detects unsupported input modalities and suggests running `setup.mjs --apply` to auto-fix.
+
+### Changed
+- **`templates/openclaw-config-mac.json`** — Ollama model input: `["text", "image", "audio"]` → `["text", "image"]`
+- **`templates/openclaw-config-linux.json`** — Same
+- **`scripts/setup-ollama.sh`** — `get_model_input_modalities()` now returns `["text", "image"]` for all models (E2B/E4B/26B/31B) with comment explaining OpenClaw validator limitation
+- **Comment updated** — `_comment` in templates dropped "Vision + audio" to "Vision enabled where supported"
+
+### Technical Notes
+- Root cause: OpenClaw recently tightened the config schema for `models[].input` to only allow `"text"` and `"image"`. EverClaw templates were ahead of the validator (Gemma 4 E2B/E4B do support audio natively), but the mismatch caused gateway rejections.
+- Migration is defensive: strips any value not in `{"text", "image"}` across ALL providers, not just Ollama.
+- Silent no-op if config is already valid (no write, no log message).
+
 ## [2026.4.7.1756] - 2026-04-07 — Local Embeddings Fix (node-llama-cpp)
 
 ### Fixed
diff --git a/Dockerfile b/Dockerfile
@@ -170,7 +170,7 @@ RUN FDIR="/home/node/.openclaw/workspace/skills/everclaw/templates/flavors/${FLA
 
 # ─── Environment ──────────────────────────────────────────────────────────────
 
-ARG EVERCLAW_VERSION=2026.4.7.1756
+ARG EVERCLAW_VERSION=2026.4.7.1918
 ENV EVERCLAW_VERSION=${EVERCLAW_VERSION}
 ENV NODE_ENV=production
 ENV EVERCLAW_PROXY_PORT=8083
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -13,7 +13,7 @@
 
 services:
   everclaw:
-    image: ghcr.io/everclaw/everclaw:2026.4.7.1756
+    image: ghcr.io/everclaw/everclaw:2026.4.7.1918
     build:
       context: .
       dockerfile: Dockerfile
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "everclaw",
-  "version": "2026.4.7.1756",
+  "version": "2026.4.7.1918",
   "type": "module",
   "description": "Open-source first AI inference via Morpheus decentralized network",
   "scripts": {
diff --git a/scripts/diagnose.sh b/scripts/diagnose.sh
@@ -305,6 +305,29 @@ print(count)
       fix "Memory search works without this, but only with a remote embedding provider"
     fi
   fi
+
+  # A11: Do any model input arrays contain unsupported values?
+  # OpenClaw validator only allows "text" and "image". Values like "audio"
+  # (valid for Gemma 4 E2B/E4B) cause gateway startup failure.
+  if [[ -f "$OPENCLAW_CONFIG" ]]; then
+    local bad_inputs
+    bad_inputs=$(jq -r '
+      [.models.providers // {} | to_entries[] |
+       .key as $prov | .value.models // [] | .[] |
+       .input // [] | .[] |
+       select(. != "text" and . != "image") |
+       $prov + ": " + .] | unique | .[]
+    ' "$OPENCLAW_CONFIG" 2>/dev/null)
+    if [[ -n "$bad_inputs" ]]; then
+      fail "Unsupported model input modalities found (gateway will reject these):"
+      while IFS= read -r line; do
+        fix "  $line"
+      done <<< "$bad_inputs"
+      fix "Run: node scripts/setup.mjs --apply (auto-sanitizes input modalities)"
+    else
+      pass "All model input modalities valid (text/image only)"
+    fi
+  fi
 }
 
 # ═════════════════════════════════════════════════════════════════════════════
diff --git a/scripts/setup-ollama.sh b/scripts/setup-ollama.sh
@@ -85,11 +85,12 @@ get_model_context_window() {
 }
 
 # Get input modalities by model
+# Note: Gemma 4 E2B/E4B models support "audio" natively, but OpenClaw's
+# config validator currently only allows ["text", "image"]. We cap all models
+# to ["text", "image"] until the validator is updated.
 get_model_input_modalities() {
   local model="$1"
   case "$model" in
-    gemma4:e2b*|gemma4-e2b*|gemma4:e4b*|gemma4-e4b*) echo '["text", "image", "audio"]' ;;
-    gemma4:26b*|gemma4-26b*|gemma4:31b*|gemma4-31b*) echo '["text", "image"]' ;;
     *) echo '["text", "image"]' ;;
   esac
 }
diff --git a/scripts/setup.mjs b/scripts/setup.mjs
@@ -679,6 +679,37 @@ if (applyMode) {
     console.log(`  ⚠️  Ollama migration check failed (not fatal): ${e.message}`);
   }
 
+  // ─── Model Input Modality Sanitization ─────────────────────────────
+  // OpenClaw's config validator only allows "text" and "image" in model
+  // input arrays. Gemma 4 E2B/E4B support "audio" natively, but the
+  // validator rejects it — causing gateway startup failure on fresh
+  // installs and upgrades. Strip unsupported values defensively across
+  // ALL providers (not just Ollama) to future-proof.
+  try {
+    const ALLOWED_INPUTS = new Set(['text', 'image']);
+    const providers = merged.models?.providers;
+    let sanitized = false;
+    if (providers && typeof providers === 'object') {
+      for (const [provId, prov] of Object.entries(providers)) {
+        if (!Array.isArray(prov.models)) continue;
+        for (const m of prov.models) {
+          if (!Array.isArray(m.input)) continue;
+          const filtered = m.input.filter(v => ALLOWED_INPUTS.has(v));
+          if (filtered.length !== m.input.length) {
+            m.input = filtered;
+            sanitized = true;
+          }
+        }
+      }
+    }
+    if (sanitized) {
+      writeFileSync(configPath, JSON.stringify(merged, null, 2) + '\n');
+      console.log('  🔧 Sanitized model input modalities (removed unsupported values like "audio")');
+    }
+  } catch (e) {
+    console.log(`  ⚠️  Input modality sanitization failed (not fatal): ${e.message}`);
+  }
+
   // ─── Stage 4: Security Tier ───────────────────────────────────────
 
   if (!noSecurity) {
diff --git a/templates/openclaw-config-linux.json b/templates/openclaw-config-linux.json
@@ -81,7 +81,7 @@
         ]
       },
       "ollama": {
-        "_comment": "Local Ollama inference fallback. Run 'bash scripts/setup-ollama.sh --apply' to auto-detect your RAM/GPU and install the optimal Gemma 4 model (E2B/E4B/26B/31B). Vision + audio enabled where supported.",
+        "_comment": "Local Ollama inference fallback. Run 'bash scripts/setup-ollama.sh --apply' to auto-detect your RAM/GPU and install the optimal Gemma 4 model (E2B/E4B/26B/31B). Vision enabled where supported.",
         "baseUrl": "http://127.0.0.1:11434/v1",
         "api": "ollama",
         "models": [
@@ -91,8 +91,7 @@
             "reasoning": false,
             "input": [
               "text",
-              "image",
-              "audio"
+              "image"
             ],
             "cost": {
               "input": 0,
diff --git a/templates/openclaw-config-mac.json b/templates/openclaw-config-mac.json
@@ -79,7 +79,7 @@
         ]
       },
       "ollama": {
-        "_comment": "Local Ollama inference fallback. Run 'bash scripts/setup-ollama.sh --apply' to auto-detect your RAM/GPU and install the optimal Gemma 4 model (E2B/E4B/26B/31B). Vision + audio enabled where supported.",
+        "_comment": "Local Ollama inference fallback. Run 'bash scripts/setup-ollama.sh --apply' to auto-detect your RAM/GPU and install the optimal Gemma 4 model (E2B/E4B/26B/31B). Vision enabled where supported.",
         "baseUrl": "http://127.0.0.1:11434/v1",
         "api": "ollama",
         "models": [
@@ -89,8 +89,7 @@
             "reasoning": false,
             "input": [
               "text",
-              "image",
-              "audio"
+              "image"
             ],
             "cost": {
               "input": 0,

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "everclaw",`
`3`		`- "version": "2026.4.7.1756",`
	`3`	`+ "version": "2026.4.7.1918",`
`4`	`4`	`"type": "module",`
`5`	`5`	`"description": "Open-source first AI inference via Morpheus decentralized network",`
`6`	`6`	`"scripts": {`
Original file line number	Diff line number	Diff line change
`@@ -85,11 +85,12 @@ get_model_context_window() {`
`85`	`85`	`}`
`86`	`86`
`87`	`87`	`# Get input modalities by model`
	`88`	`+# Note: Gemma 4 E2B/E4B models support "audio" natively, but OpenClaw's`
	`89`	`+# config validator currently only allows ["text", "image"]. We cap all models`
	`90`	`+# to ["text", "image"] until the validator is updated.`
`88`	`91`	`get_model_input_modalities() {`
`89`	`92`	`local model="$1"`
`90`	`93`	`case "$model" in`
`91`		`- gemma4:e2b\|gemma4-e2b\|gemma4:e4b\|gemma4-e4b) echo '["text", "image", "audio"]' ;;`
`92`		`- gemma4:26b\|gemma4-26b\|gemma4:31b\|gemma4-31b) echo '["text", "image"]' ;;`
`93`	`94`	`*) echo '["text", "image"]' ;;`
`94`	`95`	`esac`
`95`	`96`	`}`