vllm-project
diff --git a/‎candle-binding/semantic-router_test.go‎
Lines changed: 34 additions & 59 deletions b/‎candle-binding/semantic-router_test.go‎
Lines changed: 34 additions & 59 deletions
diff --git a/‎candle-binding/src/classifiers/unified.rs‎
Lines changed: 40 additions & 0 deletions b/‎candle-binding/src/classifiers/unified.rs‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎candle-binding/src/ffi/embedding.rs‎
Lines changed: 30 additions & 16 deletions b/‎candle-binding/src/ffi/embedding.rs‎
Lines changed: 30 additions & 16 deletions
@@ -1478,59 +1478,55 @@ func TestGetEmbeddingSmart(t *testing.T) {
 	// Initialize embedding models first
 	err := InitEmbeddingModels(Qwen3EmbeddingModelPath, GemmaEmbeddingModelPath, true)
 	if err != nil {
-		if isModelInitializationError(err) {
-			t.Skipf("Skipping GetEmbeddingSmart tests due to model initialization error: %v", err)
-		}
 		t.Fatalf("Failed to initialize embedding models: %v", err)
 	}
 
 	t.Run("ShortTextHighLatency", func(t *testing.T) {
-		// Short text with high latency priority should use Traditional BERT
+		// Short text with high latency priority - uses Qwen3 (1024) since Gemma is not available
 		text := "Hello world"
 		embedding, err := GetEmbeddingSmart(text, 0.3, 0.8)
 
 		if err != nil {
-			t.Logf("GetEmbeddingSmart returned error (expected for placeholder): %v", err)
-			// This is expected since we're using placeholder implementation
-			return
+			t.Fatalf("GetEmbeddingSmart failed: %v", err)
 		}
 
-		if len(embedding) != 768 {
-			t.Errorf("Expected 768-dim embedding, got %d", len(embedding))
+		// Expect Qwen3 (1024) dimension since Gemma is not available
+		if len(embedding) != 1024 {
+			t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
 		}
 
 		t.Logf("Short text embedding generated: dim=%d", len(embedding))
 	})
 
 	t.Run("MediumTextBalanced", func(t *testing.T) {
-		// Medium text with balanced priorities - may select Qwen3 (1024) or Gemma (768)
+		// Medium text with balanced priorities - uses Qwen3 (1024) since Gemma is not available
 		text := strings.Repeat("This is a medium length text with enough words to exceed 512 tokens. ", 10)
 		embedding, err := GetEmbeddingSmart(text, 0.5, 0.5)
 
 		if err != nil {
 			t.Fatalf("GetEmbeddingSmart failed: %v", err)
 		}
 
-		// Accept both Qwen3 (1024) and Gemma (768) dimensions
-		if len(embedding) != 768 && len(embedding) != 1024 {
-			t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding))
+		// Expect Qwen3 (1024) dimension since Gemma is not available
+		if len(embedding) != 1024 {
+			t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
 		}
 
 		t.Logf("Medium text embedding generated: dim=%d", len(embedding))
 	})
 
 	t.Run("LongTextHighQuality", func(t *testing.T) {
-		// Long text with high quality priority should use Qwen3
+		// Long text with high quality priority should use Qwen3 (1024)
 		text := strings.Repeat("This is a very long document that requires Qwen3's 32K context support. ", 50)
 		embedding, err := GetEmbeddingSmart(text, 0.9, 0.2)
 
 		if err != nil {
-			t.Logf("GetEmbeddingSmart returned error (expected for placeholder): %v", err)
-			return
+			t.Fatalf("GetEmbeddingSmart failed: %v", err)
 		}
 
-		if len(embedding) != 768 {
-			t.Errorf("Expected 768-dim embedding, got %d", len(embedding))
+		// Expect Qwen3 (1024) dimension
+		if len(embedding) != 1024 {
+			t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
 		}
 
 		t.Logf("Long text embedding generated: dim=%d", len(embedding))
@@ -1573,9 +1569,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
 					return
 				}
 
-				// Smart routing may select Qwen3 (1024) or Gemma (768) based on priorities
-				if len(embedding) != 768 && len(embedding) != 1024 {
-					t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding))
+				// Expect Qwen3 (1024) since Gemma is not available
+				if len(embedding) != 1024 {
+					t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
 				}
 				t.Logf("Priority test %s: generated %d-dim embedding", tc.desc, len(embedding))
 			})
@@ -1598,9 +1594,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
 				continue
 			}
 
-			// Smart routing may select Qwen3 (1024) or Gemma (768)
-			if len(embedding) != 768 && len(embedding) != 1024 {
-				t.Errorf("Iteration %d: Expected 768 or 1024-dim embedding, got %d", i, len(embedding))
+			// Expect Qwen3 (1024) since Gemma is not available
+			if len(embedding) != 1024 {
+				t.Errorf("Iteration %d: Expected 1024-dim embedding, got %d", i, len(embedding))
 			}
 
 			// Verify no nil pointers
@@ -1639,11 +1635,12 @@ func BenchmarkGetEmbeddingSmart(b *testing.B) {
 }
 
 // Test constants for embedding models (Phase 4.2)
+// Note: Gemma model is gated and requires HF_TOKEN, so tests use Qwen3 only
 const (
 	Qwen3EmbeddingModelPath = "../models/Qwen3-Embedding-0.6B"
-	GemmaEmbeddingModelPath = "../models/embeddinggemma-300m"
+	GemmaEmbeddingModelPath = "" // Gemma is gated, not used in CI tests
 	TestEmbeddingText       = "This is a test sentence for embedding generation"
-	TestLongContextText     = "This is a longer text that might benefit from long-context embedding models like Qwen3 or Gemma"
+	TestLongContextText     = "This is a longer text that might benefit from long-context embedding models like Qwen3"
 )
 
 // Test constants for Qwen3 Multi-LoRA
@@ -1705,23 +1702,8 @@ func TestInitEmbeddingModels(t *testing.T) {
 	})
 
 	t.Run("InitGemmaOnly", func(t *testing.T) {
-		// Similar to InitBothModels, accept already-initialized state
-		err := InitEmbeddingModels("", GemmaEmbeddingModelPath, true)
-		if err != nil {
-			t.Logf("InitEmbeddingModels (Gemma only) returned error (may already be initialized): %v", err)
-
-			// Verify functionality
-			_, testErr := GetEmbeddingSmart("test", 0.5, 0.5)
-			if testErr == nil {
-				t.Log("✓ ModelFactory is functional (already initialized)")
-			} else {
-				if isModelInitializationError(testErr) {
-					t.Skipf("Skipping test due to model unavailability: %v", testErr)
-				}
-			}
-		} else {
-			t.Log("✓ Gemma model initialized successfully")
-		}
+		// Gemma is a gated model requiring HF_TOKEN, skip in CI
+		t.Skip("Skipping Gemma-only test: Gemma is a gated model requiring HF_TOKEN")
 	})
 
 	t.Run("InitWithInvalidPaths", func(t *testing.T) {
@@ -1739,9 +1721,6 @@ func TestGetEmbeddingWithDim(t *testing.T) {
 	// Initialize embedding models first
 	err := InitEmbeddingModels(Qwen3EmbeddingModelPath, GemmaEmbeddingModelPath, true)
 	if err != nil {
-		if isModelInitializationError(err) {
-			t.Skipf("Skipping GetEmbeddingWithDim tests due to model initialization error: %v", err)
-		}
 		t.Fatalf("Failed to initialize embedding models: %v", err)
 	}
 
@@ -1806,16 +1785,16 @@ func TestGetEmbeddingWithDim(t *testing.T) {
 
 	t.Run("OversizedDimension", func(t *testing.T) {
 		// Test graceful degradation when requested dimension exceeds model capacity
-		// Qwen3: 1024, Gemma: 768, so 2048 should fall back to full dimension
+		// Qwen3: 1024, so 2048 should fall back to full dimension
 		embedding, err := GetEmbeddingWithDim(TestEmbeddingText, 0.5, 0.5, 2048)
 		if err != nil {
 			t.Errorf("Should gracefully handle oversized dimension, got error: %v", err)
 			return
 		}
 
-		// Should return full dimension (1024 for Qwen3 or 768 for Gemma)
-		if len(embedding) != 1024 && len(embedding) != 768 {
-			t.Errorf("Expected full dimension (1024 or 768), got %d", len(embedding))
+		// Should return full dimension (1024 for Qwen3)
+		if len(embedding) != 1024 {
+			t.Errorf("Expected full dimension (1024), got %d", len(embedding))
 		} else {
 			t.Logf("✓ Oversized dimension gracefully degraded to full dimension: %d", len(embedding))
 		}
@@ -1841,9 +1820,6 @@ func TestGetEmbeddingWithDim(t *testing.T) {
 func TestEmbeddingConsistency(t *testing.T) {
 	err := InitEmbeddingModels(Qwen3EmbeddingModelPath, GemmaEmbeddingModelPath, true)
 	if err != nil {
-		if isModelInitializationError(err) {
-			t.Skipf("Skipping consistency tests due to model initialization error: %v", err)
-		}
 		t.Fatalf("Failed to initialize embedding models: %v", err)
 	}
 
@@ -1911,12 +1887,11 @@ func TestEmbeddingConsistency(t *testing.T) {
 func TestEmbeddingPriorityRouting(t *testing.T) {
 	err := InitEmbeddingModels(Qwen3EmbeddingModelPath, GemmaEmbeddingModelPath, true)
 	if err != nil {
-		if isModelInitializationError(err) {
-			t.Skipf("Skipping priority routing tests due to model initialization error: %v", err)
-		}
 		t.Fatalf("Failed to initialize embedding models: %v", err)
 	}
 
+	// Note: These tests use Matryoshka dimension truncation (768) with Qwen3 model
+	// The dimension is truncated from Qwen3's full 1024 dimensions
 	testCases := []struct {
 		name            string
 		text            string
@@ -1931,23 +1906,23 @@ func TestEmbeddingPriorityRouting(t *testing.T) {
 			qualityPriority: 0.2,
 			latencyPriority: 0.9,
 			expectedDim:     768,
-			description:     "Should prefer faster embedding model (Gemma > Qwen3)",
+			description:     "Uses Qwen3 with Matryoshka 768 truncation",
 		},
 		{
 			name:            "HighQualityPriority",
 			text:            strings.Repeat("Long context text ", 30),
 			qualityPriority: 0.9,
 			latencyPriority: 0.2,
 			expectedDim:     768,
-			description:     "Should prefer quality model (Qwen3/Gemma)",
+			description:     "Uses Qwen3 with Matryoshka 768 truncation",
 		},
 		{
 			name:            "BalancedPriority",
 			text:            "Medium length text for embedding",
 			qualityPriority: 0.5,
 			latencyPriority: 0.5,
 			expectedDim:     768,
-			description:     "Should select based on text length",
+			description:     "Uses Qwen3 with Matryoshka 768 truncation",
 		},
 	}
 
 
@@ -11,6 +11,7 @@ use parking_lot::RwLock;
 use std::collections::HashMap;
 use std::time::Instant;
 
+use crate::ffi::embedding::GLOBAL_MODEL_FACTORY;
 use crate::model_architectures::config::{DualPathConfig, LoRAConfig, TraditionalConfig};
 use crate::model_architectures::routing::{DualPathRouter, ProcessingRequirements};
 use crate::model_architectures::traits::*;
@@ -1024,6 +1025,45 @@ impl DualPathUnifiedClassifier {
             model_type
         };
 
+        // Validate model availability and fall back if necessary
+        let model_type = match model_type {
+            ModelType::GemmaEmbedding => {
+                // Check if Gemma is available
+                if let Some(factory) = GLOBAL_MODEL_FACTORY.get() {
+                    if factory.get_gemma_model().is_none() {
+                        // Gemma not available, fall back to Qwen3
+                        eprintln!(
+                            "WARNING: GemmaEmbedding selected but not available, falling back to Qwen3Embedding"
+                        );
+                        ModelType::Qwen3Embedding
+                    } else {
+                        ModelType::GemmaEmbedding
+                    }
+                } else {
+                    // No factory available, fall back to Qwen3
+                    eprintln!(
+                        "WARNING: ModelFactory not initialized, falling back to Qwen3Embedding"
+                    );
+                    ModelType::Qwen3Embedding
+                }
+            }
+            ModelType::Qwen3Embedding => {
+                // Qwen3 is the default, should always be available
+                // But verify just in case
+                if let Some(factory) = GLOBAL_MODEL_FACTORY.get() {
+                    if factory.get_qwen3_model().is_none() {
+                        return Err(UnifiedClassifierError::ProcessingError(
+                            "Qwen3Embedding selected but not available and no fallback available"
+                                .to_string(),
+                        ));
+                    }
+                }
+                ModelType::Qwen3Embedding
+            }
+            // For non-embedding types, pass through
+            other => other,
+        };
+
         // Log routing decision for monitoring
         if self.config.embedding.enable_performance_tracking {
             println!(
 
@@ -29,7 +29,7 @@ enum PaddingSide {
 }
 
 /// Global singleton for ModelFactory
-static GLOBAL_MODEL_FACTORY: OnceLock<ModelFactory> = OnceLock::new();
+pub(crate) static GLOBAL_MODEL_FACTORY: OnceLock<ModelFactory> = OnceLock::new();
 
 /// Generic internal helper for single text embedding generation
 ///
@@ -77,14 +77,18 @@ where
 
     // Apply Matryoshka truncation if requested
     let result = if let Some(dim) = target_dim {
-        if dim > embedding_vec.len() {
-            return Err(format!(
-                "Target dimension {} exceeds model dimension {}",
+        // Gracefully degrade to model's max dimension if requested dimension is too large
+        let actual_dim = if dim > embedding_vec.len() {
+            eprintln!(
+                "WARNING: Requested dimension {} exceeds model dimension {}, using full dimension",
                 dim,
                 embedding_vec.len()
-            ));
-        }
-        embedding_vec[..dim].to_vec()
+            );
+            embedding_vec.len()
+        } else {
+            dim
+        };
+        embedding_vec[..actual_dim].to_vec()
     } else {
         embedding_vec
     };
@@ -185,15 +189,19 @@ where
 
     // Apply Matryoshka truncation if requested
     let result_embeddings = if let Some(dim) = target_dim {
-        if dim > embedding_dim {
-            return Err(format!(
-                "Target dimension {} exceeds model dimension {}",
+        // Gracefully degrade to model's max dimension if requested dimension is too large
+        let actual_dim = if dim > embedding_dim {
+            eprintln!(
+                "WARNING: Requested dimension {} exceeds model dimension {}, using full dimension",
                 dim, embedding_dim
-            ));
-        }
+            );
+            embedding_dim
+        } else {
+            dim
+        };
         embeddings_data
             .into_iter()
-            .map(|emb| emb[..dim].to_vec())
+            .map(|emb| emb[..actual_dim].to_vec())
             .collect()
     } else {
         embeddings_data
@@ -207,11 +215,11 @@ where
 /// # Safety
 /// - `qwen3_model_path` and `gemma_model_path` must be valid null-terminated C strings or null
 /// - Must be called before any embedding generation functions
-/// - Can only be called once (subsequent calls will be ignored)
+/// - Can only be called once (subsequent calls will return true as already initialized)
 ///
 /// # Returns
-/// - `true` if initialization succeeded
-/// - `false` if initialization failed or already initialized
+/// - `true` if initialization succeeded or already initialized
+/// - `false` if initialization failed
 #[no_mangle]
 pub extern "C" fn init_embedding_models(
     qwen3_model_path: *const c_char,
@@ -220,6 +228,12 @@ pub extern "C" fn init_embedding_models(
 ) -> bool {
     use candle_core::Device;
 
+    // Check if already initialized (OnceLock can only be set once)
+    if GLOBAL_MODEL_FACTORY.get().is_some() {
+        eprintln!("WARNING: ModelFactory already initialized");
+        return true; // Already initialized, return success
+    }
+
     // Parse model paths
     let qwen3_path = if qwen3_model_path.is_null() {
         None