Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .github/workflows/integration-test-k8s.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,17 @@ jobs:
run: |
make build-e2e

- name: Free up disk space
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a good start, i think the /mnt directory can be used if possible (e.g. move models and symlink there)

run: |
# Remove unnecessary toolchains to free ~25GB disk space
# This helps prevent "no space left on device" errors
echo "Disk before cleanup:"
df -h /
# Note: Do NOT remove $AGENT_TOOLSDIRECTORY - it contains Go/Rust from setup actions
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/share/boost /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true
echo "Disk after cleanup:"
df -h /

- name: Run Integration E2E tests (${{ matrix.profile }})
id: e2e-test
run: |
Expand Down
93 changes: 34 additions & 59 deletions candle-binding/semantic-router_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1478,59 +1478,55 @@ func TestGetEmbeddingSmart(t *testing.T) {
// Initialize embedding models first
err := InitEmbeddingModels(Qwen3EmbeddingModelPath, GemmaEmbeddingModelPath, true)
if err != nil {
if isModelInitializationError(err) {
t.Skipf("Skipping GetEmbeddingSmart tests due to model initialization error: %v", err)
}
t.Fatalf("Failed to initialize embedding models: %v", err)
}

t.Run("ShortTextHighLatency", func(t *testing.T) {
// Short text with high latency priority should use Traditional BERT
// Short text with high latency priority - uses Qwen3 (1024) since Gemma is not available
text := "Hello world"
embedding, err := GetEmbeddingSmart(text, 0.3, 0.8)

if err != nil {
t.Logf("GetEmbeddingSmart returned error (expected for placeholder): %v", err)
// This is expected since we're using placeholder implementation
return
t.Fatalf("GetEmbeddingSmart failed: %v", err)
}

if len(embedding) != 768 {
t.Errorf("Expected 768-dim embedding, got %d", len(embedding))
// Expect Qwen3 (1024) dimension since Gemma is not available
if len(embedding) != 1024 {
t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
}

t.Logf("Short text embedding generated: dim=%d", len(embedding))
})

t.Run("MediumTextBalanced", func(t *testing.T) {
// Medium text with balanced priorities - may select Qwen3 (1024) or Gemma (768)
// Medium text with balanced priorities - uses Qwen3 (1024) since Gemma is not available
text := strings.Repeat("This is a medium length text with enough words to exceed 512 tokens. ", 10)
embedding, err := GetEmbeddingSmart(text, 0.5, 0.5)

if err != nil {
t.Fatalf("GetEmbeddingSmart failed: %v", err)
}

// Accept both Qwen3 (1024) and Gemma (768) dimensions
if len(embedding) != 768 && len(embedding) != 1024 {
t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding))
// Expect Qwen3 (1024) dimension since Gemma is not available
if len(embedding) != 1024 {
t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
}

t.Logf("Medium text embedding generated: dim=%d", len(embedding))
})

t.Run("LongTextHighQuality", func(t *testing.T) {
// Long text with high quality priority should use Qwen3
// Long text with high quality priority should use Qwen3 (1024)
text := strings.Repeat("This is a very long document that requires Qwen3's 32K context support. ", 50)
embedding, err := GetEmbeddingSmart(text, 0.9, 0.2)

if err != nil {
t.Logf("GetEmbeddingSmart returned error (expected for placeholder): %v", err)
return
t.Fatalf("GetEmbeddingSmart failed: %v", err)
}

if len(embedding) != 768 {
t.Errorf("Expected 768-dim embedding, got %d", len(embedding))
// Expect Qwen3 (1024) dimension
if len(embedding) != 1024 {
t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
}

t.Logf("Long text embedding generated: dim=%d", len(embedding))
Expand Down Expand Up @@ -1573,9 +1569,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
return
}

// Smart routing may select Qwen3 (1024) or Gemma (768) based on priorities
if len(embedding) != 768 && len(embedding) != 1024 {
t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding))
// Expect Qwen3 (1024) since Gemma is not available
if len(embedding) != 1024 {
t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
}
t.Logf("Priority test %s: generated %d-dim embedding", tc.desc, len(embedding))
})
Expand All @@ -1598,9 +1594,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
continue
}

// Smart routing may select Qwen3 (1024) or Gemma (768)
if len(embedding) != 768 && len(embedding) != 1024 {
t.Errorf("Iteration %d: Expected 768 or 1024-dim embedding, got %d", i, len(embedding))
// Expect Qwen3 (1024) since Gemma is not available
if len(embedding) != 1024 {
t.Errorf("Iteration %d: Expected 1024-dim embedding, got %d", i, len(embedding))
}

// Verify no nil pointers
Expand Down Expand Up @@ -1639,11 +1635,12 @@ func BenchmarkGetEmbeddingSmart(b *testing.B) {
}

// Test constants for embedding models (Phase 4.2)
// Note: Gemma model is gated and requires HF_TOKEN, so tests use Qwen3 only
const (
Qwen3EmbeddingModelPath = "../models/Qwen3-Embedding-0.6B"
GemmaEmbeddingModelPath = "../models/embeddinggemma-300m"
GemmaEmbeddingModelPath = "" // Gemma is gated, not used in CI tests
TestEmbeddingText = "This is a test sentence for embedding generation"
TestLongContextText = "This is a longer text that might benefit from long-context embedding models like Qwen3 or Gemma"
TestLongContextText = "This is a longer text that might benefit from long-context embedding models like Qwen3"
)

// Test constants for Qwen3 Multi-LoRA
Expand Down Expand Up @@ -1705,23 +1702,8 @@ func TestInitEmbeddingModels(t *testing.T) {
})

t.Run("InitGemmaOnly", func(t *testing.T) {
// Similar to InitBothModels, accept already-initialized state
err := InitEmbeddingModels("", GemmaEmbeddingModelPath, true)
if err != nil {
t.Logf("InitEmbeddingModels (Gemma only) returned error (may already be initialized): %v", err)

// Verify functionality
_, testErr := GetEmbeddingSmart("test", 0.5, 0.5)
if testErr == nil {
t.Log("✓ ModelFactory is functional (already initialized)")
} else {
if isModelInitializationError(testErr) {
t.Skipf("Skipping test due to model unavailability: %v", testErr)
}
}
} else {
t.Log("✓ Gemma model initialized successfully")
}
// Gemma is a gated model requiring HF_TOKEN, skip in CI
t.Skip("Skipping Gemma-only test: Gemma is a gated model requiring HF_TOKEN")
})

t.Run("InitWithInvalidPaths", func(t *testing.T) {
Expand All @@ -1739,9 +1721,6 @@ func TestGetEmbeddingWithDim(t *testing.T) {
// Initialize embedding models first
err := InitEmbeddingModels(Qwen3EmbeddingModelPath, GemmaEmbeddingModelPath, true)
if err != nil {
if isModelInitializationError(err) {
t.Skipf("Skipping GetEmbeddingWithDim tests due to model initialization error: %v", err)
}
t.Fatalf("Failed to initialize embedding models: %v", err)
}

Expand Down Expand Up @@ -1806,16 +1785,16 @@ func TestGetEmbeddingWithDim(t *testing.T) {

t.Run("OversizedDimension", func(t *testing.T) {
// Test graceful degradation when requested dimension exceeds model capacity
// Qwen3: 1024, Gemma: 768, so 2048 should fall back to full dimension
// Qwen3: 1024, so 2048 should fall back to full dimension
embedding, err := GetEmbeddingWithDim(TestEmbeddingText, 0.5, 0.5, 2048)
if err != nil {
t.Errorf("Should gracefully handle oversized dimension, got error: %v", err)
return
}

// Should return full dimension (1024 for Qwen3 or 768 for Gemma)
if len(embedding) != 1024 && len(embedding) != 768 {
t.Errorf("Expected full dimension (1024 or 768), got %d", len(embedding))
// Should return full dimension (1024 for Qwen3)
if len(embedding) != 1024 {
t.Errorf("Expected full dimension (1024), got %d", len(embedding))
} else {
t.Logf("✓ Oversized dimension gracefully degraded to full dimension: %d", len(embedding))
}
Expand All @@ -1841,9 +1820,6 @@ func TestGetEmbeddingWithDim(t *testing.T) {
func TestEmbeddingConsistency(t *testing.T) {
err := InitEmbeddingModels(Qwen3EmbeddingModelPath, GemmaEmbeddingModelPath, true)
if err != nil {
if isModelInitializationError(err) {
t.Skipf("Skipping consistency tests due to model initialization error: %v", err)
}
t.Fatalf("Failed to initialize embedding models: %v", err)
}

Expand Down Expand Up @@ -1911,12 +1887,11 @@ func TestEmbeddingConsistency(t *testing.T) {
func TestEmbeddingPriorityRouting(t *testing.T) {
err := InitEmbeddingModels(Qwen3EmbeddingModelPath, GemmaEmbeddingModelPath, true)
if err != nil {
if isModelInitializationError(err) {
t.Skipf("Skipping priority routing tests due to model initialization error: %v", err)
}
t.Fatalf("Failed to initialize embedding models: %v", err)
}

// Note: These tests use Matryoshka dimension truncation (768) with Qwen3 model
// The dimension is truncated from Qwen3's full 1024 dimensions
testCases := []struct {
name string
text string
Expand All @@ -1931,23 +1906,23 @@ func TestEmbeddingPriorityRouting(t *testing.T) {
qualityPriority: 0.2,
latencyPriority: 0.9,
expectedDim: 768,
description: "Should prefer faster embedding model (Gemma > Qwen3)",
description: "Uses Qwen3 with Matryoshka 768 truncation",
},
{
name: "HighQualityPriority",
text: strings.Repeat("Long context text ", 30),
qualityPriority: 0.9,
latencyPriority: 0.2,
expectedDim: 768,
description: "Should prefer quality model (Qwen3/Gemma)",
description: "Uses Qwen3 with Matryoshka 768 truncation",
},
{
name: "BalancedPriority",
text: "Medium length text for embedding",
qualityPriority: 0.5,
latencyPriority: 0.5,
expectedDim: 768,
description: "Should select based on text length",
description: "Uses Qwen3 with Matryoshka 768 truncation",
},
}

Expand Down
40 changes: 40 additions & 0 deletions candle-binding/src/classifiers/unified.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use parking_lot::RwLock;
use std::collections::HashMap;
use std::time::Instant;

use crate::ffi::embedding::GLOBAL_MODEL_FACTORY;
use crate::model_architectures::config::{DualPathConfig, LoRAConfig, TraditionalConfig};
use crate::model_architectures::routing::{DualPathRouter, ProcessingRequirements};
use crate::model_architectures::traits::*;
Expand Down Expand Up @@ -1024,6 +1025,45 @@ impl DualPathUnifiedClassifier {
model_type
};

// Validate model availability and fall back if necessary
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maynot be the best option to fallback to different model. Would you mind disabling Gemma test and model download in CI?

let model_type = match model_type {
ModelType::GemmaEmbedding => {
// Check if Gemma is available
if let Some(factory) = GLOBAL_MODEL_FACTORY.get() {
if factory.get_gemma_model().is_none() {
// Gemma not available, fall back to Qwen3
eprintln!(
"WARNING: GemmaEmbedding selected but not available, falling back to Qwen3Embedding"
);
ModelType::Qwen3Embedding
} else {
ModelType::GemmaEmbedding
}
} else {
// No factory available, fall back to Qwen3
eprintln!(
"WARNING: ModelFactory not initialized, falling back to Qwen3Embedding"
);
ModelType::Qwen3Embedding
}
}
ModelType::Qwen3Embedding => {
// Qwen3 is the default, should always be available
// But verify just in case
if let Some(factory) = GLOBAL_MODEL_FACTORY.get() {
if factory.get_qwen3_model().is_none() {
return Err(UnifiedClassifierError::ProcessingError(
"Qwen3Embedding selected but not available and no fallback available"
.to_string(),
));
}
}
ModelType::Qwen3Embedding
}
// For non-embedding types, pass through
other => other,
};

// Log routing decision for monitoring
if self.config.embedding.enable_performance_tracking {
println!(
Expand Down
46 changes: 30 additions & 16 deletions candle-binding/src/ffi/embedding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ enum PaddingSide {
}

/// Global singleton for ModelFactory
static GLOBAL_MODEL_FACTORY: OnceLock<ModelFactory> = OnceLock::new();
pub(crate) static GLOBAL_MODEL_FACTORY: OnceLock<ModelFactory> = OnceLock::new();

/// Generic internal helper for single text embedding generation
///
Expand Down Expand Up @@ -77,14 +77,18 @@ where

// Apply Matryoshka truncation if requested
let result = if let Some(dim) = target_dim {
if dim > embedding_vec.len() {
return Err(format!(
"Target dimension {} exceeds model dimension {}",
// Gracefully degrade to model's max dimension if requested dimension is too large
let actual_dim = if dim > embedding_vec.len() {
eprintln!(
"WARNING: Requested dimension {} exceeds model dimension {}, using full dimension",
dim,
embedding_vec.len()
));
}
embedding_vec[..dim].to_vec()
);
embedding_vec.len()
} else {
dim
};
embedding_vec[..actual_dim].to_vec()
} else {
embedding_vec
};
Expand Down Expand Up @@ -185,15 +189,19 @@ where

// Apply Matryoshka truncation if requested
let result_embeddings = if let Some(dim) = target_dim {
if dim > embedding_dim {
return Err(format!(
"Target dimension {} exceeds model dimension {}",
// Gracefully degrade to model's max dimension if requested dimension is too large
let actual_dim = if dim > embedding_dim {
eprintln!(
"WARNING: Requested dimension {} exceeds model dimension {}, using full dimension",
dim, embedding_dim
));
}
);
embedding_dim
} else {
dim
};
embeddings_data
.into_iter()
.map(|emb| emb[..dim].to_vec())
.map(|emb| emb[..actual_dim].to_vec())
.collect()
} else {
embeddings_data
Expand All @@ -207,11 +215,11 @@ where
/// # Safety
/// - `qwen3_model_path` and `gemma_model_path` must be valid null-terminated C strings or null
/// - Must be called before any embedding generation functions
/// - Can only be called once (subsequent calls will be ignored)
/// - Can only be called once (subsequent calls will return true as already initialized)
///
/// # Returns
/// - `true` if initialization succeeded
/// - `false` if initialization failed or already initialized
/// - `true` if initialization succeeded or already initialized
/// - `false` if initialization failed
#[no_mangle]
pub extern "C" fn init_embedding_models(
qwen3_model_path: *const c_char,
Expand All @@ -220,6 +228,12 @@ pub extern "C" fn init_embedding_models(
) -> bool {
use candle_core::Device;

// Check if already initialized (OnceLock can only be set once)
if GLOBAL_MODEL_FACTORY.get().is_some() {
eprintln!("WARNING: ModelFactory already initialized");
return true; // Already initialized, return success
}

// Parse model paths
let qwen3_path = if qwen3_model_path.is_null() {
None
Expand Down
2 changes: 1 addition & 1 deletion e2e/profiles/aibrix/profile.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import (
const (
// Version Configuration
// AIBrix version - can be overridden via AIBRIX_VERSION environment variable
defaultAIBrixVersion = "v0.4.1"
defaultAIBrixVersion = "v0.5.0"

// Kubernetes Namespaces - used frequently throughout
namespaceSemanticRouter = "vllm-semantic-router-system"
Expand Down
Loading
Loading