diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..321a24c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,15 @@ +# Changelog + +## [Unreleased] + +### Added +- Jina AI embeddings v5 text backends (`jina-v5-nano`, `jina-v5-small`) via local ONNX inference +- Matryoshka representation learning: configurable truncate_dim for jina-v5 backends +- Asymmetric retrieval: `retrieval.query:` / `retrieval.passage:` instruction prefixes +- Auto re-embed on embedder dimension change (`--no-auto-reembed` to opt out) +- `embed_query` / `embed_document` distinction on the `Embedder` trait +- `icm config show` now displays active backend name and license tag +- `icm recall` now shows active model name in output header + +### License note +Jina v5 model weights are CC BY-NC 4.0 (non-commercial). Commercial use requires a license from Jina AI. diff --git a/Cargo.lock b/Cargo.lock index 73f2c1d..ed3c30b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -819,6 +819,9 @@ name = "esaxx-rs" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" +dependencies = [ + "cc", +] [[package]] name = "exr" @@ -962,6 +965,21 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + [[package]] name = "futures-channel" version = "0.3.32" @@ -969,6 +987,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" dependencies = [ "futures-core", + "futures-sink", ] [[package]] @@ -977,6 +996,17 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + [[package]] name = "futures-io" version = "0.3.32" @@ -1012,6 +1042,7 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" dependencies = [ + "futures-channel", "futures-core", "futures-io", "futures-macro", @@ -1140,6 +1171,12 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + [[package]] name = "hf-hub" version = "0.4.3" @@ -1147,16 +1184,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97" dependencies = [ "dirs", + "futures", "http", "indicatif", "libc", "log", "native-tls", + "num_cpus", "rand", "reqwest", "serde", "serde_json", "thiserror", + "tokio", "ureq", "windows-sys 0.60.2", ] @@ -1312,7 +1352,7 @@ dependencies = [ [[package]] name = "icm-cli" -version = "0.10.29" +version = "0.10.30" dependencies = [ "anyhow", "axum", @@ -1352,9 +1392,13 @@ dependencies = [ "chrono", "directories", "fastembed", + "hf-hub", + "ndarray", + "ort", "serde", "serde_json", "thiserror", + "tokenizers", "toml", "ulid", ] @@ -1711,6 +1755,16 @@ dependencies = [ "cc", ] +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + [[package]] name = "libredox" version = "0.1.14" @@ -2049,6 +2103,16 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "number_prefix" version = "0.4.0" @@ -2155,6 +2219,7 @@ version = "2.0.0-rc.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52afb44b6b0cffa9bf45e4d37e5a4935b0334a51570658e279e9e3e6cf324aa5" dependencies = [ + "libloading", "ndarray", "ort-sys", "tracing", @@ -3209,6 +3274,7 @@ dependencies = [ "derive_builder", "esaxx-rs", "getrandom 0.3.4", + "indicatif", "itertools 0.14.0", "log", "macro_rules_attribute", diff --git a/Cargo.toml b/Cargo.toml index 822f16a..5b59336 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,18 @@ zerocopy = { version = "0.8", features = ["derive"] } # Embeddings (optional) fastembed = "4" +# Jina v5 embedder. These are workspace version pins only; each +# consumer crate must mark them `optional = true` and gate them behind +# the `jina-v5` feature. Default builds must NOT pull these crates. +# +# `ort` defaults pull `download-binaries` which fetches the ONNX Runtime +# at build time. We disable defaults and load the system runtime via +# `load-dynamic`; `ndarray` is required for `try_extract_tensor`. +hf-hub = "0.4" +ort = { version = "2.0.0-rc.9", default-features = false, features = ["load-dynamic", "ndarray"] } +tokenizers = "0.21" +ndarray = "0.16" + # Serialization serde = { version = "1", features = ["derive"] } serde_json = { version = "1", features = ["preserve_order"] } diff --git a/README.md b/README.md index ab4c0db..f78d77e 100644 --- a/README.md +++ b/README.md @@ -392,6 +392,36 @@ C:\Users\\AppData\Roaming\icm\icm\config\config.toml # Window See [config/default.toml](config/default.toml) for all options. +## Embedder backends + +ICM supports three local ONNX embedding backends — no external API required. + +| Backend | Dims | License | Notes | +|---------|------|---------|-------| +| `fastembed` (default) | 384 / 768 / 1024 (model-dependent) | Apache-2.0 | multilingual-e5-base and others via fastembed | +| `jina-v5-nano` | 32, 64, 128, 256, 512, 768 (default: 768) | CC BY-NC 4.0 | jinaai/jina-embeddings-v5-text-nano-retrieval | +| `jina-v5-small` | 32, 64, 128, 256, 512, 768, 1024 (default: 1024) | CC BY-NC 4.0 | jinaai/jina-embeddings-v5-text-small-retrieval (Qwen3-based) | + +> **IMPORTANT — Non-commercial restriction:** Jina v5 model weights are licensed under +> [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/). **Use in commercial products +> requires a commercial Jina AI license.** See https://jina.ai/contact-sales for details. +> The default `fastembed` backend (Apache-2.0) has no such restriction. + +Weights download automatically to `~/.cache/huggingface/` on first run. No account or API key needed. + +```toml +# config.toml +[embeddings] +backend = "jina-v5-nano" +truncate_dim = 512 # optional Matryoshka dim (omit to use model default) +``` + +Matryoshka truncation lets you trade accuracy for speed and storage. Valid dims per backend: + +- `jina-v5-nano`: 32, 64, 128, 256, 512, 768 (default: 768) +- `jina-v5-small`: 32, 64, 128, 256, 512, 768, 1024 (default: 1024) +- `fastembed`: no truncation; dim is fixed by the chosen model + ## Auto-extraction ICM extracts memories automatically via three layers: diff --git a/config/default.toml b/config/default.toml index db79ae0..3e37502 100644 --- a/config/default.toml +++ b/config/default.toml @@ -24,7 +24,15 @@ prune_threshold = 0.1 # Set to false to disable embeddings entirely (no model download, keyword search only) # enabled = false -# Embedding model (fastembed model_code). Default: multilingual-e5-small (384d, 100+ languages) +# Embedder backend: "fastembed" (default, Apache-2.0) | "jina-v5-nano" | "jina-v5-small" +# +# Jina v5 backends: CC BY-NC 4.0 (non-commercial). For commercial use, acquire a +# commercial license from Jina AI before deploying. Weights download to +# ~/.cache/huggingface/ on first run. +# +# backend = "jina-v5-nano" + +# Embedding model (fastembed model_code). Default: multilingual-e5-base (768d, 100+ languages) # Other options: # "BAAI/bge-small-en-v1.5" — 384d, English-only (fastest) # "Xenova/bge-small-en-v1.5" — 384d, quantized English-only @@ -34,6 +42,11 @@ prune_threshold = 0.1 # "jinaai/jina-embeddings-v2-base-code" — 768d, code-optimized model = "intfloat/multilingual-e5-base" +# Matryoshka truncation (jina-v5 backends only). Valid dims: +# jina-v5-nano: 32, 64, 128, 256, 512, 768 (default: 768) +# jina-v5-small: 32, 64, 128, 256, 512, 768, 1024 (default: 1024) +# truncate_dim = 512 + [extraction] # Layer 0: rule-based fact extraction (zero LLM cost) enabled = true diff --git a/crates/icm-cli/Cargo.toml b/crates/icm-cli/Cargo.toml index 5e6f546..f4dd55d 100644 --- a/crates/icm-cli/Cargo.toml +++ b/crates/icm-cli/Cargo.toml @@ -15,6 +15,7 @@ path = "src/main.rs" [features] default = ["embeddings", "tui"] embeddings = ["icm-core/embeddings", "icm-mcp/embeddings"] +jina-v5 = ["icm-core/jina-v5"] tui = ["dep:ratatui", "dep:crossterm"] web = ["dep:axum", "dep:tokio", "dep:tower-http", "dep:rust-embed", "dep:mime_guess", "dep:getrandom"] vendored-openssl = ["openssl/vendored"] diff --git a/crates/icm-cli/src/config.rs b/crates/icm-cli/src/config.rs index ec8e3f9..15b5740 100644 --- a/crates/icm-cli/src/config.rs +++ b/crates/icm-cli/src/config.rs @@ -46,21 +46,42 @@ pub struct MemoryConfig { pub auto_consolidate_threshold: usize, } +/// Which embedder backend to use for `EmbeddingsConfig.backend`. +#[derive(Debug, Deserialize, Default, PartialEq, Eq, Clone)] +#[serde(rename_all = "kebab-case")] +pub enum EmbedderBackend { + /// fastembed (default) — multilingual-e5-base etc., Apache-2.0 weights. + #[default] + Fastembed, + /// jina-embeddings-v5-text-nano-retrieval — local ONNX, CC-BY-NC-4.0. + JinaV5Nano, + /// jina-embeddings-v5-text-small-retrieval (Qwen3-based) — local ONNX, CC-BY-NC-4.0. + JinaV5Small, +} + /// Embedding model settings. #[derive(Debug, Deserialize)] #[serde(default)] pub struct EmbeddingsConfig { /// Enable embeddings (set to false to skip model download entirely). pub enabled: bool, - /// Model identifier (fastembed model_code, e.g. "intfloat/multilingual-e5-small"). + /// Which embedder backend to use. + pub backend: EmbedderBackend, + /// Model identifier for the fastembed backend + /// (e.g. "intfloat/multilingual-e5-base"). Ignored by other backends. pub model: String, + /// Matryoshka truncation dimension. `None` = use the model's default + /// output dimension. Consumed by the jina-v5-nano and jina-v5-small backends. + pub truncate_dim: Option, } impl Default for EmbeddingsConfig { fn default() -> Self { Self { enabled: true, + backend: EmbedderBackend::Fastembed, model: "intfloat/multilingual-e5-base".into(), + truncate_dim: None, } } } diff --git a/crates/icm-cli/src/learn_tests.rs b/crates/icm-cli/src/learn_tests.rs index 1412e1e..fd45d96 100644 --- a/crates/icm-cli/src/learn_tests.rs +++ b/crates/icm-cli/src/learn_tests.rs @@ -9,7 +9,7 @@ mod tests { fn test_store() -> (TempDir, SqliteStore) { let tmp = TempDir::new().expect("failed to create temp dir"); let db_path = tmp.path().join("test.db"); - let store = SqliteStore::with_dims(&db_path, 384).expect("failed to create store"); + let (store, _) = SqliteStore::with_dims(&db_path, 384).expect("failed to create store"); (tmp, store) } diff --git a/crates/icm-cli/src/main.rs b/crates/icm-cli/src/main.rs index eb78a42..1d7065e 100644 --- a/crates/icm-cli/src/main.rs +++ b/crates/icm-cli/src/main.rs @@ -6,6 +6,8 @@ mod extract; mod import; #[cfg(test)] mod learn_tests; +#[cfg(test)] +mod migration_tests; #[cfg(feature = "tui")] mod tui; mod upgrade; @@ -26,6 +28,8 @@ use icm_core::{ }; use icm_store::SqliteStore; +use crate::config::EmbedderBackend; + #[derive(Parser)] #[command( name = "icm", @@ -41,6 +45,10 @@ struct Cli { #[arg(long, global = true)] no_embeddings: bool, + /// Skip automatic re-embedding when the embedder model changes. + #[arg(long, global = true, default_value_t = false)] + no_auto_reembed: bool, + #[command(subcommand)] command: Commands, } @@ -873,19 +881,76 @@ fn default_db_path() -> PathBuf { .unwrap_or_else(|| PathBuf::from("memories.db")) } -fn open_store(db: Option, embedding_dims: usize) -> Result { +fn open_store( + db: Option, + embedding_dims: usize, +) -> Result<(SqliteStore, icm_store::MigrationStatus)> { let path = db.unwrap_or_else(default_db_path); SqliteStore::with_dims(&path, embedding_dims).context("failed to open database") } -#[cfg(feature = "embeddings")] -fn init_embedder(model: &str) -> Option { - Some(icm_core::FastEmbedder::with_model(model)) +#[cfg(any(feature = "embeddings", feature = "jina-v5"))] +fn init_embedder( + cfg: &config::EmbeddingsConfig, +) -> Result>> { + use config::EmbedderBackend; + match cfg.backend { + #[cfg(feature = "embeddings")] + EmbedderBackend::Fastembed => Ok(Some(Box::new(icm_core::FastEmbedder::with_model( + &cfg.model, + )))), + #[cfg(not(feature = "embeddings"))] + EmbedderBackend::Fastembed => Err(anyhow::anyhow!( + "config requests backend `fastembed` but this binary was built \ + without the `embeddings` feature. Rebuild with \ + `--features embeddings` or set `embeddings.backend = \"jina-v5-nano\"`." + )), + #[cfg(feature = "jina-v5")] + EmbedderBackend::JinaV5Nano => { + let emb = icm_core::JinaV5NanoEmbedder::new(cfg.truncate_dim) + .map_err(|e| anyhow::anyhow!("jina-v5-nano init: {e}"))?; + Ok(Some(Box::new(emb))) + } + #[cfg(not(feature = "jina-v5"))] + EmbedderBackend::JinaV5Nano => Err(anyhow::anyhow!( + "config requests backend `jina-v5-nano` but this binary was built \ + without the `jina-v5` feature. Rebuild with `--features jina-v5` \ + or set `embeddings.backend = \"fastembed\"`." + )), + #[cfg(feature = "jina-v5")] + EmbedderBackend::JinaV5Small => { + let emb = icm_core::JinaV5SmallEmbedder::new(cfg.truncate_dim) + .map_err(|e| anyhow::anyhow!("jina-v5-small init: {e}"))?; + Ok(Some(Box::new(emb))) + } + #[cfg(not(feature = "jina-v5"))] + EmbedderBackend::JinaV5Small => Err(anyhow::anyhow!( + "config requests backend `jina-v5-small` but this binary was built \ + without the `jina-v5` feature. Rebuild with `--features jina-v5` \ + or set `embeddings.backend = \"fastembed\"`." + )), + } } -#[cfg(not(feature = "embeddings"))] -fn init_embedder(_model: &str) -> Option<()> { - None +#[cfg(not(any(feature = "embeddings", feature = "jina-v5")))] +fn init_embedder( + cfg: &config::EmbeddingsConfig, +) -> Result>> { + use config::EmbedderBackend; + match cfg.backend { + EmbedderBackend::Fastembed => Err(anyhow::anyhow!( + "config requests backend `fastembed` but this binary was built \ + without the `embeddings` feature." + )), + EmbedderBackend::JinaV5Nano => Err(anyhow::anyhow!( + "config requests backend `jina-v5-nano` but this binary was built \ + without the `jina-v5` feature." + )), + EmbedderBackend::JinaV5Small => Err(anyhow::anyhow!( + "config requests backend `jina-v5-small` but this binary was built \ + without the `jina-v5` feature." + )), + } } fn main() -> Result<()> { @@ -907,21 +972,54 @@ fn main() -> Result<()> { let cfg = config::load_config()?; let embeddings_enabled = cfg.embeddings.enabled && !cli.no_embeddings && std::env::var("ICM_NO_EMBEDDINGS").is_err(); - #[allow(unused_variables)] - let embedder = if embeddings_enabled { - init_embedder(&cfg.embeddings.model) + let embedder: Option> = if embeddings_enabled { + init_embedder(&cfg.embeddings)? } else { None }; let embedding_dims = embedder .as_ref() - .map(|e| { - use icm_core::Embedder; - e.dimensions() - }) + .map(|e| e.dimensions()) .unwrap_or(icm_core::DEFAULT_EMBEDDING_DIMS); let db_path = cli.db.clone().unwrap_or_else(default_db_path); - let store = open_store(cli.db, embedding_dims)?; + let (store, migration_status) = open_store(cli.db, embedding_dims)?; + if migration_status.dim_changed { + eprintln!( + "Embedding dim changed ({} -> {}): {} memories cleared.", + migration_status.old_dim, + migration_status.new_dim, + migration_status.affected_rows + ); + // The MCP server (Commands::Serve) always auto-reembeds; the + // --no-auto-reembed flag only applies to interactive CLI invocations. + let is_serve = matches!(&cli.command, Commands::Serve { .. }); + if cli.no_auto_reembed && !is_serve { + eprintln!( + "Skipping auto re-embed (--no-auto-reembed). \ + Run `icm embed --all` manually." + ); + } else { + #[cfg(any(feature = "embeddings", feature = "jina-v5"))] + if let Some(emb) = embedder.as_deref() { + eprintln!( + "Auto re-embedding {} memories \ + (use --no-auto-reembed to skip)...", + migration_status.affected_rows + ); + cmd_embed(&store, emb, None, false, 32)?; + } else { + eprintln!( + "No embedder active — run `icm embed --all` \ + after enabling embeddings." + ); + } + #[cfg(not(any(feature = "embeddings", feature = "jina-v5")))] + eprintln!( + "No embedder active — run `icm embed --all` \ + after enabling embeddings." + ); + } + } match cli.command { Commands::Store { @@ -931,10 +1029,7 @@ fn main() -> Result<()> { keywords, raw, } => { - #[cfg(feature = "embeddings")] - let emb_ref = embedder.as_ref().map(|e| e as &dyn icm_core::Embedder); - #[cfg(not(feature = "embeddings"))] - let emb_ref: Option<&dyn icm_core::Embedder> = None; + let emb_ref = embedder.as_deref(); cmd_store( &store, emb_ref, @@ -951,10 +1046,7 @@ fn main() -> Result<()> { limit, keyword, } => { - #[cfg(feature = "embeddings")] - let emb_ref = embedder.as_ref().map(|e| e as &dyn icm_core::Embedder); - #[cfg(not(feature = "embeddings"))] - let emb_ref: Option<&dyn icm_core::Embedder> = None; + let emb_ref = embedder.as_deref(); cmd_recall( &store, emb_ref, @@ -972,10 +1064,7 @@ fn main() -> Result<()> { importance, keywords, } => { - #[cfg(feature = "embeddings")] - let emb_ref = embedder.as_ref().map(|e| e as &dyn icm_core::Embedder); - #[cfg(not(feature = "embeddings"))] - let emb_ref: Option<&dyn icm_core::Embedder> = None; + let emb_ref = embedder.as_deref(); cmd_update(&store, emb_ref, &id, content, importance, keywords) } Commands::Health { topic } => cmd_health(&store, topic.as_deref()), @@ -1063,7 +1152,7 @@ fn main() -> Result<()> { } => { #[cfg(feature = "embeddings")] { - let emb = match embedder.as_ref() { + let emb = match embedder.as_deref() { Some(e) => e, None => bail!("embeddings not available — check your configuration"), }; @@ -1157,7 +1246,7 @@ fn main() -> Result<()> { importance, keywords, } => { - let emb_ref = embedder.as_ref().map(|e| e as &dyn icm_core::Embedder); + let emb_ref = embedder.as_deref(); cmd_save_project(&store, emb_ref, &content, importance.into(), keywords) } Commands::Learn { dir, name } => { @@ -1199,10 +1288,7 @@ fn main() -> Result<()> { password, ); } - #[cfg(feature = "embeddings")] - let emb_ref = embedder.as_ref().map(|e| e as &dyn icm_core::Embedder); - #[cfg(not(feature = "embeddings"))] - let emb_ref: Option<&dyn icm_core::Embedder> = None; + let emb_ref = embedder.as_deref(); // --compact flag overrides, otherwise use config (default: true) let use_compact = compact || cfg.mcp.compact; icm_mcp::run_server(&store, emb_ref, use_compact) @@ -1317,7 +1403,7 @@ fn cmd_recall( // Try hybrid search if embedder is available if let Some(emb) = embedder { - if let Ok(query_emb) = emb.embed(query) { + if let Ok(query_emb) = emb.embed_query(query) { if let Ok(results) = store.search_hybrid(query, &query_emb, limit) { let mut scored = results; if let Some(t) = topic { @@ -1339,6 +1425,10 @@ fn cmd_recall( return Ok(()); } + if !emb.model_name().is_empty() { + println!("model: {}", emb.model_name()); + println!(); + } let ids: Vec<&str> = expanded.iter().map(|(m, _)| m.id.as_str()).collect(); let _ = store.batch_update_access(&ids); for (mem, score) in &expanded { @@ -3583,6 +3673,33 @@ fn inject_opencode_mcp_server(config_path: &Path, name: &str, icm_bin: &str) -> Ok("configured".into()) } +/// Returns `(display_name, spdx_license)` for the given embedder backend. +/// This is the single source of truth used by both `cmd_config` output and tests. +fn backend_info(backend: &EmbedderBackend) -> (&'static str, &'static str) { + match backend { + EmbedderBackend::Fastembed => ("fastembed", "Apache-2.0"), + EmbedderBackend::JinaV5Nano => ("jina-v5-nano", "CC-BY-NC-4.0, non-commercial"), + EmbedderBackend::JinaV5Small => ("jina-v5-small", "CC-BY-NC-4.0, non-commercial"), + } +} + +/// Formats the `[embeddings]` config section into a `String`. +/// Extracted so that tests can assert on the real rendered output. +fn format_embeddings_section(cfg: &config::EmbeddingsConfig) -> String { + let (backend_name, license) = backend_info(&cfg.backend); + let mut out = String::new(); + out.push_str("[embeddings]\n"); + out.push_str(&format!(" backend = {backend_name}\n")); + out.push_str(&format!(" license = {license}\n")); + if !cfg.model.is_empty() && cfg.backend == EmbedderBackend::Fastembed { + out.push_str(&format!(" model = {}\n", cfg.model)); + } + if let Some(dim) = cfg.truncate_dim { + out.push_str(&format!(" truncate_dim = {dim}\n")); + } + out +} + fn cmd_config() -> Result<()> { let cfg = config::load_config()?; println!("Config: {}", config::show_config_path()); @@ -3601,8 +3718,7 @@ fn cmd_config() -> Result<()> { println!(" decay_rate = {}", cfg.memory.decay_rate); println!(" prune_threshold = {}", cfg.memory.prune_threshold); println!(); - println!("[embeddings]"); - println!(" model = {}", cfg.embeddings.model); + print!("{}", format_embeddings_section(&cfg.embeddings)); println!(); println!("[extraction]"); println!(" enabled = {}", cfg.extraction.enabled); @@ -3837,7 +3953,7 @@ fn cmd_save_project( ) } -#[cfg(feature = "embeddings")] +#[cfg(any(feature = "embeddings", feature = "jina-v5"))] fn cmd_embed( store: &SqliteStore, embedder: &dyn icm_core::Embedder, @@ -4135,10 +4251,12 @@ fn cmd_bench_recall(model: &str, runs: usize, verbose: bool) -> Result<()> { }); std::fs::write(&mcp_config_path, serde_json::to_string_pretty(&mcp_config)?)?; { - let _ = SqliteStore::new(&icm_db)?; + let (_, _) = SqliteStore::new(&icm_db)?; } // === WITHOUT ICM === + + eprintln!("=== WITHOUT ICM ==="); let s1_prompt = format!( "{}{}", @@ -4179,7 +4297,7 @@ fn cmd_bench_recall(model: &str, runs: usize, verbose: bool) -> Result<()> { eprintln!(" done ({:.1}s)", s1_icm.duration_ms as f64 / 1000.0); { - let store = SqliteStore::new(&icm_db)?; + let (store, _) = SqliteStore::new(&icm_db)?; let ext1 = extract::extract_and_store(&store, bench_knowledge::SOURCE_DOCUMENT, "meridian")?; let ext2 = extract::extract_and_store(&store, &s1_icm.response, "meridian")?; @@ -4197,7 +4315,7 @@ fn cmd_bench_recall(model: &str, runs: usize, verbose: bool) -> Result<()> { let mut scores_with: Vec<(usize, usize, f64)> = Vec::new(); let mut responses_with: Vec = Vec::new(); for (i, q) in questions.iter().enumerate() { - let store = SqliteStore::new(&icm_db)?; + let (store, _) = SqliteStore::new(&icm_db)?; let ctx = extract::recall_context(&store, q.prompt, None, 15)?; if verbose && !ctx.is_empty() { eprintln!(" [verbose] Context injected for Q{}:", i + 1); @@ -4220,7 +4338,7 @@ fn cmd_bench_recall(model: &str, runs: usize, verbose: bool) -> Result<()> { eprintln!(" Response: {}", truncate_words(&result.response, 200)); } { - let store = SqliteStore::new(&icm_db)?; + let (store, _) = SqliteStore::new(&icm_db)?; let _ = extract::extract_and_store(&store, &result.response, "meridian"); } scores_with.push(score); @@ -4415,13 +4533,13 @@ fn cmd_bench_agent(sessions: usize, model: &str, runs: usize, verbose: bool) -> }); std::fs::write(&mcp_config_path, serde_json::to_string_pretty(&mcp_config)?)?; { - let _ = SqliteStore::new(&icm_db)?; + let (_, _) = SqliteStore::new(&icm_db)?; } let mut results_with: Vec = Vec::new(); for (i, prompt) in prompts.iter().enumerate() { let effective_prompt = if i > 0 { - let store = SqliteStore::new(&icm_db)?; + let (store, _) = SqliteStore::new(&icm_db)?; let ctx = extract::recall_context(&store, prompt, None, 15)?; if verbose && !ctx.is_empty() { eprintln!(" [verbose] Context injected for session {}:", i + 1); @@ -4443,7 +4561,7 @@ fn cmd_bench_agent(sessions: usize, model: &str, runs: usize, verbose: bool) -> Ok(result) => { eprintln!(" done ({:.1}s)", result.duration_ms as f64 / 1000.0); { - let store = SqliteStore::new(&icm_db)?; + let (store, _) = SqliteStore::new(&icm_db)?; let extracted = extract::extract_and_store(&store, &result.response, "mathlib")?; if extracted > 0 { @@ -6024,3 +6142,108 @@ mod inject_settings_hook_tests { ); } } + +#[cfg(test)] +mod config_show_tests { + use crate::config::{EmbedderBackend, EmbeddingsConfig}; + use crate::{backend_info, format_embeddings_section}; + + /// The rendered output for jina-v5-nano must contain the exact backend name + /// and license string that `cmd_config` prints. This catches display regressions + /// because it exercises the real `format_embeddings_section` formatter. + #[test] + fn jina_v5_nano_section_contains_backend_and_license() { + let mut cfg = EmbeddingsConfig::default(); + cfg.backend = EmbedderBackend::JinaV5Nano; + + let rendered = format_embeddings_section(&cfg); + + assert!( + rendered.contains("backend = jina-v5-nano"), + "expected 'backend = jina-v5-nano' in:\n{rendered}" + ); + assert!( + rendered.contains("license = CC-BY-NC-4.0, non-commercial"), + "expected 'license = CC-BY-NC-4.0, non-commercial' in:\n{rendered}" + ); + // model line must be suppressed for non-fastembed backends + assert!( + !rendered.contains("model ="), + "model line should be omitted for jina backends, got:\n{rendered}" + ); + } + + /// jina-v5-small mirrors nano — separate test so renaming one variant doesn't + /// mask a broken mapping for the other. + #[test] + fn jina_v5_small_section_contains_backend_and_license() { + let mut cfg = EmbeddingsConfig::default(); + cfg.backend = EmbedderBackend::JinaV5Small; + + let rendered = format_embeddings_section(&cfg); + + assert!( + rendered.contains("backend = jina-v5-small"), + "expected 'backend = jina-v5-small' in:\n{rendered}" + ); + assert!( + rendered.contains("license = CC-BY-NC-4.0, non-commercial"), + "expected 'license = CC-BY-NC-4.0, non-commercial' in:\n{rendered}" + ); + } + + /// fastembed must show Apache-2.0 and the model line (no truncate_dim by default). + #[test] + fn fastembed_section_shows_apache_license_and_model() { + let cfg = EmbeddingsConfig::default(); // backend = Fastembed + + let rendered = format_embeddings_section(&cfg); + + assert!( + rendered.contains("backend = fastembed"), + "expected 'backend = fastembed' in:\n{rendered}" + ); + assert!( + rendered.contains("license = Apache-2.0"), + "expected 'license = Apache-2.0' in:\n{rendered}" + ); + assert!( + rendered.contains("model ="), + "fastembed section should include model line, got:\n{rendered}" + ); + assert!( + !rendered.contains("truncate_dim"), + "truncate_dim should be absent when None, got:\n{rendered}" + ); + } + + /// truncate_dim appears in the rendered output when set. + #[test] + fn truncate_dim_appears_when_set() { + let mut cfg = EmbeddingsConfig::default(); + cfg.backend = EmbedderBackend::JinaV5Nano; + cfg.truncate_dim = Some(512); + + let rendered = format_embeddings_section(&cfg); + + assert!( + rendered.contains("truncate_dim = 512"), + "expected 'truncate_dim = 512' in:\n{rendered}" + ); + } + + /// `backend_info` is the single source of truth — verify it returns the exact + /// strings the spec mandates so any future rename is caught here first. + #[test] + fn backend_info_returns_canonical_strings() { + assert_eq!(backend_info(&EmbedderBackend::Fastembed), ("fastembed", "Apache-2.0")); + assert_eq!( + backend_info(&EmbedderBackend::JinaV5Nano), + ("jina-v5-nano", "CC-BY-NC-4.0, non-commercial") + ); + assert_eq!( + backend_info(&EmbedderBackend::JinaV5Small), + ("jina-v5-small", "CC-BY-NC-4.0, non-commercial") + ); + } +} diff --git a/crates/icm-cli/src/migration_tests.rs b/crates/icm-cli/src/migration_tests.rs new file mode 100644 index 0000000..5705303 --- /dev/null +++ b/crates/icm-cli/src/migration_tests.rs @@ -0,0 +1,62 @@ +#[cfg(test)] +mod tests { + use icm_core::{Importance, Memory, MemoryStore}; + use icm_store::SqliteStore; + use tempfile::TempDir; + + fn make_memory(idx: u32, dims: usize) -> Memory { + let mut m = Memory::new( + "test-topic".into(), + format!("test memory {idx}"), + Importance::Medium, + ); + m.id = format!("mem-{idx:04}"); + m.embedding = Some(vec![0.0f32; dims]); + m + } + + #[test] + fn test_dim_change_detection_and_nulling() { + let tmp = TempDir::new().expect("tempdir"); + let db_path = tmp.path().join("migration_test.db"); + + // Open with dim 384 and store 5 memories with embeddings. + { + let (store, status) = + SqliteStore::with_dims(&db_path, 384).expect("open store @384"); + assert!(!status.dim_changed, "fresh store should not report dim change"); + + for i in 0..5u32 { + store.store(make_memory(i, 384)).expect("store memory"); + } + + // Verify all 5 have embeddings. + let all = store.list_all().expect("list_all"); + assert_eq!(all.len(), 5); + assert!( + all.iter().all(|m| m.embedding.is_some()), + "all memories should have embeddings after initial store" + ); + } + + // Re-open with a different dim (768) — triggers the migration. + let (store2, status) = + SqliteStore::with_dims(&db_path, 768).expect("open store @768"); + + assert!(status.dim_changed, "dim_changed should be true"); + assert_eq!(status.old_dim, 384, "old_dim should be 384"); + assert_eq!(status.new_dim, 768, "new_dim should be 768"); + assert_eq!( + status.affected_rows, 5, + "affected_rows should equal the 5 stored memories" + ); + + // Verify all rows now have embedding == None. + let all = store2.list_all().expect("list_all after migration"); + assert_eq!(all.len(), 5, "all 5 memories should still exist"); + assert!( + all.iter().all(|m| m.embedding.is_none()), + "all embeddings should be NULL after dim-change migration" + ); + } +} diff --git a/crates/icm-core/Cargo.toml b/crates/icm-core/Cargo.toml index 6a3c67a..599ed65 100644 --- a/crates/icm-core/Cargo.toml +++ b/crates/icm-core/Cargo.toml @@ -6,6 +6,7 @@ edition = "2021" [features] default = [] embeddings = ["fastembed", "directories"] +jina-v5 = ["dep:ort", "dep:tokenizers", "dep:hf-hub", "dep:ndarray"] [dependencies] chrono = { workspace = true } @@ -16,3 +17,7 @@ ulid = { workspace = true } toml = { workspace = true } fastembed = { workspace = true, optional = true } directories = { workspace = true, optional = true } +ort = { workspace = true, optional = true } +tokenizers = { workspace = true, optional = true } +hf-hub = { workspace = true, optional = true } +ndarray = { workspace = true, optional = true } diff --git a/crates/icm-core/src/embedder.rs b/crates/icm-core/src/embedder.rs index 3c3dbae..06091cd 100644 --- a/crates/icm-core/src/embedder.rs +++ b/crates/icm-core/src/embedder.rs @@ -1,7 +1,34 @@ use crate::error::IcmResult; pub trait Embedder: Send + Sync { + // --- required (existing) --- fn embed(&self, text: &str) -> IcmResult>; fn embed_batch(&self, texts: &[&str]) -> IcmResult>>; fn dimensions(&self) -> usize; + + // --- optional with defaults (new) --- + + /// Embed a query string. Default: delegates to `embed`. Override for + /// asymmetric retrieval models that use different prefixes for queries. + fn embed_query(&self, text: &str) -> IcmResult> { + self.embed(text) + } + + /// Embed a document/passage for storage. Default: delegates to `embed`. + /// Override for asymmetric retrieval models. + fn embed_document(&self, text: &str) -> IcmResult> { + self.embed(text) + } + + /// Human-readable model identifier + /// (e.g. "jina-embeddings-v5-text-nano-retrieval"). + fn model_name(&self) -> &str { + "" + } + + /// SPDX license expression for the model weights (e.g. "CC-BY-NC-4.0"). + /// Empty string for open/Apache models. Consumed by `icm config show` (S-5). + fn license(&self) -> &str { + "" + } } diff --git a/crates/icm-core/src/jina_v5_nano.rs b/crates/icm-core/src/jina_v5_nano.rs new file mode 100644 index 0000000..8450e70 --- /dev/null +++ b/crates/icm-core/src/jina_v5_nano.rs @@ -0,0 +1,386 @@ +//! Jina v5-text-nano embedder — local ONNX inference via `ort` + `tokenizers`. +//! +//! This backend is gated behind the `jina-v5` Cargo feature and is OFF by +//! default. License: CC-BY-NC-4.0 (non-commercial). See: +//! + +#[cfg(feature = "jina-v5")] +mod inner { + use hf_hub::api::sync::Api; + use ndarray::Array2; + use ort::session::{builder::GraphOptimizationLevel, Session}; + use tokenizers::Tokenizer; + + use crate::embedder::Embedder; + use crate::error::{IcmError, IcmResult}; + + const HF_MODEL_ID: &str = "jinaai/jina-embeddings-v5-text-nano-retrieval"; + const DEFAULT_DIM: usize = 768; + const VALID_DIMS: &[usize] = &[32, 64, 128, 256, 512, 768]; + + /// Internal abstraction for text encoding — enables dependency injection in tests. + pub trait TextEncoder: Send + Sync { + /// Encode a batch of texts and return full-dim (untruncated) embeddings. + fn encode(&self, texts: &[&str]) -> IcmResult>>; + } + + /// Production encoder: tokenizes with HuggingFace `tokenizers` and runs ONNX inference. + struct OrtEncoder { + session: Session, + tokenizer: Tokenizer, + } + + impl TextEncoder for OrtEncoder { + fn encode(&self, texts: &[&str]) -> IcmResult>> { + if texts.is_empty() { + return Ok(Vec::new()); + } + + let encodings = self + .tokenizer + .encode_batch(texts.iter().map(|s| s.to_string()).collect(), true) + .map_err(|e| IcmError::Embedding(e.to_string()))?; + + let batch_size = texts.len(); + let seq_len = encodings + .iter() + .map(|e| e.get_ids().len()) + .max() + .unwrap_or(0); + + if seq_len == 0 { + return Err(IcmError::Embedding("empty token sequence".into())); + } + + let mut input_ids = Array2::::zeros((batch_size, seq_len)); + let mut attention_mask = Array2::::zeros((batch_size, seq_len)); + + for (i, enc) in encodings.iter().enumerate() { + for (j, &id) in enc.get_ids().iter().enumerate() { + input_ids[[i, j]] = id as i64; + } + for (j, &m) in enc.get_attention_mask().iter().enumerate() { + attention_mask[[i, j]] = m as i64; + } + } + + // ort 2.0.0-rc.9: `inputs!` returns `Result, ...)>>`. + let session_inputs = ort::inputs! { + "input_ids" => input_ids.view(), + "attention_mask" => attention_mask.view(), + } + .map_err(|e| IcmError::Embedding(format!("ort inputs!: {e}")))?; + + let outputs = self + .session + .run(session_inputs) + .map_err(|e| IcmError::Embedding(format!("ort run: {e}")))?; + + // HF transformer ONNX models commonly emit `last_hidden_state`; + // some BERT-style exports use `token_embeddings`. Prefer the + // canonical name and fall back gracefully. + let hidden = outputs + .get("last_hidden_state") + .or_else(|| outputs.get("token_embeddings")) + .ok_or_else(|| { + IcmError::Embedding("ONNX output key not found".into()) + })?; + + let hidden_view = hidden + .try_extract_tensor::() + .map_err(|e| IcmError::Embedding(format!("extract tensor: {e}")))?; + + let shape = hidden_view.shape(); + if shape.len() != 3 { + return Err(IcmError::Embedding(format!( + "expected last_hidden_state rank 3, got shape {shape:?}" + ))); + } + if shape[0] != batch_size { + return Err(IcmError::Embedding(format!( + "ONNX output batch dim mismatch: expected {batch_size}, got {shape:?}" + ))); + } + if shape[1] < seq_len { + return Err(IcmError::Embedding(format!( + "ONNX output seq dim too small: expected >= {seq_len}, got {shape:?}" + ))); + } + let hidden_dim = shape[2]; + + let mut embeddings = Vec::with_capacity(batch_size); + for i in 0..batch_size { + let mask = attention_mask.row(i); + let mut pooled = vec![0f32; hidden_dim]; + let mut count = 0usize; + + for j in 0..seq_len { + if mask[j] == 1 { + for k in 0..hidden_dim { + pooled[k] += hidden_view[[i, j, k]]; + } + count += 1; + } + } + + if count > 0 { + let inv = 1.0_f32 / count as f32; + for v in &mut pooled { + *v *= inv; + } + } + + // L2-normalize the pooled vector. + let norm: f32 = pooled.iter().map(|x| x * x).sum::().sqrt(); + if norm > 1e-8 { + let inv = 1.0_f32 / norm; + for v in &mut pooled { + *v *= inv; + } + } + + // Return full-dim embedding — truncation happens in the Embedder methods. + embeddings.push(pooled); + } + + Ok(embeddings) + } + } + + pub struct JinaV5NanoEmbedder { + encoder: std::sync::Arc, + truncate_dim: usize, + } + + impl JinaV5NanoEmbedder { + pub fn new(truncate_dim: Option) -> IcmResult { + let dim = match truncate_dim { + Some(d) if VALID_DIMS.contains(&d) => d, + Some(d) => { + return Err(IcmError::Embedding(format!( + "invalid truncate_dim {d} for jina-v5-nano; valid: {VALID_DIMS:?}" + ))); + } + None => DEFAULT_DIM, + }; + + let api = Api::new().map_err(|e| IcmError::Embedding(e.to_string()))?; + let repo = api.model(HF_MODEL_ID.to_string()); + + eprintln!( + "Loading jina-v5-text-nano-retrieval (downloads on first run, cached thereafter)..." + ); + let onnx_path = repo + .get("onnx/model.onnx") + .map_err(|e| IcmError::Embedding(format!("download ONNX: {e}")))?; + let tokenizer_path = repo + .get("tokenizer.json") + .map_err(|e| IcmError::Embedding(format!("download tokenizer: {e}")))?; + + let intra_threads = std::thread::available_parallelism() + .map(|n| n.get().min(4)) + .unwrap_or(1); + + let session = Session::builder() + .map_err(|e| IcmError::Embedding(format!("ort session builder: {e}")))? + .with_optimization_level(GraphOptimizationLevel::Level3) + .map_err(|e| IcmError::Embedding(format!("ort opt level: {e}")))? + .with_intra_threads(intra_threads) + .map_err(|e| IcmError::Embedding(format!("ort threads: {e}")))? + .commit_from_file(&onnx_path) + .map_err(|e| { + IcmError::Embedding(format!("load ONNX from {onnx_path:?}: {e}")) + })?; + + let tokenizer = Tokenizer::from_file(&tokenizer_path) + .map_err(|e| IcmError::Embedding(e.to_string()))?; + + Ok(Self { + encoder: std::sync::Arc::new(OrtEncoder { session, tokenizer }), + truncate_dim: dim, + }) + } + + /// Test constructor for dependency injection — not part of public API. + #[cfg(test)] + pub fn new_with_encoder(encoder: std::sync::Arc, truncate_dim: usize) -> Self { + Self { encoder, truncate_dim } + } + } + + impl Embedder for JinaV5NanoEmbedder { + /// Embed a query with the asymmetric retrieval prefix `"retrieval.query: "`. + fn embed_query(&self, text: &str) -> IcmResult> { + let prefixed = format!("retrieval.query: {text}"); + let full = self.encoder.encode(&[prefixed.as_str()])?; + let vec = full.into_iter().next().ok_or_else(|| { + IcmError::Embedding("encoder returned empty batch".into()) + })?; + Ok(truncate_and_renorm(&vec, self.truncate_dim)) + } + + /// Embed a document/passage with the asymmetric retrieval prefix `"retrieval.passage: "`. + fn embed_document(&self, text: &str) -> IcmResult> { + let prefixed = format!("retrieval.passage: {text}"); + let full = self.encoder.encode(&[prefixed.as_str()])?; + let vec = full.into_iter().next().ok_or_else(|| { + IcmError::Embedding("encoder returned empty batch".into()) + })?; + Ok(truncate_and_renorm(&vec, self.truncate_dim)) + } + + /// Delegates to `embed_document` — treats an unqualified embed as a document. + fn embed(&self, text: &str) -> IcmResult> { + self.embed_document(text) + } + + fn embed_batch(&self, texts: &[&str]) -> IcmResult>> { + if texts.is_empty() { + return Ok(Vec::new()); + } + let prefixed: Vec = texts + .iter() + .map(|t| format!("retrieval.passage: {t}")) + .collect(); + let prefixed_refs: Vec<&str> = prefixed.iter().map(|s| s.as_str()).collect(); + let full = self.encoder.encode(&prefixed_refs)?; + Ok(full + .into_iter() + .map(|v| truncate_and_renorm(&v, self.truncate_dim)) + .collect()) + } + + fn dimensions(&self) -> usize { + self.truncate_dim + } + + fn model_name(&self) -> &str { + HF_MODEL_ID + } + + fn license(&self) -> &str { + "CC-BY-NC-4.0" + } + } + + /// Slice `v` to its first `n` dims (or `v.len()` if smaller) and L2-renormalize. + /// Used for Matryoshka representation truncation. + pub fn truncate_and_renorm(v: &[f32], n: usize) -> Vec { + let take = n.min(v.len()); + let sliced = &v[..take]; + let norm: f32 = sliced.iter().map(|x| x * x).sum::().sqrt(); + if norm > 1e-8 { + let inv = 1.0_f32 / norm; + sliced.iter().map(|x| x * inv).collect() + } else { + sliced.to_vec() + } + } +} + +#[cfg(feature = "jina-v5")] +pub use inner::truncate_and_renorm; +#[cfg(feature = "jina-v5")] +pub use inner::JinaV5NanoEmbedder; + +#[cfg(all(test, feature = "jina-v5"))] +mod tests { + use super::inner::truncate_and_renorm; + + #[test] + fn truncate_and_renorm_shape_and_unit_norm() { + // Input: a known unnormalized 4-dim vector. + let input = [3.0f32, 4.0, 0.0, 0.0]; // L2 norm = 5.0 + // First L2-normalize it (simulating model output). + let normalized: Vec = input.iter().map(|x| x / 5.0).collect(); + // Truncate to 2 dims. + let out = truncate_and_renorm(&normalized, 2); + assert_eq!(out.len(), 2); + let norm: f32 = out.iter().map(|x| x * x).sum::().sqrt(); + assert!( + (norm - 1.0).abs() < 1e-6, + "output must be unit-norm, got {norm}" + ); + // Expected: normalize([0.6, 0.8]) = [0.6, 0.8] / 1.0 = [0.6, 0.8]. + assert!((out[0] - 0.6).abs() < 1e-6); + assert!((out[1] - 0.8).abs() < 1e-6); + } + + #[test] + fn truncate_and_renorm_n_equals_len() { + let v = vec![1.0f32 / 3.0f32.sqrt(); 3]; + let out = truncate_and_renorm(&v, 3); + assert_eq!(out.len(), 3); + let norm: f32 = out.iter().map(|x| x * x).sum::().sqrt(); + assert!((norm - 1.0).abs() < 1e-6); + } + + #[test] + fn truncate_and_renorm_n_larger_than_len_is_clamped() { + let v = vec![0.6_f32, 0.8]; + let out = truncate_and_renorm(&v, 8); + assert_eq!(out.len(), 2); + let norm: f32 = out.iter().map(|x| x * x).sum::().sqrt(); + assert!((norm - 1.0).abs() < 1e-6); + } + + #[test] + fn truncate_and_renorm_zero_vector_passthrough() { + let v = vec![0.0_f32; 4]; + let out = truncate_and_renorm(&v, 2); + assert_eq!(out, vec![0.0, 0.0]); + } +} + +#[cfg(all(test, feature = "jina-v5"))] +mod prefix_tests { + use std::sync::Mutex; + use crate::embedder::Embedder; + use crate::error::IcmResult; + use super::inner::{TextEncoder, JinaV5NanoEmbedder}; + + struct MockTextEncoder { + captured: Mutex>, + } + + impl TextEncoder for MockTextEncoder { + fn encode(&self, texts: &[&str]) -> IcmResult>> { + self.captured.lock().unwrap().extend(texts.iter().map(|s| s.to_string())); + // Return unit vectors (dim 768) so downstream truncate_and_renorm doesn't panic + Ok(texts.iter().map(|_| { + let n = 768usize; + vec![1.0_f32 / (n as f32).sqrt(); n] + }).collect()) + } + } + + #[test] + fn embed_query_prepends_retrieval_query_prefix() { + let enc = std::sync::Arc::new(MockTextEncoder { captured: Mutex::new(Vec::new()) }); + let embedder = JinaV5NanoEmbedder::new_with_encoder(enc.clone(), 768); + let _ = embedder.embed_query("hello"); + let captured = enc.captured.lock().unwrap(); + assert_eq!(captured.len(), 1); + assert_eq!(captured[0], "retrieval.query: hello"); + } + + #[test] + fn embed_document_prepends_retrieval_passage_prefix() { + let enc = std::sync::Arc::new(MockTextEncoder { captured: Mutex::new(Vec::new()) }); + let embedder = JinaV5NanoEmbedder::new_with_encoder(enc.clone(), 768); + let _ = embedder.embed_document("hello"); + let captured = enc.captured.lock().unwrap(); + assert_eq!(captured.len(), 1); + assert_eq!(captured[0], "retrieval.passage: hello"); + } + + #[test] + fn embed_delegates_to_embed_document() { + let enc = std::sync::Arc::new(MockTextEncoder { captured: Mutex::new(Vec::new()) }); + let embedder = JinaV5NanoEmbedder::new_with_encoder(enc.clone(), 768); + let _ = embedder.embed("hello"); + let captured = enc.captured.lock().unwrap(); + assert_eq!(captured.len(), 1); + assert_eq!(captured[0], "retrieval.passage: hello", "embed must delegate to embed_document"); + } +} diff --git a/crates/icm-core/src/jina_v5_small.rs b/crates/icm-core/src/jina_v5_small.rs new file mode 100644 index 0000000..7f8c91a --- /dev/null +++ b/crates/icm-core/src/jina_v5_small.rs @@ -0,0 +1,399 @@ +//! Jina v5-text-small (Qwen3-based) embedder — local ONNX inference via `ort` + `tokenizers`. +//! +//! This backend is gated behind the `jina-v5` Cargo feature and is OFF by +//! default. License: CC-BY-NC-4.0 (non-commercial). See: +//! + +#[cfg(feature = "jina-v5")] +mod inner { + use hf_hub::api::sync::Api; + use ndarray::Array2; + use ort::session::{builder::GraphOptimizationLevel, Session}; + use tokenizers::Tokenizer; + + use crate::embedder::Embedder; + use crate::error::{IcmError, IcmResult}; + // Reuse the Matryoshka truncation utility from the nano module — do NOT duplicate. + use crate::jina_v5_nano::truncate_and_renorm; + + const HF_MODEL_ID: &str = "jinaai/jina-embeddings-v5-text-small-retrieval"; + const DEFAULT_DIM: usize = 1024; + const VALID_DIMS: &[usize] = &[32, 64, 128, 256, 512, 768, 1024]; + + /// Internal abstraction for text encoding — enables dependency injection in tests. + pub trait TextEncoder: Send + Sync { + /// Encode a batch of texts and return full-dim (untruncated) embeddings. + fn encode(&self, texts: &[&str]) -> IcmResult>>; + } + + /// Production encoder: tokenizes with HuggingFace `tokenizers` and runs ONNX inference. + struct OrtEncoder { + session: Session, + tokenizer: Tokenizer, + } + + impl TextEncoder for OrtEncoder { + fn encode(&self, texts: &[&str]) -> IcmResult>> { + if texts.is_empty() { + return Ok(Vec::new()); + } + + let encodings = self + .tokenizer + .encode_batch(texts.iter().map(|s| s.to_string()).collect(), true) + .map_err(|e| IcmError::Embedding(e.to_string()))?; + + let batch_size = texts.len(); + let seq_len = encodings + .iter() + .map(|e| e.get_ids().len()) + .max() + .unwrap_or(0); + + if seq_len == 0 { + return Err(IcmError::Embedding("empty token sequence".into())); + } + + let mut input_ids = Array2::::zeros((batch_size, seq_len)); + let mut attention_mask = Array2::::zeros((batch_size, seq_len)); + + for (i, enc) in encodings.iter().enumerate() { + for (j, &id) in enc.get_ids().iter().enumerate() { + input_ids[[i, j]] = id as i64; + } + for (j, &m) in enc.get_attention_mask().iter().enumerate() { + attention_mask[[i, j]] = m as i64; + } + } + + // ort 2.0.0-rc.9: `inputs!` returns `Result, ...)>>`. + let session_inputs = ort::inputs! { + "input_ids" => input_ids.view(), + "attention_mask" => attention_mask.view(), + } + .map_err(|e| IcmError::Embedding(format!("ort inputs!: {e}")))?; + + let outputs = self + .session + .run(session_inputs) + .map_err(|e| IcmError::Embedding(format!("ort run: {e}")))?; + + // HF transformer ONNX models commonly emit `last_hidden_state`; + // some BERT-style exports use `token_embeddings`. Prefer the + // canonical name and fall back gracefully. + // Note: Qwen3-based architecture (jina-v5-small) follows the same + // ONNX export convention as EuroBERT (jina-v5-nano) from our + // inference perspective — both output `last_hidden_state`. + let hidden = outputs + .get("last_hidden_state") + .or_else(|| outputs.get("token_embeddings")) + .ok_or_else(|| { + IcmError::Embedding("ONNX output key not found".into()) + })?; + + let hidden_view = hidden + .try_extract_tensor::() + .map_err(|e| IcmError::Embedding(format!("extract tensor: {e}")))?; + + let shape = hidden_view.shape(); + if shape.len() != 3 { + return Err(IcmError::Embedding(format!( + "expected last_hidden_state rank 3, got shape {shape:?}" + ))); + } + if shape[0] != batch_size { + return Err(IcmError::Embedding(format!( + "ONNX output batch dim mismatch: expected {batch_size}, got {shape:?}" + ))); + } + if shape[1] < seq_len { + return Err(IcmError::Embedding(format!( + "ONNX output seq dim too small: expected >= {seq_len}, got {shape:?}" + ))); + } + let hidden_dim = shape[2]; + + let mut embeddings = Vec::with_capacity(batch_size); + for i in 0..batch_size { + let mask = attention_mask.row(i); + let mut pooled = vec![0f32; hidden_dim]; + let mut count = 0usize; + + for j in 0..seq_len { + if mask[j] == 1 { + for k in 0..hidden_dim { + pooled[k] += hidden_view[[i, j, k]]; + } + count += 1; + } + } + + if count > 0 { + let inv = 1.0_f32 / count as f32; + for v in &mut pooled { + *v *= inv; + } + } + + // L2-normalize the pooled vector. + let norm: f32 = pooled.iter().map(|x| x * x).sum::().sqrt(); + if norm > 1e-8 { + let inv = 1.0_f32 / norm; + for v in &mut pooled { + *v *= inv; + } + } + + // Return full-dim embedding — truncation happens in the Embedder methods. + embeddings.push(pooled); + } + + Ok(embeddings) + } + } + + pub struct JinaV5SmallEmbedder { + encoder: std::sync::Arc, + truncate_dim: usize, + } + + impl JinaV5SmallEmbedder { + pub fn new(truncate_dim: Option) -> IcmResult { + let dim = match truncate_dim { + Some(d) if VALID_DIMS.contains(&d) => d, + Some(d) => { + return Err(IcmError::Embedding(format!( + "invalid truncate_dim {d} for jina-v5-small; valid: {VALID_DIMS:?}" + ))); + } + None => DEFAULT_DIM, + }; + + let api = Api::new().map_err(|e| IcmError::Embedding(e.to_string()))?; + let repo = api.model(HF_MODEL_ID.to_string()); + + eprintln!( + "Loading jina-v5-text-small-retrieval (downloads on first run, cached thereafter)..." + ); + let onnx_path = repo + .get("onnx/model.onnx") + .map_err(|e| IcmError::Embedding(format!("download ONNX: {e}")))?; + let tokenizer_path = repo + .get("tokenizer.json") + .map_err(|e| IcmError::Embedding(format!("download tokenizer: {e}")))?; + + let intra_threads = std::thread::available_parallelism() + .map(|n| n.get().min(4)) + .unwrap_or(1); + + let session = Session::builder() + .map_err(|e| IcmError::Embedding(format!("ort session builder: {e}")))? + .with_optimization_level(GraphOptimizationLevel::Level3) + .map_err(|e| IcmError::Embedding(format!("ort opt level: {e}")))? + .with_intra_threads(intra_threads) + .map_err(|e| IcmError::Embedding(format!("ort threads: {e}")))? + .commit_from_file(&onnx_path) + .map_err(|e| { + IcmError::Embedding(format!("load ONNX from {onnx_path:?}: {e}")) + })?; + + let tokenizer = Tokenizer::from_file(&tokenizer_path) + .map_err(|e| IcmError::Embedding(e.to_string()))?; + + Ok(Self { + encoder: std::sync::Arc::new(OrtEncoder { session, tokenizer }), + truncate_dim: dim, + }) + } + + /// Test constructor for dependency injection — not part of public API. + #[cfg(test)] + pub fn new_with_encoder(encoder: std::sync::Arc, truncate_dim: usize) -> Self { + Self { encoder, truncate_dim } + } + } + + impl Embedder for JinaV5SmallEmbedder { + /// Embed a query with the asymmetric retrieval prefix `"retrieval.query: "`. + fn embed_query(&self, text: &str) -> IcmResult> { + let prefixed = format!("retrieval.query: {text}"); + let full = self.encoder.encode(&[prefixed.as_str()])?; + let vec = full.into_iter().next().ok_or_else(|| { + IcmError::Embedding("encoder returned empty batch".into()) + })?; + Ok(truncate_and_renorm(&vec, self.truncate_dim)) + } + + /// Embed a document/passage with the asymmetric retrieval prefix `"retrieval.passage: "`. + fn embed_document(&self, text: &str) -> IcmResult> { + let prefixed = format!("retrieval.passage: {text}"); + let full = self.encoder.encode(&[prefixed.as_str()])?; + let vec = full.into_iter().next().ok_or_else(|| { + IcmError::Embedding("encoder returned empty batch".into()) + })?; + Ok(truncate_and_renorm(&vec, self.truncate_dim)) + } + + /// Delegates to `embed_document` — treats an unqualified embed as a document. + fn embed(&self, text: &str) -> IcmResult> { + self.embed_document(text) + } + + fn embed_batch(&self, texts: &[&str]) -> IcmResult>> { + if texts.is_empty() { + return Ok(Vec::new()); + } + let prefixed: Vec = texts + .iter() + .map(|t| format!("retrieval.passage: {t}")) + .collect(); + let prefixed_refs: Vec<&str> = prefixed.iter().map(|s| s.as_str()).collect(); + let full = self.encoder.encode(&prefixed_refs)?; + Ok(full + .into_iter() + .map(|v| truncate_and_renorm(&v, self.truncate_dim)) + .collect()) + } + + fn dimensions(&self) -> usize { + self.truncate_dim + } + + fn model_name(&self) -> &str { + HF_MODEL_ID + } + + fn license(&self) -> &str { + "CC-BY-NC-4.0" + } + } +} + +#[cfg(feature = "jina-v5")] +pub use inner::JinaV5SmallEmbedder; + +#[cfg(all(test, feature = "jina-v5"))] +mod tests { + // `truncate_and_renorm` itself is already tested exhaustively in jina_v5_nano. + // These tests exercise the dim-validation logic and the small-specific + // constants (DEFAULT_DIM = 1024, VALID_DIMS includes 1024). + use crate::jina_v5_nano::truncate_and_renorm; + + /// Truncating to 512 (a valid sub-dimension) produces a 512-dim unit vector. + #[test] + fn truncate_correct_dim() { + // Build a synthetic 1024-dim unit vector. + let dim = 1024usize; + let v: Vec = (0..dim) + .map(|i| (i as f32).sin()) + .collect::>() + .iter() + .map(|x| x / (dim as f32).sqrt()) + .collect(); + // Truncate to 512. + let out = truncate_and_renorm(&v, 512); + assert_eq!(out.len(), 512); + let norm: f32 = out.iter().map(|x| x * x).sum::().sqrt(); + assert!( + (norm - 1.0).abs() < 1e-5, + "output must be unit-norm after truncation, got {norm}" + ); + } + + /// Truncating to 1024 (the maximum = DEFAULT_DIM) is equivalent to no truncation. + #[test] + fn truncate_max_dim() { + let dim = 1024usize; + // Build a unit vector. + let raw: Vec = (0..dim).map(|i| (i as f32 + 1.0).recip()).collect(); + let sum_sq: f32 = raw.iter().map(|x| x * x).sum(); + let norm = sum_sq.sqrt(); + let v: Vec = raw.iter().map(|x| x / norm).collect(); + + let out = truncate_and_renorm(&v, 1024); + assert_eq!(out.len(), 1024); + let out_norm: f32 = out.iter().map(|x| x * x).sum::().sqrt(); + assert!( + (out_norm - 1.0).abs() < 1e-5, + "unit-norm preserved at max dim, got {out_norm}" + ); + } + + /// Requesting a dimension not in VALID_DIMS must return an error without + /// attempting any network I/O (dim validation happens before Api::new()). + #[test] + fn invalid_dim_rejected() { + // 999 is not in VALID_DIMS = [32, 64, 128, 256, 512, 768, 1024]. + // Dim validation is the first check in `new()`, so Err is returned + // before any HF/network access — no offline mitigation needed. + use crate::error::IcmError; + let result = super::inner::JinaV5SmallEmbedder::new(Some(999)); + assert!( + result.is_err(), + "expected Err for invalid truncate_dim 999, got Ok" + ); + match result { + Err(IcmError::Embedding(msg)) => { + assert!( + msg.contains("999"), + "error message should mention the invalid dim, got: {msg}" + ); + } + Err(other) => panic!("expected IcmError::Embedding, got {other:?}"), + Ok(_) => panic!("expected Err, got Ok"), + } + } +} + +#[cfg(all(test, feature = "jina-v5"))] +mod prefix_tests { + use std::sync::Mutex; + use crate::embedder::Embedder; + use crate::error::IcmResult; + use super::inner::{TextEncoder, JinaV5SmallEmbedder}; + + struct MockTextEncoder { + captured: Mutex>, + } + + impl TextEncoder for MockTextEncoder { + fn encode(&self, texts: &[&str]) -> IcmResult>> { + self.captured.lock().unwrap().extend(texts.iter().map(|s| s.to_string())); + // Return unit vectors (dim 1024) so downstream truncate_and_renorm doesn't panic + Ok(texts.iter().map(|_| { + let n = 1024usize; + vec![1.0_f32 / (n as f32).sqrt(); n] + }).collect()) + } + } + + #[test] + fn embed_query_prepends_retrieval_query_prefix() { + let enc = std::sync::Arc::new(MockTextEncoder { captured: Mutex::new(Vec::new()) }); + let embedder = JinaV5SmallEmbedder::new_with_encoder(enc.clone(), 1024); + let _ = embedder.embed_query("hello"); + let captured = enc.captured.lock().unwrap(); + assert_eq!(captured.len(), 1); + assert_eq!(captured[0], "retrieval.query: hello"); + } + + #[test] + fn embed_document_prepends_retrieval_passage_prefix() { + let enc = std::sync::Arc::new(MockTextEncoder { captured: Mutex::new(Vec::new()) }); + let embedder = JinaV5SmallEmbedder::new_with_encoder(enc.clone(), 1024); + let _ = embedder.embed_document("hello"); + let captured = enc.captured.lock().unwrap(); + assert_eq!(captured.len(), 1); + assert_eq!(captured[0], "retrieval.passage: hello"); + } + + #[test] + fn embed_delegates_to_embed_document() { + let enc = std::sync::Arc::new(MockTextEncoder { captured: Mutex::new(Vec::new()) }); + let embedder = JinaV5SmallEmbedder::new_with_encoder(enc.clone(), 1024); + let _ = embedder.embed("hello"); + let captured = enc.captured.lock().unwrap(); + assert_eq!(captured.len(), 1); + assert_eq!(captured[0], "retrieval.passage: hello", "embed must delegate to embed_document"); + } +} diff --git a/crates/icm-core/src/lib.rs b/crates/icm-core/src/lib.rs index 5580db2..8a2d669 100644 --- a/crates/icm-core/src/lib.rs +++ b/crates/icm-core/src/lib.rs @@ -4,6 +4,10 @@ pub mod error; #[cfg(feature = "embeddings")] pub mod fastembed_embedder; pub mod feedback; +#[cfg(feature = "jina-v5")] +pub mod jina_v5_nano; +#[cfg(feature = "jina-v5")] +mod jina_v5_small; pub mod feedback_store; pub mod learn; pub mod memoir; @@ -22,6 +26,10 @@ pub use embedder::Embedder; pub use error::{IcmError, IcmResult}; #[cfg(feature = "embeddings")] pub use fastembed_embedder::FastEmbedder; +#[cfg(feature = "jina-v5")] +pub use jina_v5_nano::{truncate_and_renorm, JinaV5NanoEmbedder}; +#[cfg(feature = "jina-v5")] +pub use jina_v5_small::JinaV5SmallEmbedder; pub use feedback::{Feedback, FeedbackStats}; pub use feedback_store::FeedbackStore; pub use memoir::{Concept, ConceptLink, Label, Memoir, MemoirStats, Relation}; diff --git a/crates/icm-mcp/src/tools.rs b/crates/icm-mcp/src/tools.rs index 1844669..344dca4 100644 --- a/crates/icm-mcp/src/tools.rs +++ b/crates/icm-mcp/src/tools.rs @@ -1135,7 +1135,7 @@ fn tool_recall( // Try hybrid search if embedder is available if let Some(emb) = embedder { - if let Ok(query_emb) = emb.embed(query) { + if let Ok(query_emb) = emb.embed_query(query) { if let Ok(results) = store.search_hybrid(query, &query_emb, limit) { let mut scored_results = results; if let Some(t) = topic { diff --git a/crates/icm-store/src/lib.rs b/crates/icm-store/src/lib.rs index 27b3dd9..f64f94e 100644 --- a/crates/icm-store/src/lib.rs +++ b/crates/icm-store/src/lib.rs @@ -2,3 +2,15 @@ mod schema; mod store; pub use store::SqliteStore; + +/// Returned by store-open functions to report whether a dimension-change +/// migration was executed. `dim_changed = false` means the vector table was +/// untouched; `true` means the old table was dropped, all `memories.embedding` +/// rows were set to NULL, and the table was recreated with `new_dim` columns. +#[derive(Debug, Clone, Default)] +pub struct MigrationStatus { + pub dim_changed: bool, + pub old_dim: usize, + pub new_dim: usize, + pub affected_rows: usize, +} diff --git a/crates/icm-store/src/schema.rs b/crates/icm-store/src/schema.rs index f1d72de..1826217 100644 --- a/crates/icm-store/src/schema.rs +++ b/crates/icm-store/src/schema.rs @@ -3,6 +3,7 @@ use rusqlite::Connection; use icm_core::{IcmError, IcmResult}; use crate::store::db_err; +use crate::MigrationStatus; /// Check if a FTS virtual table exists in sqlite_master. fn fts_table_exists(conn: &Connection, name: &str) -> Result { @@ -36,13 +37,15 @@ fn create_vec_table(conn: &Connection, embedding_dims: usize) -> Result<(), IcmE Ok(()) } -/// Initialize the database schema. `embedding_dims` controls the sqlite-vec vector size. -/// Pass `None` to skip vector table creation (no embeddings feature). -pub fn init_db(conn: &Connection) -> Result<(), IcmError> { +/// Initialize the database schema using the default embedding dimensions. +/// Returns a [`MigrationStatus`] describing whether a dim-change migration ran. +pub fn init_db(conn: &Connection) -> Result { init_db_with_dims(conn, icm_core::DEFAULT_EMBEDDING_DIMS) } -pub fn init_db_with_dims(conn: &Connection, embedding_dims: usize) -> Result<(), IcmError> { +/// Initialize the database schema. `embedding_dims` controls the sqlite-vec vector size. +/// Returns a [`MigrationStatus`] describing whether a dim-change migration ran. +pub fn init_db_with_dims(conn: &Connection, embedding_dims: usize) -> Result { conn.execute_batch( " CREATE TABLE IF NOT EXISTS memories ( @@ -361,15 +364,22 @@ pub fn init_db_with_dims(conn: &Connection, embedding_dims: usize) -> Result<(), // Model changed — drop vec table and clear embeddings conn.execute_batch("DROP TABLE IF EXISTS vec_memories") .map_err(db_err)?; - conn.execute("UPDATE memories SET embedding = NULL", []) + let affected_rows = conn + .execute("UPDATE memories SET embedding = NULL", []) .map_err(db_err)?; create_vec_table(conn, embedding_dims)?; + return Ok(MigrationStatus { + dim_changed: true, + old_dim: stored, + new_dim: embedding_dims, + affected_rows, + }); } } else { create_vec_table(conn, embedding_dims)?; } - Ok(()) + Ok(MigrationStatus::default()) } /// Migrate existing DBs: replace the broad `memories_au` trigger with one diff --git a/crates/icm-store/src/store.rs b/crates/icm-store/src/store.rs index 5930a03..374e7d2 100644 --- a/crates/icm-store/src/store.rs +++ b/crates/icm-store/src/store.rs @@ -14,6 +14,7 @@ use icm_core::{ }; use crate::schema::{init_db, init_db_with_dims}; +use crate::MigrationStatus; /// Convert rusqlite::Error to IcmError::Database pub(crate) fn db_err(e: rusqlite::Error) -> IcmError { @@ -43,12 +44,14 @@ pub struct SqliteStore { } impl SqliteStore { - pub fn new(path: &Path) -> IcmResult { + pub fn new(path: &Path) -> IcmResult<(Self, MigrationStatus)> { Self::with_dims(path, icm_core::DEFAULT_EMBEDDING_DIMS) } /// Open or create a store with a specific embedding dimension. - pub fn with_dims(path: &Path, embedding_dims: usize) -> IcmResult { + /// Returns the store and a [`MigrationStatus`] indicating whether a + /// dim-change migration was performed (embeddings NULLed, vec table recreated). + pub fn with_dims(path: &Path, embedding_dims: usize) -> IcmResult<(Self, MigrationStatus)> { ensure_sqlite_vec(); if let Some(parent) = path.parent() { std::fs::create_dir_all(parent) @@ -60,8 +63,8 @@ impl SqliteStore { "PRAGMA journal_mode=WAL; PRAGMA foreign_keys=ON; PRAGMA busy_timeout=5000;", ) .map_err(db_err)?; - init_db_with_dims(&conn, embedding_dims)?; - Ok(Self { conn }) + let migration = init_db_with_dims(&conn, embedding_dims)?; + Ok((Self { conn }, migration)) } /// Apply decay if more than 24 hours since last decay. diff --git a/docs/issues/jina-v5-fastembed-rs-F1.patch b/docs/issues/jina-v5-fastembed-rs-F1.patch new file mode 100644 index 0000000..4e54b5c --- /dev/null +++ b/docs/issues/jina-v5-fastembed-rs-F1.patch @@ -0,0 +1,137 @@ +From 57fc1ade67af596f51f42586459da06ba2510162 Mon Sep 17 00:00:00 2001 +From: METAeuPHORIC +Date: Wed, 29 Apr 2026 14:38:44 +0000 +Subject: [PATCH] feat: register jina-embeddings-v5-text-nano-retrieval +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Adds Jina AI's v5-text-nano retrieval embedder as a built-in +EmbeddingModel variant. + +Model: jinaai/jina-embeddings-v5-text-nano-retrieval +- 239M params, EuroBERT-style encoder +- Output dim: 768 (full); Matryoshka-truncatable to {32, 64, 128, 256, 512, 768} +- 8K-token context window +- Pooling: mean over last_hidden_state with attention-mask masking +- License: CC BY-NC 4.0 (non-commercial) + +Implements the registration-only path: enum variant, ModelInfo entry +in init_models_map(), and pooling registration in +get_default_pooling_method(). Inference uses the existing +TextEmbedding ONNX runner; no architecture changes required. + +The retrieval variant supports asymmetric prompts via instruction +prefixes ("retrieval.query: " / "retrieval.passage: ") in +production but this PR does not yet wire that into a typed API — +users may prepend prefixes manually. A follow-up could lift PR #236's +embed_query() / with_query_prefix() builders, but that is out of +scope here. + +Snapshot test value in tests/text-embeddings.rs is intentionally +omitted; the existing _ => panic!() arm signals that the variant +needs an expected-sum entry. CI has ORT_LIB_LOCATION set and can +capture this on a follow-up commit, or a maintainer with the +necessary toolchain can fill it in. + +Refs upstream feedback request: jina v5 was tested against fastembed +in PR #236 (closed) and downstream in icm (rtk-ai/icm). +--- + src/models/text_embedding.rs | 19 +++++++++++++++++++ + src/text_embedding/impl.rs | 1 + + 2 files changed, 20 insertions(+) + +diff --git a/src/models/text_embedding.rs b/src/models/text_embedding.rs +index 6807b08..e66950e 100644 +--- a/src/models/text_embedding.rs ++++ b/src/models/text_embedding.rs +@@ -67,20 +67,28 @@ pub enum EmbeddingModel { + /// Alibaba-NLP/gte-large-en-v1.5 + GTELargeENV15, + /// Quantized Alibaba-NLP/gte-large-en-v1.5 + GTELargeENV15Q, + /// Qdrant/clip-ViT-B-32-text + ClipVitB32, + /// jinaai/jina-embeddings-v2-base-code + JinaEmbeddingsV2BaseCode, + /// jinaai/jina-embeddings-v2-base-en + JinaEmbeddingsV2BaseEN, ++ /// jinaai/jina-embeddings-v5-text-nano-retrieval ++ /// ++ /// License: CC BY-NC 4.0 (non-commercial). Commercial use requires a ++ /// commercial license from Jina AI. The retrieval variant supports ++ /// asymmetric prompts via instruction prefixes ++ /// (`retrieval.query: ` / `retrieval.passage: `) and Matryoshka ++ /// truncation in {32, 64, 128, 256, 512, 768}. ++ JinaEmbeddingsV5TextNano, + /// onnx-community/embeddinggemma-300m-ONNX + EmbeddingGemma300M, + /// snowflake/snowflake-arctic-embed-xs + SnowflakeArcticEmbedXS, + /// Quantized snowflake/snowflake-arctic-embed-xs + SnowflakeArcticEmbedXSQ, + /// snowflake/snowflake-arctic-embed-s + SnowflakeArcticEmbedS, + /// Quantized snowflake/snowflake-arctic-embed-s + SnowflakeArcticEmbedSQ, +@@ -402,20 +410,31 @@ fn init_models_map() -> HashMap> { + }, + ModelInfo { + model: EmbeddingModel::JinaEmbeddingsV2BaseEN, + dim: 768, + description: String::from("Jina embeddings v2 base English"), + model_code: String::from("jinaai/jina-embeddings-v2-base-en"), + model_file: String::from("model.onnx"), + additional_files: Vec::new(), + output_key: None, + }, ++ ModelInfo { ++ model: EmbeddingModel::JinaEmbeddingsV5TextNano, ++ dim: 768, ++ description: String::from( ++ "Jina embeddings v5 text nano retrieval (CC BY-NC 4.0, non-commercial)", ++ ), ++ model_code: String::from("jinaai/jina-embeddings-v5-text-nano-retrieval"), ++ model_file: String::from("onnx/model.onnx"), ++ additional_files: Vec::new(), ++ output_key: None, ++ }, + ModelInfo { + model: EmbeddingModel::EmbeddingGemma300M, + dim: 768, + description: String::from("EmbeddingGemma is a 300M parameter from Google"), + model_code: String::from("onnx-community/embeddinggemma-300m-ONNX"), + model_file: String::from("onnx/model.onnx"), + additional_files: vec!["onnx/model.onnx_data".to_string()], + output_key: Some(crate::OutputKey::ByName("sentence_embedding")), + }, + ModelInfo { +diff --git a/src/text_embedding/impl.rs b/src/text_embedding/impl.rs +index 37b846d..fd48994 100644 +--- a/src/text_embedding/impl.rs ++++ b/src/text_embedding/impl.rs +@@ -241,20 +241,21 @@ impl TextEmbedding { + + EmbeddingModel::GTEBaseENV15 => Some(Pooling::Cls), + EmbeddingModel::GTEBaseENV15Q => Some(Pooling::Cls), + EmbeddingModel::GTELargeENV15 => Some(Pooling::Cls), + EmbeddingModel::GTELargeENV15Q => Some(Pooling::Cls), + + EmbeddingModel::ClipVitB32 => Some(Pooling::Mean), + + EmbeddingModel::JinaEmbeddingsV2BaseCode => Some(Pooling::Mean), + EmbeddingModel::JinaEmbeddingsV2BaseEN => Some(Pooling::Mean), ++ EmbeddingModel::JinaEmbeddingsV5TextNano => Some(Pooling::Mean), + + EmbeddingModel::EmbeddingGemma300M => Some(Pooling::Mean), + + EmbeddingModel::SnowflakeArcticEmbedXS => Some(Pooling::Cls), + EmbeddingModel::SnowflakeArcticEmbedXSQ => Some(Pooling::Cls), + EmbeddingModel::SnowflakeArcticEmbedS => Some(Pooling::Cls), + EmbeddingModel::SnowflakeArcticEmbedSQ => Some(Pooling::Cls), + EmbeddingModel::SnowflakeArcticEmbedM => Some(Pooling::Cls), + EmbeddingModel::SnowflakeArcticEmbedMQ => Some(Pooling::Cls), + EmbeddingModel::SnowflakeArcticEmbedMLong => Some(Pooling::Cls), +-- +2.54.0 + diff --git a/docs/issues/jina-v5-slices.md b/docs/issues/jina-v5-slices.md new file mode 100644 index 0000000..94d01cd --- /dev/null +++ b/docs/issues/jina-v5-slices.md @@ -0,0 +1,232 @@ +# Jina v5 Matryoshka Embedding Support — Issue Slices + +Generated from plan: `.claude/plans/support-latest-jina-ai-s-fuzzy-crystal.md` +License note: Jina v5 weights are **CC BY-NC 4.0 (non-commercial)**. Local use requires a commercial Jina license for production redistribution. + +--- + +## Track 1 — ICM repo + +### S-store — Refactor schema migration to return MigrationStatus [DONE: 36b4030] + +**Parent:** PRD plan file + +**What to build:** +Add `MigrationStatus { dim_changed, old_dim, new_dim, affected_rows }` to `icm-store/src/lib.rs`. Change `init_db_with_dims` to return `IcmResult` instead of `IcmResult<()>`. Propagate through `SqliteStore::with_dims` and CLI `open_store`. Store remains data-only — no embedder reference. + +**Acceptance criteria:** +- [x] `MigrationStatus` exported from `icm-store` public API with no embedder dependency +- [x] `init_db_with_dims` returns `IcmResult`; dim-change path returns all four fields; no-change path returns `default()` +- [x] `open_store` in CLI propagates `MigrationStatus` +- [x] All 125 existing tests in `crates/icm-store` pass +- [x] Zero compile-time reference to `Embedder` inside `icm-store` + +**Blocked by:** None +**User stories covered:** US-arch (store stays data-only) + +--- + +### S-1 — Add embedder factory + Jina v5-text-nano backend end-to-end [DONE: 4cf9462, 1995d50, ee977fa] + +**Parent:** S-store + +**What to build:** +Extend `Embedder` trait with `embed_query`, `embed_document`, `model_name`, `license` default methods. Add `EmbedderBackend` enum to `EmbeddingsConfig`. Implement `JinaV5NanoEmbedder` via `ort` + `tokenizers` + `hf-hub`. Wire asymmetric recall path (`embed_query` in recall, `embed` for store). Add `truncate_and_renorm` Matryoshka helper. Log dim-change warning. + +**Acceptance criteria:** +- [x] `EmbedderBackend::Fastembed` and `EmbedderBackend::JinaV5Nano` selectable from config +- [x] First run downloads ONNX from HuggingFace, prints status message +- [x] Trait has `embed_query`, `embed_document`, `model_name`, `license` with defaults; `FastEmbedder` unchanged +- [x] Recall path uses `embed_query`; store path uses `embed` +- [x] Dim-change log message surfaced +- [x] `truncate_and_renorm` unit tests: shape, unit-norm, element-wise correctness, zero-norm edge case +- [x] 291 tests pass + +**Blocked by:** S-store +**User stories covered:** US-1, US-2 + +--- + +### S-2 — Add Jina v5-text-small (Qwen3) backend + +**Parent:** S-1 + +**What to build:** +Add `EmbedderBackend::JinaV5Small` variant. Implement `JinaV5SmallEmbedder` using `ort` + `tokenizers` + `hf-hub` with Qwen3 architecture: load `jinaai/jina-embeddings-v5-text-small-retrieval` ONNX, apply mean-pool over `last_hidden_state` with attention-mask masking, L2 normalize, Matryoshka truncation. Default dim = 1024; valid truncate dims = {32, 64, 128, 256, 512, 768, 1024}. Reuse `truncate_and_renorm` from S-1. + +**Acceptance criteria:** +- [ ] `EmbedderBackend::JinaV5Small` selectable; downloads `jinaai/jina-embeddings-v5-text-small-retrieval` ONNX on first run +- [ ] Mean-pool head applied over `last_hidden_state` with attention-mask masking; output L2-normalized +- [ ] `truncate_and_renorm` from S-1 reused (not duplicated) +- [ ] Round-trip test against an 8K-token document succeeds +- [ ] `cosine(small_full, small_truncated_512) >= 0.93` on canned text (informational, not gated) +- [ ] Dim 1024 (default) stored in schema; migration works from 384→1024 and 768→1024 + +**Blocked by:** S-1 +**User stories covered:** US-4 + +--- + +### S-3 — Auto re-embed on dim change (CLI/MCP orchestration) + +**Parent:** S-store, S-1 + +**What to build:** +In `icm-cli/src/main.rs`, after `open_store` returns `MigrationStatus { dim_changed: true }`, call the existing `cmd_embed` batch loop for all NULL-embedding rows. Add `--no-auto-reembed` flag. Wire same check into MCP server startup. Per-row errors log and continue; summary printed always. + +**Acceptance criteria:** +- [ ] Prerequisite (from S-store, already done): when the active embedder's `dimensions()` differs from the stored dim in `icm_metadata`, `schema.rs` drops `vec_memories`, sets all `memories.embedding` to NULL, recreates the table, and returns `MigrationStatus { dim_changed: true, affected_rows: N }` — S-3 reads `affected_rows` as the count of rows that need re-embedding +- [ ] CLI detects `MigrationStatus::dim_changed == true` and auto-invokes embed loop +- [ ] MCP server startup path has same check +- [ ] Progress bar identical visual to `cmd_embed` +- [ ] `--no-auto-reembed` flag: skips, warns, exits clean +- [ ] Per-row errors do not abort the pass; summary line always printed +- [ ] Integration test: fresh DB (fastembed 384 dims), 20 memories, switch to jina-v5-nano, run `icm recall`, verify all 20 rows have 768-dim embeddings +- [ ] Test: `--no-auto-reembed` leaves vec_memories empty and prints warning + +**Blocked by:** S-store, S-1 +**User stories covered:** US-5 + +--- + +### S-4 — Enforce asymmetric retrieval paths in v5 backends + +**Parent:** S-1, S-2 + +**What to build:** +Override `embed_query` and `embed_document` in `JinaV5NanoEmbedder` and `JinaV5SmallEmbedder`. `embed_query` prepends `"retrieval.query: "`, `embed_document` prepends `"retrieval.passage: "`, `embed` delegates to `embed_document`. Unit test via `MockEncoder` that captures exact prefix strings. + +**Acceptance criteria:** +- [ ] `JinaV5NanoEmbedder::embed_query` passes `"retrieval.query: {text}"` to encoder +- [ ] `JinaV5NanoEmbedder::embed_document` passes `"retrieval.passage: {text}"` to encoder +- [ ] `JinaV5NanoEmbedder::embed` delegates to `embed_document` +- [ ] Same three impl points in `JinaV5SmallEmbedder` +- [ ] `FastEmbedder`: no changes +- [ ] Unit test via `MockEncoder`: captures exact string passed; asserts prefix for query and passage +- [ ] Existing round-trip recall test from S-1 still passes + +**Blocked by:** S-1, S-2 +**User stories covered:** US-3 + +--- + +### S-5 — Docs / UX / license disclosure + +**Parent:** S-1..S-4 + +**What to build:** +Update `README.md` with "Embedder backends" section. Add license warning comment in `config/default.toml`. Make `icm config show` print active embedder type + license tag. Make `icm recall` output header include model name. Add CHANGELOG entry. + +**Acceptance criteria:** +- [ ] README has clear non-commercial warning for Jina v5 +- [ ] `config/default.toml` has license comment +- [ ] `icm config show` prints `embedder: jina-v5-nano (CC-BY-NC-4.0, non-commercial)` when active +- [ ] `icm recall` output header includes `model: jina-v5-nano` +- [ ] CHANGELOG.md entry under Unreleased +- [ ] Snapshot test of `icm config show` output covers new fields + +**Blocked by:** S-1, S-2, S-4 +**User stories covered:** US-7 + +--- + +## Track 2 — upstream Anush008/fastembed-rs + +Note: the actual Rust crate `fastembed` consumed by ICM is published from `Anush008/fastembed-rs`, not `qdrant/fastembed` (which is the Python upstream). PRs target the Rust repo. Issue qdrant/fastembed#607 was filed against the Python upstream; the Rust port has no equivalent issue at the time of writing. + +### F-1 — Register `jina-embeddings-v5-text-nano-retrieval` as built-in fastembed model [PATCH READY: docs/issues/jina-v5-fastembed-rs-F1.patch] + +**Parent:** qdrant/fastembed#607 (Python upstream) + +**What to build:** +Add `EmbeddingModel::JinaEmbeddingsV5TextNano` to fastembed's enum in `src/models/text_embedding.rs`. Add `ModelInfo` entry in `init_models_map()` with HF path `jinaai/jina-embeddings-v5-text-nano-retrieval`, dim = 768, max tokens = 8192, license = "CC BY-NC 4.0". Add pooling mode entry. Add snapshot test. + +**Draft patch:** +```rust +// In EmbeddingModel enum (~line 4490): +/// jinaai/jina-embeddings-v5-text-nano-retrieval +JinaEmbeddingsV5TextNano, + +// In init_models_map() (~line 4930): +ModelInfo { + model: EmbeddingModel::JinaEmbeddingsV5TextNano, + dim: 768, + description: String::from("Jina embeddings v5 text nano (CC BY-NC 4.0)"), + model_code: String::from("jinaai/jina-embeddings-v5-text-nano-retrieval"), + model_file: String::from("onnx/model.onnx"), + additional_files: Vec::new(), + output_key: None, +}, + +// In get_quantization_mode() (~line 6230): +EmbeddingModel::JinaEmbeddingsV5TextNano => Some(Pooling::Mean), + +// In verify_embeddings() snapshot test (~line 7850): +EmbeddingModel::JinaEmbeddingsV5TextNano => [a, b, c, d], // run test to capture values +``` + +**Acceptance criteria:** +- [x] Enum variant added with rustdoc covering license + Matryoshka dim list +- [x] `ModelInfo` entry registered in `init_models_map()` (dim=768, model_file=`onnx/model.onnx`) +- [x] Pooling registered as `Pooling::Mean` in `get_default_pooling_method()` +- [x] `cargo build` clean against Anush008/fastembed-rs main (verified 2026-04-29) +- [ ] Snapshot test entry in `tests/text-embeddings.rs` — DEFERRED. The catch-all `_ => panic!()` arm signals to CI / maintainer that real expected sums must be captured by running the test once with `ORT_LIB_LOCATION` set. Snapshot capture requires ~50MB ONNX Runtime + ~250MB model download; out of scope for the registration-only patch. +- [ ] PR opened against Anush008/fastembed-rs (HITL — user decides when to fork+push the prepared patch at `docs/issues/jina-v5-fastembed-rs-F1.patch`) + +**Apply via:** `git -C am < docs/issues/jina-v5-fastembed-rs-F1.patch` + +**Blocked by:** None (patch applies cleanly to Anush008/fastembed-rs main) +**User stories covered:** US-8 + +--- + +### F-2 — Register `jina-embeddings-v5-text-small-retrieval` [BLOCKED upstream] + +**Parent:** F-1 + +**Status update (2026-04-29):** v5-text-small is Qwen3-decoder-based with mean-pool head. Anush008/fastembed-rs main does NOT support decoder-style ONNX exports (no `Pooling::LastToken`, no `position_ids` injection, no KV-cache injection). Closed PR #236 (`feat: decoder/quantized model support`) attempted this work but was rejected. F-2 cannot be a registration-only patch like F-1; it requires the architectural prerequisites of #236 to land first. Current path: ICM uses its own ort+tokenizers integration in `icm-core` (already DONE in S-2), which is sufficient for the local-only consumer use case. Track upstream re-attempt separately. + +**What to build (when unblocked):** +Same pattern as F-1 but for small (Qwen3-based). Coordinate with fastembed maintainers on whether a new `ModelArchitecture::Qwen3Pooled` variant is needed. If yes, that is a separate commit before F-2. + +**Acceptance criteria:** +- [ ] Variant + ModelInfo + pooling + snapshot test +- [ ] Test produces 1024-dim vector for "hello world" +- [ ] If Qwen3 arch needed: clean separate commit + +**Blocked by:** F-1 +**User stories covered:** US-9 + +--- + +### F-3 (optional) — Add `truncate_dim` parameter to fastembed `InitOptions` + +**Parent:** F-1 + +**What to build:** +Add `InitOptions::truncate_dim: Option` that slices + L2-renormalizes output. No-op when `None` or model doesn't support it. + +**Acceptance criteria:** +- [ ] `InitOptions::truncate_dim: Option` +- [ ] When set: output sliced + L2-renormalized +- [ ] No-op for non-Matryoshka models (or warning) + +**Blocked by:** F-1 +**User stories covered:** US-10 + +--- + +## To create GitHub issues + +Run in dependency order (Track 1 first, then Track 2): + +```bash +# Track 1 — ICM repo (skip S-store and S-1, already done) +gh issue create --title "feat: Jina v5-text-small (Qwen3) embedder backend" --body "$(cat docs/issues/jina-v5-slices.md | sed -n '/### S-2/,/### S-3/p')" +gh issue create --title "feat: Auto re-embed on embedder dim change" --body "..." +gh issue create --title "feat: Asymmetric retrieval prefixes for Jina v5 backends" --body "..." +gh issue create --title "feat: Docs/UX — license disclosure and embedder surfacing" --body "..." + +# Track 2 — fastembed fork +gh issue create --title "feat: register jina-embeddings-v5-text-nano as built-in model" --repo qdrant/fastembed --body "..." +```