feat!: Instantiate hybrid cache for hybrid models

gabe-l-hart · gabe-l-hart · commit cbf6b102c7d6 · 2025-05-16T11:24:33.000-06:00
There is a small breaking change here that extends the create_memory
method signature to include the hparams. Currently, this member is only
used inside llama_context and is not part of an interface that's expected
to be extended by classes derived from llama_model, so I don't think this
should actually break any downstream use cases.

Branch: HybridCache

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -180,7 +180,7 @@ llama_context::llama_context(
             /*.type_v =*/ params.type_v,
         };
 
-        memory.reset(model.create_memory(params_mem, cparams));
+        memory.reset(model.create_memory(params_mem, cparams, hparams));
     }
 
     // init backends
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -13040,46 +13040,94 @@ struct llm_build_bailingmoe : public llm_graph_context {
     }
 };
 
-llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
+llama_memory_i * llama_model::create_memory(
+    const llama_memory_params & params,
+          llama_cparams       & cparams,
+    const llama_hparams       & hparams) const {
     llama_memory_i * res;
 
     switch (arch) {
+        // Models that need specific instantiation should be handled in the
+        // switch statement
         case LLM_ARCH_BERT:
         case LLM_ARCH_JINA_BERT_V2:
         case LLM_ARCH_NOMIC_BERT:
         case LLM_ARCH_NOMIC_BERT_MOE:
             {
                 res = nullptr;
             } break;
-        case LLM_ARCH_MAMBA:
-        case LLM_ARCH_RWKV6:
-        case LLM_ARCH_RWKV6QWEN2:
-        case LLM_ARCH_RWKV7:
-        case LLM_ARCH_ARWKV7:
-            {
-                res = new llama_kv_cache_recurrent(
-                        *this,
-                        GGML_TYPE_F32,
-                        GGML_TYPE_F32,
-                        cparams.offload_kqv,
-                        std::max((uint32_t) 1, cparams.n_seq_max));
-            } break;
+        // Models that need standard caching should rely on recurrent/hybrid
+        // checks
         default:
             {
-                const auto padding = llama_kv_cache_unified::get_padding(cparams);
+                if (llm_arch_is_hybrid(arch)) {
+                    // make vectors of recurrent and non-recurrent layer indices
+                    std::vector<size_t> recurrent_layers;
+                    std::vector<size_t> unified_layers;
+                    for (auto il = 0u; il < hparams.n_layer; ++il) {
+                        if (hparams.recurrent_layer(il)) {
+                            recurrent_layers.push_back(il);
+                        } else {
+                            unified_layers.push_back(il);
+                        }
+                    }
+
+                    const auto padding = llama_kv_cache_unified::get_padding(cparams);
+                    cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
+                    LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
+
+                    // initialize the children
+                    std::vector<llama_kv_cache_hybrid::child_cache> children;
+                    children.emplace_back(
+                        std::unique_ptr<llama_kv_cache>(
+                            new llama_kv_cache_recurrent(
+                                *this,
+                                GGML_TYPE_F32,
+                                GGML_TYPE_F32,
+                                cparams.offload_kqv,
+                                std::max((uint32_t) 1, cparams.n_seq_max))
+                        ),
+                        std::move(recurrent_layers)
+                    );
+                    children.emplace_back(
+                        std::unique_ptr<llama_kv_cache>(
+                            new llama_kv_cache_unified(
+                                *this,
+                                params.type_k,
+                                params.type_v,
+                                !cparams.flash_attn,
+                                cparams.offload_kqv,
+                                cparams.n_ctx,
+                                padding)
+                        ),
+                        std::move(unified_layers)
+                    );
+
+                    // initialize the hybrid cache with both children
+                    res = new llama_kv_cache_hybrid(hparams, std::move(children));
+                } else if (llm_arch_is_recurrent(arch)) {
+                    res = new llama_kv_cache_recurrent(
+                            *this,
+                            GGML_TYPE_F32,
+                            GGML_TYPE_F32,
+                            cparams.offload_kqv,
+                            std::max((uint32_t) 1, cparams.n_seq_max));
+                } else {
+                    const auto padding = llama_kv_cache_unified::get_padding(cparams);
 
-                cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
+                    cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
 
-                LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
+                    LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
 
-                res = new llama_kv_cache_unified(
-                        *this,
-                        params.type_k,
-                        params.type_v,
-                        !cparams.flash_attn,
-                        cparams.offload_kqv,
-                        cparams.n_ctx,
-                        padding);
+                    res = new llama_kv_cache_unified(
+                            *this,
+                            params.type_k,
+                            params.type_v,
+                            !cparams.flash_attn,
+                            cparams.offload_kqv,
+                            cparams.n_ctx,
+                            padding);
+                }
             }
     }
 
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -402,7 +402,10 @@ struct llama_model {
 
     // note: can mutate `cparams`
     // TODO: move this to new llm_arch_model_i interface
-    llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
+    llama_memory_i * create_memory(
+        const llama_memory_params & params,
+              llama_cparams       & cparams,
+        const llama_hparams       & hparams) const;
 
     // TODO: move this to new llm_arch_model_i interface
     llm_graph_result_ptr build_graph(

Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,7 @@ llama_context::llama_context(`
`180`	`180`	`/.type_v =/ params.type_v,`
`181`	`181`	`};`
`182`	`182`
`183`		`- memory.reset(model.create_memory(params_mem, cparams));`
	`183`	`+ memory.reset(model.create_memory(params_mem, cparams, hparams));`
`184`	`184`	`}`
`185`	`185`
`186`	`186`	`// init backends`