File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -2470,10 +2470,26 @@ struct GGMLRunner {
24702470 *effective_budget_out = effective_budget;
24712471 }
24722472
2473+ // When streaming and the model dwarfs the budget, cap the planner at
2474+ // a quarter so it builds smaller merged segments and chunk-K can fit
2475+ // alongside. Without streaming the cap only adds dispatch overhead.
2476+ size_t planner_budget = effective_budget;
2477+ if (stream_layers_enabled) {
2478+ size_t total_params_bytes = 0 ;
2479+ for (const ggml_tensor* t : params_tensor_set_) {
2480+ if (t != nullptr ) {
2481+ total_params_bytes += ggml_nbytes (t);
2482+ }
2483+ }
2484+ if (total_params_bytes * 4 > effective_budget * 3 ) {
2485+ planner_budget = effective_budget / 4 ;
2486+ }
2487+ }
2488+
24732489 *plan_out = sd::ggml_graph_cut::resolve_plan (runtime_backend,
24742490 gf,
24752491 &graph_cut_plan_cache_,
2476- effective_budget ,
2492+ planner_budget ,
24772493 params_tensor_set_,
24782494 get_desc ().c_str ());
24792495 if (stream_layers_enabled) {
You can’t perform that action at this time.
0 commit comments