Skip to content

Commit 88a5ee4

Browse files
committed
perf: gate planner budget cap on stream_layers_enabled
1 parent 33a84ba commit 88a5ee4

1 file changed

Lines changed: 13 additions & 9 deletions

File tree

src/ggml_extend.hpp

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2470,17 +2470,21 @@ struct GGMLRunner {
24702470
*effective_budget_out = effective_budget;
24712471
}
24722472

2473-
// When the model dwarfs the budget, cap the planner at a quarter so
2474-
// it builds smaller merged segments and chunk-K can fit alongside.
2475-
// Otherwise leave the planner free to merge into one large segment.
2476-
size_t total_params_bytes = 0;
2477-
for (const ggml_tensor* t : params_tensor_set_) {
2478-
if (t != nullptr) {
2479-
total_params_bytes += ggml_nbytes(t);
2473+
// When streaming and the model dwarfs the budget, cap the planner at
2474+
// a quarter so it builds smaller merged segments and chunk-K can fit
2475+
// alongside. Without streaming the cap only adds dispatch overhead.
2476+
size_t planner_budget = effective_budget;
2477+
if (stream_layers_enabled) {
2478+
size_t total_params_bytes = 0;
2479+
for (const ggml_tensor* t : params_tensor_set_) {
2480+
if (t != nullptr) {
2481+
total_params_bytes += ggml_nbytes(t);
2482+
}
2483+
}
2484+
if (total_params_bytes * 4 > effective_budget * 3) {
2485+
planner_budget = effective_budget / 4;
24802486
}
24812487
}
2482-
const size_t planner_budget =
2483-
(total_params_bytes * 4 > effective_budget * 3) ? effective_budget / 4 : effective_budget;
24842488

24852489
*plan_out = sd::ggml_graph_cut::resolve_plan(runtime_backend,
24862490
gf,

0 commit comments

Comments
 (0)