forked from ggml-org/llama.cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathresource-tracking-integration.patch
More file actions
102 lines (89 loc) · 4.25 KB
/
resource-tracking-integration.patch
File metadata and controls
102 lines (89 loc) · 4.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 1234567..8901234 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -26,6 +26,7 @@
#include "llama-model.h"
#include "llama-vocab.h"
#include "llama-hparams.h"
+#include "llama-resource-integration.h" // Add resource tracking integration
#include <algorithm>
#include <array>
@@ -10343,17 +10344,25 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
for (int il = 0; il < n_layer; ++il) {
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ // RESOURCE TRACKING: Start layer processing
+ llama_resource_hook_layer_start(il);
// norm
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
+
+ // RESOURCE TRACKING: Start attention component
+ llama_resource_hook_attention_start(il);
// self-attention
{
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
+
+ // RESOURCE TRACKING: Track Q matrix computation
+ llama_resource_hook_qkv("Q", cur, model.layers[il].wq, Qcur, il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
+
+ // RESOURCE TRACKING: Track K matrix computation
+ llama_resource_hook_qkv("K", cur, model.layers[il].wk, Kcur, il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
+
+ // RESOURCE TRACKING: Track V matrix computation
+ llama_resource_hook_qkv("V", cur, model.layers[il].wv, Vcur, il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
@@ -10379,6 +10388,9 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
+
+ // RESOURCE TRACKING: Track full attention computation
+ llama_resource_hook_attention(Qcur, Kcur, Vcur, cur, il);
}
if (il == n_layer - 1 && inp_out_ids) {
@@ -10392,6 +10404,12 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
LLM_NORM_RMS, il);
cb(cur, "attn_post_norm", il);
+ // RESOURCE TRACKING: End attention component
+ llama_resource_hook_attention_end(il);
+
+ // RESOURCE TRACKING: Handoff from attention to MLP
+ llama_resource_hook_attention_to_mlp_handoff(il);
+
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
cb(sa_out, "sa_out", il);
@@ -10402,12 +10420,16 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
// feed-forward network
{
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
+
+ // RESOURCE TRACKING: Track MLP computation
+ llama_resource_hook_mlp(sa_out, model.layers[il].ffn_gate,
+ model.layers[il].ffn_up, model.layers[il].ffn_down, cur, il);
}
cur = build_norm(cur,
@@ -10420,6 +10442,9 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
+ // RESOURCE TRACKING: End layer processing
+ llama_resource_hook_layer_end(il);
+
// input for next layer
inpL = cur;
}