forked from ggml-org/llama.cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_inference_instrumentation.cpp
More file actions
317 lines (253 loc) Β· 13.4 KB
/
test_inference_instrumentation.cpp
File metadata and controls
317 lines (253 loc) Β· 13.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
#include "llama.h"
#include "llama-instrumentation.h"
#include "common.h"
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
#include <cmath>
int main(int argc, char ** argv) {
// Initialize dynamic backends
ggml_backend_load_all();
try {
// 1. Initialize instrumentation with DETAILED level
std::cout << "π§ Initializing llama backend..." << std::endl;
// 1. Initialize the instrumentation system
llama_instrumentation instr(llama_instr_level::VERBOSE, "gemma_inference_trace.log");
instr.enable();
// 2. Load the Gemma-3 1B model
std::cout << "π Loading Gemma-3 1B model..." << std::endl;
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = 0; // Use CPU for this test
llama_model * model = llama_model_load_from_file("downloads/gemma-3-1b-it-Q4_K_M.gguf", model_params);
if (!model) {
std::cerr << "β Failed to load model!" << std::endl;
return 1;
}
// 3. Get vocab
const llama_vocab * vocab = llama_model_get_vocab(model);
if (!vocab) {
std::cerr << "β Failed to get vocabulary from model!" << std::endl;
return 1;
}
std::cout << "π Vocabulary loaded successfully" << std::endl;
// 4. Create context
std::cout << "βοΈ Creating inference context..." << std::endl;
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = 512; // Context length
ctx_params.n_batch = 32; // Batch size
ctx_params.n_threads = 4; // CPU threads
llama_context * ctx = llama_init_from_model(model, ctx_params);
if (!ctx) {
std::cerr << "β Failed to create context!" << std::endl;
llama_model_free(model);
return 1;
}
// 5. Prepare the prompt
std::string prompt = "what is the roadmap i can follow to learn AI/ML and get a decent job in it?";
std::cout << "π Prompt: " << prompt << std::endl;
// 6. Begin instrumented session
std::cout << "π Starting instrumented inference test..." << std::endl;
instr.begin_session(prompt, model);
// 7. Tokenize the prompt
const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.length(), NULL, 0, true, true);
std::vector<llama_token> prompt_tokens(n_prompt);
llama_tokenize(vocab, prompt.c_str(), prompt.length(), prompt_tokens.data(), n_prompt, true, true);
std::cout << "π€ Tokenized prompt: " << n_prompt << " tokens" << std::endl;
// Debug: print first few tokens
std::cout << "π First few tokens: ";
for (int i = 0; i < std::min(5, n_prompt); ++i) {
std::cout << prompt_tokens[i] << " ";
}
std::cout << std::endl;
std::cout << "π¦ Creating batch..." << std::endl;
// 8. Create batch with all prompt tokens
llama_batch batch = llama_batch_init(n_prompt, 0, 1);
// Add tokens to batch manually
for (int i = 0; i < n_prompt; ++i) {
batch.token[i] = prompt_tokens[i];
batch.pos[i] = i;
batch.n_seq_id[i] = 1;
batch.seq_id[i][0] = 0;
batch.logits[i] = (i == n_prompt - 1); // Only compute logits for last token
}
batch.n_tokens = n_prompt;
std::cout << "π¦ Batch created successfully" << std::endl;
std::cout << "π Batch info: n_tokens=" << batch.n_tokens << ", logits=" << (batch.logits ? "yes" : "no") << std::endl;
// Make sure logits are computed for the last token
batch.logits[batch.n_tokens - 1] = true;
std::cout << "π Set logits flag for last token" << std::endl;
// 9. Process the prompt (this will trigger our instrumentation)
instr.begin_step("prompt_processing", 0);
std::cout << "π§ Processing prompt..." << std::endl;
if (llama_decode(ctx, batch) != 0) {
std::cerr << "β Failed to decode prompt!" << std::endl;
return 1;
}
instr.end_step("Prompt processed successfully");
std::cout << "β
Prompt processed successfully!" << std::endl;
// 10. Generate a few tokens
std::cout << "π€ Generated response: ";
std::cout.flush();
int max_tokens = 50;
std::vector<llama_token> all_tokens = prompt_tokens;
for (int i = 0; i < max_tokens; i++) {
std::cout << "π― Generation step " << i << std::endl;
// Get logits and sample next token - use the LAST token's logits (index -1)
float * logits = llama_get_logits_ith(ctx, -1); // -1 means last token with logits=true
if (logits == nullptr) {
std::cout << "β Failed to get logits!" << std::endl;
break;
}
std::cout << "β
Got logits successfully!" << std::endl;
int vocab_size = llama_vocab_n_tokens(vocab);
// Enhanced probability distribution analysis
std::vector<std::pair<llama_token, float>> token_logits;
for (llama_token token_id = 0; token_id < vocab_size; token_id++) {
token_logits.push_back({token_id, logits[token_id]});
}
// Sort by logits (descending)
std::sort(token_logits.begin(), token_logits.end(),
[](const auto& a, const auto& b) { return a.second > b.second; });
// Calculate softmax probabilities for top tokens
std::vector<std::pair<llama_token, float>> top_tokens_with_probs;
const int top_k = 10; // Show top 10 tokens
// First, calculate softmax for top tokens
float max_logit = token_logits[0].second;
float sum_exp = 0.0f;
// Calculate exp(logit - max_logit) for numerical stability
for (int k = 0; k < std::min(top_k, (int)token_logits.size()); k++) {
float exp_val = std::exp(token_logits[k].second - max_logit);
sum_exp += exp_val;
top_tokens_with_probs.push_back({token_logits[k].first, exp_val});
}
// Convert to probabilities
for (auto& pair : top_tokens_with_probs) {
pair.second /= sum_exp;
}
// Create sampling state for instrumentation
llama_sampling_state sampling_state;
if (!top_tokens_with_probs.empty()) {
sampling_state.selected_token = top_tokens_with_probs[0].first;
sampling_state.selected_prob = top_tokens_with_probs[0].second;
} else {
sampling_state.selected_token = 0;
sampling_state.selected_prob = 0.0f;
}
sampling_state.sampling_method = "greedy";
// Fill top tokens and probabilities for instrumentation
for (int k = 0; k < std::min(top_k, (int)top_tokens_with_probs.size()); k++) {
sampling_state.top_tokens.push_back(top_tokens_with_probs[k].first);
sampling_state.top_probs.push_back(top_tokens_with_probs[k].second);
// Convert token to readable text
char token_str[256];
int n_chars = 0;
// Safe token to piece conversion
if (vocab) {
n_chars = llama_token_to_piece(vocab, top_tokens_with_probs[k].first, token_str, sizeof(token_str), 0, true);
}
std::string token_text;
if (n_chars > 0 && n_chars < sizeof(token_str)) {
token_text = std::string(token_str, n_chars);
} else {
token_text = "<unk>";
}
sampling_state.top_token_texts.push_back(token_text);
// Find the corresponding logit value
bool found_logit = false;
for (int j = 0; j < (int)token_logits.size(); j++) {
if (token_logits[j].first == top_tokens_with_probs[k].first) {
sampling_state.logits_sample.push_back(token_logits[j].second);
found_logit = true;
break;
}
}
if (!found_logit) {
sampling_state.logits_sample.push_back(0.0f); // Default value
}
}
// Add layer classification information (simulated for educational purposes)
int total_layers = llama_model_n_layer(model);
// Simulate layer-by-layer processing time (for educational visualization)
for (int layer = 0; layer < total_layers; layer++) {
llama_layer_info layer_info;
layer_info.layer_id = layer;
// Classify layer types based on typical transformer architecture
if (layer % 2 == 0) {
layer_info.layer_type = "attention";
layer_info.operation = "multi_head_self_attention";
} else {
layer_info.layer_type = "feed_forward";
layer_info.operation = "mlp_projection";
}
// Simulate layer execution time (educational approximation)
layer_info.execution_time = std::chrono::microseconds(1000 + (layer * 50));
// Add layer-specific metrics
layer_info.layer_metrics["attention_heads"] = (layer_info.layer_type == "attention") ? 4.0 : 0.0;
layer_info.layer_metrics["hidden_dim"] = 1152.0;
layer_info.layer_metrics["intermediate_dim"] = (layer_info.layer_type == "feed_forward") ? 6912.0 : 0.0;
sampling_state.layer_details.push_back(layer_info);
}
// Log the sampling state
instr.log_sampling_state(sampling_state);
// Select the greedy token (highest probability)
llama_token next_token = top_tokens_with_probs[0].first;
// Check for end of sequence
if (next_token == llama_vocab_eos(vocab)) {
std::cout << "π End of sequence reached!" << std::endl;
break;
}
// Convert token to text and print
char token_str[256];
int n_chars = llama_token_to_piece(vocab, next_token, token_str, sizeof(token_str), 0, true);
if (n_chars > 0) {
std::cout << std::string(token_str, n_chars);
std::cout.flush();
}
// Instrument the token generation step with layer details
std::string step_name = "token_generation_" + std::to_string(i);
instr.begin_step(step_name, 0);
// Log detailed layer information
int n_layer = llama_model_n_layer(model);
// Performance metric for this step
instr.log_performance_metric("token_probability", top_tokens_with_probs[0].second, "probability");
instr.log_performance_metric("token_logit", token_logits[0].second, "raw_logit");
// Add custom metrics for layer count and model info
instr.log_performance_metric("model_layers", n_layer, "count");
instr.log_performance_metric("vocab_size", vocab_size, "tokens");
// Prepare batch for next token
llama_batch next_batch = llama_batch_init(1, 0, 1);
next_batch.token[0] = next_token;
next_batch.pos[0] = batch.n_tokens + i;
next_batch.n_seq_id[0] = 1;
next_batch.seq_id[0][0] = 0;
next_batch.logits[0] = true;
next_batch.n_tokens = 1;
std::cout << "βοΈ Processing token " << i << std::endl;
// Decode next token (this triggers more instrumentation)
if (llama_decode(ctx, next_batch) != 0) {
instr.end_step("Decode failed");
std::cout << "β Decode failed!" << std::endl;
break;
}
instr.end_step("Token generated: " + std::string(token_str, n_chars > 0 ? n_chars : 0));
all_tokens.push_back(next_token);
// Free the batch
llama_batch_free(next_batch);
}
std::cout << std::endl;
// 11. End instrumented session
instr.end_session();
// 12. Free the allocated batches
llama_batch_free(batch);
// 13. Cleanup
llama_free(ctx);
llama_model_free(model);
std::cout << "β
Basic inference test complete!" << std::endl;
std::cout << "π Total tokens processed: " << all_tokens.size() << std::endl;
} catch (const std::exception& e) {
std::cerr << "β Error: " << e.what() << std::endl;
return 1;
}
return 0;
}