forked from ggml-org/llama.cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_resource_instrumentation.cpp
More file actions
163 lines (125 loc) ยท 6.64 KB
/
test_resource_instrumentation.cpp
File metadata and controls
163 lines (125 loc) ยท 6.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#include "include/llama.h"
#include "src/llama-instrumentation.h"
#include "include/llama-resource-instrumentation.h"
#include "common/common.h"
#include <iostream>
#include <string>
#include <vector>
int main(int argc, char ** argv) {
// Initialize dynamic backends
ggml_backend_load_all();
try {
std::cout << "๐ง Testing Resource Instrumentation System..." << std::endl;
// 1. Initialize BOTH instrumentation systems
std::cout << "๐ Initializing instrumentation systems..." << std::endl;
// Original instrumentation
llama_instrumentation instr(llama_instr_level::DETAILED, "test_token_trace.log");
instr.enable();
// NEW Resource instrumentation
llama_resource_instrumentation_init(llama_resource_level::DETAILED, "test_resource_trace.jsonl");
// 2. Load the model
std::cout << "๐ Loading model..." << std::endl;
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = 0;
llama_model * model = llama_model_load_from_file("downloads/gemma-3-1b-it-Q4_K_M.gguf", model_params);
if (!model) {
std::cerr << "โ Failed to load model!" << std::endl;
return 1;
}
// 3. Create context
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = 256;
ctx_params.n_batch = 16;
ctx_params.n_threads = 2;
llama_context * ctx = llama_init_from_model(model, ctx_params);
if (!ctx) {
std::cerr << "โ Failed to create context!" << std::endl;
llama_model_free(model);
return 1;
}
// 4. Begin instrumented session for BOTH systems
std::string prompt = "Hello what is deep learning?";
std::cout << "๐ญ Prompt: " << prompt << std::endl;
// Token-level session
instr.begin_session(prompt, model);
// Resource-level session
LLAMA_RESOURCE_BEGIN_SESSION("test_resource_session_20240823_140000_123456");
// 5. Tokenize
const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.length(), NULL, 0, true, true);
std::vector<llama_token> prompt_tokens(n_prompt);
llama_tokenize(vocab, prompt.c_str(), prompt.length(), prompt_tokens.data(), n_prompt, true, true);
std::cout << "๐ค Tokenized: " << n_prompt << " tokens" << std::endl;
// 6. Process with BOTH instrumentations active
int n_layers = llama_model_n_layer(model);
std::cout << "๐ง Processing through " << n_layers << " layers..." << std::endl;
// Simulate layer-by-layer processing with resource tracking
for (int layer = 0; layer < std::min(3, n_layers); layer++) { // Just first 3 layers for demo
std::cout << "๐ Processing layer " << layer << std::endl;
// Begin resource tracking for this layer
LLAMA_RESOURCE_BEGIN_LAYER(layer);
// Simulate attention component
LLAMA_RESOURCE_BEGIN_COMPONENT("attention");
// Log simulated memory allocation for attention weights
// NOTE: In a real integration, you'd pass actual tensor pointers
// For this test, we'll create fake tensor data to demonstrate API usage
// Simulate Q, K, V weight loading
std::cout << " ๐พ Simulating QKV weight allocation..." << std::endl;
// LLAMA_RESOURCE_LOG_MEMORY_ALLOCATION(qkv_tensor, "qkv_weights");
// Simulate attention computation
std::cout << " โก Simulating attention computation..." << std::endl;
// LLAMA_RESOURCE_LOG_COMPUTE_OPERATION("mul_mat", "attention", inputs, output);
LLAMA_RESOURCE_END_COMPONENT("attention");
// Simulate MLP component
LLAMA_RESOURCE_BEGIN_COMPONENT("mlp");
std::cout << " ๐งฎ Simulating MLP operations..." << std::endl;
// LLAMA_RESOURCE_LOG_MLP_OPERATION("gate_proj", gate_weights, gate_activations);
// LLAMA_RESOURCE_LOG_MLP_OPERATION("up_proj", up_weights, up_activations);
// LLAMA_RESOURCE_LOG_MLP_OPERATION("down_proj", down_weights, down_activations);
// Log component data transfer
LLAMA_RESOURCE_LOG_COMPONENT_HANDOFF("attention", "mlp");
LLAMA_RESOURCE_END_COMPONENT("mlp");
// End layer processing
LLAMA_RESOURCE_END_LAYER(layer);
}
// 7. Create and process batch (minimal for demo)
llama_batch batch = llama_batch_init(std::min(8, n_prompt), 0, 1);
int tokens_to_process = std::min(8, n_prompt);
for (int i = 0; i < tokens_to_process; ++i) {
batch.token[i] = prompt_tokens[i];
batch.pos[i] = i;
batch.n_seq_id[i] = 1;
batch.seq_id[i][0] = 0;
batch.logits[i] = (i == tokens_to_process - 1);
}
batch.n_tokens = tokens_to_process;
// Process with token-level instrumentation
instr.begin_step("batch_processing", 0);
std::cout << "๐ง Processing batch..." << std::endl;
if (llama_decode(ctx, batch) == 0) {
std::cout << "โ
Batch processed successfully!" << std::endl;
// Log some performance metrics
instr.log_performance_metric("batch_size", tokens_to_process, "tokens");
instr.log_performance_metric("layers_processed", n_layers, "count");
} else {
std::cout << "โ Batch processing failed!" << std::endl;
}
instr.end_step("Batch processing complete");
// 8. End both instrumentation sessions
instr.end_session();
LLAMA_RESOURCE_END_SESSION();
// 9. Cleanup
llama_batch_free(batch);
llama_free(ctx);
llama_model_free(model);
llama_resource_instrumentation_free();
std::cout << "โ
Resource Instrumentation Test Complete!" << std::endl;
std::cout << "๐ Check logs:" << std::endl;
std::cout << " - Token-level: test_token_trace.log" << std::endl;
std::cout << " - Resource-level: test_resource_trace.jsonl" << std::endl;
} catch (const std::exception& e) {
std::cerr << "โ Error: " << e.what() << std::endl;
return 1;
}
return 0;
}