Skip to content

Commit f2751be

Browse files
committed
isort & black for scripts/benchmark.py
1 parent 29ddbb4 commit f2751be

File tree

4 files changed

+363
-58
lines changed

4 files changed

+363
-58
lines changed

README.md

+44-9
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,42 @@ The implemented architecture is based on the original Vision Transformer from:
2020
ViT architecture. Taken from the <a href="https://arxiv.org/abs/2010.11929">original paper</a>.
2121
</p>
2222

23+
## Example
24+
25+
<p align="center">
26+
<img src="assets/magpie.jpeg" alt="example input" width="30%" height="auto">
27+
</p>
28+
29+
<pre>$ ./bin/vit -t 4 -m ../ggml-model-f16.gguf -i ../assets/magpie.jpeg -k 5
30+
main: seed = 1701176263
31+
main: n_threads = 4 / 8
32+
vit_model_load: loading model from &apos;../ggml-model-f16.gguf&apos; - please wait
33+
vit_model_load: hidden_size = 192
34+
vit_model_load: num_hidden_layers = 12
35+
vit_model_load: num_attention_heads = 3
36+
vit_model_load: patch_size = 16
37+
vit_model_load: img_size = 224
38+
vit_model_load: num_classes = 1000
39+
vit_model_load: ftype = 1
40+
vit_model_load: qntvr = 0
41+
operator(): ggml ctx size = 11.13 MB
42+
vit_model_load: ................... done
43+
vit_model_load: model size = 11.04 MB / num tensors = 152
44+
main: loaded image &apos;../assets/magpie.jpeg&apos; (500 x 470)
45+
vit_image_preprocess: scale = 2.232143
46+
processed, out dims : (224 x 224)
47+
48+
&gt; magpie : 0.87
49+
&gt; goose : 0.02
50+
&gt; toucan : 0.01
51+
&gt; drake : 0.01
52+
&gt; king penguin, Aptenodytes patagonica : 0.01
53+
54+
55+
main: model load time = 17.92 ms
56+
main: processing time = 146.96 ms
57+
main: total time = 164.88 ms
58+
</pre>
2359

2460
## Convert PyTorch to GGUF
2561

@@ -77,15 +113,14 @@ allowing multithreaded runs. Make sure to also enable multiple threads when runn
77113
usage: ./bin/vit [options]
78114

79115
options:
80-
-h, --help show this help message and exit
81-
-s SEED, --seed SEED RNG seed (default: -1)
82-
-t N, --threads N number of threads to use during computation (default: 4)
83-
-m FNAME, --model FNAME
84-
model path (default: ../ggml-model-f16.bin)
85-
-i FNAME, --inp FNAME
86-
input file (default: ../assets/tench.jpg)
87-
-e FLOAT, --epsilon
88-
epsilon (default: 0.000001)
116+
-h, --help show this help message and exit
117+
-s SEED, --seed SEED RNG seed (default: -1)
118+
-t N, --threads N number of threads to use during computation (default: 4)
119+
-m FNAME, --model FNAME model path (default: ../ggml-model-f16.bin)
120+
-i FNAME, --inp FNAME input file (default: ../assets/tench.jpg)
121+
-k N, --topk N top k classes to print (default: 5)
122+
-e FLOAT, --epsilon epsilon (default: 0.000001)
123+
89124

90125
## Benchmark against PyTorch
91126

quantize.cpp

+236-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,236 @@
1-
// add simple qunatization strategies
1+
// add simple qunatization strategies
2+
3+
#include "ggml/ggml.h"
4+
#include "ggml/ggml-alloc.h"
5+
6+
#include <cassert>
7+
#include <cmath>
8+
#include <cstddef>
9+
#include <cstdio>
10+
#include <cstring>
11+
#include <fstream>
12+
#include <map>
13+
#include <string>
14+
#include <vector>
15+
#include <thread>
16+
#include <cinttypes>
17+
#include <algorithm>
18+
19+
#if defined(_MSC_VER)
20+
#pragma warning(disable : 4244 4267) // possible loss of data
21+
#endif
22+
23+
bool vit_model_quantize(const char *fname_inp, const char *fname_out, const int itype)
24+
{
25+
26+
ggml_type type = GGML_TYPE_Q4_1;
27+
28+
switch (itype)
29+
{
30+
case 2:
31+
type = GGML_TYPE_Q4_0;
32+
break;
33+
case 3:
34+
type = GGML_TYPE_Q4_1;
35+
break;
36+
case 6:
37+
type = GGML_TYPE_Q5_0;
38+
break;
39+
case 7:
40+
type = GGML_TYPE_Q5_1;
41+
break;
42+
case 8:
43+
type = GGML_TYPE_Q8_0;
44+
break;
45+
default:
46+
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
47+
return false;
48+
};
49+
50+
auto ctx_clip = vit_model_load(fname_inp, 2);
51+
const auto &ctx_src = ctx_clip->ctx_gguf;
52+
const auto &ctx_data = ctx_clip->ctx;
53+
54+
auto ctx_out = gguf_init_empty();
55+
gguf_set_kv(ctx_out, ctx_src);
56+
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
57+
gguf_set_val_u32(ctx_out, "general.file_type", itype);
58+
59+
auto fout = std::ofstream(fname_out, std::ios::binary);
60+
61+
const int n_tensors = gguf_get_n_tensors(ctx_src);
62+
63+
for (int i = 0; i < n_tensors; ++i)
64+
{
65+
const char *name = gguf_get_tensor_name(ctx_src, i);
66+
struct ggml_tensor *cur = ggml_get_tensor(ctx_data, name);
67+
gguf_add_tensor(ctx_out, cur);
68+
}
69+
70+
const size_t meta_size = gguf_get_meta_size(ctx_out);
71+
for (size_t i = 0; i < meta_size; ++i)
72+
{
73+
fout.put(0);
74+
}
75+
76+
// regexes of tensor names to be quantized
77+
const std::vector<std::string> k_names = {
78+
".*weight",
79+
};
80+
81+
std::vector<uint8_t> read_data(512);
82+
std::vector<uint8_t> work(512);
83+
std::vector<float> conv_buf(512);
84+
std::vector<int64_t> hist_all(1 << 4, 0);
85+
size_t total_size_org = 0;
86+
size_t total_size_new = 0;
87+
88+
for (int i = 0; i < n_tensors; ++i)
89+
{
90+
const std::string name = gguf_get_tensor_name(ctx_src, i);
91+
struct ggml_tensor *cur = ggml_get_tensor(ctx_data, name.c_str());
92+
93+
enum ggml_type new_type;
94+
void *new_data;
95+
size_t new_size;
96+
97+
bool quantize = false;
98+
for (const auto &s : k_names)
99+
{
100+
if (std::regex_match(name, std::regex(s)))
101+
{
102+
quantize = true;
103+
break;
104+
}
105+
}
106+
107+
// quantize only 2D tensors
108+
quantize &= (cur->n_dims == 2);
109+
110+
if (quantize)
111+
{
112+
new_type = type;
113+
const size_t n_elms = ggml_nelements(cur);
114+
float *f32_data;
115+
116+
switch (cur->type)
117+
{
118+
case GGML_TYPE_F32:
119+
f32_data = (float *)cur->data;
120+
break;
121+
case GGML_TYPE_F16:
122+
if (conv_buf.size() < n_elms)
123+
{
124+
conv_buf.resize(n_elms);
125+
}
126+
for (int j = 0; j < n_elms; ++j)
127+
{
128+
conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]);
129+
}
130+
f32_data = (float *)conv_buf.data();
131+
break;
132+
default:
133+
printf("Please use an input file in f32 or f16\n");
134+
return false;
135+
}
136+
137+
if (work.size() < n_elms * 4)
138+
{
139+
work.resize(n_elms * 4);
140+
}
141+
new_data = work.data();
142+
143+
std::vector<int64_t> hist_cur(1 << 4, 0);
144+
145+
switch (new_type)
146+
{
147+
case GGML_TYPE_Q4_0:
148+
{
149+
new_size = ggml_quantize_q4_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
150+
}
151+
break;
152+
case GGML_TYPE_Q4_1:
153+
{
154+
new_size = ggml_quantize_q4_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
155+
}
156+
break;
157+
case GGML_TYPE_Q5_0:
158+
{
159+
new_size = ggml_quantize_q5_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
160+
}
161+
break;
162+
case GGML_TYPE_Q5_1:
163+
{
164+
new_size = ggml_quantize_q5_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
165+
}
166+
break;
167+
case GGML_TYPE_Q8_0:
168+
{
169+
new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
170+
}
171+
break;
172+
default:
173+
{
174+
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
175+
return false;
176+
}
177+
}
178+
179+
for (int j = 0; j < hist_cur.size(); ++j)
180+
{
181+
hist_all[j] += hist_cur[j];
182+
}
183+
}
184+
else
185+
{
186+
new_type = cur->type;
187+
new_data = cur->data;
188+
new_size = ggml_nbytes(cur);
189+
}
190+
const size_t orig_size = ggml_nbytes(cur);
191+
total_size_org += orig_size;
192+
total_size_new += new_size;
193+
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
194+
gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
195+
fout.write((const char *)new_data, new_size);
196+
size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
197+
for (int j = 0; j < pad; ++j)
198+
{
199+
fout.put(0);
200+
}
201+
202+
printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), cur->n_dims, quantize,
203+
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
204+
}
205+
206+
// go back to beginning of file and write the updated metadata
207+
fout.seekp(0, std::ios::beg);
208+
std::vector<uint8_t> meta(meta_size);
209+
gguf_get_meta_data(ctx_out, meta.data());
210+
fout.write((const char *)meta.data(), meta_size);
211+
212+
fout.close();
213+
214+
clip_free(ctx_clip);
215+
gguf_free(ctx_out);
216+
217+
{
218+
printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
219+
printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
220+
221+
int64_t sum_all = 0;
222+
for (size_t i = 0; i < hist_all.size(); ++i)
223+
{
224+
sum_all += hist_all[i];
225+
}
226+
227+
printf("%s: hist: ", __func__);
228+
for (size_t i = 0; i < hist_all.size(); ++i)
229+
{
230+
printf("%5.3f ", hist_all[i] / (float)sum_all);
231+
}
232+
printf("\n");
233+
}
234+
235+
return true;
236+
}

scripts/benchmark.py

+28-20
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,24 @@
1+
import time
2+
3+
import timm
14
import torch
25
import torchvision.transforms as transforms
6+
from memory_profiler import memory_usage
37
from PIL import Image
4-
import timm
5-
import time
68
from threadpoolctl import threadpool_limits
7-
from memory_profiler import memory_usage
9+
810

911
def process_and_predict(image_path, model_path):
1012
model = timm.create_model(model_path, pretrained=True)
11-
preprocess = transforms.Compose([
12-
transforms.Resize((224, 224)),
13-
transforms.ToTensor(),
14-
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
15-
])
13+
preprocess = transforms.Compose(
14+
[
15+
transforms.Resize((224, 224)),
16+
transforms.ToTensor(),
17+
transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
18+
]
19+
)
1620

17-
image = Image.open(image_path).convert('RGB')
21+
image = Image.open(image_path).convert("RGB")
1822
image = preprocess(image)
1923
image = image.unsqueeze(0)
2024

@@ -24,21 +28,24 @@ def process_and_predict(image_path, model_path):
2428

2529
return probabilities
2630

31+
2732
def benchmark_model(image_path, model_name, N=10):
2833
times = []
2934
peak_memory_usages = []
3035

3136
for _ in range(N):
3237
start_time = time.time()
33-
38+
3439
# Measure peak memory usage
35-
peak_memory_usage = memory_usage((process_and_predict, (image_path, model_name)),
36-
interval=0.01,
37-
max_usage=True,
38-
include_children=True)
40+
peak_memory_usage = memory_usage(
41+
(process_and_predict, (image_path, model_name)),
42+
interval=0.01,
43+
max_usage=True,
44+
include_children=True,
45+
)
3946

4047
end_time = time.time()
41-
48+
4249
time_taken = end_time - start_time
4350
times.append(time_taken)
4451
peak_memory_usages.append(peak_memory_usage)
@@ -47,12 +54,13 @@ def benchmark_model(image_path, model_name, N=10):
4754
max_peak_memory = sum(peak_memory_usages) / N
4855
return avg_time, max_peak_memory
4956

57+
5058
# model variants
5159
model_variants = {
52-
'tiny': 'vit_tiny_patch16_224.augreg_in21k_ft_in1k',
53-
'small': 'vit_small_patch16_224.augreg_in21k_ft_in1k',
54-
'base': 'vit_base_patch16_224.augreg_in21k_ft_in1k',
55-
'large': 'vit_large_patch16_224.augreg_in21k_ft_in1k'
60+
"tiny": "vit_tiny_patch16_224.augreg_in21k_ft_in1k",
61+
"small": "vit_small_patch16_224.augreg_in21k_ft_in1k",
62+
"base": "vit_base_patch16_224.augreg_in21k_ft_in1k",
63+
"large": "vit_large_patch16_224.augreg_in21k_ft_in1k",
5664
}
5765

5866
# an image
@@ -65,4 +73,4 @@ def benchmark_model(image_path, model_name, N=10):
6573

6674
for name, model_name in model_variants.items():
6775
avg_time, peak_memory = benchmark_model(image_path, model_name)
68-
print(f"| {name} | {avg_time:.0f} | {peak_memory:.0f} |")
76+
print(f"| {name} | {avg_time:.0f} | {peak_memory:.0f} |")

0 commit comments

Comments
 (0)