1
- // add simple qunatization strategies
1
+ // add simple qunatization strategies
2
+
3
+ #include " ggml/ggml.h"
4
+ #include " ggml/ggml-alloc.h"
5
+
6
+ #include < cassert>
7
+ #include < cmath>
8
+ #include < cstddef>
9
+ #include < cstdio>
10
+ #include < cstring>
11
+ #include < fstream>
12
+ #include < map>
13
+ #include < string>
14
+ #include < vector>
15
+ #include < thread>
16
+ #include < cinttypes>
17
+ #include < algorithm>
18
+
19
+ #if defined(_MSC_VER)
20
+ #pragma warning(disable : 4244 4267) // possible loss of data
21
+ #endif
22
+
23
+ bool vit_model_quantize (const char *fname_inp, const char *fname_out, const int itype)
24
+ {
25
+
26
+ ggml_type type = GGML_TYPE_Q4_1;
27
+
28
+ switch (itype)
29
+ {
30
+ case 2 :
31
+ type = GGML_TYPE_Q4_0;
32
+ break ;
33
+ case 3 :
34
+ type = GGML_TYPE_Q4_1;
35
+ break ;
36
+ case 6 :
37
+ type = GGML_TYPE_Q5_0;
38
+ break ;
39
+ case 7 :
40
+ type = GGML_TYPE_Q5_1;
41
+ break ;
42
+ case 8 :
43
+ type = GGML_TYPE_Q8_0;
44
+ break ;
45
+ default :
46
+ fprintf (stderr, " %s: invalid quantization type %d\n " , __func__, itype);
47
+ return false ;
48
+ };
49
+
50
+ auto ctx_clip = vit_model_load (fname_inp, 2 );
51
+ const auto &ctx_src = ctx_clip->ctx_gguf ;
52
+ const auto &ctx_data = ctx_clip->ctx ;
53
+
54
+ auto ctx_out = gguf_init_empty ();
55
+ gguf_set_kv (ctx_out, ctx_src);
56
+ gguf_set_val_u32 (ctx_out, " general.quantization_version" , GGML_QNT_VERSION);
57
+ gguf_set_val_u32 (ctx_out, " general.file_type" , itype);
58
+
59
+ auto fout = std::ofstream (fname_out, std::ios::binary);
60
+
61
+ const int n_tensors = gguf_get_n_tensors (ctx_src);
62
+
63
+ for (int i = 0 ; i < n_tensors; ++i)
64
+ {
65
+ const char *name = gguf_get_tensor_name (ctx_src, i);
66
+ struct ggml_tensor *cur = ggml_get_tensor (ctx_data, name);
67
+ gguf_add_tensor (ctx_out, cur);
68
+ }
69
+
70
+ const size_t meta_size = gguf_get_meta_size (ctx_out);
71
+ for (size_t i = 0 ; i < meta_size; ++i)
72
+ {
73
+ fout.put (0 );
74
+ }
75
+
76
+ // regexes of tensor names to be quantized
77
+ const std::vector<std::string> k_names = {
78
+ " .*weight" ,
79
+ };
80
+
81
+ std::vector<uint8_t > read_data (512 );
82
+ std::vector<uint8_t > work (512 );
83
+ std::vector<float > conv_buf (512 );
84
+ std::vector<int64_t > hist_all (1 << 4 , 0 );
85
+ size_t total_size_org = 0 ;
86
+ size_t total_size_new = 0 ;
87
+
88
+ for (int i = 0 ; i < n_tensors; ++i)
89
+ {
90
+ const std::string name = gguf_get_tensor_name (ctx_src, i);
91
+ struct ggml_tensor *cur = ggml_get_tensor (ctx_data, name.c_str ());
92
+
93
+ enum ggml_type new_type;
94
+ void *new_data;
95
+ size_t new_size;
96
+
97
+ bool quantize = false ;
98
+ for (const auto &s : k_names)
99
+ {
100
+ if (std::regex_match (name, std::regex (s)))
101
+ {
102
+ quantize = true ;
103
+ break ;
104
+ }
105
+ }
106
+
107
+ // quantize only 2D tensors
108
+ quantize &= (cur->n_dims == 2 );
109
+
110
+ if (quantize)
111
+ {
112
+ new_type = type;
113
+ const size_t n_elms = ggml_nelements (cur);
114
+ float *f32_data;
115
+
116
+ switch (cur->type )
117
+ {
118
+ case GGML_TYPE_F32:
119
+ f32_data = (float *)cur->data ;
120
+ break ;
121
+ case GGML_TYPE_F16:
122
+ if (conv_buf.size () < n_elms)
123
+ {
124
+ conv_buf.resize (n_elms);
125
+ }
126
+ for (int j = 0 ; j < n_elms; ++j)
127
+ {
128
+ conv_buf[j] = ggml_fp16_to_fp32 (((ggml_fp16_t *)cur->data )[j]);
129
+ }
130
+ f32_data = (float *)conv_buf.data ();
131
+ break ;
132
+ default :
133
+ printf (" Please use an input file in f32 or f16\n " );
134
+ return false ;
135
+ }
136
+
137
+ if (work.size () < n_elms * 4 )
138
+ {
139
+ work.resize (n_elms * 4 );
140
+ }
141
+ new_data = work.data ();
142
+
143
+ std::vector<int64_t > hist_cur (1 << 4 , 0 );
144
+
145
+ switch (new_type)
146
+ {
147
+ case GGML_TYPE_Q4_0:
148
+ {
149
+ new_size = ggml_quantize_q4_0 (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
150
+ }
151
+ break ;
152
+ case GGML_TYPE_Q4_1:
153
+ {
154
+ new_size = ggml_quantize_q4_1 (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
155
+ }
156
+ break ;
157
+ case GGML_TYPE_Q5_0:
158
+ {
159
+ new_size = ggml_quantize_q5_0 (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
160
+ }
161
+ break ;
162
+ case GGML_TYPE_Q5_1:
163
+ {
164
+ new_size = ggml_quantize_q5_1 (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
165
+ }
166
+ break ;
167
+ case GGML_TYPE_Q8_0:
168
+ {
169
+ new_size = ggml_quantize_q8_0 (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
170
+ }
171
+ break ;
172
+ default :
173
+ {
174
+ fprintf (stderr, " %s: unsupported quantization type %d\n " , __func__, new_type);
175
+ return false ;
176
+ }
177
+ }
178
+
179
+ for (int j = 0 ; j < hist_cur.size (); ++j)
180
+ {
181
+ hist_all[j] += hist_cur[j];
182
+ }
183
+ }
184
+ else
185
+ {
186
+ new_type = cur->type ;
187
+ new_data = cur->data ;
188
+ new_size = ggml_nbytes (cur);
189
+ }
190
+ const size_t orig_size = ggml_nbytes (cur);
191
+ total_size_org += orig_size;
192
+ total_size_new += new_size;
193
+ gguf_set_tensor_type (ctx_out, name.c_str (), new_type);
194
+ gguf_set_tensor_data (ctx_out, name.c_str (), new_data, new_size);
195
+ fout.write ((const char *)new_data, new_size);
196
+ size_t pad = GGML_PAD (new_size, gguf_get_alignment (ctx_out)) - new_size;
197
+ for (int j = 0 ; j < pad; ++j)
198
+ {
199
+ fout.put (0 );
200
+ }
201
+
202
+ printf (" %s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n " , name.c_str (), cur->n_dims , quantize,
203
+ orig_size / 1024.0 / 1024.0 , new_size / 1024.0 / 1024.0 );
204
+ }
205
+
206
+ // go back to beginning of file and write the updated metadata
207
+ fout.seekp (0 , std::ios::beg);
208
+ std::vector<uint8_t > meta (meta_size);
209
+ gguf_get_meta_data (ctx_out, meta.data ());
210
+ fout.write ((const char *)meta.data (), meta_size);
211
+
212
+ fout.close ();
213
+
214
+ clip_free (ctx_clip);
215
+ gguf_free (ctx_out);
216
+
217
+ {
218
+ printf (" %s: original size = %8.2f MB\n " , __func__, total_size_org / 1024.0 / 1024.0 );
219
+ printf (" %s: quantized size = %8.2f MB\n " , __func__, total_size_new / 1024.0 / 1024.0 );
220
+
221
+ int64_t sum_all = 0 ;
222
+ for (size_t i = 0 ; i < hist_all.size (); ++i)
223
+ {
224
+ sum_all += hist_all[i];
225
+ }
226
+
227
+ printf (" %s: hist: " , __func__);
228
+ for (size_t i = 0 ; i < hist_all.size (); ++i)
229
+ {
230
+ printf (" %5.3f " , hist_all[i] / (float )sum_all);
231
+ }
232
+ printf (" \n " );
233
+ }
234
+
235
+ return true ;
236
+ }
0 commit comments