isort & black for scripts/benchmark.py

staghado · staghado · commit f2751beea8ed · 2023-11-28T14:03:17.000+01:00
diff --git a/README.md b/README.md
@@ -20,6 +20,42 @@ The implemented architecture is based on the original Vision Transformer from:
   ViT architecture. Taken from the <a href="https://arxiv.org/abs/2010.11929">original paper</a>.
 </p>
 
+## Example
+
+<p align="center">
+  <img src="assets/magpie.jpeg" alt="example input" width="30%" height="auto">
+</p>
+
+<pre>$ ./bin/vit -t 4 -m ../ggml-model-f16.gguf -i ../assets/magpie.jpeg -k 5
+main: seed = 1701176263
+main: n_threads = 4 / 8
+vit_model_load: loading model from &apos;../ggml-model-f16.gguf&apos; - please wait
+vit_model_load: hidden_size            = 192
+vit_model_load: num_hidden_layers      = 12
+vit_model_load: num_attention_heads    = 3
+vit_model_load: patch_size             = 16
+vit_model_load: img_size               = 224
+vit_model_load: num_classes            = 1000
+vit_model_load: ftype                  = 1
+vit_model_load: qntvr                  = 0
+operator(): ggml ctx size =  11.13 MB
+vit_model_load: ................... done
+vit_model_load: model size =    11.04 MB / num tensors = 152
+main: loaded image &apos;../assets/magpie.jpeg&apos; (500 x 470)
+vit_image_preprocess: scale = 2.232143
+processed, out dims : (224 x 224)
+
+ &gt; magpie : 0.87
+ &gt; goose : 0.02
+ &gt; toucan : 0.01
+ &gt; drake : 0.01
+ &gt; king penguin, Aptenodytes patagonica : 0.01
+
+
+main:    model load time =    17.92 ms
+main:    processing time =   146.96 ms
+main:    total time      =   164.88 ms
+</pre>
 
 ## Convert PyTorch to GGUF
 
@@ -77,15 +113,14 @@ allowing multithreaded runs. Make sure to also enable multiple threads when runn
     usage: ./bin/vit [options]
 
     options:
-      -h, --help            show this help message and exit
-      -s SEED, --seed SEED  RNG seed (default: -1)
-      -t N, --threads N     number of threads to use during computation (default: 4)
-      -m FNAME, --model FNAME
-                            model path (default: ../ggml-model-f16.bin)
-      -i FNAME, --inp FNAME
-                            input file (default: ../assets/tench.jpg)
-      -e FLOAT, --epsilon
-                            epsilon (default: 0.000001)
+      -h, --help              show this help message and exit
+      -s SEED, --seed SEED    RNG seed (default: -1)
+      -t N, --threads N       number of threads to use during computation (default: 4)
+      -m FNAME, --model FNAME model path (default: ../ggml-model-f16.bin)
+      -i FNAME, --inp FNAME   input file (default: ../assets/tench.jpg)
+      -k N, --topk N          top k classes to print (default: 5)
+      -e FLOAT, --epsilon     epsilon (default: 0.000001)
+
 
 ## Benchmark against PyTorch
 
diff --git a/quantize.cpp b/quantize.cpp
@@ -1 +1,236 @@
-// add simple qunatization strategies
+// add simple qunatization strategies
+
+#include "ggml/ggml.h"
+#include "ggml/ggml-alloc.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <thread>
+#include <cinttypes>
+#include <algorithm>
+
+#if defined(_MSC_VER)
+#pragma warning(disable : 4244 4267) // possible loss of data
+#endif
+
+bool vit_model_quantize(const char *fname_inp, const char *fname_out, const int itype)
+{
+
+    ggml_type type = GGML_TYPE_Q4_1;
+
+    switch (itype)
+    {
+    case 2:
+        type = GGML_TYPE_Q4_0;
+        break;
+    case 3:
+        type = GGML_TYPE_Q4_1;
+        break;
+    case 6:
+        type = GGML_TYPE_Q5_0;
+        break;
+    case 7:
+        type = GGML_TYPE_Q5_1;
+        break;
+    case 8:
+        type = GGML_TYPE_Q8_0;
+        break;
+    default:
+        fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
+        return false;
+    };
+
+    auto ctx_clip = vit_model_load(fname_inp, 2);
+    const auto &ctx_src = ctx_clip->ctx_gguf;
+    const auto &ctx_data = ctx_clip->ctx;
+
+    auto ctx_out = gguf_init_empty();
+    gguf_set_kv(ctx_out, ctx_src);
+    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
+    gguf_set_val_u32(ctx_out, "general.file_type", itype);
+
+    auto fout = std::ofstream(fname_out, std::ios::binary);
+
+    const int n_tensors = gguf_get_n_tensors(ctx_src);
+
+    for (int i = 0; i < n_tensors; ++i)
+    {
+        const char *name = gguf_get_tensor_name(ctx_src, i);
+        struct ggml_tensor *cur = ggml_get_tensor(ctx_data, name);
+        gguf_add_tensor(ctx_out, cur);
+    }
+
+    const size_t meta_size = gguf_get_meta_size(ctx_out);
+    for (size_t i = 0; i < meta_size; ++i)
+    {
+        fout.put(0);
+    }
+
+    // regexes of tensor names to be quantized
+    const std::vector<std::string> k_names = {
+        ".*weight",
+    };
+
+    std::vector<uint8_t> read_data(512);
+    std::vector<uint8_t> work(512);
+    std::vector<float> conv_buf(512);
+    std::vector<int64_t> hist_all(1 << 4, 0);
+    size_t total_size_org = 0;
+    size_t total_size_new = 0;
+
+    for (int i = 0; i < n_tensors; ++i)
+    {
+        const std::string name = gguf_get_tensor_name(ctx_src, i);
+        struct ggml_tensor *cur = ggml_get_tensor(ctx_data, name.c_str());
+
+        enum ggml_type new_type;
+        void *new_data;
+        size_t new_size;
+
+        bool quantize = false;
+        for (const auto &s : k_names)
+        {
+            if (std::regex_match(name, std::regex(s)))
+            {
+                quantize = true;
+                break;
+            }
+        }
+
+        // quantize only 2D tensors
+        quantize &= (cur->n_dims == 2);
+
+        if (quantize)
+        {
+            new_type = type;
+            const size_t n_elms = ggml_nelements(cur);
+            float *f32_data;
+
+            switch (cur->type)
+            {
+            case GGML_TYPE_F32:
+                f32_data = (float *)cur->data;
+                break;
+            case GGML_TYPE_F16:
+                if (conv_buf.size() < n_elms)
+                {
+                    conv_buf.resize(n_elms);
+                }
+                for (int j = 0; j < n_elms; ++j)
+                {
+                    conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]);
+                }
+                f32_data = (float *)conv_buf.data();
+                break;
+            default:
+                printf("Please use an input file in f32 or f16\n");
+                return false;
+            }
+
+            if (work.size() < n_elms * 4)
+            {
+                work.resize(n_elms * 4);
+            }
+            new_data = work.data();
+
+            std::vector<int64_t> hist_cur(1 << 4, 0);
+
+            switch (new_type)
+            {
+            case GGML_TYPE_Q4_0:
+            {
+                new_size = ggml_quantize_q4_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+            }
+            break;
+            case GGML_TYPE_Q4_1:
+            {
+                new_size = ggml_quantize_q4_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+            }
+            break;
+            case GGML_TYPE_Q5_0:
+            {
+                new_size = ggml_quantize_q5_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+            }
+            break;
+            case GGML_TYPE_Q5_1:
+            {
+                new_size = ggml_quantize_q5_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+            }
+            break;
+            case GGML_TYPE_Q8_0:
+            {
+                new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+            }
+            break;
+            default:
+            {
+                fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
+                return false;
+            }
+            }
+
+            for (int j = 0; j < hist_cur.size(); ++j)
+            {
+                hist_all[j] += hist_cur[j];
+            }
+        }
+        else
+        {
+            new_type = cur->type;
+            new_data = cur->data;
+            new_size = ggml_nbytes(cur);
+        }
+        const size_t orig_size = ggml_nbytes(cur);
+        total_size_org += orig_size;
+        total_size_new += new_size;
+        gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
+        gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
+        fout.write((const char *)new_data, new_size);
+        size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
+        for (int j = 0; j < pad; ++j)
+        {
+            fout.put(0);
+        }
+
+        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), cur->n_dims, quantize,
+               orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+    }
+
+    // go back to beginning of file and write the updated metadata
+    fout.seekp(0, std::ios::beg);
+    std::vector<uint8_t> meta(meta_size);
+    gguf_get_meta_data(ctx_out, meta.data());
+    fout.write((const char *)meta.data(), meta_size);
+
+    fout.close();
+
+    clip_free(ctx_clip);
+    gguf_free(ctx_out);
+
+    {
+        printf("%s: original size  = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        printf("%s: quantized size  = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+
+        int64_t sum_all = 0;
+        for (size_t i = 0; i < hist_all.size(); ++i)
+        {
+            sum_all += hist_all[i];
+        }
+
+        printf("%s: hist: ", __func__);
+        for (size_t i = 0; i < hist_all.size(); ++i)
+        {
+            printf("%5.3f ", hist_all[i] / (float)sum_all);
+        }
+        printf("\n");
+    }
+
+    return true;
+}
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
@@ -1,20 +1,24 @@
+import time
+
+import timm
 import torch
 import torchvision.transforms as transforms
+from memory_profiler import memory_usage
 from PIL import Image
-import timm
-import time
 from threadpoolctl import threadpool_limits
-from memory_profiler import memory_usage
+
 
 def process_and_predict(image_path, model_path):
     model = timm.create_model(model_path, pretrained=True)
-    preprocess = transforms.Compose([
-        transforms.Resize((224, 224)),
-        transforms.ToTensor(),
-        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-    ])
+    preprocess = transforms.Compose(
+        [
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ]
+    )
 
-    image = Image.open(image_path).convert('RGB')
+    image = Image.open(image_path).convert("RGB")
     image = preprocess(image)
     image = image.unsqueeze(0)
 
@@ -24,21 +28,24 @@ def process_and_predict(image_path, model_path):
 
     return probabilities
 
+
 def benchmark_model(image_path, model_name, N=10):
     times = []
     peak_memory_usages = []
 
     for _ in range(N):
         start_time = time.time()
-        
+
         # Measure peak memory usage
-        peak_memory_usage = memory_usage((process_and_predict, (image_path, model_name)), 
-                                         interval=0.01, 
-                                         max_usage=True, 
-                                         include_children=True)
+        peak_memory_usage = memory_usage(
+            (process_and_predict, (image_path, model_name)),
+            interval=0.01,
+            max_usage=True,
+            include_children=True,
+        )
 
         end_time = time.time()
-        
+
         time_taken = end_time - start_time
         times.append(time_taken)
         peak_memory_usages.append(peak_memory_usage)
@@ -47,12 +54,13 @@ def benchmark_model(image_path, model_name, N=10):
     max_peak_memory = sum(peak_memory_usages) / N
     return avg_time, max_peak_memory
 
+
 # model variants
 model_variants = {
-    'tiny': 'vit_tiny_patch16_224.augreg_in21k_ft_in1k',
-    'small': 'vit_small_patch16_224.augreg_in21k_ft_in1k',
-    'base': 'vit_base_patch16_224.augreg_in21k_ft_in1k',
-    'large': 'vit_large_patch16_224.augreg_in21k_ft_in1k'
+    "tiny": "vit_tiny_patch16_224.augreg_in21k_ft_in1k",
+    "small": "vit_small_patch16_224.augreg_in21k_ft_in1k",
+    "base": "vit_base_patch16_224.augreg_in21k_ft_in1k",
+    "large": "vit_large_patch16_224.augreg_in21k_ft_in1k",
 }
 
 # an image
@@ -65,4 +73,4 @@ def benchmark_model(image_path, model_name, N=10):
 
         for name, model_name in model_variants.items():
             avg_time, peak_memory = benchmark_model(image_path, model_name)
-            print(f"| {name} | {avg_time:.0f} | {peak_memory:.0f} |")
+            print(f"| {name} | {avg_time:.0f} | {peak_memory:.0f} |")
diff --git a/vit.cpp b/vit.cpp