Merge branch 'master' into HEAD

ggml-org · Jan 29, 2024 · 3296ebd · 3296ebd
2 parents 21815c6 + 53558f9
commit 3296ebd
Show file tree

Hide file tree

Showing 51 changed files with 28,151 additions and 4,606 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -105,11 +105,12 @@ if (GGML_ALL_WARNINGS)
 endif()
 
 if (NOT MSVC)
-    add_compile_options(
-        "$<$<COMPILE_LANGUAGE:C>:-Werror=vla>"
-        "$<$<COMPILE_LANGUAGE:CXX>:-Werror=vla>"
-        "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler;-Werror=vla>"
-    )
+    # TODO: temporary disabled until we figure out ggml-metal.m
+    #add_compile_options(
+    #    "$<$<COMPILE_LANGUAGE:C>:-Werror=vla>"
+    #    "$<$<COMPILE_LANGUAGE:CXX>:-Werror=vla>"
+    #    "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler;-Werror=vla>"
+    #)
 endif()
 
 #

diff --git a/Package.swift b/Package.swift
@@ -28,7 +28,7 @@ let package = Package(
             resources: [
                 .process("src/ggml-metal.metal")
             ],
-            publicHeadersPath: "include/ggml",
+            publicHeadersPath: "spm-headers",
             cSettings: [
                 .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
                 .define("GGML_USE_ACCELERATE"),

diff --git a/README.md b/README.md
@@ -49,6 +49,7 @@ Some of the development is currently happening in the [llama.cpp](https://github
 - [X] Example of Qwen inference [QwenLM/qwen.cpp](https://github.com/QwenLM/qwen.cpp)
 - [X] Example of YOLO inference [examples/yolo](https://github.com/ggerganov/ggml/tree/master/examples/yolo)
 - [X] Example of ViT inference [staghado/vit.cpp](https://github.com/staghado/vit.cpp)
+- [X] Example of multiple LLMs inference [foldl/chatllm.cpp](https://github.com/foldl/chatllm.cpp)
 - [X] SeamlessM4T inference *(in development)* https://github.com/facebookresearch/seamless_communication/tree/main/ggml
 
 ## Whisper inference (example)
@@ -171,6 +172,64 @@ export LD_LIBRARY_PATH=/data/local/tmp
 ./bin/gpt-2-backend -m models/ggml-model.bin -p "this is an example"
 ```
 
+### CLBlast for Android
+
+Build CLBlast.
+
+```bash
+# In CLBlast/build
+$ANDROID_SDK_PATH/cmake/3.22.1/bin/cmake .. \
+    -DCMAKE_SYSTEM_NAME=Android \
+    -DCMAKE_SYSTEM_VERSION=33 \
+    -DCMAKE_ANDROID_ARCH_ABI=arm64-v8a \
+    -DCMAKE_ANDROID_NDK=$ANDROID_NDK_PATH \
+    -DCMAKE_ANDROID_STL_TYPE=c++_static \
+    -DOPENCL_ROOT=$(readlink -f ../../OpenCL-Headers) \
+    -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=BOTH \
+    -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+
+# Build libclblast.so
+make -j4
+```
+
+Pull `libGLES_mali.so` to `libOpenCL.so`.
+
+```bash
+# In ggml project root.
+mkdir arm64-v8a
+adb pull /system/vendor/lib64/egl/libGLES_mali.so arm64-v8a/libOpenCL.so
+```
+
+Build ggml with CLBlast.
+
+```bash
+# In ggml/build
+cd build
+$ANDROID_SDK_PATH/cmake/3.22.1/bin/cmake .. \
+    -DGGML_CLBLAST=ON \
+    -DCMAKE_SYSTEM_NAME=Android \
+    -DCMAKE_SYSTEM_VERSION=33 \
+    -DCMAKE_ANDROID_ARCH_ABI=arm64-v8a \
+    -DCMAKE_ANDROID_NDK=$ANDROID_NDK_PATH \
+    -DCMAKE_ANDROID_STL_TYPE=c++_shared \
+    -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
+    -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=BOTH \
+    -DCLBLAST_HOME=$(readlink -f ../../CLBlast) \
+    -DOPENCL_LIB=$(readlink -f ../arm64-v8a/libOpenCL.so)
+
+# Run make, adb push, etc.
+```
+
+Then in `adb shell`...
+
+```bash
+cd /data/local/tmp
+export LD_LIBRARY_PATH=/system/vendor/lib64/egl:/data/local/tmp
+./bin/gpt-2-backend -m models/ggml-model.bin -n 64 -p "Pepperoni pizza"
+```
+
+OpenCL does not have the same level of support in `ggml-backend` as CUDA or Metal. In the `gpt-2-backend` example, OpenCL will only be used for the matrix multiplications when evaluating large prompts.
+
 ## Resources
 
 - [GGML - Large Language Models for Everyone](https://github.com/rustformers/llm/blob/main/crates/ggml/README.md): a description of the GGML format provided by the maintainers of the `llm` Rust crate, which provides Rust bindings for GGML

diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp
@@ -62,6 +62,8 @@ bool ggml_common_quantize_0(
         case GGML_FTYPE_ALL_F32:
         case GGML_FTYPE_MOSTLY_F16:
         case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
+        case GGML_FTYPE_MOSTLY_IQ2_XXS:
+        case GGML_FTYPE_MOSTLY_IQ2_XS:
                 {
                     fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                     return false;
@@ -182,7 +184,7 @@ bool ggml_common_quantize_0(
                 case GGML_TYPE_Q5_K:
                 case GGML_TYPE_Q6_K:
                     {
-                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data());
+                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], hist_cur.data(), nullptr);
                     } break;
                 case GGML_TYPE_F32:
                 case GGML_TYPE_F16:
@@ -191,6 +193,8 @@ bool ggml_common_quantize_0(
                 case GGML_TYPE_I32:
                 case GGML_TYPE_Q8_1:
                 case GGML_TYPE_Q8_K:
+                case GGML_TYPE_IQ2_XXS:
+                case GGML_TYPE_IQ2_XS:
                 case GGML_TYPE_COUNT:
                     {
                         fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));

diff --git a/examples/common.cpp b/examples/common.cpp
@@ -639,6 +639,12 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector
 
         fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
     }
+    else if (fname.size() > 256 && (fname.substr(0, 4) == "RIFF" || fname.substr(8, 4) == "WAVE")) {
+        if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) {
+            fprintf(stderr, "error: failed to open WAV file from fname buffer\n");
+            return false;
+        }
+    }
     else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
         fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
         return false;

diff --git a/examples/common.h b/examples/common.h
@@ -136,6 +136,7 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
 //
 
 // Read WAV audio file and store the PCM data into pcmf32
+// fname can be a buffer of WAV data instead of a filename
 // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
 // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
 bool read_wav(

diff --git a/examples/gpt-2/main-backend.cpp b/examples/gpt-2/main-backend.cpp
@@ -209,7 +209,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 #ifdef GGML_USE_METAL
     if (n_gpu_layers > 0) {
         fprintf(stderr, "%s: using Metal backend\n", __func__);
-        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+        ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
         model.backend = ggml_backend_metal_init();
         if (!model.backend) {
             fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);

diff --git a/examples/gpt-2/main-batched.cpp b/examples/gpt-2/main-batched.cpp
@@ -298,7 +298,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 #ifdef GGML_USE_METAL
     if (n_gpu_layers > 0) {
         fprintf(stderr, "%s: using Metal backend\n", __func__);
-        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+        ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
         model.backend = ggml_backend_metal_init();
         if (!model.backend) {
             fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);

diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
@@ -118,7 +118,7 @@ void init_backends(gpt2_model & model, const gpt_params & params) {
 #ifdef GGML_USE_METAL
     if (params.n_gpu_layers > 0) {
         fprintf(stderr, "%s: using Metal backend\n", __func__);
-        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+        ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
         gpu_backend = ggml_backend_metal_init();
         if (!gpu_backend) {
             fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
@@ -947,7 +947,7 @@ int main(int argc, char ** argv) {
     ggml_backend_sched_t sched;
     {
         // initialize the scheduler
-        sched = ggml_backend_sched_new(model.backends.data(), model.backends.size());
+        sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES);
 
         // create the worst case graph for memory usage estimation
         int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);

diff --git a/examples/mnist/README.md b/examples/mnist/README.md
@@ -3,11 +3,16 @@
 These are simple examples of how to use GGML for inferencing.
 The first example uses convolutional neural network (CNN), the second one uses fully connected neural network.
 
-## Building the examples
+## Python environment setup and build the examples
 
 ```bash
 git clone https://github.com/ggerganov/ggml
 cd ggml
+# Install python dependencies in a virtual environment
+python3 -m venv ggml_env
+source ./ggml_env/bin/activate
+pip install -r requirements.txt
+# Build the examples
 mkdir build && cd build
 cmake ..
 make -j4 mnist-cnn mnist

diff --git a/examples/python/ggml/__init__.pyi b/examples/python/ggml/__init__.pyi
@@ -506,15 +506,6 @@ class lib:
                 struct ggml_tensor  * a);
     """
     ...
-  def ggml_cont_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-    make contiguous, in-place
-
-        GGML_API struct ggml_tensor * ggml_cont_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
   def ggml_conv_1d(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, s0: int, p0: int, d0: int) -> ffi.CData:
     """
         GGML_API struct ggml_tensor * ggml_conv_1d(
@@ -614,16 +605,6 @@ class lib:
                 struct ggml_tensor  * b);
     """
     ...
-  def ggml_cpy_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-    a -> b, in-place, return view(b)
-
-        GGML_API struct ggml_tensor * ggml_cpy_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
   def ggml_cross_entropy_loss(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
     """
         GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
@@ -1202,7 +1183,7 @@ class lib:
     - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
     - max_size specifies the maximum size of a tensor and is used to create shared views such
     that it is guaranteed that the tensor will fit in at least one of the views
-    
+
 
     bool ggml_metal_add_buffer(
             struct ggml_metal_context * ctx,
@@ -2428,4 +2409,4 @@ class lib:
     ...
   def quantize_row_q8_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None:
     """void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);"""
-    ...
+    ...
diff --git a/examples/python/ggml/cffi.py b/examples/python/ggml/cffi.py
diff --git a/examples/starcoder/CMakeLists.txt b/examples/starcoder/CMakeLists.txt
@@ -5,13 +5,6 @@ set(TEST_TARGET starcoder)
 add_executable(${TEST_TARGET} main.cpp)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
 
-#
-# starcoder-mmap
-
-set(TEST_TARGET starcoder-mmap)
-add_executable(${TEST_TARGET} starcoder-mmap.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
 #
 # starcoder-quantize
 

diff --git a/examples/starcoder/README.md b/examples/starcoder/README.md
@@ -34,7 +34,7 @@ options:
   -m FNAME, --model FNAME
                         model path (default: models/starcoder-117M/ggml-model.bin)
 
-$ ./bin/starcoder -m ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin -p "def fibonnaci(" -t 4 --top_k 0 --top_p 0.95 --temp 0.2      
+$ ./bin/starcoder -m ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin -p "def fibonnaci(" -t 4 --top_k 0 --top_p 0.95 --temp 0.2
 main: seed = 1683881276
 starcoder_model_load: loading model from '../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin'
 starcoder_model_load: n_vocab = 49280
@@ -47,7 +47,7 @@ starcoder_model_load: ggml ctx size = 1794.90 MB
 starcoder_model_load: memory size =   768.00 MB, n_mem = 49152
 starcoder_model_load: model size  =  1026.83 MB
 main: prompt: 'def fibonnaci('
-main: number of tokens in prompt = 7, first 8 tokens: 563 24240 78 2658 64 2819 7 
+main: number of tokens in prompt = 7, first 8 tokens: 563 24240 78 2658 64 2819 7
 
 def fibonnaci(n):
     if n == 0: