Skip to content

Commit

Permalink
Merge branch 'master' into HEAD
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Jan 29, 2024
2 parents 21815c6 + 53558f9 commit 3296ebd
Show file tree
Hide file tree
Showing 51 changed files with 28,151 additions and 4,606 deletions.
11 changes: 6 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,12 @@ if (GGML_ALL_WARNINGS)
endif()

if (NOT MSVC)
add_compile_options(
"$<$<COMPILE_LANGUAGE:C>:-Werror=vla>"
"$<$<COMPILE_LANGUAGE:CXX>:-Werror=vla>"
"$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler;-Werror=vla>"
)
# TODO: temporary disabled until we figure out ggml-metal.m
#add_compile_options(
# "$<$<COMPILE_LANGUAGE:C>:-Werror=vla>"
# "$<$<COMPILE_LANGUAGE:CXX>:-Werror=vla>"
# "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler;-Werror=vla>"
#)
endif()

#
Expand Down
2 changes: 1 addition & 1 deletion Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ let package = Package(
resources: [
.process("src/ggml-metal.metal")
],
publicHeadersPath: "include/ggml",
publicHeadersPath: "spm-headers",
cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.define("GGML_USE_ACCELERATE"),
Expand Down
59 changes: 59 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ Some of the development is currently happening in the [llama.cpp](https://github
- [X] Example of Qwen inference [QwenLM/qwen.cpp](https://github.com/QwenLM/qwen.cpp)
- [X] Example of YOLO inference [examples/yolo](https://github.com/ggerganov/ggml/tree/master/examples/yolo)
- [X] Example of ViT inference [staghado/vit.cpp](https://github.com/staghado/vit.cpp)
- [X] Example of multiple LLMs inference [foldl/chatllm.cpp](https://github.com/foldl/chatllm.cpp)
- [X] SeamlessM4T inference *(in development)* https://github.com/facebookresearch/seamless_communication/tree/main/ggml

## Whisper inference (example)
Expand Down Expand Up @@ -171,6 +172,64 @@ export LD_LIBRARY_PATH=/data/local/tmp
./bin/gpt-2-backend -m models/ggml-model.bin -p "this is an example"
```

### CLBlast for Android

Build CLBlast.

```bash
# In CLBlast/build
$ANDROID_SDK_PATH/cmake/3.22.1/bin/cmake .. \
-DCMAKE_SYSTEM_NAME=Android \
-DCMAKE_SYSTEM_VERSION=33 \
-DCMAKE_ANDROID_ARCH_ABI=arm64-v8a \
-DCMAKE_ANDROID_NDK=$ANDROID_NDK_PATH \
-DCMAKE_ANDROID_STL_TYPE=c++_static \
-DOPENCL_ROOT=$(readlink -f ../../OpenCL-Headers) \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=BOTH \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH

# Build libclblast.so
make -j4
```

Pull `libGLES_mali.so` to `libOpenCL.so`.

```bash
# In ggml project root.
mkdir arm64-v8a
adb pull /system/vendor/lib64/egl/libGLES_mali.so arm64-v8a/libOpenCL.so
```

Build ggml with CLBlast.

```bash
# In ggml/build
cd build
$ANDROID_SDK_PATH/cmake/3.22.1/bin/cmake .. \
-DGGML_CLBLAST=ON \
-DCMAKE_SYSTEM_NAME=Android \
-DCMAKE_SYSTEM_VERSION=33 \
-DCMAKE_ANDROID_ARCH_ABI=arm64-v8a \
-DCMAKE_ANDROID_NDK=$ANDROID_NDK_PATH \
-DCMAKE_ANDROID_STL_TYPE=c++_shared \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=BOTH \
-DCLBLAST_HOME=$(readlink -f ../../CLBlast) \
-DOPENCL_LIB=$(readlink -f ../arm64-v8a/libOpenCL.so)

# Run make, adb push, etc.
```

Then in `adb shell`...

```bash
cd /data/local/tmp
export LD_LIBRARY_PATH=/system/vendor/lib64/egl:/data/local/tmp
./bin/gpt-2-backend -m models/ggml-model.bin -n 64 -p "Pepperoni pizza"
```

OpenCL does not have the same level of support in `ggml-backend` as CUDA or Metal. In the `gpt-2-backend` example, OpenCL will only be used for the matrix multiplications when evaluating large prompts.

## Resources

- [GGML - Large Language Models for Everyone](https://github.com/rustformers/llm/blob/main/crates/ggml/README.md): a description of the GGML format provided by the maintainers of the `llm` Rust crate, which provides Rust bindings for GGML
Expand Down
6 changes: 5 additions & 1 deletion examples/common-ggml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ bool ggml_common_quantize_0(
case GGML_FTYPE_ALL_F32:
case GGML_FTYPE_MOSTLY_F16:
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
case GGML_FTYPE_MOSTLY_IQ2_XXS:
case GGML_FTYPE_MOSTLY_IQ2_XS:
{
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
return false;
Expand Down Expand Up @@ -182,7 +184,7 @@ bool ggml_common_quantize_0(
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
{
cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data());
cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], hist_cur.data(), nullptr);
} break;
case GGML_TYPE_F32:
case GGML_TYPE_F16:
Expand All @@ -191,6 +193,8 @@ bool ggml_common_quantize_0(
case GGML_TYPE_I32:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q8_K:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_COUNT:
{
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
Expand Down
6 changes: 6 additions & 0 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -639,6 +639,12 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector

fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
}
else if (fname.size() > 256 && (fname.substr(0, 4) == "RIFF" || fname.substr(8, 4) == "WAVE")) {
if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) {
fprintf(stderr, "error: failed to open WAV file from fname buffer\n");
return false;
}
}
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
return false;
Expand Down
1 change: 1 addition & 0 deletions examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
//

// Read WAV audio file and store the PCM data into pcmf32
// fname can be a buffer of WAV data instead of a filename
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
bool read_wav(
Expand Down
2 changes: 1 addition & 1 deletion examples/gpt-2/main-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
#ifdef GGML_USE_METAL
if (n_gpu_layers > 0) {
fprintf(stderr, "%s: using Metal backend\n", __func__);
ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
model.backend = ggml_backend_metal_init();
if (!model.backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
Expand Down
2 changes: 1 addition & 1 deletion examples/gpt-2/main-batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
#ifdef GGML_USE_METAL
if (n_gpu_layers > 0) {
fprintf(stderr, "%s: using Metal backend\n", __func__);
ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
model.backend = ggml_backend_metal_init();
if (!model.backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
Expand Down
4 changes: 2 additions & 2 deletions examples/gpt-2/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ void init_backends(gpt2_model & model, const gpt_params & params) {
#ifdef GGML_USE_METAL
if (params.n_gpu_layers > 0) {
fprintf(stderr, "%s: using Metal backend\n", __func__);
ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
gpu_backend = ggml_backend_metal_init();
if (!gpu_backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
Expand Down Expand Up @@ -947,7 +947,7 @@ int main(int argc, char ** argv) {
ggml_backend_sched_t sched;
{
// initialize the scheduler
sched = ggml_backend_sched_new(model.backends.data(), model.backends.size());
sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES);

// create the worst case graph for memory usage estimation
int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
Expand Down
7 changes: 6 additions & 1 deletion examples/mnist/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,16 @@
These are simple examples of how to use GGML for inferencing.
The first example uses convolutional neural network (CNN), the second one uses fully connected neural network.

## Building the examples
## Python environment setup and build the examples

```bash
git clone https://github.com/ggerganov/ggml
cd ggml
# Install python dependencies in a virtual environment
python3 -m venv ggml_env
source ./ggml_env/bin/activate
pip install -r requirements.txt
# Build the examples
mkdir build && cd build
cmake ..
make -j4 mnist-cnn mnist
Expand Down
23 changes: 2 additions & 21 deletions examples/python/ggml/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -506,15 +506,6 @@ class lib:
struct ggml_tensor * a);
"""
...
def ggml_cont_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
"""
make contiguous, in-place
GGML_API struct ggml_tensor * ggml_cont_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
"""
...
def ggml_conv_1d(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, s0: int, p0: int, d0: int) -> ffi.CData:
"""
GGML_API struct ggml_tensor * ggml_conv_1d(
Expand Down Expand Up @@ -614,16 +605,6 @@ class lib:
struct ggml_tensor * b);
"""
...
def ggml_cpy_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
"""
a -> b, in-place, return view(b)
GGML_API struct ggml_tensor * ggml_cpy_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
"""
...
def ggml_cross_entropy_loss(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
"""
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
Expand Down Expand Up @@ -1202,7 +1183,7 @@ class lib:
- you don't need to keep the host memory buffer allocated as it is never accessed by Metal
- max_size specifies the maximum size of a tensor and is used to create shared views such
that it is guaranteed that the tensor will fit in at least one of the views
bool ggml_metal_add_buffer(
struct ggml_metal_context * ctx,
Expand Down Expand Up @@ -2428,4 +2409,4 @@ class lib:
...
def quantize_row_q8_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None:
"""void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);"""
...
...
2 changes: 1 addition & 1 deletion examples/python/ggml/cffi.py

Large diffs are not rendered by default.

7 changes: 0 additions & 7 deletions examples/starcoder/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,6 @@ set(TEST_TARGET starcoder)
add_executable(${TEST_TARGET} main.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)

#
# starcoder-mmap

set(TEST_TARGET starcoder-mmap)
add_executable(${TEST_TARGET} starcoder-mmap.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)

#
# starcoder-quantize

Expand Down
4 changes: 2 additions & 2 deletions examples/starcoder/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ options:
-m FNAME, --model FNAME
model path (default: models/starcoder-117M/ggml-model.bin)
$ ./bin/starcoder -m ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin -p "def fibonnaci(" -t 4 --top_k 0 --top_p 0.95 --temp 0.2
$ ./bin/starcoder -m ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin -p "def fibonnaci(" -t 4 --top_k 0 --top_p 0.95 --temp 0.2
main: seed = 1683881276
starcoder_model_load: loading model from '../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin'
starcoder_model_load: n_vocab = 49280
Expand All @@ -47,7 +47,7 @@ starcoder_model_load: ggml ctx size = 1794.90 MB
starcoder_model_load: memory size = 768.00 MB, n_mem = 49152
starcoder_model_load: model size = 1026.83 MB
main: prompt: 'def fibonnaci('
main: number of tokens in prompt = 7, first 8 tokens: 563 24240 78 2658 64 2819 7
main: number of tokens in prompt = 7, first 8 tokens: 563 24240 78 2658 64 2819 7
def fibonnaci(n):
if n == 0:
Expand Down
Loading

0 comments on commit 3296ebd

Please sign in to comment.