diff --git a/ci/run.sh b/ci/run.sh index a5d0344a3..cd9435d9c 100644 --- a/ci/run.sh +++ b/ci/run.sh @@ -319,52 +319,6 @@ function gg_sum_yolo { gg_printf '```\n' } -# mpt - -function gg_run_mpt { - cd ${SRC} - - gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/config.json - gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/warnings.py - gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/fc.py - gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/attention.py - gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/blocks.py - gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/ffn.py - gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/norm.py - gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/tokenizer.json - gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/tokenizer_config.json - gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/pytorch_model.bin.index.json - gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/configuration_mpt.py - gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/resolve/main/pytorch_model-00001-of-00002.bin - gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/resolve/main/pytorch_model-00002-of-00002.bin - - cd build-ci-release - - set -e - - path_models="../models-mnt/mpt/7B" - model_f16="${path_models}/ggml-model-f16.bin" - model_q4_0="${path_models}/ggml-model-q4_0.bin" - - python3 ../examples/mpt/convert-h5-to-ggml.py ${path_models} 1 - ./bin/mpt-quantize ${model_f16} ${model_q4_0} q4_0 - - (time ./bin/mpt --model ${model_f16} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log - (time ./bin/mpt --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log - - set +e -} - -function gg_sum_mpt { - gg_printf '### %s\n\n' "${ci}" - - gg_printf 'Runs short MPT text generation\n' - gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" - gg_printf '```\n' - gg_printf '%s\n' "$(cat $OUT/${ci}-tg.log)" - gg_printf '```\n' -} - ## main if [ -z $GG_BUILD_LOW_PERF ]; then @@ -394,7 +348,8 @@ test $ret -eq 0 && gg_run yolo if [ -z $GG_BUILD_LOW_PERF ]; then if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 16 ]; then - test $ret -eq 0 && gg_run mpt + # run tests that require GPU with at least 16GB of VRAM + date fi fi diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 340f6470d..5a268dca1 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -22,10 +22,5 @@ add_subdirectory(gpt-2) add_subdirectory(gpt-j) add_subdirectory(whisper) add_subdirectory(mnist) -add_subdirectory(gpt-neox) -add_subdirectory(dolly-v2) -add_subdirectory(replit) -add_subdirectory(mpt) -add_subdirectory(starcoder) add_subdirectory(sam) add_subdirectory(yolo) diff --git a/examples/dolly-v2/CMakeLists.txt b/examples/dolly-v2/CMakeLists.txt deleted file mode 100644 index b2d555637..000000000 --- a/examples/dolly-v2/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -# -# dollyv2 - -set(TEST_TARGET dollyv2) -add_executable(${TEST_TARGET} main.cpp) -target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) - -# -# dollyv2-quantize - -set(TEST_TARGET dollyv2-quantize) -add_executable(${TEST_TARGET} quantize.cpp) -target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) diff --git a/examples/dolly-v2/README.md b/examples/dolly-v2/README.md deleted file mode 100644 index add97385a..000000000 --- a/examples/dolly-v2/README.md +++ /dev/null @@ -1,187 +0,0 @@ -# Dolly-V2 - -Transformer architecture: GPT-NeoX - -Modeled from examples/stablelm - -Ref: https://github.com/databrickslabs/dolly - -Ref: https://github.com/stability-AI/stableLM/#stablelm-alpha - -## Usage - -```bash -# get the repo and build it -git clone https://github.com/ggerganov/ggml -cd ggml -mkdir build && cd build -cmake .. -make -j - -# get the Dolly-V2 3B model -git clone https://huggingface.co/databricks/dolly-v2-3b - -# install Python dependencies -python3 -m pip install -r ../requirements.txt - -# convert model to FP16 -python3 ../examples/dolly-v2/convert-h5-to-ggml.py ./dolly-v2-3b/ 1 - -# run inference using FP16 precision -./bin/dollyv2 -m ./dolly-v2-3b/ggml-model-f16.bin -p "State the meaning of life." -t 6 -n 64 - -main: seed = 1683218142 -dollyv2_model_load: loading model from './dolly-v2-3b/ggml-model-f16.bin' - please wait ... -dollyv2_model_load: n_vocab = 50280 -dollyv2_model_load: n_ctx = 2048 -dollyv2_model_load: n_embd = 2560 -dollyv2_model_load: n_head = 32 -dollyv2_model_load: n_layer = 32 -dollyv2_model_load: n_rot = 20 -dollyv2_model_load: ftype = 1 -dollyv2_model_load: ggml ctx size = 7374.91 MB -dollyv2_model_load: memory_size = 640.00 MB, n_mem = 65536 -dollyv2_model_load: ................................................ done -dollyv2_model_load: model size = 5295.10 MB / num tensors = 388 -main: number of tokens in prompt = 32 -main: token[0] = 30003, Below -main: token[1] = 310, is -main: token[2] = 271, an -main: token[3] = 9775, instruction -main: token[4] = 326, that -main: token[5] = 8631, describes -main: token[6] = 247, a -main: token[7] = 4836, task -main: token[8] = 964, . -main: token[9] = 19566, Write -main: token[10] = 247, a -main: token[11] = 2380, response -main: token[12] = 326, that -main: token[13] = 20420, appropriately -main: token[14] = 29141, completes -main: token[15] = 253, the -main: token[16] = 2748, request -main: token[17] = 964, . -main: token[18] = 187, - -main: token[19] = 187, - -main: token[20] = 50278, ### Instruction: -main: token[21] = 187, - -main: token[22] = 5443, State -main: token[23] = 253, the -main: token[24] = 4495, meaning -main: token[25] = 273, of -main: token[26] = 1495, life -main: token[27] = 964, . -main: token[28] = 187, - -main: token[29] = 187, - -main: token[30] = 50279, ### Response: -main: token[31] = 187, - - -Below is an instruction that describes a task. Write a response that appropriately completes the request. - -### Instruction: -State the meaning of life. - -### Response: -The meaning of life is to love and be loved. - -### End - -main: mem per token = 16136720 bytes -main: load time = 2202.58 ms -main: sample time = 2.57 ms -main: predict time = 1497.14 ms / 33.27 ms per token -main: total time = 6187.27 ms -``` - -## 5-bit integer quantization mode - -```bash -# quantize the model to 5-bits using Q5_0 quantization -./bin/dollyv2-quantize ./dolly-v2-3b/ggml-model-f16.bin ./dolly-v2-3b/ggml-model-q5_0.bin q5_0 - -# run the quantized model -./bin/dollyv2 -m ./dolly-v2-3b/ggml-model-q5_0.bin -p "State the meaning of life." -t 6 -n 64 - -main: seed = 1683218518 -dollyv2_model_load: loading model from './dolly-v2-3b/ggml-model-q5_0.bin' - please wait ... -dollyv2_model_load: n_vocab = 50280 -dollyv2_model_load: n_ctx = 2048 -dollyv2_model_load: n_embd = 2560 -dollyv2_model_load: n_head = 32 -dollyv2_model_load: n_layer = 32 -dollyv2_model_load: n_rot = 20 -dollyv2_model_load: ftype = 8 -dollyv2_model_load: ggml ctx size = 3902.68 MB -dollyv2_model_load: memory_size = 640.00 MB, n_mem = 65536 -dollyv2_model_load: ................................................ done -dollyv2_model_load: model size = 1822.87 MB / num tensors = 388 -main: number of tokens in prompt = 32 -main: token[0] = 30003, Below -main: token[1] = 310, is -main: token[2] = 271, an -main: token[3] = 9775, instruction -main: token[4] = 326, that -main: token[5] = 8631, describes -main: token[6] = 247, a -main: token[7] = 4836, task -main: token[8] = 964, . -main: token[9] = 19566, Write -main: token[10] = 247, a -main: token[11] = 2380, response -main: token[12] = 326, that -main: token[13] = 20420, appropriately -main: token[14] = 29141, completes -main: token[15] = 253, the -main: token[16] = 2748, request -main: token[17] = 964, . -main: token[18] = 187, - -main: token[19] = 187, - -main: token[20] = 50278, ### Instruction: -main: token[21] = 187, - -main: token[22] = 5443, State -main: token[23] = 253, the -main: token[24] = 4495, meaning -main: token[25] = 273, of -main: token[26] = 1495, life -main: token[27] = 964, . -main: token[28] = 187, - -main: token[29] = 187, - -main: token[30] = 50279, ### Response: -main: token[31] = 187, - - -Below is an instruction that describes a task. Write a response that appropriately completes the request. - -### Instruction: -State the meaning of life. - -### Response: -The meaning of life is the discovery of the true self. - -### End - -main: mem per token = 16127760 bytes -main: load time = 1011.09 ms -main: sample time = 2.79 ms -main: predict time = 1271.62 ms / 27.64 ms per token -main: total time = 2802.51 ms -``` - -## Notes - -- No guarantees for correctness -- The tokenizer is currently hacked - probably works only for English -- Non-parallel residual is not supported -- Contributions and improvements are welcome diff --git a/examples/dolly-v2/convert-h5-to-ggml.py b/examples/dolly-v2/convert-h5-to-ggml.py deleted file mode 100644 index 0019810e2..000000000 --- a/examples/dolly-v2/convert-h5-to-ggml.py +++ /dev/null @@ -1,116 +0,0 @@ -import sys -import struct -import json -import numpy as np - -from transformers import AutoModelForCausalLM, AutoTokenizer - -if len(sys.argv) < 3: - print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n") - print(" ftype == 0 -> float32") - print(" ftype == 1 -> float16") - sys.exit(1) - -# output in the same directory as the model -dir_model = sys.argv[1] -fname_out = sys.argv[1] + "/ggml-model.bin" - -with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f: - encoder = json.load(f) - -with open(dir_model + "/config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -# possible data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 -# -# map from ftype to string -ftype_str = ["f32", "f16"] - -ftype = 1 -if len(sys.argv) > 2: - ftype = int(sys.argv[2]) - if ftype < 0 or ftype > 1: - print("Invalid ftype: " + str(ftype)) - sys.exit(1) - fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" - - -tokenizer = AutoTokenizer.from_pretrained(dir_model) -model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True) -#print (model) - -#print(tokenizer.encode('I believe the meaning of life is')) - -list_vars = model.state_dict() -for name in list_vars.keys(): - print(name, list_vars[name].shape, list_vars[name].dtype) - -fout = open(fname_out, "wb") - -print(hparams) - -fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex -fout.write(struct.pack("i", hparams["vocab_size"])) -fout.write(struct.pack("i", hparams["max_position_embeddings"])) -fout.write(struct.pack("i", hparams["hidden_size"])) -fout.write(struct.pack("i", hparams["num_attention_heads"])) -fout.write(struct.pack("i", hparams["num_hidden_layers"])) -fout.write(struct.pack("i", int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))) -fout.write(struct.pack("i", hparams["use_parallel_residual"])) -fout.write(struct.pack("i", ftype)) - -# TODO: temporary hack to not deal with implementing the tokenizer -dot_token = tokenizer.encode('.')[0] -for i in range(hparams["vocab_size"]): - text = tokenizer.decode([dot_token, i]).encode('utf-8') - # remove the first byte (it's always '.') - text = text[1:] - fout.write(struct.pack("i", len(text))) - fout.write(text) - -for name in list_vars.keys(): - data = list_vars[name].squeeze().numpy() - print("Processing variable: " + name + " with shape: ", data.shape) - - # we don't need these - if name.endswith(".attention.masked_bias") or \ - name.endswith(".attention.bias") or \ - name.endswith(".attention.rotary_emb.inv_freq"): - print(" Skipping variable: " + name) - continue - - n_dims = len(data.shape); - - # ftype == 0 -> float32, ftype == 1 -> float16 - ftype_cur = 0; - if ftype != 0: - if name[-7:] == ".weight" and n_dims == 2: - print(" Converting to float16") - data = data.astype(np.float16) - ftype_cur = 1 - else: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - else: - if data.dtype != np.float32: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - - # header - str = name.encode('utf-8') - fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) - for i in range(n_dims): - fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) - fout.write(str); - - # data - data.tofile(fout) - -fout.close() - -print("Done. Output file: " + fname_out) -print("") diff --git a/examples/dolly-v2/main.cpp b/examples/dolly-v2/main.cpp deleted file mode 100644 index 9e3599608..000000000 --- a/examples/dolly-v2/main.cpp +++ /dev/null @@ -1,968 +0,0 @@ -#include "ggml/ggml.h" - -#include "common.h" -#include "common-ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if !defined(_WIN32) -#define DOLLY_INTERACTIVE_PORT -#endif - -#if defined(DOLLY_INTERACTIVE_PORT) -#include -#include -#include -#include -#endif - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -// default hparams (Dolly-V2 3B) -struct dollyv2_hparams { - int32_t n_vocab = 50254; // tokenizer.vocab_size - int32_t n_ctx = 2048; // model.config.max_position_embeddings - int32_t n_embd = 2560; // model.config.hidden_size - int32_t n_head = 32; // model.config.num_attention_heads - int32_t n_layer = 32; // model.config.num_hidden_layers - int32_t n_rot = 20; // rotary_pct[25%] * (n_embd / n_head) - int32_t par_res = 1; // 1 = true, 0 = false - int32_t ftype = GGML_FTYPE_MOSTLY_F16; - float eps = 1e-5f; -}; - -const std::string INSTRUCTION_KEY = "### Instruction:"; -const std::string RESPONSE_KEY = "### Response:"; -const std::string END_KEY = "### End"; -const std::string INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."; - -// dollyv2 prompt format -std::string prompt_for_generation(const std::string& instruction) { - return INTRO_BLURB + "\n\n" + INSTRUCTION_KEY + "\n" + instruction + "\n\n" + RESPONSE_KEY + "\n"; -} - -struct dollyv2_layer { - // pre normalization - struct ggml_tensor * ln_1_g; - struct ggml_tensor * ln_1_b; - - // attention - struct ggml_tensor * c_attn_attn_w; - struct ggml_tensor * c_attn_attn_b; - - struct ggml_tensor * c_attn_proj_w; - struct ggml_tensor * c_attn_proj_b; - - // post normalization - struct ggml_tensor * ln_2_g; - struct ggml_tensor * ln_2_b; - - // ff - struct ggml_tensor * c_mlp_fc_w; - struct ggml_tensor * c_mlp_fc_b; - - struct ggml_tensor * c_mlp_proj_w; - struct ggml_tensor * c_mlp_proj_b; -}; - -struct dollyv2_model { - dollyv2_hparams hparams; - - // normalization - struct ggml_tensor * ln_f_g; - struct ggml_tensor * ln_f_b; - - struct ggml_tensor * wte; // position embedding - - struct ggml_tensor * lmh_g; // language model head - //struct ggml_tensor * lmh_b; // language model bias - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // - struct ggml_context * ctx; - std::map tensors; -}; - -// load the model's weights from a file -bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vocab & vocab) { - printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); - - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); - return false; - } - } - - // load hparams - { - auto & hparams = model.hparams; - - fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fin.read((char *) &hparams.par_res, sizeof(hparams.par_res)); - fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - - const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: n_rot = %d\n", __func__, hparams.n_rot); - printf("%s: par_res = %d\n", __func__, hparams.par_res); - printf("%s: ftype = %d\n", __func__, hparams.ftype); - printf("%s: qntvr = %d\n", __func__, qntvr); - - hparams.ftype %= GGML_QNT_VERSION_FACTOR; - } - - // load vocab - { - const int32_t n_vocab = model.hparams.n_vocab; - - std::string word; - std::vector buf(128); - - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - fin.read((char *) &len, sizeof(len)); - - buf.resize(len); - fin.read((char *) buf.data(), len); - word.assign(buf.data(), len); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - } - - vocab.add_special_token("### End"); - vocab.add_special_token("### Instruction:"); - vocab.add_special_token("### Response:"); - } - - // for the big tensors, we have the option to store the data in 16-bit floats or quantized - // in order to save memory and also to speed up the computation - ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); - if (wtype == GGML_TYPE_COUNT) { - fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", - __func__, fname.c_str(), model.hparams.ftype); - return false; - } - - auto & ctx = model.ctx; - - size_t ctx_size = 0; - - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - - ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g - ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b - - ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // wte - - ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // lmh_g - //ctx_size += ggml_row_size(GGML_TYPE_F32, n_vocab); // lmh_b - - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b - - ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b - - ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd*n_embd)); // c_attn_proj_b - - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b - - ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b - - ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b - - ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k - ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v - - ctx_size += (6 + 16*n_layer)*512; // object overhead - - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); - } - - // create the ggml context - { - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - model.ctx = ggml_init(params); - if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return false; - } - } - - // prepare memory for the weights - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_vocab = hparams.n_vocab; - - model.layers.resize(n_layer); - - model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - - model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - model.lmh_g = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - //model.lmh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab); - - // map by name - model.tensors["gpt_neox.embed_in.weight"] = model.wte; - - model.tensors["gpt_neox.final_layer_norm.weight"] = model.ln_f_g; - model.tensors["gpt_neox.final_layer_norm.bias"] = model.ln_f_b; - - model.tensors["embed_out.weight"] = model.lmh_g; - //model.tensors["lm_head.bias"] = model.lmh_b; - - for (int i = 0; i < n_layer; ++i) { - auto & layer = model.layers[i]; - - layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); - layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); - - layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); - layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); - - layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); - layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // map by name - - // unmapped: attention.rotary_emb, mlp.act - - model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.weight"] = layer.ln_1_g; - model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.bias"] = layer.ln_1_b; - - model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.weight"] = layer.c_attn_attn_w; - model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.bias"] = layer.c_attn_attn_b; - - model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.weight"] = layer.c_attn_proj_w; - model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.bias"] = layer.c_attn_proj_b; - - model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.weight"] = layer.ln_2_g; - model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.bias"] = layer.ln_2_b; - - model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.weight"] = layer.c_mlp_fc_w; - model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.bias"] = layer.c_mlp_fc_b; - - model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.weight"] = layer.c_mlp_proj_w; - model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.bias"] = layer.c_mlp_proj_b; - } - } - - // key + value memory - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - - const int64_t n_mem = n_layer*n_ctx; - const int64_t n_elements = n_embd*n_mem; - - model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem); - } - - // load weights - { - int n_tensors = 0; - size_t total_size = 0; - - printf("%s: ", __func__); - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ttype; - - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ttype), sizeof(ttype)); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - if (model.tensors.find(name) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); - return false; - } - - auto tensor = model.tensors[name]; - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str()); - return false; - } - - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n", - __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); - return false; - } - - // for debugging - if (0) { - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); - } - - const size_t bpe = ggml_type_size(ggml_type(ttype)); - - if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", - __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe); - return false; - } - - fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - - total_size += ggml_nbytes(tensor); - if (++n_tensors % 8 == 0) { - printf("."); - fflush(stdout); - } - } - - printf(" done\n"); - - printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); - } - - fin.close(); - - return true; -} - -// feed-forward network -ggml_tensor * gpt_neox_ff( - const dollyv2_layer & layer, - ggml_context * ctx0, - ggml_tensor * inp, - float eps) { - ggml_tensor * cur = ggml_norm(ctx0, inp, eps); - - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, layer.ln_2_g, cur), - cur), - ggml_repeat(ctx0, layer.ln_2_b, cur)); - - cur = ggml_mul_mat(ctx0, - layer.c_mlp_fc_w, - cur); - - cur = ggml_add(ctx0, - ggml_repeat(ctx0, layer.c_mlp_fc_b, cur), - cur); - - // GELU activation - cur = ggml_gelu(ctx0, cur); - - // projection - // cur = proj_w*cur + proj_b - cur = ggml_mul_mat(ctx0, - layer.c_mlp_proj_w, - cur); - - cur = ggml_add(ctx0, - ggml_repeat(ctx0, layer.c_mlp_proj_b, cur), - cur); - return cur; -} - -// evaluate the transformer -// -// - model: the model -// - n_threads: number of threads to use -// - n_past: the context size so far -// - embd_inp: the embeddings of the tokens in the context -// - embd_w: the predicted logits for the next token -// -bool dollyv2_eval( - const dollyv2_model & model, - const int n_threads, - const int n_past, - const std::vector & embd_inp, - std::vector & embd_w, - size_t & mem_per_token) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_head = hparams.n_head; - const int n_vocab = hparams.n_vocab; - const int n_rot = hparams.n_rot; - - static size_t buf_size = 256u*1024*1024; - static void * buf = malloc(buf_size); - - if (mem_per_token > 0 && mem_per_token*N > buf_size) { - const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead - //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); - - // reallocate - buf_size = buf_size_new; - buf = realloc(buf, buf_size); - if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); - return false; - } - } - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - int * data = (int *) KQ_pos->data; - for (int i = 0; i < N; ++i) { - data[i] = n_past + i; - } - - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); - - // wte - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur; - - // self-attention - { - { - cur = ggml_norm(ctx0, inpL, hparams.eps); - - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), - cur), - ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); - } - - // compute QKV - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_attn_attn_w, - cur); - - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), - cur); - } - - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head)); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head)); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head)); - - // using mode = 2 for GPT-NeoX mode - Qcur = ggml_rope_inplace(ctx0, Qcur, KQ_pos, n_rot, 2, 0); - Kcur = ggml_rope_inplace(ctx0, Kcur, KQ_pos, n_rot, 2, 0); - - // store key and value to memory - { - Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); - - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, - ( n_ctx)*ggml_element_size(model.memory_v), - (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v)); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), - n_embd/n_head, n_head, n_past + N), - 0, 2, 1, 3); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - 1.0f/sqrt(float(n_embd)/n_head)); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - struct ggml_tensor * V = - ggml_view_3d(ctx0, model.memory_v, - n_past + N, n_embd/n_head, n_head, - n_ctx*ggml_element_size(model.memory_v), - n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head, - il*n_ctx*ggml_element_size(model.memory_v)*n_embd); - - // KQV = transpose(V) * KQ_soft_max - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - - // cur = KQV_merged.contiguous().view(n_embd, N) - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - // projection - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_attn_proj_w, - cur); - - cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur); - } - } - - if (hparams.par_res == 0) { - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL); - - cur = gpt_neox_ff(model.layers[il], ctx0, inpFF, hparams.eps); - - // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); - } else { - struct ggml_tensor * inpFF = cur; - - // this is independent of the self-attention result, so it could be done in parallel to the self-attention - // note here we pass inpL instead of cur - cur = gpt_neox_ff(model.layers[il], ctx0, inpL, hparams.eps); - - // layer input + FF - cur = ggml_add(ctx0, cur, inpFF); - - // input for next layer - inpL = ggml_add(ctx0, cur, inpL); - } - - } - - // norm - { - inpL = ggml_norm(ctx0, inpL, hparams.eps); - - // inpL = ln_f_g*inpL + ln_f_b - inpL = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.ln_f_g, inpL), - inpL), - ggml_repeat(ctx0, model.ln_f_b, inpL)); - } - - // lm_head - { - inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL); - - //inpL = ggml_add(ctx0, - // ggml_repeat(ctx0, model.lmh_b, inpL), - // inpL); - } - - // logits -> probs - //inpL = ggml_soft_max_inplace(ctx0, inpL); - - // run the computation - ggml_build_forward_expand(gf, inpL); - ggml_graph_compute_with_ctx(ctx0, gf, n_threads); - - //if (n_past%100 == 0) { - // ggml_graph_print (&gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); - //} - - //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); - - // return result for just the last token - embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); - - if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; - } - //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); - - ggml_free(ctx0); - - return true; -} - -std::string execute_prompt( - const dollyv2_model &model, - gpt_vocab &vocab, - const std::string &prompt, - gpt_params ¶ms, - std::mt19937 &rng, - int64_t t_load_us, - int64_t t_sample_us, - int64_t t_predict_us, - size_t mem_per_token, - int n_past, - bool stream_response_to_cout = false) { - std::string output = ""; - std::vector logits; - - // tokenize the prompt - std::vector embd_inp = ::gpt_tokenize(vocab, prompt); - - params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int)embd_inp.size()); - - printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - for (size_t i = 0; i < embd_inp.size(); i++) { - printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); - } - printf("\n"); - - std::vector embd; - - dollyv2_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token); - - const int32_t end_token = vocab.token_to_id["### End"]; - - for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); - - if (!dollyv2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { - printf("Failed to predict\n"); - return output; - } - - t_predict_us += ggml_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (i >= embd_inp.size()) { - // sample next token - const int top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - - const int n_vocab = model.hparams.n_vocab; - - gpt_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); - - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - } else { - // if here, it means we are still processing the input prompt - for (size_t k = i; k < embd_inp.size(); k++) { - embd.push_back(embd_inp[k]); - if (int32_t(embd.size()) > params.n_batch) { - break; - } - } - i += embd.size() - 1; - } - - // display text - for (auto id : embd) { - output += vocab.id_to_token[id]; - if (stream_response_to_cout) { - printf("%s", vocab.id_to_token[id].c_str()); - } - } - if (stream_response_to_cout) { - fflush(stdout); - } - - // end of text token - if (embd.back() == 0 || (end_token > 0 && embd.back() == end_token)) { - return output; - } - } - return output; -} - -#if defined(DOLLY_INTERACTIVE_PORT) -int setup_port(const int port) { - int sockfd = socket(AF_INET, SOCK_STREAM, 0); - if (sockfd < 0) { - fprintf(stderr, "%s: Failed to create new socket\n", __func__); - return -1; - } - - sockaddr_in servaddr; - std::memset(&servaddr, 0, sizeof(servaddr)); - - servaddr.sin_family = AF_INET; - servaddr.sin_addr.s_addr = htonl(INADDR_ANY); - servaddr.sin_port = htons(port); - - if (bind(sockfd, (struct sockaddr *)&servaddr, sizeof(servaddr)) < 0) { - fprintf(stderr, "%s: Failed to bind to port %i\n", __func__, port); - return -1; - } - - if (listen(sockfd, 10) < 0) { - fprintf(stderr, "%s: Failed to listen to socket on port %i\n", __func__, port); - return -1; - } - return sockfd; -} - -std::string read_from_port(int sockfd, int clientfd) { - if (clientfd < 0) { - fprintf(stderr, "%s: Failed to accept new connection\n", __func__); - return ""; - } - - char buffer[4096]; - std::memset(buffer, 0, sizeof(buffer)); - - if (read(clientfd, buffer, sizeof(buffer)) < 0) { - fprintf(stderr, "%s: Failed to read from client\n", __func__); - } else { - std::cout << "Received: " << buffer; - return std::string(buffer); - } - return std::string(""); -} -#endif - -int main(int argc, char ** argv) { - ggml_time_init(); - - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - params.model = "models/dolly-v2-3b/ggml-model-f16.bin"; - - if (gpt_params_parse(argc, argv, params) == false) { - return 1; - } - - if (params.seed < 0) { - params.seed = time(NULL); - } - - printf("%s: seed = %d\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - - int64_t t_load_us = 0; - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - // determine the required inference memory per token: - size_t mem_per_token = 0; - - int n_past = 0; - - gpt_vocab vocab; - dollyv2_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!dollyv2_model_load(params.model, model, vocab)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - - test_gpt_tokenizer(vocab, params.token_test); - } - -#if defined(DOLLY_INTERACTIVE_PORT) - int sockfd = -1; - if (params.interactive_port != -1) { - sockfd = setup_port(params.interactive_port); - if (sockfd == -1) { - return 1; - } - fprintf(stdout, "Model is ready on port %i\n", params.interactive_port); - fflush(stdout); - } -#endif - - if (params.interactive || params.interactive_port != -1) { - while (true) { - std::string prompt_input; -#if defined(DOLLY_INTERACTIVE_PORT) - int clientfd = -1; - if (params.interactive_port != -1) { - sockaddr_in clientaddr; - socklen_t clientaddrlen = sizeof(clientaddr); - clientfd = accept(sockfd, (struct sockaddr *)&clientaddr, &clientaddrlen); - prompt_input = read_from_port(sockfd, clientfd); - } else -#endif - { - printf("Please enter your quesiton:\n>"); - fflush(stdout); - - std::getline(std::cin, prompt_input); - } - - if (strcmp(prompt_input.c_str(), "exit") == 0) { - break; - } - - const std::string prompt = prompt_for_generation(prompt_input); - // call the model - const std::string response = execute_prompt(model, vocab, prompt, params, rng, t_load_us, t_sample_us, t_predict_us, mem_per_token, n_past, true); - -#if defined(DOLLY_INTERACTIVE_PORT) - if (params.interactive_port != -1) { - if (write(clientfd, response.c_str(), response.size()) < 0) { - fprintf(stderr, "%s: Failed to write answer '%s' to client\n", __func__, response.c_str()); - } - - if (close(clientfd) < 0) { - fprintf(stderr, "%s: Failed to close client socket\n", __func__); - } - } else -#endif - { - printf("%s\n\n", response.c_str()); - } - fflush(stdout); - } - } else { - if (params.prompt.empty()) { - params.prompt = gpt_random_prompt(rng); - } - - const std::string prompt = prompt_for_generation(params.prompt); - execute_prompt(model, vocab, prompt, params, rng, t_load_us, t_sample_us, t_predict_us, mem_per_token, n_past, true); - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us / 1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us / 1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, t_predict_us / 1000.0f / n_past); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f); - } - - ggml_free(model.ctx); - -#if defined(DOLLY_INTERACTIVE_PORT) - if (params.interactive_port != -1 && close(sockfd) < 0) { - fprintf(stderr, "%s: Failed to close server socket\n", __func__); - } -#endif - - return 0; -} diff --git a/examples/dolly-v2/quantize.cpp b/examples/dolly-v2/quantize.cpp deleted file mode 100644 index 0c0d24ccf..000000000 --- a/examples/dolly-v2/quantize.cpp +++ /dev/null @@ -1,178 +0,0 @@ -#include "ggml/ggml.h" - -#include "common.h" -#include "common-ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// default hparams (dollyv2 3B) -struct dollyv2_hparams { - int32_t n_vocab = 50254; // tokenizer.vocab_size - int32_t n_ctx = 2048; // model.config.max_position_embeddings - int32_t n_embd = 2560; // model.config.hidden_size - int32_t n_head = 32; // model.config.num_attention_heads - int32_t n_layer = 32; // model.config.num_hidden_layers - int32_t n_rot = 20; // rotary_pct[25%] * (n_embd / n_head) - int32_t par_res = 1; // 1 = true, 0 = false - int32_t ftype = GGML_FTYPE_MOSTLY_F16; -}; - -// quantize a model -bool dollyv2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) { - gpt_vocab vocab; - - printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); - - auto finp = std::ifstream(fname_inp, std::ios::binary); - if (!finp) { - fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str()); - return false; - } - - auto fout = std::ofstream(fname_out, std::ios::binary); - if (!fout) { - fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - finp.read((char *) &magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str()); - return false; - } - - fout.write((char *) &magic, sizeof(magic)); - } - - dollyv2_hparams hparams; - - // load hparams - { - finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - finp.read((char *) &hparams.par_res, sizeof(hparams.par_res)); - finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - - const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; - const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: par_res = %d\n", __func__, hparams.par_res); - printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); - printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); - printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); - printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); - - fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); - fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fout.write((char *) &hparams.par_res, sizeof(hparams.par_res)); - fout.write((char *) &ftype_dst, sizeof(ftype_dst)); - } - - // load vocab - { - const int32_t n_vocab = hparams.n_vocab; - - std::string word; - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - finp.read ((char *) &len, sizeof(len)); - fout.write((char *) &len, sizeof(len)); - - word.resize(len); - finp.read ((char *) word.data(), len); - fout.write((char *) word.data(), len); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - } - } - - // regexes of tensor names to be quantized - const std::vector to_quant = { - ".*weight", - }; - - if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) { - fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str()); - return false; - } - - finp.close(); - fout.close(); - - return true; -} - -// usage: -// ./dollyv2-quantize models/dolly-v2-3B/ggml-model.bin models/dolly-v2-3B/ggml-model-quant.bin type -// -int main(int argc, char ** argv) { - if (argc != 4) { - fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); - ggml_print_ftypes(stderr); - return 1; - } - - // needed to initialize f16 tables - { - struct ggml_init_params params = { 0, NULL, false }; - struct ggml_context * ctx = ggml_init(params); - ggml_free(ctx); - } - - const std::string fname_inp = argv[1]; - const std::string fname_out = argv[2]; - - const ggml_ftype ftype = ggml_parse_ftype(argv[3]); - - const int64_t t_main_start_us = ggml_time_us(); - - int64_t t_quantize_us = 0; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!dollyv2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { - fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); - return 1; - } - - t_quantize_us = ggml_time_us() - t_start_us; - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n"); - printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - return 0; -} diff --git a/examples/gpt-neox/CMakeLists.txt b/examples/gpt-neox/CMakeLists.txt deleted file mode 100644 index 21a319b33..000000000 --- a/examples/gpt-neox/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -# -# gpt-neox - -set(TEST_TARGET gpt-neox) -add_executable(${TEST_TARGET} main.cpp) -target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) - -# -# gpt-neox-quantize - -set(TEST_TARGET gpt-neox-quantize) -add_executable(${TEST_TARGET} quantize.cpp) -target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) diff --git a/examples/gpt-neox/README.md b/examples/gpt-neox/README.md deleted file mode 100644 index 64c6d7c62..000000000 --- a/examples/gpt-neox/README.md +++ /dev/null @@ -1,110 +0,0 @@ -# GPT-NeoX - -Transformer architecture: GPT-NeoX - -Ref: https://github.com/stability-AI/stableLM/#stablelm-alpha - -## Usage - -```bash -# get the repo and build it -git clone https://github.com/ggerganov/ggml -cd ggml -mkdir build && cd build -cmake .. -make -j - -# get the StableLM 3B Alpha model -git clone https://huggingface.co/stabilityai/gpt_neox-base-alpha-3b - -# install Python dependencies -python3 -m pip install -r ../requirements.txt - -# convert model to FP16 -python3 ../examples/gpt-neox/convert-h5-to-ggml.py ./stablelm-base-alpha-3b/ 1 - -# run inference using FP16 precision -make -j && ./bin/gpt-neox -m ./stablelm-base-alpha-3b/ggml-model-f16.bin -p "I believe the meaning of life is" -t 8 -n 64 - -main: seed = 1681940611 -gpt_neox_model_load: loading model from 'models/stablelm-base-alpha-3b/ggml-model-f16.bin' - please wait ... -gpt_neox_model_load: n_vocab = 50688 -gpt_neox_model_load: n_ctx = 4096 -gpt_neox_model_load: n_embd = 4096 -gpt_neox_model_load: n_head = 32 -gpt_neox_model_load: n_layer = 16 -gpt_neox_model_load: n_rot = 32 -gpt_neox_model_load: ftype = 1 -gpt_neox_model_load: ggml ctx size = 10011.10 MB -gpt_neox_model_load: memory_size = 2048.00 MB, n_mem = 65536 -gpt_neox_model_load: ................................ done -gpt_neox_model_load: model size = 6939.28 MB / num tensors = 260 -main: number of tokens in prompt = 7 -main: token[0] = 42, I -main: token[1] = 2868, believe -main: token[2] = 253, the -main: token[3] = 4495, meaning -main: token[4] = 273, of -main: token[5] = 1495, life -main: token[6] = 310, is - -I believe the meaning of life is to grow, to find a way, to love, to find an appreciation for life, and to live it with all of its beauty. - -For I am the child of God. I am the offspring of God's love. I am the offspring of the light of the world. I am the offspring of the - -main: mem per token = 12186760 bytes -main: load time = 2118.55 ms -main: sample time = 9.59 ms -main: predict time = 4474.07 ms / 63.92 ms per token -main: total time = 6911.26 ms -``` - -## 5-bit integer quantization mode - -```bash -# quantize the model to 5-bits using Q5_0 quantization -./bin/gpt-neox-quantize ./stablelm-base-alpha-3b/ggml-model-f16.bin ./stablelm-base-alpha-3b/ggml-model-q5_0.bin q5_0 - -# run the quantized model -./bin/gpt-neox -m ./stablelm-base-alpha-3b/ggml-model-q5_0.bin -p "I believe the meaning of life is" -t 8 -n 64 - -main: seed = 1682021489 -gpt_neox_model_load: loading model from 'models/stablelm-base-alpha-3b/ggml-model-q5_0.bin' - please wait ... -gpt_neox_model_load: n_vocab = 50688 -gpt_neox_model_load: n_ctx = 4096 -gpt_neox_model_load: n_embd = 4096 -gpt_neox_model_load: n_head = 32 -gpt_neox_model_load: n_layer = 16 -gpt_neox_model_load: n_rot = 32 -gpt_neox_model_load: ftype = 6 -gpt_neox_model_load: ggml ctx size = 5676.10 MB -gpt_neox_model_load: memory_size = 1024.00 MB, n_mem = 65536 -gpt_neox_model_load: ........................ done -gpt_neox_model_load: model size = 2604.28 MB / num tensors = 196 -main: number of tokens in prompt = 7 -main: token[0] = 42, I -main: token[1] = 2868, believe -main: token[2] = 253, the -main: token[3] = 4495, meaning -main: token[4] = 273, of -main: token[5] = 1495, life -main: token[6] = 310, is - -I believe the meaning of life is to love and be loved. The last three verses were enough to tie us all together. If you love someone you love them all. There are some things in this world that are just not equal in Heaven. - Be here in this moment. - -This world is not what is outside of us. It is what - -main: mem per token = 12958024 bytes -main: load time = 850.51 ms -main: sample time = 9.95 ms -main: predict time = 3103.81 ms / 44.34 ms per token -main: total time = 4177.68 ms - -``` - -## Notes - -- No guarantees for correctness -- The tokenizer is currently hacked - probably works only for English -- Non-parallel residual is not supported -- Contributions and improvements are welcome diff --git a/examples/gpt-neox/convert-h5-to-ggml.py b/examples/gpt-neox/convert-h5-to-ggml.py deleted file mode 100644 index f11a4cbc4..000000000 --- a/examples/gpt-neox/convert-h5-to-ggml.py +++ /dev/null @@ -1,107 +0,0 @@ -import sys -import struct -import json -import numpy as np - -from transformers import AutoModelForCausalLM, AutoTokenizer - -if len(sys.argv) < 3: - print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n") - print(" ftype == 0 -> float32") - print(" ftype == 1 -> float16") - sys.exit(1) - -# output in the same directory as the model -dir_model = sys.argv[1] -fname_out = sys.argv[1] + "/ggml-model.bin" - -with open(dir_model + "/config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -# possible data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 -# -# map from ftype to string -ftype_str = ["f32", "f16"] - -ftype = 1 -if len(sys.argv) > 2: - ftype = int(sys.argv[2]) - if ftype < 0 or ftype > 1: - print("Invalid ftype: " + str(ftype)) - sys.exit(1) - fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" - - -tokenizer = AutoTokenizer.from_pretrained(dir_model) -model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True) - -list_vars = model.state_dict() -for name in list_vars.keys(): - print(name, list_vars[name].shape, list_vars[name].dtype) - -fout = open(fname_out, "wb") - -print(hparams) - -fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex -fout.write(struct.pack("i", hparams["vocab_size"])) -fout.write(struct.pack("i", hparams["max_position_embeddings"])) -fout.write(struct.pack("i", hparams["hidden_size"])) -fout.write(struct.pack("i", hparams["num_attention_heads"])) -fout.write(struct.pack("i", hparams["num_hidden_layers"])) -fout.write(struct.pack("i", int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))) -fout.write(struct.pack("i", hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)) -fout.write(struct.pack("i", ftype)) - -# TODO: temporary hack to not deal with implementing the tokenizer -for i in range(hparams["vocab_size"]): - text = tokenizer.decode([i]).encode('utf-8') - fout.write(struct.pack("i", len(text))) - fout.write(text) - -for name in list_vars.keys(): - data = list_vars[name].squeeze().numpy() - print("Processing variable: " + name + " with shape: ", data.shape) - - # we don't need these - if name.endswith(".attention.masked_bias") or \ - name.endswith(".attention.bias") or \ - name.endswith(".attention.rotary_emb.inv_freq"): - print(" Skipping variable: " + name) - continue - - n_dims = len(data.shape) - - # ftype == 0 -> float32, ftype == 1 -> float16 - ftype_cur = 0 - if ftype != 0: - if name[-7:] == ".weight" and n_dims == 2: - print(" Converting to float16") - data = data.astype(np.float16) - ftype_cur = 1 - else: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - else: - if data.dtype != np.float32: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - - # header - str = name.encode('utf-8') - fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) - for i in range(n_dims): - fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) - fout.write(str) - - # data - data.tofile(fout) - -fout.close() - -print("Done. Output file: " + fname_out) -print("") diff --git a/examples/gpt-neox/main.cpp b/examples/gpt-neox/main.cpp deleted file mode 100644 index 37f3c61b3..000000000 --- a/examples/gpt-neox/main.cpp +++ /dev/null @@ -1,820 +0,0 @@ -#include "ggml/ggml.h" - -#include "common.h" -#include "common-ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -// default hparams (StableLM 3B) -struct gpt_neox_hparams { - int32_t n_vocab = 50257; - int32_t n_ctx = 4096; - int32_t n_embd = 4096; - int32_t n_head = 32; - int32_t n_layer = 16; - int32_t n_rot = 32; // rotary_pct * (n_embd / n_head) - int32_t par_res = 1; // 1 = true, 0 = false - int32_t ftype = 1; - float eps = 1e-5f; -}; - -struct gpt_neox_layer { - // pre normalization - struct ggml_tensor * ln_1_g; - struct ggml_tensor * ln_1_b; - - // attention - struct ggml_tensor * c_attn_attn_w; - struct ggml_tensor * c_attn_attn_b; - - struct ggml_tensor * c_attn_proj_w; - struct ggml_tensor * c_attn_proj_b; - - // post normalization - struct ggml_tensor * ln_2_g; - struct ggml_tensor * ln_2_b; - - // ff - struct ggml_tensor * c_mlp_fc_w; - struct ggml_tensor * c_mlp_fc_b; - - struct ggml_tensor * c_mlp_proj_w; - struct ggml_tensor * c_mlp_proj_b; -}; - -struct gpt_neox_model { - gpt_neox_hparams hparams; - - // normalization - struct ggml_tensor * ln_f_g; - struct ggml_tensor * ln_f_b; - - struct ggml_tensor * wte; // position embedding - - struct ggml_tensor * lmh_g; // language model head - //struct ggml_tensor * lmh_b; // language model bias - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // - struct ggml_context * ctx; - std::map tensors; -}; - -// load the model's weights from a file -bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab) { - printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); - - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); - return false; - } - } - - // load hparams - { - auto & hparams = model.hparams; - - fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fin.read((char *) &hparams.par_res, sizeof(hparams.par_res)); - fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - - const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: n_rot = %d\n", __func__, hparams.n_rot); - printf("%s: par_res = %d\n", __func__, hparams.par_res); - printf("%s: ftype = %d\n", __func__, hparams.ftype); - printf("%s: qntvr = %d\n", __func__, qntvr); - - hparams.ftype %= GGML_QNT_VERSION_FACTOR; - } - - // load vocab - { - const int32_t n_vocab = model.hparams.n_vocab; - - std::string word; - std::vector buf(128); - - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - fin.read((char *) &len, sizeof(len)); - - buf.resize(len); - fin.read((char *) buf.data(), len); - word.assign(buf.data(), len); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - } - } - - // for the big tensors, we have the option to store the data in 16-bit floats or quantized - // in order to save memory and also to speed up the computation - ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); - if (wtype == GGML_TYPE_COUNT) { - fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", - __func__, fname.c_str(), model.hparams.ftype); - return false; - } - - auto & ctx = model.ctx; - - size_t ctx_size = 0; - - { - const auto & hparams = model.hparams; - - const size_t n_embd = hparams.n_embd; - const size_t n_layer = hparams.n_layer; - const size_t n_ctx = hparams.n_ctx; - const size_t n_vocab = hparams.n_vocab; - - ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g - ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b - - ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // wte - - ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // lmh_g - //ctx_size += ggml_row_size(GGML_TYPE_F32, n_vocab); // lmh_b - - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b - - ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b - - ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd*n_embd)); // c_attn_proj_b - - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b - - ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b - - ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b - - ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k - ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v - - ctx_size += (6 + 16*n_layer)*1024; // object overhead - - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); - } - - // create the ggml context - { - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - model.ctx = ggml_init(params); - if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return false; - } - } - - // prepare memory for the weights - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_vocab = hparams.n_vocab; - - model.layers.resize(n_layer); - - model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - - model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - model.lmh_g = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - //model.lmh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab); - - // map by name - model.tensors["gpt_neox.embed_in.weight"] = model.wte; - - model.tensors["gpt_neox.final_layer_norm.weight"] = model.ln_f_g; - model.tensors["gpt_neox.final_layer_norm.bias"] = model.ln_f_b; - - model.tensors["embed_out.weight"] = model.lmh_g; - //model.tensors["lm_head.bias"] = model.lmh_b; - - for (int i = 0; i < n_layer; ++i) { - auto & layer = model.layers[i]; - - layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); - layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); - - layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); - layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); - - layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); - layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // map by name - model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.weight"] = layer.ln_1_g; - model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.bias"] = layer.ln_1_b; - - model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.weight"] = layer.c_attn_attn_w; - model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.bias"] = layer.c_attn_attn_b; - - model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.weight"] = layer.c_attn_proj_w; - model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.bias"] = layer.c_attn_proj_b; - - model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.weight"] = layer.ln_2_g; - model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.bias"] = layer.ln_2_b; - - model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.weight"] = layer.c_mlp_fc_w; - model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.bias"] = layer.c_mlp_fc_b; - - model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.weight"] = layer.c_mlp_proj_w; - model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.bias"] = layer.c_mlp_proj_b; - } - } - - // key + value memory - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - - const int64_t n_mem = n_layer*n_ctx; - const int64_t n_elements = n_embd*n_mem; - - model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem); - } - - // load weights - { - int n_tensors = 0; - size_t total_size = 0; - - printf("%s: ", __func__); - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ttype; - - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ttype), sizeof(ttype)); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - if (model.tensors.find(name) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); - return false; - } - - auto tensor = model.tensors[name]; - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str()); - return false; - } - - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n", - __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); - return false; - } - - // for debugging - if (0) { - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); - } - - const size_t bpe = ggml_type_size(ggml_type(ttype)); - - if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", - __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe); - return false; - } - - fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - - total_size += ggml_nbytes(tensor); - if (++n_tensors % 8 == 0) { - printf("."); - fflush(stdout); - } - } - - printf(" done\n"); - - printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); - } - - fin.close(); - - return true; -} - - -// feed-forward network -ggml_tensor * gpt_neox_ff( - const gpt_neox_layer & layer, - ggml_context * ctx0, - ggml_tensor * inp, - float eps) { - ggml_tensor * cur = ggml_norm(ctx0, inp, eps); - - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, layer.ln_2_g, cur), - cur), - ggml_repeat(ctx0, layer.ln_2_b, cur)); - - cur = ggml_mul_mat(ctx0, - layer.c_mlp_fc_w, - cur); - - cur = ggml_add(ctx0, - ggml_repeat(ctx0, layer.c_mlp_fc_b, cur), - cur); - - // GELU activation - cur = ggml_gelu(ctx0, cur); - - // projection - // cur = proj_w*cur + proj_b - cur = ggml_mul_mat(ctx0, - layer.c_mlp_proj_w, - cur); - - cur = ggml_add(ctx0, - ggml_repeat(ctx0, layer.c_mlp_proj_b, cur), - cur); - return cur; -} - -// evaluate the transformer -// -// - model: the model -// - n_threads: number of threads to use -// - n_past: the context size so far -// - embd_inp: the embeddings of the tokens in the context -// - embd_w: the predicted logits for the next token -// -bool gpt_neox_eval( - const gpt_neox_model & model, - const int n_threads, - const int n_past, - const std::vector & embd_inp, - std::vector & embd_w, - size_t & mem_per_token) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_head = hparams.n_head; - const int n_vocab = hparams.n_vocab; - const int n_rot = hparams.n_rot; - - static size_t buf_size = 256u*1024*1024; - static void * buf = malloc(buf_size); - - // use 2 scratch buffers - // TODO: very hacky solution - reimplement in a more elegant way - static size_t scr0_size = 256u*1024*1024; - static void * scr0 = malloc(scr0_size); - - static size_t scr1_size = 256u*1024*1024; - static void * scr1 = malloc(scr1_size); - - if (mem_per_token > 0 && mem_per_token*N > buf_size) { - const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead - //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); - - // reallocate - buf_size = buf_size_new; - buf = realloc(buf, buf_size); - if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); - return false; - } - } - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - int * data = (int *) KQ_pos->data; - for (int i = 0; i < N; ++i) { - data[i] = n_past + i; - } - - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); - - // wte - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur; - - ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); - - // self-attention - { - { - cur = ggml_norm(ctx0, inpL, hparams.eps); - - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), - cur), - ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); - } - - // compute QKV - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_attn_attn_w, - cur); - - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), - cur); - } - - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head)); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head)); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head)); - - // using mode = 2 for GPT-NeoX mode - Qcur = ggml_rope_inplace(ctx0, Qcur, KQ_pos, n_rot, 2, 0); - Kcur = ggml_rope_inplace(ctx0, Kcur, KQ_pos, n_rot, 2, 0); - - // store key and value to memory - { - Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); - - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, - ( n_ctx)*ggml_element_size(model.memory_v), - (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v)); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), - n_embd/n_head, n_head, n_past + N), - 0, 2, 1, 3); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - 1.0f/sqrt(float(n_embd)/n_head)); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - struct ggml_tensor * V = - ggml_view_3d(ctx0, model.memory_v, - n_past + N, n_embd/n_head, n_head, - n_ctx*ggml_element_size(model.memory_v), - n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head, - il*n_ctx*ggml_element_size(model.memory_v)*n_embd); - - // KQV = transpose(V) * KQ_soft_max - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - - // cur = KQV_merged.contiguous().view(n_embd, N) - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - // projection - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_attn_proj_w, - cur); - - cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur); - } - } - - ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); - - if (hparams.par_res == 0) { - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL); - - cur = gpt_neox_ff(model.layers[il], ctx0, inpFF, hparams.eps); - - // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); - } else { - struct ggml_tensor * inpFF = cur; - - // this is independent of the self-attention result, so it could be done in parallel to the self-attention - // note here we pass inpL instead of cur - cur = gpt_neox_ff(model.layers[il], ctx0, inpL, hparams.eps); - - // layer input + FF - cur = ggml_add(ctx0, cur, inpFF); - - // input for next layer - inpL = ggml_add(ctx0, cur, inpL); - } - } - - ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); - - // norm - { - inpL = ggml_norm(ctx0, inpL, hparams.eps); - - // inpL = ln_f_g*inpL + ln_f_b - inpL = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.ln_f_g, inpL), - inpL), - ggml_repeat(ctx0, model.ln_f_b, inpL)); - } - - ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - - // lm_head - { - inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL); - - //inpL = ggml_add(ctx0, - // ggml_repeat(ctx0, model.lmh_b, inpL), - // inpL); - } - - // logits -> probs - //inpL = ggml_soft_max_inplace(ctx0, inpL); - - // run the computation - ggml_build_forward_expand(gf, inpL); - ggml_graph_compute_with_ctx(ctx0, gf, n_threads); - - //if (n_past%100 == 0) { - // ggml_graph_print (&gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); - //} - - //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); - - // return result for just the last token - embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); - - if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; - } - //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); - - ggml_free(ctx0); - - return true; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - params.model = "models/stablelm-base-alpha-3b/ggml-model-f16.bin"; - - if (gpt_params_parse(argc, argv, params) == false) { - return 1; - } - - if (params.seed < 0) { - params.seed = time(NULL); - } - - printf("%s: seed = %d\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - if (params.prompt.empty()) { - params.prompt = gpt_random_prompt(rng); - } - - int64_t t_load_us = 0; - - gpt_vocab vocab; - gpt_neox_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt_neox_model_load(params.model, model, vocab)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - - test_gpt_tokenizer(vocab, params.token_test); - } - - int n_past = 0; - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector logits; - - // tokenize the prompt - std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); - - params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - - printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - for (size_t i = 0; i < embd_inp.size(); i++) { - printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); - } - printf("\n"); - - std::vector embd; - - // determine the required inference memory per token: - size_t mem_per_token = 0; - gpt_neox_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); - - for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt_neox_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { - printf("Failed to predict\n"); - return 1; - } - - t_predict_us += ggml_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (i >= embd_inp.size()) { - // sample next token - const int top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - - const int n_vocab = model.hparams.n_vocab; - - gpt_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); - - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - } else { - // if here, it means we are still processing the input prompt - for (size_t k = i; k < embd_inp.size(); k++) { - embd.push_back(embd_inp[k]); - if (int32_t(embd.size()) > params.n_batch) { - break; - } - } - i += embd.size() - 1; - } - - // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); - } - fflush(stdout); - - // end of text token - if (embd.back() == 0) { - break; - } - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - ggml_free(model.ctx); - - return 0; -} diff --git a/examples/gpt-neox/quantize.cpp b/examples/gpt-neox/quantize.cpp deleted file mode 100644 index 96208c1e8..000000000 --- a/examples/gpt-neox/quantize.cpp +++ /dev/null @@ -1,178 +0,0 @@ -#include "ggml/ggml.h" - -#include "common.h" -#include "common-ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// default hparams (StableLM 3B) -struct gpt_neox_hparams { - int32_t n_vocab = 50257; - int32_t n_ctx = 4096; - int32_t n_embd = 4096; - int32_t n_head = 32; - int32_t n_layer = 16; - int32_t n_rot = 32; // 0.25 * (n_embd / n_head) - int32_t par_res = 1; // 1 = true, 0 = false - int32_t ftype = 1; -}; - -// quantize a model -bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) { - gpt_vocab vocab; - - printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); - - auto finp = std::ifstream(fname_inp, std::ios::binary); - if (!finp) { - fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str()); - return false; - } - - auto fout = std::ofstream(fname_out, std::ios::binary); - if (!fout) { - fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - finp.read((char *) &magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str()); - return false; - } - - fout.write((char *) &magic, sizeof(magic)); - } - - gpt_neox_hparams hparams; - - // load hparams - { - finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - finp.read((char *) &hparams.par_res, sizeof(hparams.par_res)); - finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - - const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; - const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: par_res = %d\n", __func__, hparams.par_res); - printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); - printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); - printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); - printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); - - fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); - fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fout.write((char *) &hparams.par_res, sizeof(hparams.par_res)); - fout.write((char *) &ftype_dst, sizeof(ftype_dst)); - } - - // load vocab - { - const int32_t n_vocab = hparams.n_vocab; - - std::string word; - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - finp.read ((char *) &len, sizeof(len)); - fout.write((char *) &len, sizeof(len)); - - word.resize(len); - finp.read ((char *) word.data(), len); - fout.write((char *) word.data(), len); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - } - } - - // regexes of tensor names to be quantized - const std::vector to_quant = { - ".*weight", - }; - - if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) { - fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str()); - return false; - } - - finp.close(); - fout.close(); - - return true; -} - -// usage: -// ./gpt-neox-quantize models/stalellm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type -// -int main(int argc, char ** argv) { - if (argc != 4) { - fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); - ggml_print_ftypes(stderr); - return 1; - } - - // needed to initialize f16 tables - { - struct ggml_init_params params = { 0, NULL, false }; - struct ggml_context * ctx = ggml_init(params); - ggml_free(ctx); - } - - const std::string fname_inp = argv[1]; - const std::string fname_out = argv[2]; - - const ggml_ftype ftype = ggml_parse_ftype(argv[3]); - - const int64_t t_main_start_us = ggml_time_us(); - - int64_t t_quantize_us = 0; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { - fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); - return 1; - } - - t_quantize_us = ggml_time_us() - t_start_us; - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n"); - printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - return 0; -} diff --git a/examples/mpt/CMakeLists.txt b/examples/mpt/CMakeLists.txt deleted file mode 100644 index 09408f9fc..000000000 --- a/examples/mpt/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -# -# mpt - -set(TEST_TARGET mpt) -add_executable(${TEST_TARGET} main.cpp) -target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) - -# -# mpt-quantize - -set(TEST_TARGET mpt-quantize) -add_executable(${TEST_TARGET} quantize.cpp) -target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) diff --git a/examples/mpt/README.md b/examples/mpt/README.md deleted file mode 100644 index 39f46bae3..000000000 --- a/examples/mpt/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# MPT - -Ref: https://github.com/mosaicml/llm-foundry#mpt - -## Usage - -```bash -# get the repo and build it -git clone https://github.com/ggerganov/ggml -cd ggml -mkdir build && cd build -cmake .. -make -j - -# get the model from HuggingFace -# be sure to have git-lfs installed -git clone https://huggingface.co/mosaicml/mpt-30b - -# convert model to FP16 -python3 ../examples/mpt/convert-h5-to-ggml.py ./mpt-30b 1 - -# run inference using FP16 precision -./bin/mpt -m ./mpt-30b/ggml-model-f16.bin -p "I believe the meaning of life is" -t 8 -n 64 - -# quantize the model to 5-bits using Q5_0 quantization -./bin/mpt-quantize ./mpt-30b/ggml-model-f16.bin ./mpt-30b/ggml-model-q5_0.bin q5_0 -``` diff --git a/examples/mpt/convert-h5-to-ggml.py b/examples/mpt/convert-h5-to-ggml.py deleted file mode 100755 index ccd6459fe..000000000 --- a/examples/mpt/convert-h5-to-ggml.py +++ /dev/null @@ -1,169 +0,0 @@ -import os -import struct -import sys - -import torch -from transformers import AutoConfig, AutoTokenizer - - -# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a signficant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = ( - list(range(ord("!"), ord("~") + 1)) - + list(range(ord("¡"), ord("¬") + 1)) - + list(range(ord("®"), ord("ÿ") + 1)) - ) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8 + n) - n += 1 - - cs = [chr(n) for n in cs] - - return dict(zip(bs, cs)) - - -def count_model_parts(dir_model: str) -> int: - """Returns the number of model parts in the model directory.""" - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print(f"Found {num_parts} model parts in {dir_model}") - return num_parts - - -if len(sys.argv) < 3: - print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n") - print(" ftype == 0 -> float32") - print(" ftype == 1 -> float16") - sys.exit(1) - - -# output in the same directory as the model -dir_model = sys.argv[1] -# get number of model parts -num_parts = count_model_parts(dir_model) - -# possible data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 -# -# map from ftype to string -ftype_str = ["f32", "f16"] - -ftype = 1 -if len(sys.argv) > 2: - ftype = int(sys.argv[2]) - if ftype < 0 or ftype > 1: - print("Invalid ftype: " + str(ftype)) - sys.exit(1) - fname_out = dir_model + "/ggml-model-" + ftype_str[ftype] + ".bin" - - -tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) -config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True) -hparams = config.to_dict() - -fout = open(fname_out, "wb") - -fout.write(struct.pack("i", 0x67676D6C)) # magic: ggml in hex -fout.write(struct.pack("i", hparams["d_model"])) -fout.write(struct.pack("i", hparams["max_seq_len"])) -fout.write(struct.pack("i", hparams["n_heads"])) -fout.write(struct.pack("i", hparams["n_layers"])) -fout.write(struct.pack("i", hparams["vocab_size"])) -fout.write(struct.pack("f", hparams["attn_config"]["alibi_bias_max"])) -fout.write(struct.pack("f", hparams["attn_config"]["clip_qkv"] or 0.0)) -fout.write(struct.pack("i", ftype)) - -vocab_size = hparams["vocab_size"] - -encoder = tokenizer.vocab -# Add added_tokens (special tokens) to the encoder -encoder.update(tokenizer.get_added_vocab()) - -byte_encoder = bytes_to_unicode() -byte_decoder = {v: k for k, v in byte_encoder.items()} - -counter = 0 -# sort by value -for key in sorted(encoder, key=encoder.get): - # workaround for key error when c not found - text = "" - for c in key: - if c not in byte_decoder: - text += c - else: - text += chr(byte_decoder[c]) - text = bytearray(text, encoding="utf-8") - fout.write(struct.pack("i", len(text))) - fout.write(text) - counter += 1 - -# Repeat last token until vocab_size -while counter < vocab_size: - fout.write(struct.pack("i", len(text))) - fout.write(text) - counter += 1 - -if num_parts == 0: - part_names = ("pytorch_model.bin",) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - print(f"\n* Loading part: {part_name}") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - for name in model_part.keys(): - data = model_part[name].squeeze() - n_dims = len(data.shape) - - # ftype == 0 -> float32, ftype == 1 -> float16 - # default type is fp32 - ftype_cur = 0 - if ftype == 1 and name[-7:] == ".weight" and n_dims > 1: - ftype_cur = 1 - data = data.to(dtype=torch.float16 if ftype_cur == 1 else torch.float32).numpy() - - print( - "Processing variable: " + name + " with shape: ", - data.shape, - "->", - data.dtype, - ) - - # header - str = name.encode("utf-8") - fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) - for i in range(n_dims): - fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) - fout.write(str) - - # data - data.tofile(fout) - - # release memory - del model_part - -fout.close() - -print("Done. Output file: " + fname_out) -print("") diff --git a/examples/mpt/main.cpp b/examples/mpt/main.cpp deleted file mode 100644 index a16367cc1..000000000 --- a/examples/mpt/main.cpp +++ /dev/null @@ -1,1042 +0,0 @@ -#include "ggml/ggml.h" - -#include "common-ggml.h" -#include "common.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -// no defaults for now -struct mpt_hparams { - int32_t d_model = 0; - int32_t max_seq_len = 0; - int32_t n_heads = 0; - int32_t n_layers = 0; - int32_t n_vocab = 0; - float alibi_bias_max = 0; - float clip_qkv = 0; - int32_t ftype = 0; - int32_t n_ctx = 0; - -}; - -struct mpt_layer { - // pre normalization - struct ggml_tensor * norm_1_weight; - - // attention - struct ggml_tensor * c_attn_wqkv_weight; - struct ggml_tensor * c_attn_out_proj_weight; - - // post normalization - struct ggml_tensor * norm_2_weight; - - // ff - struct ggml_tensor * ffn_up_proj; - struct ggml_tensor * ffn_down_proj; -}; - -struct mpt_model { - mpt_hparams hparams; - - struct ggml_tensor * wte_weight; // position embedding - struct ggml_tensor * norm_f_weight; // language model head - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - struct ggml_context * ctx; - std::map tensors; -}; - -struct mpt_params { - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - - int32_t seed = -1; // RNG seed - int32_t n_predict = 200; // new tokens to predict - int32_t n_batch = 8; // batch size for prompt processing - int32_t n_ctx = 512; - - std::string model = ""; // model path - std::string prompt = ""; - std::string token_test = ""; - - bool perplexity = false; - - // sampling parameters - int32_t top_k = 0; - float top_p = 1.0f; - float temp = 0.8f; - int32_t repeat_last_n = 64; - float repeat_penalty = 1.02f; - -}; - -void mpt_print_usage(int /*argc*/, char ** argv, const mpt_params & params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); - fprintf(stderr, " prompt to start generation with (default: random)\n"); - fprintf(stderr, " -f FNAME, --file FNAME\n"); - fprintf(stderr, " load prompt from a file\n"); - fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n"); - fprintf(stderr, " test tokenization\n"); - fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); - fprintf(stderr, " --top_k N top-k sampling (default: %d, 0 = n_vocab)\n", params.top_k); - fprintf(stderr, " --top_p N top-p sampling (default: %.2f)\n", params.top_p); - fprintf(stderr, " --temp N temperature (default: %.2f)\n", params.temp); - fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); - fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty); - fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); - fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, "\n"); -} - -bool mpt_params_parse(int argc, char ** argv, mpt_params & params) { - for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; - - if (arg == "-s" || arg == "--seed") { - params.seed = std::stoi(argv[++i]); - } else if (arg == "-t" || arg == "--threads") { - params.n_threads = std::stoi(argv[++i]); - } else if (arg == "-p" || arg == "--prompt") { - params.prompt = argv[++i]; - } else if (arg == "-n" || arg == "--n_predict") { - params.n_predict = std::stoi(argv[++i]); - } else if (arg == "--top_k") { - params.top_k = std::max(1, std::stoi(argv[++i])); - } else if (arg == "--top_p") { - params.top_p = std::stof(argv[++i]); - } else if (arg == "--temp") { - params.temp = std::stof(argv[++i]); - } else if (arg == "--repeat-last-n") { - params.repeat_last_n = std::stof(argv[++i]); - } else if (arg == "--repeat-penalty") { - params.repeat_penalty = std::stof(argv[++i]); - } else if (arg == "--perplexity") { - params.perplexity = true; - } else if (arg == "-c" || arg == "--ctx-size") { - params.n_ctx = std::stoi(argv[++i]); - } else if (arg == "-b" || arg == "--batch_size") { - params.n_batch = std::stoi(argv[++i]); - } else if (arg == "-m" || arg == "--model") { - params.model = argv[++i]; - } else if (arg == "-h" || arg == "--help") { - mpt_print_usage(argc, argv, params); - exit(0); - } else if (arg == "-f" || arg == "--file") { - if (++i > argc) { - fprintf(stderr, "Invalid file param"); - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - break; - } - params.prompt.clear(); - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (params.prompt.back() == '\n') { - params.prompt.pop_back(); - } - } else if (arg == "-tt" || arg == "--token_test") { - params.token_test = argv[++i]; - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - mpt_print_usage(argc, argv, params); - exit(0); - } - } - - return true; -} - -// load the model's weights from a file -bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) { - printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); - - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - fin.read((char *)&magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); - return false; - } - } - - // load hparams - { - auto & hparams = model.hparams; - - fin.read((char *) &hparams.d_model, sizeof(hparams.d_model)); - fin.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len)); - fin.read((char *) &hparams.n_heads, sizeof(hparams.n_heads)); - fin.read((char *) &hparams.n_layers, sizeof(hparams.n_layers)); - fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fin.read((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max)); - fin.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv)); - fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - - hparams.n_ctx = std::min(hparams.max_seq_len, hparams.n_ctx); - - const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; - - printf("%s: d_model = %d\n", __func__, hparams.d_model); - printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_heads = %d\n", __func__, hparams.n_heads); - printf("%s: n_layers = %d\n", __func__, hparams.n_layers); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max); - printf("%s: clip_qkv = %f\n", __func__, hparams.clip_qkv); - printf("%s: ftype = %d\n", __func__, hparams.ftype); - printf("%s: qntvr = %d\n", __func__, qntvr); - - hparams.ftype %= GGML_QNT_VERSION_FACTOR; - } - - // load vocab - { - const int32_t n_vocab = model.hparams.n_vocab; - - std::string word; - std::vector buf(128); - - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - fin.read((char *) &len, sizeof(len)); - - buf.resize(len); - fin.read((char *) buf.data(), len); - word.assign(buf.data(), len); - - // Convert token from utf-8 - std::wstring word_multibytes = convert_to_wstring(word); - word.resize(word_multibytes.size()); - for (size_t w = 0; w < word_multibytes.size(); w++) { - word[w] = uint8_t(word_multibytes[w]); - } - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - } - } - - // for the big tensors, we have the option to store the data in 16-bit - // floats or quantized in order to save memory and also to speed up the - // computation - ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model.hparams.ftype)); - if (wtype == GGML_TYPE_COUNT) { - fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname.c_str(), - model.hparams.ftype); - return false; - } - - auto & ctx = model.ctx; - - size_t ctx_size = 0; - - const auto & hparams = model.hparams; - const size_t n_ctx = hparams.n_ctx; - - { - const size_t n_embd = hparams.d_model; - const size_t n_layer = hparams.n_layers; - const size_t n_vocab = hparams.n_vocab; - - ctx_size += ggml_row_size(wtype, n_embd * n_vocab); // wte_weight - ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // norm_f_weight - - ctx_size += n_layer * (ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_weight - - ctx_size += n_layer * (ggml_row_size(wtype, 3 * n_embd * n_embd)); // attn_Wqkv_weight - ctx_size += n_layer * (ggml_row_size(wtype, n_embd * n_embd)); // attn_out_proj_weight - - ctx_size += n_layer * (ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_weight - - ctx_size += n_layer * (ggml_row_size(wtype, 4 * n_embd * n_embd)); // mlp_mlp_up_weight - ctx_size += n_layer * (ggml_row_size(wtype, 4 * n_embd * n_embd)); // mlp_mlp_down_weight - - ctx_size += n_ctx * n_layer * ggml_row_size(GGML_TYPE_F16, n_embd); // memory_k - ctx_size += n_ctx * n_layer * ggml_row_size(GGML_TYPE_F16, n_embd); // memory_v - - ctx_size += (1 + 6 * n_layer) * 512; // object overhead - - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size / (1024.0 * 1024.0)); - } - - // create the ggml context - { - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - model.ctx = ggml_init(params); - if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return false; - } - } - - // prepare memory for the weights - { - const auto & hparams = model.hparams; - - const size_t n_embd = hparams.d_model; - const size_t n_layer = hparams.n_layers; - const size_t n_vocab = hparams.n_vocab; - - model.layers.resize(n_layer); - - model.wte_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - model.norm_f_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // map by name - model.tensors["transformer.wte.weight"] = model.wte_weight; - model.tensors["transformer.norm_f.weight"] = model.norm_f_weight; - - for (int i = 0; i < (int) n_layer; ++i) { - auto & layer = model.layers[i]; - - layer.norm_1_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.c_attn_wqkv_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, 3 * n_embd); - layer.c_attn_out_proj_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.norm_2_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ffn_up_proj = ggml_new_tensor_2d(ctx, wtype, n_embd, 4 * n_embd); - layer.ffn_down_proj = ggml_new_tensor_2d(ctx, wtype, 4 * n_embd, n_embd); - - // map by name - model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"] = layer.norm_1_weight; - model.tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"] = layer.c_attn_wqkv_weight; - model.tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_out_proj_weight; - model.tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"] = layer.norm_2_weight; - model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"] = layer.ffn_up_proj; - model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj; - } - } - - // key + value memory - { - const auto & hparams = model.hparams; - - const size_t n_embd = hparams.d_model; - const size_t n_layer = hparams.n_layers; - - const int64_t n_mem = n_layer * n_ctx; - const int64_t n_elements = n_embd * n_mem; - - model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size / 1024.0 / 1024.0, n_mem); - } - - // load weights - { - int n_tensors = 0; - size_t total_size = 0; - - printf("%s: ", __func__); - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ttype; - - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ttype), sizeof(ttype)); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = {1, 1}; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - if (model.tensors.find(name) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); - return false; - } - - auto tensor = model.tensors[name]; - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str()); - return false; - } - - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, - "%s: tensor '%s' has wrong shape in model file: got [%5d, " - "%5d], expected [%5d, %5d]\n", - __func__, name.c_str(), (int)tensor->ne[0], (int)tensor->ne[1], ne[0], ne[1]); - return false; - } - - // for debugging - if (0) { - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], - ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor)); - } - - const size_t bpe = ggml_type_size(ggml_type(ttype)); - - if ((nelements * bpe) / ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { - fprintf(stderr, - "%s: tensor '%s' has wrong size in model file: got %zu, " - "expected %zu\n", - __func__, name.c_str(), ggml_nbytes(tensor), nelements * bpe); - return false; - } - - fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - - total_size += ggml_nbytes(tensor); - if (++n_tensors % 8 == 0) { - printf("."); - fflush(stdout); - } - } - - printf(" done\n"); - - printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size / 1024.0 / 1024.0, n_tensors); - } - - fin.close(); - - return true; -} - -// evaluate the transformer -// -// - model: the model -// - n_threads: number of threads to use -// - n_past: the context size so far -// - embd_inp: the embeddings of the tokens in the context -// - embd_w: the predicted logits for the next token -// -bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, - const std::vector & embd_inp, std::vector & embd_w, bool logits_all, size_t & mem_per_token) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_embd = hparams.d_model; - const int n_layer = hparams.n_layers; - const int n_head = hparams.n_heads; - const int n_vocab = hparams.n_vocab; - const int n_ctx = hparams.n_ctx; - const float eps = 1e-5f; - - static size_t buf_size = 256u * 1024 * 1024; - static void * buf = malloc(buf_size); - - // use 2 scratch buffers - // TODO: very hacky solution - reimplement in a more elegant way - static size_t scr0_size = 256u*1024*1024; - static void * scr0 = malloc(scr0_size); - - static size_t scr1_size = 256u*1024*1024; - static void * scr1 = malloc(scr1_size); - - if (mem_per_token > 0 && mem_per_token * N > buf_size) { - const size_t buf_size_new = 1.1 * (mem_per_token * N); // add 10% to account for ggml object overhead - // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, - // buf_size, buf_size_new); - - // reallocate - buf_size = buf_size_new; - buf = realloc(buf, buf_size); - if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); - return false; - } - } - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd)); - - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte_weight, embd); - - for (int il = 0; il < n_layer; ++il) { - - struct ggml_tensor * cur; - - ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); - - // a = self.ln_1(x) - { - cur = ggml_norm(ctx0, inpL, eps); - - cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_1_weight, cur), cur); - } - - // self-attention - // b, _, past_key_value = self.attn(a, past_key_value=past_key_value, - // attn_bias=attn_bias, attention_mask=attention_mask, - // is_causal=is_causal) - { - // compute QKV - cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_wqkv_weight, cur); - - if (model.hparams.clip_qkv > 0.0f) { - cur = ggml_clamp(ctx0, cur, -model.hparams.clip_qkv, model.hparams.clip_qkv); - } - - struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0 * sizeof(float) * n_embd); - struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1 * sizeof(float) * n_embd); - struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2 * sizeof(float) * n_embd); - - // store key and value to memory - { - struct ggml_tensor * k = - ggml_view_1d(ctx0, model.memory_k, N * n_embd, - (ggml_element_size(model.memory_k) * n_embd) * (il * n_ctx + n_past)); - struct ggml_tensor * v = - ggml_view_1d(ctx0, model.memory_v, N * n_embd, - (ggml_element_size(model.memory_v) * n_embd) * (il * n_ctx + n_past)); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, - // 2, 1, 3) [64, N, 12] - struct ggml_tensor * Q = ggml_permute( - ctx0, ggml_cpy(ctx0, Qcur, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd / n_head, n_head, N)), 0, 2, - 1, 3); - - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, - // 3) [64, n_past + N, 12] - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_embd, - il * n_ctx * ggml_element_size(model.memory_k) * n_embd), - n_embd / n_head, n_head, n_past + N), - 0, 2, 1, 3); - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, KQ, 1.0f / sqrt(float(n_embd) / n_head)); - - struct ggml_tensor * KQ_scaled_alibi = - ggml_alibi(ctx0, KQ_scaled, n_past, n_head, model.hparams.alibi_bias_max); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, - // 2, 0, 3).contiguous() [n_past + N, 64, 12] - struct ggml_tensor * V_trans = ggml_cpy( - ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_embd, - il * n_ctx * ggml_element_size(model.memory_v) * n_embd), - n_embd / n_head, n_head, n_past + N), - 1, 2, 0, 3), - ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd / n_head, n_head)); - - // KQV = transpose(V) * KQ_soft_max - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - - // cur = KQV_merged.contiguous().view(n_embd, N) - cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - // projection - { cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_out_proj_weight, cur); } - } - - inpL = ggml_add(ctx0, inpL, cur); - - ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); - - // m = self.ln_2(x) - { - cur = ggml_norm(ctx0, inpL, eps); - - cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_2_weight, cur), cur); - } - - // n = self.mlp(m) - { - - cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up_proj, cur); - - // GELU activation - cur = ggml_gelu(ctx0, cur); - - // projection - // cur = proj_w*cur + proj_b - cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down_proj, cur); - } - - // x = x + n - inpL = ggml_add(ctx0, inpL, cur); - } - - ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); - - // norm - { - inpL = ggml_norm(ctx0, inpL, eps); - // inpL = ln_f_g*inpL - inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL); - } - - ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - - // output embedding weight tied to input embedding - inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL); - - // logits -> probs - // inpL = ggml_soft_max(ctx0, inpL); - - // run the computation - ggml_build_forward_expand(gf, inpL); - ggml_graph_compute_with_ctx(ctx0, gf, n_threads); - - // std::cout << "Qcur" << std::endl; - // print_tensor(Qcur); - - // if (n_past%100 == 0) { - // ggml_graph_print(&gf); - // ggml_graph_dump_dot(&gf, NULL, "mpt-model.dot"); - // } - - if (logits_all) { - // return result for all tokens - embd_w.resize(n_vocab *N); - memcpy(embd_w.data(), (float *)ggml_get_data(inpL) , sizeof(float) * n_vocab * N); - } else { - // return result for just the last token - embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab); - } - - if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0) / N; - } - // printf("used_mem = %zu\n", ggml_used_mem(ctx0)); - - ggml_free(ctx0); - - return true; -} - -std::vector softmax(const std::vector & logits) { - std::vector probs(logits.size()); - float max_logit = logits[0]; - for (float v : logits) max_logit = std::max(max_logit, v); - double sum_exp = 0.0; - for (size_t i = 0; i < logits.size(); i++) { - // Subtract the maximum logit value from the current logit value for numerical stability - const float logit = logits[i] - max_logit; - const float exp_logit = expf(logit); - sum_exp += exp_logit; - probs[i] = exp_logit; - } - for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp; - return probs; -} - -int perplexity(const mpt_params & params) { - ggml_time_init(); - - const int64_t t_main_start_us = ggml_time_us(); - - printf("%s: n_threads = %d\n", __func__, params.n_threads); - printf("%s: n_batch = %d\n", __func__, params.n_batch); - printf("%s: n_ctx = %d\n", __func__, params.n_ctx); - printf("\n"); - - int64_t t_load_us = 0; - - gpt_vocab vocab; - mpt_model model; - - model.hparams.n_ctx = params.n_ctx; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!mpt_model_load(params.model, model, vocab)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - } - - int64_t t_predict_us = 0; - - std::vector logits; - - // tokenize the prompt - std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); - - printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - - // determine the required inference memory per token: - size_t mem_per_token = 0; - mpt_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, false, mem_per_token); - - int count = 0; - - const int n_chunk = embd_inp.size() / params.n_ctx; - - const int n_vocab = model.hparams.n_vocab; - const int n_batch = params.n_batch; - - double nll = 0.0; - fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); - - for (int i = 0; i < n_chunk; ++i) { - - const int start = i * params.n_ctx; - const int end = start + params.n_ctx; - - const int num_batches = (params.n_ctx + n_batch - 1) / n_batch; - - std::vector logits; - - const auto t_start = std::chrono::high_resolution_clock::now(); - - for (int j = 0; j < num_batches; ++j) { - - const int batch_start = start + j * n_batch; - const int batch_size = std::min(end - batch_start, n_batch); - - std::vector embd; - - for(int p=0;p batch_logits;// = llama_get_logits(ctx); - - const int64_t t_start_us = ggml_time_us(); - - if (!mpt_eval(model, params.n_threads, j * batch_size, embd, batch_logits, true, mem_per_token)) { - printf("%s: failed to evaluate model\n", __func__); - return 1; - } - - t_predict_us += ggml_time_us() - t_start_us; - - logits.insert(logits.end(), batch_logits.data(), batch_logits.data() + batch_size * n_vocab); - - } - - const auto t_end = std::chrono::high_resolution_clock::now(); - - if (i == 0) { - const float t_total = std::chrono::duration(t_end - t_start).count(); - fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); - int total_seconds = (int)(t_total * n_chunk); - if (total_seconds >= 60*60) { - fprintf(stderr, "%d hours ", total_seconds / (60*60)); - total_seconds = total_seconds % (60*60); - } - fprintf(stderr, "%d minutes\n", total_seconds / 60); - - printf("\nChunk\tPPL cumulative\tPPL chunk\n"); - } - - // We get the logits for all the tokens in the context window (params.n_ctx) - // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, - // calculate the perplexity over the last half of the window (so the model always has - // some context to predict the token). - // - // We rely on the fact that attention in the forward pass only looks at previous - // tokens here, so the logits returned for each token are an accurate representation - // of what the model would have predicted at that point. - // - // Example, we have a context window of 512, we will compute perplexity for each of the - // last 256 tokens. Then, we split the input up into context window size chunks to - // process the entire prompt. - - double nllchunk = 0.0; - int countchunk = 0; - - for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) { - // Calculate probability of next token, given the previous ones. - const std::vector tok_logits( - logits.begin() + (j + 0) * n_vocab, - logits.begin() + (j + 1) * n_vocab); - - const float prob = softmax(tok_logits)[embd_inp[ start+ j + 1]]; - - nllchunk += -std::log(prob); - ++countchunk; - } - - nll += nllchunk; - count += countchunk; - - // perplexity is e^(average negative log-likelihood) - printf("%d\t%.8lf\t%.8lf\n", i + 1, std::exp(nll / count), std::exp(nllchunk/countchunk) ); - fflush(stdout); - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us / 1000.0f); - printf("%s: eval time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, t_predict_us / 1000.0f / (n_chunk * params.n_ctx)); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f); - } - - ggml_free(model.ctx); - - return 0; -} - -int main(int argc, char ** argv) { - mpt_params params; - - if (mpt_params_parse(argc, argv, params) == false) { - return 1; - } - - if (params.perplexity) { - return perplexity(params); - } - - ggml_time_init(); - - const int64_t t_main_start_us = ggml_time_us(); - - if (params.seed < 0) { - params.seed = time(NULL); - } - - if (params.n_predict < 0) { - params.n_predict = 0; - } - - printf("%s: seed = %d\n", __func__, params.seed); - printf("%s: n_threads = %d\n", __func__, params.n_threads); - printf("%s: n_batch = %d\n", __func__, params.n_batch); - printf("%s: n_ctx = %d\n", __func__, params.n_ctx); - printf("%s: n_predict = %d\n\n", __func__, params.n_predict); - - std::mt19937 rng(params.seed); - if (params.prompt.empty()) { - params.prompt = gpt_random_prompt(rng); - } - - int64_t t_load_us = 0; - - gpt_vocab vocab; - mpt_model model; - - model.hparams.n_ctx = params.n_ctx; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!mpt_model_load(params.model, model, vocab)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - - test_gpt_tokenizer(vocab, params.token_test); - } - - if (params.top_k == 0) { - params.top_k = model.hparams.n_vocab; - } - - if (params.repeat_last_n == -1) { - params.repeat_last_n = params.n_ctx; - } - - printf("\n"); - printf("%s: temp = %.3f\n", __func__, params.temp); - printf("%s: top_k = %d\n", __func__, params.top_k); - printf("%s: top_p = %.3f\n", __func__, params.top_p); - printf("%s: repeat_last_n = %d\n", __func__, params.repeat_last_n); - printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty); - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector last_n_tokens(params.n_ctx); - std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - - // tokenize the prompt - std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); - - printf("\n"); - printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - - for (size_t i = 0; i < embd_inp.size(); i++) { - printf("%s: token[%zu] = %6d\n", __func__, i, embd_inp[i]); - } - printf("\n"); - - std::vector embd; - std::vector logits; - - // determine the required inference memory per token: - size_t mem_per_token = 0; - mpt_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, false, mem_per_token); - - int n_past = 0; - int n_consumed = 0; - int n_sampled = 0; - - while (n_sampled < params.n_predict) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); - - if (!mpt_eval(model, params.n_threads, n_past, embd, logits, false, mem_per_token)) { - printf("%s: failed to predict\n", __func__); - return 1; - } - - t_predict_us += ggml_time_us() - t_start_us; - - n_past += embd.size(); - embd.clear(); - } - - if ((int)embd_inp.size() <= n_consumed) { - // sample next token - - const int top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - const int repeat_last_n = params.repeat_last_n; - const float repeat_penalty = params.repeat_penalty; - - gpt_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = gpt_sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - model.hparams.n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_last_n, repeat_penalty, rng); - - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(id); - - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - ++n_sampled; - - } else { - // if here, it means we are still processing the input prompt - while ((int) embd_inp.size() > n_consumed) { - embd.push_back(embd_inp[n_consumed]); - - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(embd_inp[n_consumed]); - - ++n_consumed; - if ((int) embd.size() >= params.n_batch) { - break; - } - } - } - - // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); - } - fflush(stdout); - - // end of text token - if (embd.back() == 0) { - break; - } - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n\n"); - printf("%s: sampled tokens = %8d\n", __func__, n_sampled); - printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us / 1000.0f); - printf("%s: sample time = %8.2f ms / %.2f ms per token\n", __func__, t_sample_us / 1000.0f, t_sample_us / 1000.0f / n_sampled); - printf("%s: eval time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, t_predict_us / 1000.0f / n_past); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f); - } - - ggml_free(model.ctx); - - return 0; -} diff --git a/examples/mpt/quantize.cpp b/examples/mpt/quantize.cpp deleted file mode 100644 index d0c9dda82..000000000 --- a/examples/mpt/quantize.cpp +++ /dev/null @@ -1,186 +0,0 @@ -#include "ggml/ggml.h" - -#include "common-ggml.h" -#include "common.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct mpt_hparams { - int32_t d_model = 0; - int32_t max_seq_len = 0; - int32_t n_heads = 0; - int32_t n_layers = 0; - int32_t n_vocab = 0; - float alibi_bias_max = 0; - float clip_qkv = 0; - int32_t ftype = 0; -}; - -// quantize a model -bool mpt_model_quantize(const std::string & fname_inp, - const std::string & fname_out, ggml_ftype ftype) { - - printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); - - auto finp = std::ifstream(fname_inp, std::ios::binary); - if (!finp) { - fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, - fname_inp.c_str()); - return false; - } - - auto fout = std::ofstream(fname_out, std::ios::binary); - if (!fout) { - fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, - fname_out.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - finp.read((char *)&magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", - __func__, fname_inp.c_str()); - return false; - } - - fout.write((char *)&magic, sizeof(magic)); - } - - mpt_hparams hparams; - - // load hparams - { - finp.read((char *) &hparams.d_model, sizeof(hparams.d_model)); - finp.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len)); - finp.read((char *) &hparams.n_heads, sizeof(hparams.n_heads)); - finp.read((char *) &hparams.n_layers, sizeof(hparams.n_layers)); - finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - finp.read((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max)); - finp.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv)); - finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - - const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; - const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; - - printf("%s: d_model = %d\n", __func__, hparams.d_model); - printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len); - printf("%s: n_heads = %d\n", __func__, hparams.n_heads); - printf("%s: n_layers = %d\n", __func__, hparams.n_layers); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max); - printf("%s: clip_qkv = %f\n", __func__, hparams.clip_qkv); - printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); - printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); - printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); - printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); - - fout.write((char *) &hparams.d_model, sizeof(hparams.d_model)); - fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len)); - fout.write((char *) &hparams.n_heads, sizeof(hparams.n_heads)); - fout.write((char *) &hparams.n_layers, sizeof(hparams.n_layers)); - fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fout.write((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max)); - fout.write((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv)); - fout.write((char *) &ftype_dst, sizeof(ftype_dst)); - } - - // load vocab - { - const int32_t n_vocab = hparams.n_vocab; - - std::string word; - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - finp.read((char *)&len, sizeof(len)); - fout.write((char *)&len, sizeof(len)); - - word.resize(len); - finp.read((char *)word.data(), len); - fout.write((char *)word.data(), len); - } - } - - printf("%s: quantizing tensors\n", __func__); - - // regexes of tensor names to be quantized - const std::vector to_quant = { - ".*weight", - }; - - if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) { - fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, - fname_inp.c_str()); - return false; - } - - finp.close(); - fout.close(); - - return true; -} - -// usage: -// ./mpt-quantize models/mpt/ggml-model.bin -// models/mpt/ggml-model-quant.bin type -// -int main(int argc, char ** argv) { - if (argc != 4) { - fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", - argv[0]); - ggml_print_ftypes(stderr); - return 1; - } - - // needed to initialize f16 tables - { - struct ggml_init_params params = {0, NULL, false}; - struct ggml_context * ctx = ggml_init(params); - ggml_free(ctx); - } - - const std::string fname_inp = argv[1]; - const std::string fname_out = argv[2]; - - const ggml_ftype ftype = ggml_parse_ftype(argv[3]); - - const int64_t t_main_start_us = ggml_time_us(); - - int64_t t_quantize_us = 0; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!mpt_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { - fprintf(stderr, "%s: failed to quantize model from '%s'\n", - __func__, fname_inp.c_str()); - return 1; - } - - t_quantize_us = ggml_time_us() - t_start_us; - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n"); - printf("%s: quantize time = %8.2f ms\n", __func__, - t_quantize_us / 1000.0f); - printf("%s: total time = %8.2f ms\n", __func__, - (t_main_end_us - t_main_start_us) / 1000.0f); - } - - return 0; -} diff --git a/examples/replit/CMakeLists.txt b/examples/replit/CMakeLists.txt deleted file mode 100644 index 696b7f988..000000000 --- a/examples/replit/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -# -# replit - -set(TEST_TARGET replit) -add_executable(${TEST_TARGET} main.cpp) -target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) - -# -# replit-quantize - -set(TEST_TARGET replit-quantize) -add_executable(${TEST_TARGET} quantize.cpp) -target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) diff --git a/examples/replit/convert-h5-to-ggml.py b/examples/replit/convert-h5-to-ggml.py deleted file mode 100644 index 4fc15a977..000000000 --- a/examples/replit/convert-h5-to-ggml.py +++ /dev/null @@ -1,117 +0,0 @@ -from pathlib import Path -import sys -import struct -import json -import numpy as np -from transformers import AutoModelForCausalLM, AutoTokenizer -import sentencepiece.sentencepiece_model_pb2 as model - -if len(sys.argv) < 3: - print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n") - print(" ftype == 0 -> float32") - print(" ftype == 1 -> float16") - sys.exit(1) - - -# output in the same directory as the model -dir_model = sys.argv[1] -fname_out = sys.argv[1] + "/ggml-model.bin" - - -with open(dir_model + "/config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -sp_proto = model.ModelProto() -sp_proto.ParseFromString(open(Path(sys.argv[1]) / "spiece.model", "rb").read()) - - -# possible data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 -# -# map from ftype to string -ftype_str = ["f32", "f16"] - -ftype = 1 -if len(sys.argv) > 2: - ftype = int(sys.argv[2]) - if ftype < 0 or ftype > 1: - print("Invalid ftype: " + str(ftype)) - sys.exit(1) - fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" - - -tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) -model = AutoModelForCausalLM.from_pretrained( - dir_model, low_cpu_mem_usage=True, trust_remote_code=True -) -# print (model) - -# print(tokenizer.encode('I believe the meaning of life is')) - -list_vars = model.state_dict() -for name in list_vars.keys(): - print(name, list_vars[name].shape, list_vars[name].dtype) - -fout = open(fname_out, "wb") - -print(hparams) - -fout.write(struct.pack("i", 0x67676D6C)) # magic: ggml in hex -fout.write(struct.pack("i", hparams["d_model"])) -fout.write(struct.pack("i", hparams["max_seq_len"])) -fout.write(struct.pack("i", hparams["n_heads"])) -fout.write(struct.pack("i", hparams["n_layers"])) -fout.write(struct.pack("i", hparams["vocab_size"])) -fout.write(struct.pack("i", ftype)) - - -# TODO: temporary hack to not deal with implementing the tokenizer -for piece in sp_proto.pieces: - encoded_piece = piece.piece.encode("utf-8") - fout.write(struct.pack("i", len(encoded_piece))) - fout.write(encoded_piece) - fout.write(struct.pack("f", piece.score)) - -if hparams["vocab_size"] > len(sp_proto.pieces): - for i in range(hparams["vocab_size"] - len(sp_proto.pieces)): - fout.write(struct.pack("i", 0)) - fout.write(struct.pack("f", 0)) - -for name in list_vars.keys(): - data = list_vars[name].squeeze().numpy() - print("Processing variable: " + name + " with shape: ", data.shape) - - n_dims = len(data.shape) - - # ftype == 0 -> float32, ftype == 1 -> float16 - ftype_cur = 0 - if ftype != 0: - if name[-7:] == ".weight" and n_dims == 2: - print(" Converting to float16") - data = data.astype(np.float16) - ftype_cur = 1 - else: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - else: - if data.dtype != np.float32: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - - # header - str = name.encode("utf-8") - fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) - for i in range(n_dims): - fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) - fout.write(str) - - # data - data.tofile(fout) - -fout.close() - -print("Done. Output file: " + fname_out) -print("") diff --git a/examples/replit/main.cpp b/examples/replit/main.cpp deleted file mode 100644 index acd1cbb5e..000000000 --- a/examples/replit/main.cpp +++ /dev/null @@ -1,798 +0,0 @@ -#include "ggml/ggml.h" - -#include "common-ggml.h" -#include "common.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_WIN32) -#define NOMINMAX -#include -bool is_stdin_terminal() { - auto in = GetStdHandle(STD_INPUT_HANDLE); - return GetFileType(in) == FILE_TYPE_CHAR; -} -#else -#include -bool is_stdin_terminal() { - return isatty(STDIN_FILENO); -} -#endif - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -using piece_t = std::pair; -using piece_map_t = std::unordered_map; - -struct replit_tokenizer { - gpt_vocab raw_vocab; - piece_map_t piece_map; - std::vector vocab; -}; - -std::pair, float> encode_word(const std::string & word, const piece_map_t & model) { - std::vector best_segmentations_starts(word.length() + 1, -1); - best_segmentations_starts[0] = 0; - - std::vector best_segmentations_scores(word.length() + 1, -std::numeric_limits::infinity()); - best_segmentations_scores[0] = 1.0; - - for (size_t start_idx = 0; start_idx < word.length(); ++start_idx) { - float best_score_at_start = best_segmentations_scores[start_idx]; - for (size_t end_idx = start_idx + 1; end_idx <= word.length(); ++end_idx) { - std::string token = word.substr(start_idx, end_idx - start_idx); - if (model.count(token) && best_score_at_start != -std::numeric_limits::infinity()) { - float token_score = model.at(token).second; - float score = token_score + best_score_at_start; - if (best_segmentations_scores[end_idx] == -std::numeric_limits::infinity() || - best_segmentations_scores[end_idx] > score) { - best_segmentations_starts[end_idx] = start_idx; - best_segmentations_scores[end_idx] = score; - } - } - } - } - - if (best_segmentations_scores.back() == -std::numeric_limits::infinity()) { - return std::make_pair(std::vector{0}, 0.0f); - } - - float score = best_segmentations_scores.back(); - int start = best_segmentations_starts.back(); - int end = word.length(); - std::vector tokens; - while (start != 0) { - const auto token_id = model.at(word.substr(start, end - start)).first; - tokens.insert(tokens.begin(), token_id); - int next_start = best_segmentations_starts[start]; - end = start; - start = next_start; - } - const auto token_id = model.at(word.substr(start, end - start)).first; - tokens.insert(tokens.begin(), token_id); - return std::make_pair(tokens, score); -} - -bool replit_tokenizer_load(replit_tokenizer & tokenizer, std::istream & fin, int max_vocab_size) { - std::string word; - std::vector buf(128); - - for (int i = 0; i < max_vocab_size; i++) { - uint32_t len; - fin.read((char *)&len, sizeof(len)); - - buf.resize(len); - fin.read((char *)buf.data(), len); - word.assign(buf.data(), len); - - float score; - fin.read((char *)&score, sizeof(score)); - - tokenizer.piece_map[word] = std::make_pair(i, -score); - tokenizer.raw_vocab.id_to_token[i] = word; - } - - return true; -} - -std::string replace_all(const std::string & str, // where to work - const std::string & find, // substitute 'find' - const std::string & replace // by 'replace' -) { - using namespace std; - string result; - size_t find_len = find.size(); - size_t pos, from = 0; - while (string::npos != (pos = str.find(find, from))) { - result.append(str, from, pos - from); - result.append(replace); - from = pos + find_len; - } - result.append(str, from, string::npos); - return result; -} - -std::string ws_symbol = "\342\226\201"; -std::vector replit_tokenizer_tokenize(replit_tokenizer & tokenizer, const std::string & text) { - std::vector tokens; - auto normalized_text = replace_all(text, " ", ws_symbol); - auto tokenized = encode_word(normalized_text, tokenizer.piece_map); - - return tokenized.first; -} - -std::string replit_tokenizer_detokenize(replit_tokenizer & tokenizer, const std::vector & tokens) { - std::string text; - for (auto token : tokens) { - text += tokenizer.raw_vocab.id_to_token[token]; - } - auto denormalized_text = replace_all(text, ws_symbol, " "); - return denormalized_text; -} - -// no defaults for now -struct replit_hparams { - int32_t d_model = 0; - int32_t max_seq_len = 0; - int32_t n_heads = 0; - int32_t n_layers = 0; - int32_t n_vocab = 0; - int32_t ftype = 0; -}; - -struct replit_layer { - // pre normalization - struct ggml_tensor * norm_1_weight; - - // attention - struct ggml_tensor * c_attn_wqkv_weight; - struct ggml_tensor * c_attn_out_proj_weight; - - // post normalization - struct ggml_tensor * norm_2_weight; - - // ff - struct ggml_tensor * ffn_up_proj; - struct ggml_tensor * ffn_down_proj; -}; - -struct replit_model { - replit_hparams hparams; - - struct ggml_tensor * wte_weight; // position embedding - struct ggml_tensor * norm_f_weight; // language model head - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - struct ggml_context * ctx; - std::map tensors; -}; - -// load the model's weights from a file -bool replit_model_load(const std::string & fname, replit_model & model, replit_tokenizer & vocab) { - printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); - - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - fin.read((char *)&magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); - return false; - } - } - - // load hparams - { - auto & hparams = model.hparams; - - fin.read((char *)&hparams.d_model, sizeof(hparams.d_model)); - fin.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len)); - fin.read((char *)&hparams.n_heads, sizeof(hparams.n_heads)); - fin.read((char *)&hparams.n_layers, sizeof(hparams.n_layers)); - fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab)); - fin.read((char *)&hparams.ftype, sizeof(hparams.ftype)); - - const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; - - printf("%s: d_model = %d\n", __func__, hparams.d_model); - printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len); - printf("%s: n_heads = %d\n", __func__, hparams.n_heads); - printf("%s: n_layers = %d\n", __func__, hparams.n_layers); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: ftype = %d\n", __func__, hparams.ftype); - printf("%s: qntvr = %d\n", __func__, qntvr); - - hparams.ftype %= GGML_QNT_VERSION_FACTOR; - } - - // load vocab - replit_tokenizer_load(vocab, fin, model.hparams.n_vocab); - - // for the big tensors, we have the option to store the data in 16-bit - // floats or quantized in order to save memory and also to speed up the - // computation - ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model.hparams.ftype)); - if (wtype == GGML_TYPE_COUNT) { - fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname.c_str(), - model.hparams.ftype); - return false; - } - - auto & ctx = model.ctx; - - size_t ctx_size = 0; - - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.d_model; - const int n_layer = hparams.n_layers; - const int n_ctx = hparams.max_seq_len; - const int n_vocab = hparams.n_vocab; - - ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // wte_weight - ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_weight - - ctx_size += n_layer * (ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_weight - - ctx_size += n_layer * (ggml_row_size(wtype, 3 * n_embd * n_embd)); // attn_Wqkv_weight - ctx_size += n_layer * (ggml_row_size(wtype, n_embd * n_embd)); // attn_out_proj_weight - - ctx_size += n_layer * (ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_weight - - ctx_size += n_layer * (ggml_row_size(wtype, 4 * n_embd * n_embd)); // mlp_mlp_up_weight - ctx_size += n_layer * (ggml_row_size(wtype, 4 * n_embd * n_embd)); // mlp_mlp_down_weight - - ctx_size += n_ctx * n_layer * ggml_row_size(GGML_TYPE_F16, n_embd); // memory_k - ctx_size += n_ctx * n_layer * ggml_row_size(GGML_TYPE_F16, n_embd); // memory_v - - ctx_size += (1 + 6 * n_layer) * 512; // object overhead - - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size / (1024.0 * 1024.0)); - } - - // create the ggml context - { - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - model.ctx = ggml_init(params); - if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return false; - } - } - - // prepare memory for the weights - { - const auto & hparams = model.hparams; - - const size_t n_embd = hparams.d_model; - const size_t n_layer = hparams.n_layers; - const size_t n_vocab = hparams.n_vocab; - - model.layers.resize(n_layer); - - model.wte_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - model.norm_f_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // map by name - model.tensors["transformer.wte.weight"] = model.wte_weight; - model.tensors["transformer.norm_f.weight"] = model.norm_f_weight; - - for (int i = 0; i < (int)n_layer; ++i) { - auto & layer = model.layers[i]; - - layer.norm_1_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.c_attn_wqkv_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, 3 * n_embd); - layer.c_attn_out_proj_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.norm_2_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ffn_up_proj = ggml_new_tensor_2d(ctx, wtype, n_embd, 4 * n_embd); - layer.ffn_down_proj = ggml_new_tensor_2d(ctx, wtype, 4 * n_embd, n_embd); - - // map by name - model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"] = layer.norm_1_weight; - model.tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"] = layer.c_attn_wqkv_weight; - model.tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] = - layer.c_attn_out_proj_weight; - model.tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"] = layer.norm_2_weight; - model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"] = layer.ffn_up_proj; - model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj; - } - } - - // key + value memory - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.d_model; - const int n_layer = hparams.n_layers; - const int n_ctx = hparams.max_seq_len; - - const int64_t n_mem = n_layer * n_ctx; - const int64_t n_elements = n_embd * n_mem; - - model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - printf("%s: memory_size = %8.2f MB, n_mem = %" PRIu64 "\n", __func__, memory_size / 1024.0 / 1024.0, n_mem); - } - - // load weights - { - int n_tensors = 0; - size_t total_size = 0; - - printf("%s: ", __func__); - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ttype; - - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ttype), sizeof(ttype)); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = {1, 1}; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - if (model.tensors.find(name) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); - return false; - } - - auto tensor = model.tensors[name]; - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str()); - return false; - } - - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, - "%s: tensor '%s' has wrong shape in model file: got [%5d, " - "%5d], expected [%5d, %5d]\n", - __func__, name.c_str(), (int)tensor->ne[0], (int)tensor->ne[1], ne[0], ne[1]); - return false; - } - - // for debugging - if (0) { - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], - ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor)); - } - - const size_t bpe = ggml_type_size(ggml_type(ttype)); - - if ((nelements * bpe) / ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { - fprintf(stderr, - "%s: tensor '%s' has wrong size in model file: got %zu, " - "expected %zu\n", - __func__, name.c_str(), ggml_nbytes(tensor), nelements * bpe); - return false; - } - - fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - - total_size += ggml_nbytes(tensor); - if (++n_tensors % 8 == 0) { - printf("."); - fflush(stdout); - } - } - - printf(" done\n"); - - printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size / 1024.0 / 1024.0, n_tensors); - } - - fin.close(); - - return true; -} - -// evaluate the transformer -// -// - model: the model -// - n_threads: number of threads to use -// - n_past: the context size so far -// - embd_inp: the embeddings of the tokens in the context -// - embd_w: the predicted logits for the next token -// -bool replit_eval(const replit_model & model, const int n_threads, const int n_past, - const std::vector & embd_inp, std::vector & embd_w, bool logits_all, - size_t & mem_per_token) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_embd = hparams.d_model; - const int n_layer = hparams.n_layers; - const int n_head = hparams.n_heads; - const int n_vocab = hparams.n_vocab; - const int n_ctx = hparams.max_seq_len; - const float eps = 1e-5f; - - static size_t buf_size = 256u * 1024 * 1024; - static void * buf = malloc(buf_size); - - if (mem_per_token > 0 && mem_per_token * N > buf_size) { - const size_t buf_size_new = 1.1 * (mem_per_token * N); // add 10% to account for ggml object overhead - // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, - // buf_size, buf_size_new); - - // reallocate - buf_size = buf_size_new; - buf = realloc(buf, buf_size); - if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); - return false; - } - } - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd)); - - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte_weight, embd); - - for (int il = 0; il < n_layer; ++il) { - - struct ggml_tensor * cur; - - // a = self.ln_1(x) - { - cur = ggml_norm(ctx0, inpL, eps); - - cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_1_weight, cur), cur); - } - - // self-attention - // b, _, past_key_value = self.attn(a, past_key_value=past_key_value, - // attn_bias=attn_bias, attention_mask=attention_mask, - // is_causal=is_causal) - { - // compute QKV - cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_wqkv_weight, cur); - - struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0 * sizeof(float) * n_embd); - struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1 * sizeof(float) * n_embd); - struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2 * sizeof(float) * n_embd); - - // store key and value to memory - { - struct ggml_tensor * k = - ggml_view_1d(ctx0, model.memory_k, N * n_embd, - (ggml_element_size(model.memory_k) * n_embd) * (il * n_ctx + n_past)); - struct ggml_tensor * v = - ggml_view_1d(ctx0, model.memory_v, N * n_embd, - (ggml_element_size(model.memory_v) * n_embd) * (il * n_ctx + n_past)); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, - // 2, 1, 3) [64, N, 12] - struct ggml_tensor * Q = ggml_permute( - ctx0, ggml_cpy(ctx0, Qcur, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd / n_head, n_head, N)), 0, 2, - 1, 3); - - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, - // 3) [64, n_past + N, 12] - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_embd, - il * n_ctx * ggml_element_size(model.memory_k) * n_embd), - n_embd / n_head, n_head, n_past + N), - 0, 2, 1, 3); - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, KQ, 1.0f / sqrt(float(n_embd) / n_head)); - - struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8.0f); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, - // 2, 0, 3).contiguous() [n_past + N, 64, 12] - struct ggml_tensor * V_trans = ggml_cpy( - ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_embd, - il * n_ctx * ggml_element_size(model.memory_v) * n_embd), - n_embd / n_head, n_head, n_past + N), - 1, 2, 0, 3), - ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd / n_head, n_head)); - - // KQV = transpose(V) * KQ_soft_max - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - - // cur = KQV_merged.contiguous().view(n_embd, N) - cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - // projection - { cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_out_proj_weight, cur); } - } - - inpL = ggml_add(ctx0, inpL, cur); - - // m = self.ln_2(x) - { - cur = ggml_norm(ctx0, inpL, eps); - - cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_2_weight, cur), cur); - } - - // n = self.mlp(m) - { - - cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up_proj, cur); - - // GELU activation - cur = ggml_gelu(ctx0, cur); - - // projection - // cur = proj_w*cur + proj_b - cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down_proj, cur); - } - - // x = x + n - inpL = ggml_add(ctx0, inpL, cur); - } - - // norm - { - inpL = ggml_norm(ctx0, inpL, eps); - // inpL = ln_f_g*inpL - inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL); - } - - // output embedding weight tied to input embedding - inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL); - - // logits -> probs - // inpL = ggml_soft_max(ctx0, inpL); - - // run the computation - ggml_build_forward_expand(gf, inpL); - ggml_graph_compute_with_ctx(ctx0, gf, n_threads); - - // std::cout << "Qcur" << std::endl; - // print_tensor(Qcur); - - // if (n_past%100 == 0) { - // ggml_graph_print(&gf); - // ggml_graph_dump_dot(&gf, NULL, "mpt-model.dot"); - // } - - if (logits_all) { - // return result for all tokens - embd_w.resize(n_vocab * N); - memcpy(embd_w.data(), (float *)ggml_get_data(inpL), sizeof(float) * n_vocab * N); - } else { - // return result for just the last token - embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab); - } - - if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0) / N; - } - // printf("used_mem = %zu\n", ggml_used_mem(ctx0)); - - ggml_free(ctx0); - - return true; -} - -int main(int argc, char ** argv) { - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - params.model = ""; - - if (gpt_params_parse(argc, argv, params) == false) { - return 1; - } - - if (params.seed < 0) { - params.seed = time(NULL); - } - - printf("%s: seed = %d\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - if (params.prompt.empty()) { - if (!is_stdin_terminal()) { - std::string line; - while (std::getline(std::cin, line)) { - params.prompt = params.prompt + "\n" + line; - } - } else { - params.prompt = gpt_random_prompt(rng); - } - } - - int64_t t_load_us = 0; - - replit_tokenizer vocab; - replit_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!replit_model_load(params.model, model, vocab)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - } - - int n_past = 0; - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector logits; - - // tokenize the prompt - std::vector embd_inp = replit_tokenizer_tokenize(vocab, params.prompt); - - printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - - for (size_t i = 0; i < embd_inp.size(); i++) { - printf("%s: token[%zu] = %6zu\n", __func__, i, embd_inp[i]); - // vocab.id_to_token.at(embd_inp[i]).c_str() - } - printf("\n"); - - params.n_predict = std::min(params.n_predict, model.hparams.max_seq_len - (int)embd_inp.size()); - - std::vector embd; - - // determine the required inference memory per token: - size_t mem_per_token = 0; - replit_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, false, mem_per_token); - - for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); - - if (!replit_eval(model, params.n_threads, n_past, embd, logits, false, mem_per_token)) { - printf("Failed to predict\n"); - return 1; - } - - t_predict_us += ggml_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (i >= embd_inp.size()) { - // sample next token - const int top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - - const int n_vocab = model.hparams.n_vocab; - - gpt_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = gpt_sample_top_k_top_p(vocab.raw_vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, - temp, rng); - - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - } else { - // if here, it means we are still processing the input prompt - for (size_t k = i; k < embd_inp.size(); k++) { - embd.push_back(embd_inp[k]); - if (int32_t(embd.size()) > params.n_batch) { - break; - } - } - i += embd.size() - 1; - } - - // display text - for (auto id : embd) { - printf("%s", replit_tokenizer_detokenize(vocab, {static_cast(id)}).c_str()); - } - fflush(stdout); - - // end of text token - if (embd.back() == 0) { - break; - } - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us / 1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us / 1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, - t_predict_us / 1000.0f / n_past); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f); - } - - ggml_free(model.ctx); - - return 0; -} diff --git a/examples/replit/quantize.cpp b/examples/replit/quantize.cpp deleted file mode 100644 index f274074bb..000000000 --- a/examples/replit/quantize.cpp +++ /dev/null @@ -1,182 +0,0 @@ -#include "ggml/ggml.h" - -#include "common-ggml.h" -#include "common.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct mpt_hparams { - int32_t d_model = 0; - int32_t max_seq_len = 0; - int32_t n_heads = 0; - int32_t n_layers = 0; - int32_t n_vocab = 0; - int32_t ftype = 0; -}; - -// quantize a model -bool mpt_model_quantize(const std::string & fname_inp, - const std::string & fname_out, ggml_ftype ftype) { - - printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); - - auto finp = std::ifstream(fname_inp, std::ios::binary); - if (!finp) { - fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, - fname_inp.c_str()); - return false; - } - - auto fout = std::ofstream(fname_out, std::ios::binary); - if (!fout) { - fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, - fname_out.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - finp.read((char *)&magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", - __func__, fname_inp.c_str()); - return false; - } - - fout.write((char *)&magic, sizeof(magic)); - } - - mpt_hparams hparams; - - // load hparams - { - finp.read((char *) &hparams.d_model, sizeof(hparams.d_model)); - finp.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len)); - finp.read((char *) &hparams.n_heads, sizeof(hparams.n_heads)); - finp.read((char *) &hparams.n_layers, sizeof(hparams.n_layers)); - finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - - const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; - const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; - - printf("%s: d_model = %d\n", __func__, hparams.d_model); - printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len); - printf("%s: n_heads = %d\n", __func__, hparams.n_heads); - printf("%s: n_layers = %d\n", __func__, hparams.n_layers); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); - printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); - printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); - printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); - - fout.write((char *) &hparams.d_model, sizeof(hparams.d_model)); - fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len)); - fout.write((char *) &hparams.n_heads, sizeof(hparams.n_heads)); - fout.write((char *) &hparams.n_layers, sizeof(hparams.n_layers)); - fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fout.write((char *) &ftype_dst, sizeof(ftype_dst)); - } - - // load vocab - { - const int32_t n_vocab = hparams.n_vocab; - - std::string word; - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - finp.read((char *)&len, sizeof(len)); - fout.write((char *)&len, sizeof(len)); - - word.resize(len); - finp.read((char *)word.data(), len); - fout.write((char *)word.data(), len); - - float prob; - finp.read((char *)&prob, sizeof(prob)); - fout.write((char *)&prob, sizeof(prob)); - } - } - - printf("%s: quantizing tensors\n", __func__); - - // regexes of tensor names to be quantized - const std::vector to_quant = { - ".*weight", - }; - - if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) { - fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, - fname_inp.c_str()); - return false; - } - - finp.close(); - fout.close(); - - return true; -} - -// usage: -// ./replit-quantize models/replit/ggml-model.bin -// models/replit/ggml-model-quant.bin type -// -int main(int argc, char ** argv) { - if (argc != 4) { - fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", - argv[0]); - ggml_print_ftypes(stderr); - return 1; - } - - // needed to initialize f16 tables - { - struct ggml_init_params params = {0, NULL, false}; - struct ggml_context * ctx = ggml_init(params); - ggml_free(ctx); - } - - const std::string fname_inp = argv[1]; - const std::string fname_out = argv[2]; - - const ggml_ftype ftype = ggml_parse_ftype(argv[3]); - - const int64_t t_main_start_us = ggml_time_us(); - - int64_t t_quantize_us = 0; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!mpt_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { - fprintf(stderr, "%s: failed to quantize model from '%s'\n", - __func__, fname_inp.c_str()); - return 1; - } - - t_quantize_us = ggml_time_us() - t_start_us; - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n"); - printf("%s: quantize time = %8.2f ms\n", __func__, - t_quantize_us / 1000.0f); - printf("%s: total time = %8.2f ms\n", __func__, - (t_main_end_us - t_main_start_us) / 1000.0f); - } - - return 0; -} diff --git a/examples/starcoder/CMakeLists.txt b/examples/starcoder/CMakeLists.txt deleted file mode 100644 index f7b849e37..000000000 --- a/examples/starcoder/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -# -# starcoder - -set(TEST_TARGET starcoder) -add_executable(${TEST_TARGET} main.cpp) -target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) - -# -# starcoder-quantize - -set(TEST_TARGET starcoder-quantize) -add_executable(${TEST_TARGET} quantize.cpp) -target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) - -# -# For GPU offloading - -if (GGML_CUBLAS) - add_compile_definitions(GGML_USE_CUBLAS) -endif() -if (GGML_CLBLAST) - add_compile_definitions(GGML_USE_CLBLAST) -endif() - diff --git a/examples/starcoder/README.md b/examples/starcoder/README.md deleted file mode 100644 index ea64c4d28..000000000 --- a/examples/starcoder/README.md +++ /dev/null @@ -1,115 +0,0 @@ -# 💫 StarCoder - -This is a C++ example running 💫 StarCoder inference using the [ggml](https://github.com/ggerganov/ggml) library. - -The program runs on the CPU - no video card is required. - -The example supports the following 💫 StarCoder models: - -- `bigcode/starcoder` -- `bigcode/gpt_bigcode-santacoder` aka the smol StarCoder - -Sample performance on MacBook M1 Pro: - -TODO - - -Sample output: - -``` -$ ./bin/starcoder -h -usage: ./bin/starcoder [options] - -options: - -h, --help show this help message and exit - -s SEED, --seed SEED RNG seed (default: -1) - -t N, --threads N number of threads to use during computation (default: 8) - -p PROMPT, --prompt PROMPT - prompt to start generation with (default: random) - -n N, --n_predict N number of tokens to predict (default: 200) - --top_k N top-k sampling (default: 40) - --top_p N top-p sampling (default: 0.9) - --temp N temperature (default: 1.0) - -b N, --batch_size N batch size for prompt processing (default: 8) - -m FNAME, --model FNAME - model path (default: models/starcoder-117M/ggml-model.bin) - -$ ./bin/starcoder -m ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin -p "def fibonnaci(" -t 4 --top_k 0 --top_p 0.95 --temp 0.2 -main: seed = 1683881276 -starcoder_model_load: loading model from '../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin' -starcoder_model_load: n_vocab = 49280 -starcoder_model_load: n_ctx = 2048 -starcoder_model_load: n_embd = 2048 -starcoder_model_load: n_head = 16 -starcoder_model_load: n_layer = 24 -starcoder_model_load: ftype = 3 -starcoder_model_load: ggml ctx size = 1794.90 MB -starcoder_model_load: memory size = 768.00 MB, n_mem = 49152 -starcoder_model_load: model size = 1026.83 MB -main: prompt: 'def fibonnaci(' -main: number of tokens in prompt = 7, first 8 tokens: 563 24240 78 2658 64 2819 7 - -def fibonnaci(n): - if n == 0: - return 0 - elif n == 1: - return 1 - else: - return fibonacci(n-1) + fibonacci(n-2) - -print(fibo(10)) - -main: mem per token = 9597928 bytes -main: load time = 480.43 ms -main: sample time = 26.21 ms -main: predict time = 3987.95 ms / 19.36 ms per token -main: total time = 4580.56 ms -``` - -## Quick start -```bash -git clone https://github.com/ggerganov/ggml -cd ggml - -# Install Python dependencies -python3 -m pip install -r requirements.txt - -# Convert HF model to ggml -python examples/starcoder/convert-hf-to-ggml.py bigcode/gpt_bigcode-santacoder - -# Build ggml + examples -mkdir build && cd build -cmake .. && make -j4 starcoder starcoder-quantize - -# quantize the model -./bin/starcoder-quantize ../models/bigcode/gpt_bigcode-santacoder-ggml.bin ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin 3 - -# run inference -./bin/starcoder -m ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin -p "def fibonnaci(" --top_k 0 --top_p 0.95 --temp 0.2 -``` - - -## Downloading and converting the original models (💫 StarCoder) - -You can download the original model and convert it to `ggml` format using the script `convert-hf-to-ggml.py`: - -``` -# Convert HF model to ggml -python examples/starcoder/convert-hf-to-ggml.py bigcode/gpt_bigcode-santacoder -``` - -This conversion requires that you have python and Transformers installed on your computer. - -## Quantizing the models - -You can also try to quantize the `ggml` models via 4-bit integer quantization. - -``` -# quantize the model -./bin/starcoder-quantize ../models/bigcode/gpt_bigcode-santacoder-ggml.bin ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin 3 -``` - -| Model | Original size | Quantized size | Quantization type | -| --- | --- | --- | --- | -| `bigcode/gpt_bigcode-santacoder` | 5396.45 MB | 1026.83 MB | 4-bit integer (q4_1) | -| `bigcode/starcoder` | 71628.23 MB | 13596.23 MB | 4-bit integer (q4_1) | diff --git a/examples/starcoder/convert-hf-to-ggml.py b/examples/starcoder/convert-hf-to-ggml.py deleted file mode 100644 index 30af75cb9..000000000 --- a/examples/starcoder/convert-hf-to-ggml.py +++ /dev/null @@ -1,208 +0,0 @@ -# Convert HF models to ggml format -# - -import sys -import struct -import json -import torch -import numpy as np -import re -import os -import argparse - -from transformers import AutoModelForCausalLM -from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BloomForCausalLM - -# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a signficant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8+n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - -parser = argparse.ArgumentParser(description='Convert starcoder HF model to GGML') -parser.add_argument('model_name_or_path', type=str, help='Name of model on HF hub, or local model folder') -parser.add_argument('--outfile', type=str, default='ggml-model.bin', help='Path of GGML file to write.') -parser.add_argument('--use_f32', action="store_true", help='Save GGML file in fp32') - -args = parser.parse_args() - -# use 16-bit or 32-bit floats -use_f16 = not args.use_f32 - -fname_out = args.outfile -fname_dir = os.path.dirname(fname_out) -if fname_dir: - os.makedirs(fname_dir, exist_ok=True) - -print("Loading model: ", args.model_name_or_path) -tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) -config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=True) -hparams = config.to_dict() -model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, config=config, torch_dtype=torch.float16 if use_f16 else torch.float32, low_cpu_mem_usage=True, trust_remote_code=True, offload_state_dict=True) -print("Model loaded: ", args.model_name_or_path) - -list_vars = model.state_dict() - -encoder = tokenizer.vocab -# Add added_tokens (special tokens) to the encoder -encoder.update(tokenizer.get_added_vocab()) -print(hparams) - -print("Saving ggml model to: ", fname_out) -fout = open(fname_out, "wb") - -fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex -vocab_size = hparams["vocab_size"] -fout.write(struct.pack("i", vocab_size)) -# fout.write(struct.pack("i", len(encoder))) -fout.write(struct.pack("i", hparams["n_positions"])) -fout.write(struct.pack("i", hparams["n_embd"])) -fout.write(struct.pack("i", hparams["n_head"])) -fout.write(struct.pack("i", hparams["n_layer"])) -fout.write(struct.pack("i", use_f16)) - -byte_encoder = bytes_to_unicode() -byte_decoder = {v:k for k, v in byte_encoder.items()} - -fout.write(struct.pack("i", vocab_size)) - -counter = 0 -# sort by value -for key in sorted(encoder, key=encoder.get): - text = bytearray([byte_decoder[c] for c in key]) - fout.write(struct.pack("i", len(text))) - fout.write(text) - counter += 1 - -# TODO: Repeat last token until vocab_size -while counter < vocab_size: - fout.write(struct.pack("i", len(text))) - fout.write(text) - counter += 1 -# assert counter == config.vocab_size - -for name in list_vars.keys(): - data = list_vars[name].squeeze().numpy() - print("Processing variable: " + name + " with shape: ", data.shape) - - # rename headers to keep compatibility - if name == "transformer.ln_f.weight": - name = "model/ln_f/g" - elif name == "transformer.ln_f.bias": - name = "model/ln_f/b" - elif name == "transformer.wte.weight": - name = "model/wte" - elif name == "transformer.wpe.weight": - name = "model/wpe" - elif name == "lm_head.weight": - name = "model/lm_head" - elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/ln_1/g" - elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/ln_1/b" - elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/attn/c_attn/w" - elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/attn/c_attn/b" - elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/attn/c_proj/w" - elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/attn/c_proj/b" - elif re.match(r"transformer.h.\d+.ln_2.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/ln_2/g" - elif re.match(r"transformer.h.\d+.ln_2.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/ln_2/b" - elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/mlp/c_fc/w" - elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/mlp/c_fc/b" - elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/mlp/c_proj/w" - elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/mlp/c_proj/b" - else: - print("Unrecognized variable name. %s", name) - - # we don't need these - if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"): - print(" Skipping variable: " + name) - continue - - n_dims = len(data.shape); - - # ftype == 0 -> float32, ftype == 1 -> float16 - ftype = 0; - if use_f16: - if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or name[-2:] == "/w") and n_dims == 2: - print(" Converting to float16") - data = data.astype(np.float16) - ftype = 1 - else: - print(" Converting to float32") - data = data.astype(np.float32) - ftype = 0 - - "model/h.*/attn/c_attn/w" - "model/h.*/attn/c_proj/w" - "model/h.*/mlp/c_fc/w" - "model/h.*/mlp/c_proj/w" - if name[-14:] == "/attn/c_attn/w" or name[-14:] == "/attn/c_attn/b": - print(" Duplicate K,V heads to use MHA instead of MQA") - - embed_dim = hparams["n_embd"] - head_dim = embed_dim // hparams["n_head"] - - # ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim) - q, k ,v = np.split(data, (hparams["n_head"] * head_dim, (hparams["n_head"] + 1) * head_dim), axis=0) - # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim) - if len(k.shape) == 2: - k = np.tile(k, (hparams["n_head"], 1)) - v = np.tile(v, (hparams["n_head"], 1)) - elif len(k.shape) == 1: - k = np.tile(k, (hparams["n_head"])) - v = np.tile(v, (hparams["n_head"])) - # concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim) - data = np.concatenate((q, k, v), axis=0) - - # header - str = name.encode('utf-8') - fout.write(struct.pack("iii", n_dims, len(str), ftype)) - for i in range(n_dims): - fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) - fout.write(str); - - # data - data.tofile(fout) - -fout.close() - -print("Done. Output file: " + fname_out) -print("") diff --git a/examples/starcoder/main.cpp b/examples/starcoder/main.cpp deleted file mode 100644 index b11cbb709..000000000 --- a/examples/starcoder/main.cpp +++ /dev/null @@ -1,924 +0,0 @@ -#include "ggml/ggml.h" - -#include "common.h" -#include "common-ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -// default hparams (GPT-2 117M) -// https://huggingface.co/bigcode/gpt_bigcode-santacoder/blob/main/config.json -struct starcoder_hparams { - int32_t n_vocab = 49280; - int32_t n_ctx = 2048; - int32_t n_embd = 2048; - int32_t n_head = 16; - int32_t n_layer = 24; - int32_t ftype = 1; - float eps = 1e-5f; -}; - -struct starcoder_layer { - // normalization - struct ggml_tensor * ln_1_g; - struct ggml_tensor * ln_1_b; - - struct ggml_tensor * ln_2_g; - struct ggml_tensor * ln_2_b; - - // attention - struct ggml_tensor * c_attn_attn_w; - struct ggml_tensor * c_attn_attn_b; - - struct ggml_tensor * c_attn_proj_w; - struct ggml_tensor * c_attn_proj_b; - - // mlp - struct ggml_tensor * c_mlp_fc_w; - struct ggml_tensor * c_mlp_fc_b; - - struct ggml_tensor * c_mlp_proj_w; - struct ggml_tensor * c_mlp_proj_b; -}; - -struct starcoder_model { - starcoder_hparams hparams; - - // normalization - struct ggml_tensor * ln_f_g; - struct ggml_tensor * ln_f_b; - - struct ggml_tensor * wte; // position embedding - struct ggml_tensor * wpe; // token embedding - struct ggml_tensor * lm_head; // language model head - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // - struct ggml_context * ctx; - std::map tensors; -}; - -// load the model's weights from a file -bool starcoder_model_load(const std::string & fname, starcoder_model & model, gpt_vocab & vocab) { - printf("%s: loading model from '%s'\n", __func__, fname.c_str()); - - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); - return false; - } - } - - // load hparams - { - auto & hparams = model.hparams; - - fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - - const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: ftype = %d\n", __func__, hparams.ftype); - printf("%s: qntvr = %d\n", __func__, qntvr); - - hparams.ftype %= GGML_QNT_VERSION_FACTOR; - } - - // load vocab - { - int32_t n_vocab = 0; - fin.read((char *) &n_vocab, sizeof(n_vocab)); - - if (n_vocab != model.hparams.n_vocab) { - fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", - __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); - return false; - } - - std::string word; - std::vector buf(128); - - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - fin.read((char *) &len, sizeof(len)); - - buf.resize(len); - fin.read((char *) buf.data(), len); - word.assign(buf.data(), len); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - - // if (i < 10) fprintf(stderr, "%.s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); - } - - // Add StarChat special tokens. - for (std::string token : { - "<|system|>", - "<|user|>", - "<|assistant|>", - "<|end|>", - "", - "", - "", - "", - "<|end_of_turn|>" - }) { - if (vocab.token_to_id.find(token) != vocab.token_to_id.end()) { - vocab.add_special_token(token); - } - } - } - - // for the big tensors, we have the option to store the data in 16-bit floats or quantized - // in order to save memory and also to speed up the computation - ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); - if (wtype == GGML_TYPE_COUNT) { - fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", - __func__, fname.c_str(), model.hparams.ftype); - return false; - } - - auto & ctx = model.ctx; - - size_t ctx_size = 0; - - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - - const int head_dim = n_embd / hparams.n_head; - const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head - const int kv_dim = kv_heads * head_dim; - - ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g - ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b - - ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // wte - ctx_size += n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe - ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // lm_head - - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b - - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b - - ctx_size += n_layer*((n_embd + 2*kv_dim)*ggml_row_size(wtype, n_embd)); // c_attn_attn_w // TODO: - ctx_size += n_layer*((n_embd + 2*kv_dim)*ggml_row_size(GGML_TYPE_F32, 1)); // c_attn_attn_b - - ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b - - ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b - - ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w - ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b - - ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k - ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v - - ctx_size += (6 + 12*n_layer)*512; // object overhead - - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); - } - - // create the ggml context - { - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - model.ctx = ggml_init(params); - if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return false; - } - } - - // prepare memory for the weights - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - - const int head_dim = n_embd / hparams.n_head; - const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head - const int kv_dim = kv_heads * head_dim; - - model.layers.resize(n_layer); - - model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx); - model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - - // map by name - model.tensors["model/ln_f/g"] = model.ln_f_g; - model.tensors["model/ln_f/b"] = model.ln_f_b; - - model.tensors["model/wte"] = model.wte; - model.tensors["model/wpe"] = model.wpe; - model.tensors["model/lm_head"] = model.lm_head; - - for (int i = 0; i < n_layer; ++i) { - auto & layer = model.layers[i]; - - layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd + 2*kv_dim); - layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd + 2*kv_dim); - - layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); //TODO: 4*n_embd = config.n_inner - layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); - - layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); - layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // map by name - model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g; - model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b; - - model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g; - model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b; - - model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w; - model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b; - - model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w; - model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b; - - model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w; - model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b; - - model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w; - model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b; - } - } - - // key + value memory - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - - const int n_mem = n_layer*n_ctx; - const int n_elements = n_embd*n_mem; - - model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); - } - - // load weights - { - size_t total_size = 0; - - bool has_lm_head = false; - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ttype; - - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ttype), sizeof(ttype)); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - if (model.tensors.find(name) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); - return false; - } - - auto tensor = model.tensors[name]; - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", - __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); - return false; - } - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file. got %d, expected %d\n", - __func__, name.c_str(), (int) ggml_nelements(tensor), nelements); - return false; - } - - // for debugging - if (0) { - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); - } - - const size_t bpe = ggml_type_size(ggml_type(ttype)); - - if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", - __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe); - return false; - } - - fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - - // GPT-2 models share the WTE tensor as the LM head - if (name == "model/wte" && has_lm_head == false) { - memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor)); - } - - if (name == "model/lm_head") { - has_lm_head = true; - } - - total_size += ggml_nbytes(tensor); - } - - printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); - } - - fin.close(); - - return true; -} - -// evaluate the transformer -// -// - model: the model -// - n_threads: number of threads to use -// - n_past: the context size so far -// - embd_inp: the embeddings of the tokens in the context -// - embd_w: the predicted logits for the next token -// -bool starcoder_eval( - const starcoder_model & model, - const int n_threads, - const int n_past, - const std::vector & embd_inp, - std::vector & embd_w, - size_t & mem_per_token) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_head = hparams.n_head; - const int n_vocab = hparams.n_vocab; - - static size_t buf_size = 256u*1024*1024; - static void * buf = malloc(buf_size); - - // use 2 scratch buffers - // TODO: very hacky solution - reimplement in a more elegant way - static size_t scr0_size = 256u*1024*1024; - static void * scr0 = malloc(scr0_size); - - static size_t scr1_size = 256u*1024*1024; - static void * scr1 = malloc(scr1_size); - - if (mem_per_token > 0 && mem_per_token*N > buf_size) { - const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead - //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); - - // reallocate - buf_size = buf_size_new; - buf = realloc(buf, buf_size); - if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); - return false; - } - } - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); - - struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - for (int i = 0; i < N; ++i) { - ((int32_t *) position->data)[i] = n_past + i; - } - - // wte + wpe - struct ggml_tensor * inpL = - ggml_add(ctx0, - ggml_get_rows(ctx0, model.wte, embd), - ggml_get_rows(ctx0, model.wpe, position)); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur; - - ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); - - // norm - { - // [ 768, N] - cur = ggml_norm(ctx0, inpL, hparams.eps); - - // cur = ln_1_g*cur + ln_1_b - // [ 768, N] - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), - cur), - ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); - } - - // attn - // [2304, 768] - model.layers[il].c_attn_attn_w - // [2304, 1] - model.layers[il].c_attn_attn_b - // [ 768, N] - cur (in) - // [2304, N] - cur (out) - // - // cur = attn_w*cur + attn_b - // [2304, N] - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_attn_attn_w, - cur); - - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), - cur); - } - - // self-attention - { - struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); - struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); - struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); - - // store key and value to memory - if (N >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - // [64, N, 12] - struct ggml_tensor * Q = - ggml_permute(ctx0, - ggml_cpy(ctx0, - Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), - 0, 2, 1, 3); - - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - // [64, n_past + N, 12] - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), - n_embd/n_head, n_head, n_past + N), - 0, 2, 1, 3); //TODO: need to be tiled - - // GG: flash attention - //struct ggml_tensor * V = - // ggml_cpy(ctx0, - // ggml_permute(ctx0, - // ggml_reshape_3d(ctx0, - // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), - // n_embd/n_head, n_head, n_past + N), - // 1, 2, 0, 3), - // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); - - //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); - - // K * Q - // [n_past + N, N, 12] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); //TODO: check if it broadcasts - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head)); - - // KQ_masked = mask_past(KQ_scaled) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - - // KQ = soft_max(KQ_masked) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - // [n_past + N, 64, 12] - struct ggml_tensor * V_trans = - ggml_cpy(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), - n_embd/n_head, n_head, n_past + N), - 1, 2, 0, 3), - ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head)); - - // KQV = transpose(V) * KQ_soft_max - // [64, N, 12] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // [64, 12, N] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - - // cur = KQV_merged.contiguous().view(n_embd, N) - // [768, N] - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - } - - // projection - // [ 768, 768] - model.layers[il].c_attn_proj_w - // [ 768, 1] - model.layers[il].c_attn_proj_b - // [ 768, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_attn_proj_w, - cur); - - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), - cur); - } - - // add the input - cur = ggml_add(ctx0, cur, inpL); - - struct ggml_tensor * inpFF = cur; - - ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); - - // feed-forward network - { - // norm - { - cur = ggml_norm(ctx0, inpFF, hparams.eps); - - // cur = ln_2_g*cur + ln_2_b - // [ 768, N] - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_2_g, cur), - cur), - ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); - } - - // fully connected - // [3072, 768] - model.layers[il].c_mlp_fc_w - // [3072, 1] - model.layers[il].c_mlp_fc_b - // [ 768, N] - cur (in) - // [3072, N] - cur (out) - // - // cur = fc_w*cur + fc_b - // [3072, N] - cur = ggml_mul_mat(ctx0, - model.layers[il].c_mlp_fc_w, - cur); - - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), - cur); - - // GELU activation - // [3072, N] - cur = ggml_gelu(ctx0, cur); - - // projection - // [ 768, 3072] - model.layers[il].c_mlp_proj_w - // [ 768, 1] - model.layers[il].c_mlp_proj_b - // [3072, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - cur = ggml_mul_mat(ctx0, - model.layers[il].c_mlp_proj_w, - cur); - - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), - cur); - } - - // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); - } - - ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); - - // norm - { - // [ 768, N] - inpL = ggml_norm(ctx0, inpL, hparams.eps); - - // inpL = ln_f_g*inpL + ln_f_b - // [ 768, N] - inpL = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.ln_f_g, inpL), - inpL), - ggml_repeat(ctx0, model.ln_f_b, inpL)); - } - - ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - - // inpL = WTE * inpL - // [ 768, 50257] - model.lm_head - // [ 768, N] - inpL - inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); - - // logits -> probs - //inpL = ggml_soft_max_inplace(ctx0, inpL); - - // run the computation - ggml_build_forward_expand(gf, inpL); - ggml_graph_compute_with_ctx(ctx0, gf, n_threads); - - //if (n_past%100 == 0) { - // ggml_graph_print (&gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); - //} - - //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); - - // return result just for the last token - embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); - - if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; - } - //printf("used_mem = %zu MB\n", ggml_used_mem(ctx0)/(1024*1024)); - - ggml_free(ctx0); - - return true; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - - if (gpt_params_parse(argc, argv, params) == false) { - return 1; - } - - if (params.seed < 0) { - params.seed = time(NULL); - } - - printf("%s: seed = %d\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - if (params.prompt.empty()) { - params.prompt = gpt_random_prompt(rng); - } - - int64_t t_load_us = 0; - - gpt_vocab vocab; - starcoder_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!starcoder_model_load(params.model, model, vocab)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - - test_gpt_tokenizer(vocab, params.token_test); - } - - if (params.repeat_last_n == -1) { - params.repeat_last_n = model.hparams.n_ctx; - } - printf("\n"); - printf("%s: temp = %.3f\n", __func__, params.temp); - printf("%s: top_k = %d\n", __func__, params.top_k); - printf("%s: top_p = %.3f\n", __func__, params.top_p); - printf("%s: repeat_last_n = %d\n", __func__, params.repeat_last_n); - printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty); - - int n_past = 0; - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector logits; - - std::vector last_n_tokens(model.hparams.n_ctx); - std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - - // tokenize the prompt - std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); - - params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - - printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - for (size_t i = 0; i < embd_inp.size(); i++) { - printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); - } - printf("\n\n"); - - // Handle StarChat "<|end|>" and OpenCoder "<|end_of_turn>" tokens. - gpt_vocab::id starchat_end_token = -1; - { - const auto it = vocab.token_to_id.find("<|end|>"); - if (it != vocab.token_to_id.end()) { - starchat_end_token = it->second; - } else { - const auto eot_token_id = vocab.token_to_id.find("<|end_of_turn|>"); - if (eot_token_id != vocab.token_to_id.end()) { - starchat_end_token = eot_token_id->second; - } - } - } - - // submit the input prompt token-by-token - // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning - std::vector embd; - - // determine the required inference memory per token: - size_t mem_per_token = 0; - starcoder_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); - - for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); - - if (!starcoder_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { - printf("Failed to predict\n"); - return 1; - } - - t_predict_us += ggml_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (i >= embd_inp.size()) { - // sample next token - const int top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - - const int n_vocab = model.hparams.n_vocab; - - gpt_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = gpt_sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, params.repeat_last_n, params.repeat_penalty, rng); - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(id); - } else { - // if here, it means we are still processing the input prompt - for (size_t k = i; k < embd_inp.size(); k++) { - embd.push_back(embd_inp[k]); - - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(embd_inp[k]); - - if (int32_t(embd.size()) >= params.n_batch) { - break; - } - } - i += embd.size() - 1; - } - - // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); - } - fflush(stdout); - - // check if model is santacoder - if (model.hparams.n_layer <= 30 && embd.back() == 49152) { - break; - } - // check if model is starcoder - else if (embd.back() == 0) { //TODO: this is only for starcoder - break; - } - // Handle StarChat "<|end|>" token. - else if (embd.back() == starchat_end_token && i >= embd_inp.size()) { - break; - } - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - ggml_free(model.ctx); - - return 0; -} diff --git a/examples/starcoder/quantize.cpp b/examples/starcoder/quantize.cpp deleted file mode 100644 index d3aee3f26..000000000 --- a/examples/starcoder/quantize.cpp +++ /dev/null @@ -1,184 +0,0 @@ -#include "ggml/ggml.h" - -#include "common.h" -#include "common-ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// default hparams (GPT-2 117M) -struct starcoder_hparams { - int32_t n_vocab = 49280; - int32_t n_ctx = 2048; - int32_t n_embd = 2048; - int32_t n_head = 16; - int32_t n_layer = 24; - int32_t ftype = 1; -}; - -// quantize a model -bool starcoder_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) { - gpt_vocab vocab; - - printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); - - auto finp = std::ifstream(fname_inp, std::ios::binary); - if (!finp) { - fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str()); - return false; - } - - auto fout = std::ofstream(fname_out, std::ios::binary); - if (!fout) { - fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - finp.read((char *) &magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str()); - return false; - } - - fout.write((char *) &magic, sizeof(magic)); - } - - starcoder_hparams hparams; - - // load hparams - { - finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - - const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; - const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); - printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); - printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); - printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); - - fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); - fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fout.write((char *) &ftype_dst, sizeof(ftype_dst)); - } - - // load vocab - { - int32_t n_vocab = 0; - finp.read ((char *) &n_vocab, sizeof(n_vocab)); - fout.write((char *) &n_vocab, sizeof(n_vocab)); - - if (n_vocab != hparams.n_vocab) { - fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", - __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab); - return false; - } - - std::string word; - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - finp.read ((char *) &len, sizeof(len)); - fout.write((char *) &len, sizeof(len)); - - word.resize(len); - finp.read ((char *) word.data(), len); - fout.write((char *) word.data(), len); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - } - } - - // regexes of tensor names to be quantized - const std::vector to_quant = { - "model/wte", - "model/lm_head", - "model/h.*/attn/c_attn/w", - "model/h.*/attn/c_proj/w", - "model/h.*/mlp/c_fc/w", - "model/h.*/mlp/c_proj/w", - }; - - if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) { - fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str()); - return false; - } - - finp.close(); - fout.close(); - - return true; -} - -// usage: -// ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type -// -int main(int argc, char ** argv) { - if (argc != 4) { - fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); - ggml_print_ftypes(stderr); - return 1; - } - - // needed to initialize f16 tables - { - struct ggml_init_params params = { 0, NULL, false }; - struct ggml_context * ctx = ggml_init(params); - ggml_free(ctx); - } - - const std::string fname_inp = argv[1]; - const std::string fname_out = argv[2]; - - const ggml_ftype ftype = ggml_parse_ftype(argv[3]); - - const int64_t t_main_start_us = ggml_time_us(); - - int64_t t_quantize_us = 0; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!starcoder_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { - fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); - return 1; - } - - t_quantize_us = ggml_time_us() - t_start_us; - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n"); - printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - return 0; -}