[Backend] Add Llamacpp backend (#2975)

* Add llamacpp backend Signed-off-by: Adrien Gallouët <[email protected]> * Get rid of llama_batch_get_one() Signed-off-by: Adrien Gallouët <[email protected]> * Use max_batch_total_tokens Signed-off-by: Adrien Gallouët <[email protected]> * Handle max_batch_size Signed-off-by: Adrien Gallouët <[email protected]> * Add some input validation checks Signed-off-by: Adrien Gallouët <[email protected]> * Handle ctx args & fix sampling Signed-off-by: Adrien Gallouët <[email protected]> * Add GPU args Signed-off-by: Adrien Gallouët <[email protected]> * Add --defrag-threshold Signed-off-by: Adrien Gallouët <[email protected]> * Add a stupid batch mechanism Signed-off-by: Adrien Gallouët <[email protected]> * Cleanup Signed-off-by: Adrien Gallouët <[email protected]> * Add --numa Signed-off-by: Adrien Gallouët <[email protected]> * Fix args Signed-off-by: Adrien Gallouët <[email protected]> * Enable flash attention by default Signed-off-by: Adrien Gallouët <[email protected]> * Add --offload-kqv Signed-off-by: Adrien Gallouët <[email protected]> * Fix batch_pos Signed-off-by: Adrien Gallouët <[email protected]> * backend(llama): add CUDA Dockerfile_llamacpp for now * Only export the latest logits Signed-off-by: Adrien Gallouët <[email protected]> * Output real logprobs Signed-off-by: Adrien Gallouët <[email protected]> * Fix batching Signed-off-by: Adrien Gallouët <[email protected]> * Fix seq iterations Signed-off-by: Adrien Gallouët <[email protected]> * Auto-detect n_threads when not provided Signed-off-by: Adrien Gallouët <[email protected]> * Clear request cache after completion Signed-off-by: Adrien Gallouët <[email protected]> * Remove warmup Signed-off-by: Adrien Gallouët <[email protected]> * Cleanup Signed-off-by: Adrien Gallouët <[email protected]> * backend(llama): add CUDA architectures build argument for Dockerfile * Add specific args for batch Signed-off-by: Adrien Gallouët <[email protected]> * Add --type-v & --type-k Signed-off-by: Adrien Gallouët <[email protected]> * Bump llamacpp to b4623 Signed-off-by: Adrien Gallouët <[email protected]> * Disable graceful shutdown in debug mode Signed-off-by: Adrien Gallouët <[email protected]> * Update Dockerfile_llamacpp Signed-off-by: Adrien Gallouët <[email protected]> * Cleanup Dockerfile Signed-off-by: Adrien Gallouët <[email protected]> * Update Cargo.lock Signed-off-by: Adrien Gallouët <[email protected]> * Update args Signed-off-by: Adrien Gallouët <[email protected]> * Simplify batching logic Signed-off-by: Adrien Gallouët <[email protected]> * Set TGI_LLAMA_PKG_CUDA from CUDA_VERSION Signed-off-by: Adrien Gallouët <[email protected]> * Rename bindings Signed-off-by: Adrien Gallouët <[email protected]> * Remove n_ctx Signed-off-by: Adrien Gallouët <[email protected]> * Make max_batch_total_tokens optional Signed-off-by: Adrien Gallouët <[email protected]> * Ensure all samplers are freed on error Signed-off-by: Adrien Gallouët <[email protected]> * Initialize penalty_last_n with llamacpp default value Signed-off-by: Adrien Gallouët <[email protected]> * Cleanup Signed-off-by: Adrien Gallouët <[email protected]> * Improve default settings Signed-off-by: Adrien Gallouët <[email protected]> * Add doc Signed-off-by: Adrien Gallouët <[email protected]> * Update docs Signed-off-by: Adrien Gallouët <[email protected]> * Thanks clippy Signed-off-by: Adrien Gallouët <[email protected]> * Thanks cargo fmt Signed-off-by: Adrien Gallouët <[email protected]> * Update docs Signed-off-by: Adrien Gallouët <[email protected]> * Do not use HOSTNAME env Signed-off-by: Adrien Gallouët <[email protected]> * Bump llama.cpp & cuda Signed-off-by: Adrien Gallouët <[email protected]> * Fix requirements.txt Signed-off-by: Adrien Gallouët <[email protected]> * Fix fmt Signed-off-by: Adrien Gallouët <[email protected]> * Enable KQV offload by default Signed-off-by: Adrien Gallouët <[email protected]> * Remove Ngrok tunneling Signed-off-by: Adrien Gallouët <[email protected]> * Remove .cargo/config.toml Signed-off-by: Adrien Gallouët <[email protected]> * Fix Dockerfile Signed-off-by: Adrien Gallouët <[email protected]> * Add missing cuda prefix Signed-off-by: Adrien Gallouët <[email protected]> * Handle custom llama.cpp dir Signed-off-by: Adrien Gallouët <[email protected]> * Cleanup Signed-off-by: Adrien Gallouët <[email protected]> * Add README.md Signed-off-by: Adrien Gallouët <[email protected]> * Add HF transfer Signed-off-by: Adrien Gallouët <[email protected]> * Fix bool args Signed-off-by: Adrien Gallouët <[email protected]> * Update doc Signed-off-by: Adrien Gallouët <[email protected]> * Update doc Signed-off-by: Adrien Gallouët <[email protected]> --------- Signed-off-by: Adrien Gallouët <[email protected]> Co-authored-by: Morgan Funtowicz <[email protected]>
huggingface · Feb 14, 2025 · cfd4fbb · cfd4fbb
1 parent 6df0fc0
commit cfd4fbb
Show file tree

Hide file tree

Showing 12 changed files with 1,764 additions and 421 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -5,6 +5,7 @@ members = [
     "backends/v3",
     "backends/grpc-metadata",
     "backends/trtllm",
+    "backends/llamacpp",
     "launcher",
     "router"
 ]

diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp
@@ -0,0 +1,76 @@
+FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps
+
+ARG llamacpp_version=b4651
+ARG llamacpp_cuda=OFF
+ARG cuda_arch=75-real;80-real;86-real;89-real;90-real
+
+WORKDIR /opt/src
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt install -y \
+    clang \
+    cmake \
+    curl \
+    git \
+    python3-dev \
+    libssl-dev \
+    pkg-config \
+    tar
+
+ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
+RUN tar -xzf ${llamacpp_version}.tar.gz \
+ && cd llama.cpp-${llamacpp_version} \
+ && cmake -B build \
+    -DCMAKE_INSTALL_PREFIX=/usr \
+    -DCMAKE_INSTALL_LIBDIR=/usr/lib \
+    -DCMAKE_C_COMPILER=clang \
+    -DCMAKE_CXX_COMPILER=clang++ \
+    -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
+    -DGGML_CUDA=${llamacpp_cuda} \
+    -DLLAMA_BUILD_COMMON=OFF \
+    -DLLAMA_BUILD_TESTS=OFF \
+    -DLLAMA_BUILD_EXAMPLES=OFF \
+    -DLLAMA_BUILD_SERVER=OFF \
+ && cmake --build build --parallel --config Release \
+ && cmake --install build
+
+WORKDIR /app
+COPY rust-toolchain.toml rust-toolchain.toml
+RUN curl -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain none
+ENV PATH="/root/.cargo/bin:$PATH"
+RUN cargo install cargo-chef --locked
+
+FROM deps AS planner
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM deps AS builder
+COPY --from=planner /app/recipe.json recipe.json
+RUN cargo chef cook \
+    --recipe-path recipe.json \
+    --profile release-opt \
+    --package text-generation-router-llamacpp
+COPY . .
+RUN cargo build \
+    --profile release-opt \
+    --package text-generation-router-llamacpp --frozen
+
+FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
+
+RUN apt update && apt install -y \
+    python3-venv \
+    python3-pip
+
+RUN python3 -m venv /venv
+ENV PATH="/venv/bin:$PATH"
+
+COPY backends/llamacpp/requirements.txt requirements.txt
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+COPY --from=builder /usr/lib/libllama.so /usr/lib/
+COPY --from=builder /usr/lib/libggml*.so /usr/lib/
+COPY --from=builder /app/target/release-opt/text-generation-router-llamacpp /usr/bin/
+
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+ENTRYPOINT ["text-generation-router-llamacpp"]
diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "text-generation-router-llamacpp"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+
+[build-dependencies]
+bindgen = "0.71.1"
+pkg-config = "0.3.31"
+
+[dependencies]
+async-trait = "0.1.85"
+clap = "4.5.27"
+num_cpus = "1.16.0"
+text-generation-router = { path = "../../router" }
+thiserror = "2.0.11"
+tokenizers.workspace = true
+tokio = "1.43.0"
+tokio-stream = "0.1.17"
+tracing = "0.1.41"
diff --git a/backends/llamacpp/README.md b/backends/llamacpp/README.md
@@ -0,0 +1,24 @@
+# Llamacpp backend
+
+If all your dependencies are installed at the system level, running
+cargo build should be sufficient. However, if you want to experiment
+with different versions of llama.cpp, some additional setup is required.
+
+## Install llama.cpp
+
+    LLAMACPP_PREFIX=$(pwd)/llama.cpp.out
+
+    git clone https://github.com/ggerganov/llama.cpp
+    cd llama.cpp
+    cmake -B build \
+        -DCMAKE_INSTALL_PREFIX="$LLAMACPP_PREFIX" \
+        -DLLAMA_BUILD_COMMON=OFF \
+        -DLLAMA_BUILD_TESTS=OFF \
+        -DLLAMA_BUILD_EXAMPLES=OFF \
+        -DLLAMA_BUILD_SERVER=OFF
+    cmake --build build --config Release -j
+    cmake --install build
+
+## Build TGI
+
+    PKG_CONFIG_PATH="$LLAMACPP_PREFIX/lib/pkgconfig" cargo build
diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
@@ -0,0 +1,48 @@
+use bindgen::callbacks::{ItemInfo, ParseCallbacks};
+use std::env;
+use std::path::PathBuf;
+
+#[derive(Debug)]
+struct PrefixStripper;
+
+impl ParseCallbacks for PrefixStripper {
+    fn generated_name_override(&self, item_info: ItemInfo<'_>) -> Option<String> {
+        item_info.name.strip_prefix("llama_").map(str::to_string)
+    }
+}
+
+fn main() {
+    if let Some(cuda_version) = option_env!("CUDA_VERSION") {
+        let mut version: Vec<&str> = cuda_version.split('.').collect();
+        if version.len() > 2 {
+            version.pop();
+        }
+        let cuda_version = format!("cuda-{}", version.join("."));
+        pkg_config::Config::new().probe(&cuda_version).unwrap();
+    }
+    let llama = pkg_config::Config::new().probe("llama").unwrap();
+
+    for path in &llama.link_paths {
+        println!("cargo:rustc-link-arg=-Wl,-rpath,{}", path.display());
+    }
+    println!("cargo:rustc-link-arg=-Wl,--disable-new-dtags");
+
+    let bindings = bindgen::Builder::default()
+        .clang_args(
+            llama
+                .include_paths
+                .iter()
+                .map(|p| format!("-I{}", p.display())),
+        )
+        .header_contents("llama_bindings.h", "#include <llama.h>")
+        .prepend_enum_name(false)
+        .parse_callbacks(Box::new(PrefixStripper))
+        .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
+        .generate()
+        .expect("Unable to generate bindings");
+
+    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
+    bindings
+        .write_to_file(out_path.join("llamacpp.rs"))
+        .expect("Couldn't write bindings!");
+}
diff --git a/backends/llamacpp/requirements.txt b/backends/llamacpp/requirements.txt
@@ -0,0 +1,3 @@
+transformers==4.48.2
+huggingface-hub==0.28.1
+hf-transfer==0.1.9