From 67e00652a80c9b861697809c91f1a23015266bea Mon Sep 17 00:00:00 2001 From: Reda Noureddine Date: Mon, 22 Apr 2024 10:16:28 +0200 Subject: [PATCH 01/11] feat(containers): inference with hugging face models --- containers/hugging-face-inference/Dockerfile | 19 ++++++++++ containers/hugging-face-inference/README.md | 8 +++++ containers/hugging-face-inference/main.py | 36 +++++++++++++++++++ containers/hugging-face-inference/prompt.py | 4 +++ .../hugging-face-inference/requirements.txt | 2 ++ .../terraform/container.tf | 28 +++++++++++++++ .../terraform/images.tf | 20 +++++++++++ .../terraform/providers.tf | 16 +++++++++ .../hugging-face-inference/terraform/utils.tf | 5 +++ .../terraform/variables.tf | 36 +++++++++++++++++++ .../terraform/versions.tf | 13 +++++++ 11 files changed, 187 insertions(+) create mode 100644 containers/hugging-face-inference/Dockerfile create mode 100644 containers/hugging-face-inference/README.md create mode 100644 containers/hugging-face-inference/main.py create mode 100644 containers/hugging-face-inference/prompt.py create mode 100644 containers/hugging-face-inference/requirements.txt create mode 100644 containers/hugging-face-inference/terraform/container.tf create mode 100644 containers/hugging-face-inference/terraform/images.tf create mode 100644 containers/hugging-face-inference/terraform/providers.tf create mode 100644 containers/hugging-face-inference/terraform/utils.tf create mode 100644 containers/hugging-face-inference/terraform/variables.tf create mode 100644 containers/hugging-face-inference/terraform/versions.tf diff --git a/containers/hugging-face-inference/Dockerfile b/containers/hugging-face-inference/Dockerfile new file mode 100644 index 0000000..61de72f --- /dev/null +++ b/containers/hugging-face-inference/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.12-slim-bookworm + +ARG MODEL_DOWNLOAD_SOURCE + +RUN apt-get update && apt-get install -y wget + +WORKDIR /app + +RUN pip install --upgrade pip +COPY requirements.txt . +RUN pip install -r requirements.txt + +RUN pip install llama-cpp-python==0.2.62 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu + +RUN wget $MODEL_DOWNLOAD_SOURCE + +COPY . . + +CMD ["uvicorn", "main:app", "--proxy-headers", "--host", "0.0.0.0", "--port", "80"] diff --git a/containers/hugging-face-inference/README.md b/containers/hugging-face-inference/README.md new file mode 100644 index 0000000..53a897d --- /dev/null +++ b/containers/hugging-face-inference/README.md @@ -0,0 +1,8 @@ +## Deploy Hugging Face Models in Serverless Containers + +### Example of public Hugging Face Models to test + +- [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) +- [llama-2-7b.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_K_M.gguf) +- [phi-2.Q8_0.gguf](https://huggingface.co/TheBloke/phi-2-GGUF/blob/main/phi-2.Q8_0.gguf) + diff --git a/containers/hugging-face-inference/main.py b/containers/hugging-face-inference/main.py new file mode 100644 index 0000000..753308e --- /dev/null +++ b/containers/hugging-face-inference/main.py @@ -0,0 +1,36 @@ +from fastapi import FastAPI +from llama_cpp import Llama +import os +import prompt + +MODEL_FILE_NAME=os.environ["MODEL_FILE_NAME"] + +app = FastAPI() + +print("loading model from memory starts", flush=True) + +llm = Llama(model_path=MODEL_FILE_NAME) + +print("loading model from memory successfully ends", flush=True) + +@app.get("/") +def hello(): + """Get Inference Server Info""" + + return { + "message": "Hello, this is the inference server! Serving model {model_name}" + .format(model_name=MODEL_FILE_NAME) + } + +@app.post("/") +def infer(prompt: prompt.Prompt): + + print("inference endpoint is called", flush=True) + + output = llm(prompt=prompt.message, max_tokens=200) + + print("output is successfully inferred", flush=True) + + print(output, flush=True) + + return output diff --git a/containers/hugging-face-inference/prompt.py b/containers/hugging-face-inference/prompt.py new file mode 100644 index 0000000..5dc7363 --- /dev/null +++ b/containers/hugging-face-inference/prompt.py @@ -0,0 +1,4 @@ +from pydantic import BaseModel + +class Prompt(BaseModel): + message: str \ No newline at end of file diff --git a/containers/hugging-face-inference/requirements.txt b/containers/hugging-face-inference/requirements.txt new file mode 100644 index 0000000..3b33077 --- /dev/null +++ b/containers/hugging-face-inference/requirements.txt @@ -0,0 +1,2 @@ +fastapi==0.104.1 +uvicorn==0.24.0.post1 \ No newline at end of file diff --git a/containers/hugging-face-inference/terraform/container.tf b/containers/hugging-face-inference/terraform/container.tf new file mode 100644 index 0000000..3b98f07 --- /dev/null +++ b/containers/hugging-face-inference/terraform/container.tf @@ -0,0 +1,28 @@ +resource "scaleway_container_namespace" "main" { + name = "ifr-${lower(replace(var.hf_model_file_name, "/[.]|[_]/", "-"))}-${random_string.random_suffix.result}" + description = "Inference using Hugging Face models" +} + +resource "scaleway_container" "inference-hugging-face" { + name = "inference" + description = "Inference serving API using a Hugging Face model" + namespace_id = scaleway_container_namespace.main.id + registry_image = docker_image.inference.name + environment_variables = { + "MODEL_FILE_NAME" = var.hf_model_file_name + } + port = 80 + cpu_limit = 2240 + memory_limit = 4096 + min_scale = 1 + max_scale = 1 + deploy = true +} + +resource scaleway_container_cron "inference_cron" { + container_id = scaleway_container.inference-hugging-face.id + schedule = var.inference_cron_schedule + args = jsonencode({ + "message" : "Hello! It's sunny today. How are you doing?" + }) +} \ No newline at end of file diff --git a/containers/hugging-face-inference/terraform/images.tf b/containers/hugging-face-inference/terraform/images.tf new file mode 100644 index 0000000..6c857e6 --- /dev/null +++ b/containers/hugging-face-inference/terraform/images.tf @@ -0,0 +1,20 @@ +resource "scaleway_registry_namespace" "main" { + name = "ifr-${lower(replace(var.hf_model_file_name, "/[.]|[_]/", "-"))}-${random_string.random_suffix.result}" + region = var.region + project_id = var.project_id +} + +resource "docker_image" "inference" { + name = "${scaleway_registry_namespace.main.endpoint}/inference-with-huggingface:${var.image_version}" + build { + context = "${path.cwd}/../" + no_cache = true + build_args = { + MODEL_DOWNLOAD_SOURCE : var.hf_model_download_source + } + } + + provisioner "local-exec" { + command = "docker push ${docker_image.inference.name}" + } +} diff --git a/containers/hugging-face-inference/terraform/providers.tf b/containers/hugging-face-inference/terraform/providers.tf new file mode 100644 index 0000000..439df4d --- /dev/null +++ b/containers/hugging-face-inference/terraform/providers.tf @@ -0,0 +1,16 @@ +provider "scaleway" { + region = var.region + access_key = var.access_key + secret_key = var.secret_key + project_id = var.project_id +} + +provider "docker" { + host = "unix:///var/run/docker.sock" + + registry_auth { + address = scaleway_registry_namespace.main.endpoint + username = "nologin" + password = var.secret_key + } +} diff --git a/containers/hugging-face-inference/terraform/utils.tf b/containers/hugging-face-inference/terraform/utils.tf new file mode 100644 index 0000000..15d52ab --- /dev/null +++ b/containers/hugging-face-inference/terraform/utils.tf @@ -0,0 +1,5 @@ +resource "random_string" "random_suffix" { + length = 3 + upper = false + special = false +} diff --git a/containers/hugging-face-inference/terraform/variables.tf b/containers/hugging-face-inference/terraform/variables.tf new file mode 100644 index 0000000..a9b0e68 --- /dev/null +++ b/containers/hugging-face-inference/terraform/variables.tf @@ -0,0 +1,36 @@ +variable "access_key" { + type = string +} + +variable "secret_key" { + type = string +} + +variable "project_id" { + type = string +} + +variable "image_version" { + type = string + default = "0.0.3" +} + +variable "region" { + type = string + default = "fr-par" +} + +variable "inference_cron_schedule" { + type = string + default = "*/15 * * * *" +} + +variable "hf_model_file_name" { + type = string + default = "llama-2-7b.Q4_0.gguf" +} + +variable "hf_model_download_source" { + type = string + default = "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_0.gguf" +} diff --git a/containers/hugging-face-inference/terraform/versions.tf b/containers/hugging-face-inference/terraform/versions.tf new file mode 100644 index 0000000..b186193 --- /dev/null +++ b/containers/hugging-face-inference/terraform/versions.tf @@ -0,0 +1,13 @@ +terraform { + required_providers { + scaleway = { + source = "scaleway/scaleway" + version = ">= 2.39" + } + docker = { + source = "kreuzwerker/docker" + version = "3.0.2" + } + } + required_version = ">= 0.13" +} From 273615d3f8ed54a0c6e8a5b5d673241d064f38ee Mon Sep 17 00:00:00 2001 From: Reda Noureddine Date: Mon, 22 Apr 2024 18:31:34 +0200 Subject: [PATCH 02/11] feat: deploy multiple models using terraform workpaces --- containers/hugging-face-inference/README.md | 13 +++++++--- .../terraform/deploy-models.sh | 26 +++++++++++++++++++ .../terraform/variables.tf | 2 -- 3 files changed, 35 insertions(+), 6 deletions(-) create mode 100644 containers/hugging-face-inference/terraform/deploy-models.sh diff --git a/containers/hugging-face-inference/README.md b/containers/hugging-face-inference/README.md index 53a897d..3e55d13 100644 --- a/containers/hugging-face-inference/README.md +++ b/containers/hugging-face-inference/README.md @@ -1,8 +1,13 @@ ## Deploy Hugging Face Models in Serverless Containers -### Example of public Hugging Face Models to test +- Export these variables: -- [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) -- [llama-2-7b.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_K_M.gguf) -- [phi-2.Q8_0.gguf](https://huggingface.co/TheBloke/phi-2-GGUF/blob/main/phi-2.Q8_0.gguf) +```bash +export SCW_ACCESS_KEY="access-key" SCW_SECRET_KEY="secret-key" SCW_PROJECT_ID="project-id" +``` +- Run script to deploy multiple hugging face models using terraform workspaces: + +```bash +bash ./deploy-models.sh +``` \ No newline at end of file diff --git a/containers/hugging-face-inference/terraform/deploy-models.sh b/containers/hugging-face-inference/terraform/deploy-models.sh new file mode 100644 index 0000000..6655dc5 --- /dev/null +++ b/containers/hugging-face-inference/terraform/deploy-models.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -e + +export SCW_ACCESS_KEY=${SCW_ACCESS_KEY} \ + SCW_SECRET_KEY=${SCW_SECRET_KEY} \ + SCW_PROJECT_ID=${SCW_PROJECT_ID} + +declare -A hf_models + +hf_models["llama-2-7b.Q2_K.gguf"]="https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q8_0.gguf" +hf_models["mistral-7b-instruct-v0.2.Q2_K.gguf"]="https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q8_0.gguf" + +terraform init + +for model_file_name in "${!hf_models[@]}"; +do + terraform workspace new $model_file_name + export TF_VAR_hf_model_file_name=$model_file_name \ + TF_VAR_hf_model_download_source=${hf_models[$model_file_name]} \ + TF_VAR_access_key=$SCW_ACCESS_KEY \ + TF_VAR_secret_key=$SCW_SECRET_KEY \ + TF_VAR_project_id=$SCW_PROJECT_ID + terraform plan -var-file=testing.tfvars + terraform apply -var-file=testing.tfvars -auto-approve +done diff --git a/containers/hugging-face-inference/terraform/variables.tf b/containers/hugging-face-inference/terraform/variables.tf index a9b0e68..623c911 100644 --- a/containers/hugging-face-inference/terraform/variables.tf +++ b/containers/hugging-face-inference/terraform/variables.tf @@ -27,10 +27,8 @@ variable "inference_cron_schedule" { variable "hf_model_file_name" { type = string - default = "llama-2-7b.Q4_0.gguf" } variable "hf_model_download_source" { type = string - default = "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_0.gguf" } From ff810088df0158c6d2099f58e2b1792a890ed8f8 Mon Sep 17 00:00:00 2001 From: Reda Noureddine Date: Tue, 23 Apr 2024 18:19:19 +0200 Subject: [PATCH 03/11] feat: deploy models using json file info --- containers/hugging-face-inference/Dockerfile | 4 +- .../terraform/deploy-models.sh | 23 ++++++----- .../terraform/hf-models.json | 40 +++++++++++++++++++ 3 files changed, 55 insertions(+), 12 deletions(-) create mode 100644 containers/hugging-face-inference/terraform/hf-models.json diff --git a/containers/hugging-face-inference/Dockerfile b/containers/hugging-face-inference/Dockerfile index 61de72f..5f3088a 100644 --- a/containers/hugging-face-inference/Dockerfile +++ b/containers/hugging-face-inference/Dockerfile @@ -10,7 +10,9 @@ RUN pip install --upgrade pip COPY requirements.txt . RUN pip install -r requirements.txt -RUN pip install llama-cpp-python==0.2.62 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu +RUN pip install llama-cpp-python==0.2.62 \ + --no-cache-dir \ + --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu RUN wget $MODEL_DOWNLOAD_SOURCE diff --git a/containers/hugging-face-inference/terraform/deploy-models.sh b/containers/hugging-face-inference/terraform/deploy-models.sh index 6655dc5..b92fbc3 100644 --- a/containers/hugging-face-inference/terraform/deploy-models.sh +++ b/containers/hugging-face-inference/terraform/deploy-models.sh @@ -1,15 +1,19 @@ #!/bin/bash +# Setup + set -e -export SCW_ACCESS_KEY=${SCW_ACCESS_KEY} \ - SCW_SECRET_KEY=${SCW_SECRET_KEY} \ - SCW_PROJECT_ID=${SCW_PROJECT_ID} +export TF_VAR_access_key=${SCW_ACCESS_KEY} \ + TF_VAR_secret_key=${SCW_SECRET_KEY} \ + TF_VAR_project_id=${SCW_PROJECT_ID} + +# Associative list of models to deploy -declare -A hf_models +declare -A hf_models +eval "$(jq -r '.[]|.[]|"hf_models[\(.file)]=\(.source)"' hf-models.json)" -hf_models["llama-2-7b.Q2_K.gguf"]="https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q8_0.gguf" -hf_models["mistral-7b-instruct-v0.2.Q2_K.gguf"]="https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q8_0.gguf" +# Initialize, plan, and deploy each model in a Terraform workspace terraform init @@ -18,9 +22,6 @@ do terraform workspace new $model_file_name export TF_VAR_hf_model_file_name=$model_file_name \ TF_VAR_hf_model_download_source=${hf_models[$model_file_name]} \ - TF_VAR_access_key=$SCW_ACCESS_KEY \ - TF_VAR_secret_key=$SCW_SECRET_KEY \ - TF_VAR_project_id=$SCW_PROJECT_ID - terraform plan -var-file=testing.tfvars - terraform apply -var-file=testing.tfvars -auto-approve + terraform plan + terraform apply -auto-approve done diff --git a/containers/hugging-face-inference/terraform/hf-models.json b/containers/hugging-face-inference/terraform/hf-models.json new file mode 100644 index 0000000..1c5da8b --- /dev/null +++ b/containers/hugging-face-inference/terraform/hf-models.json @@ -0,0 +1,40 @@ +{ + "llama" : [ + { + "file": "llama-2-7b.Q2_K.gguf", + "source" : "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/llama-2-7b.Q2_K.gguf", + "size_gb": "2.83" + }, + { + "file": "llama-2-7b.Q3_K_L.gguf", + "source" : "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q3_K_L.gguf", + "size_gb": "3.6" + } + ], + + "mistral" : [ + { + "file": "mistral-7b-instruct-v0.2.Q2_K.gguf", + "source" : "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q2_K.gguf", + "size_gb": "3.08" + }, + { + "file": "mistral-7b-instruct-v0.2.Q3_K_L.gguf", + "source" : "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q3_K_L.gguf", + "size_gb": "3.82" + } + ], + + "phi" : [ + { + "file": "phi-2.Q2_K.gguf", + "source" : "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q2_K.gguf", + "size_gb": "1.17" + }, + { + "file": "phi-2.Q5_K_M.gguf", + "source" : "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q5_K_M.gguf", + "size_gb": "2.07" + } + ] +} \ No newline at end of file From 1c3fce3708078ab161dd1b3fe75703d6eb9967f7 Mon Sep 17 00:00:00 2001 From: Reda Noureddine Date: Tue, 23 Apr 2024 18:38:29 +0200 Subject: [PATCH 04/11] fix: model sources --- containers/hugging-face-inference/terraform/hf-models.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/containers/hugging-face-inference/terraform/hf-models.json b/containers/hugging-face-inference/terraform/hf-models.json index 1c5da8b..5b7e94a 100644 --- a/containers/hugging-face-inference/terraform/hf-models.json +++ b/containers/hugging-face-inference/terraform/hf-models.json @@ -2,7 +2,7 @@ "llama" : [ { "file": "llama-2-7b.Q2_K.gguf", - "source" : "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/llama-2-7b.Q2_K.gguf", + "source" : "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q2_K.gguf", "size_gb": "2.83" }, { @@ -15,12 +15,12 @@ "mistral" : [ { "file": "mistral-7b-instruct-v0.2.Q2_K.gguf", - "source" : "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q2_K.gguf", + "source" : "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q2_K.gguf", "size_gb": "3.08" }, { "file": "mistral-7b-instruct-v0.2.Q3_K_L.gguf", - "source" : "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q3_K_L.gguf", + "source" : "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q3_K_L.gguf", "size_gb": "3.82" } ], From 7c8a7429c285d09fbaa71b3ee2ce53b44469cbd6 Mon Sep 17 00:00:00 2001 From: Reda Noureddine Date: Wed, 24 Apr 2024 10:56:31 +0200 Subject: [PATCH 05/11] feat: docker login + terraform select with create flag --- containers/hugging-face-inference/README.md | 2 +- .../hugging-face-inference/terraform/deploy-models.sh | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/containers/hugging-face-inference/README.md b/containers/hugging-face-inference/README.md index 3e55d13..e3fa747 100644 --- a/containers/hugging-face-inference/README.md +++ b/containers/hugging-face-inference/README.md @@ -3,7 +3,7 @@ - Export these variables: ```bash -export SCW_ACCESS_KEY="access-key" SCW_SECRET_KEY="secret-key" SCW_PROJECT_ID="project-id" +export SCW_ACCESS_KEY="access-key" SCW_SECRET_KEY="secret-key" SCW_PROJECT_ID="project-id" REGION="fr-par" ``` - Run script to deploy multiple hugging face models using terraform workspaces: diff --git a/containers/hugging-face-inference/terraform/deploy-models.sh b/containers/hugging-face-inference/terraform/deploy-models.sh index b92fbc3..cda787e 100644 --- a/containers/hugging-face-inference/terraform/deploy-models.sh +++ b/containers/hugging-face-inference/terraform/deploy-models.sh @@ -13,13 +13,17 @@ export TF_VAR_access_key=${SCW_ACCESS_KEY} \ declare -A hf_models eval "$(jq -r '.[]|.[]|"hf_models[\(.file)]=\(.source)"' hf-models.json)" +# Login to docker Scaleway's registry on fr-par + +docker login "rg.$REGION.scw.cloud" -u nologin --password-stdin <<< "$SCW_SECRET_KEY" + # Initialize, plan, and deploy each model in a Terraform workspace terraform init for model_file_name in "${!hf_models[@]}"; do - terraform workspace new $model_file_name + terraform workspace select -or-create $model_file_name export TF_VAR_hf_model_file_name=$model_file_name \ TF_VAR_hf_model_download_source=${hf_models[$model_file_name]} \ terraform plan From 975f3890682ee2d385a4832d18a0cc5abc0cbcdc Mon Sep 17 00:00:00 2001 From: Reda Noureddine Date: Tue, 7 May 2024 11:51:05 +0200 Subject: [PATCH 06/11] feat: benchmark script --- .../terraform/benchmark-models.py | 77 +++++++++++++++++++ .../terraform/hf-models.json | 18 +++-- 2 files changed, 89 insertions(+), 6 deletions(-) create mode 100644 containers/hugging-face-inference/terraform/benchmark-models.py diff --git a/containers/hugging-face-inference/terraform/benchmark-models.py b/containers/hugging-face-inference/terraform/benchmark-models.py new file mode 100644 index 0000000..7ffe2af --- /dev/null +++ b/containers/hugging-face-inference/terraform/benchmark-models.py @@ -0,0 +1,77 @@ +import json, requests, csv, pandas +import matplotlib.pyplot as plt + +class Benchmark: + _model_families = ["llama", "mistral", "phi"] + _endpoints = {} + + def __init__(self, models_file: str, benchmark_file: str, results_figure: str, message: str) -> None: + self.models_file = models_file + self.benchmark_file = benchmark_file + self.message = message + self.results_figure = results_figure + + def get_container_endpoints_from_json_file(self)-> None: + if self.models_file == "": + raise Exception("file name is empty") + + with open(self.models_file, 'r') as models_file: + json_data = json.load(models_file) + + for family in self._model_families: + self._endpoints[family] = [] + for model in json_data[family]: + self._endpoints[family].append({"model": model["file"], "endpoint": model["ctn_endpoint"]}) + + def analyze_results(self) -> None: + benchmark_results = pandas.read_csv(self.benchmark_file) + benchmark_results.boxplot(column="Total Response Time", by="Family").plot() + plt.ylabel("Total Response Time in seconds") + plt.savefig(self.results_figure) + + def benchmark_models(self, num_samples: int) -> None: + self.get_container_endpoints_from_json_file() + + fields = ['Model', 'Family', 'Total Response Time', 'Response Message'] + benchmark_data = [] + + for family in self._model_families: + for endpoint in self._endpoints[family]: + if endpoint["endpoint"] == "": + raise Exception("model endpoint is empty") + + for _ in range(num_samples): + try: + print("Calling model {model} on endpoint {endpoint} with message {message}" + .format(model=endpoint["model"], endpoint=endpoint["endpoint"], message=self.message) + ) + + rsp = requests.post(endpoint["endpoint"], json={"message": self.message}) + + response_text = rsp.json()["choices"][0]["text"] + + print("The model {model} responded with: {response_text}" + .format(model=endpoint["model"], response_text=response_text) + ) + + benchmark_data.append([endpoint["model"], family, rsp.elapsed.total_seconds(), response_text]) + except: + pass + + with open(self.benchmark_file, 'w') as results_file: + wrt = csv.writer(results_file) + wrt.writerow(fields) + wrt.writerows(benchmark_data) + + self.analyze_results() + +if __name__ == "__main__": + + benchmark = Benchmark( + models_file="hf-models.json", + benchmark_file="benchmark-results.csv", + results_figure="results-plot.png", + message="What the difference between an elephant and an ant?", + ) + + benchmark.benchmark_models(num_samples=50) diff --git a/containers/hugging-face-inference/terraform/hf-models.json b/containers/hugging-face-inference/terraform/hf-models.json index 5b7e94a..a4f44f5 100644 --- a/containers/hugging-face-inference/terraform/hf-models.json +++ b/containers/hugging-face-inference/terraform/hf-models.json @@ -3,12 +3,14 @@ { "file": "llama-2-7b.Q2_K.gguf", "source" : "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q2_K.gguf", - "size_gb": "2.83" + "size_gb": "2.83", + "ctn_endpoint": "paste container endpoint here" }, { "file": "llama-2-7b.Q3_K_L.gguf", "source" : "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q3_K_L.gguf", - "size_gb": "3.6" + "size_gb": "3.6", + "ctn_endpoint": "paste container endpoint here" } ], @@ -16,12 +18,14 @@ { "file": "mistral-7b-instruct-v0.2.Q2_K.gguf", "source" : "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q2_K.gguf", - "size_gb": "3.08" + "size_gb": "3.08", + "ctn_endpoint": "paste container endpoint here" }, { "file": "mistral-7b-instruct-v0.2.Q3_K_L.gguf", "source" : "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q3_K_L.gguf", - "size_gb": "3.82" + "size_gb": "3.82", + "ctn_endpoint": "paste container endpoint here" } ], @@ -29,12 +33,14 @@ { "file": "phi-2.Q2_K.gguf", "source" : "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q2_K.gguf", - "size_gb": "1.17" + "size_gb": "1.17", + "ctn_endpoint": "paste container endpoint here" }, { "file": "phi-2.Q5_K_M.gguf", "source" : "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q5_K_M.gguf", - "size_gb": "2.07" + "size_gb": "2.07", + "ctn_endpoint": "paste container endpoint here" } ] } \ No newline at end of file From 162a433bd082d8da4eea3c9fd223392c146677de Mon Sep 17 00:00:00 2001 From: Reda Noureddine Date: Tue, 7 May 2024 11:53:53 +0200 Subject: [PATCH 07/11] feat: rename script + add flags --- .../terraform/deploy-models.sh | 31 ---------- .../terraform/terraform.sh | 56 +++++++++++++++++++ 2 files changed, 56 insertions(+), 31 deletions(-) delete mode 100644 containers/hugging-face-inference/terraform/deploy-models.sh create mode 100755 containers/hugging-face-inference/terraform/terraform.sh diff --git a/containers/hugging-face-inference/terraform/deploy-models.sh b/containers/hugging-face-inference/terraform/deploy-models.sh deleted file mode 100644 index cda787e..0000000 --- a/containers/hugging-face-inference/terraform/deploy-models.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -# Setup - -set -e - -export TF_VAR_access_key=${SCW_ACCESS_KEY} \ - TF_VAR_secret_key=${SCW_SECRET_KEY} \ - TF_VAR_project_id=${SCW_PROJECT_ID} - -# Associative list of models to deploy - -declare -A hf_models -eval "$(jq -r '.[]|.[]|"hf_models[\(.file)]=\(.source)"' hf-models.json)" - -# Login to docker Scaleway's registry on fr-par - -docker login "rg.$REGION.scw.cloud" -u nologin --password-stdin <<< "$SCW_SECRET_KEY" - -# Initialize, plan, and deploy each model in a Terraform workspace - -terraform init - -for model_file_name in "${!hf_models[@]}"; -do - terraform workspace select -or-create $model_file_name - export TF_VAR_hf_model_file_name=$model_file_name \ - TF_VAR_hf_model_download_source=${hf_models[$model_file_name]} \ - terraform plan - terraform apply -auto-approve -done diff --git a/containers/hugging-face-inference/terraform/terraform.sh b/containers/hugging-face-inference/terraform/terraform.sh new file mode 100755 index 0000000..1bf68af --- /dev/null +++ b/containers/hugging-face-inference/terraform/terraform.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +set -e + +# Common environment variables +export TF_VAR_access_key=${SCW_ACCESS_KEY} \ + TF_VAR_secret_key=${SCW_SECRET_KEY} \ + TF_VAR_project_id=${SCW_PROJECT_ID} + +# Associative list of models to deploy using json data +declare -A hf_models +eval "$(jq -r '.[]|.[]|"hf_models[\(.file)]=\(.source)"' hf-models.json)" + +# Login to docker Scaleway's registry +docker login "rg.$REGION.scw.cloud" -u nologin --password-stdin <<< "$SCW_SECRET_KEY" + +# Initialize, plan, and deploy each model in a Terraform workspace +apply() { + terraform init + for model_file_name in "${!hf_models[@]}"; + do + terraform workspace select -or-create $model_file_name + export TF_VAR_hf_model_file_name=$model_file_name \ + TF_VAR_hf_model_download_source=${hf_models[$model_file_name]} + terraform plan + terraform apply -auto-approve + done +} + +# Destroy resources of each Terraform workspace +destroy(){ + for model_file_name in "${!hf_models[@]}"; + do + terraform workspace select $model_file_name + export TF_VAR_hf_model_file_name=$model_file_name \ + TF_VAR_hf_model_download_source=${hf_models[$model_file_name]} + terraform destroy -auto-approve + done +} + +# Script actions per flag +while getopts "ad" option; do + case $option in + a) + echo "deploying models" + apply + ;; + d) + echo "destroying models" + destroy + ;; + *) + echo "flag is not provided" + exit 1 + esac +done \ No newline at end of file From 3640c3d8030a2595ba429edc23c3418062922604 Mon Sep 17 00:00:00 2001 From: Reda Noureddine Date: Tue, 7 May 2024 11:55:01 +0200 Subject: [PATCH 08/11] refactor: fastapi app --- containers/hugging-face-inference/Dockerfile | 4 +++- containers/hugging-face-inference/main.py | 11 +++++++---- containers/hugging-face-inference/prompt.py | 4 ---- 3 files changed, 10 insertions(+), 9 deletions(-) delete mode 100644 containers/hugging-face-inference/prompt.py diff --git a/containers/hugging-face-inference/Dockerfile b/containers/hugging-face-inference/Dockerfile index 5f3088a..f3ce032 100644 --- a/containers/hugging-face-inference/Dockerfile +++ b/containers/hugging-face-inference/Dockerfile @@ -7,7 +7,9 @@ RUN apt-get update && apt-get install -y wget WORKDIR /app RUN pip install --upgrade pip + COPY requirements.txt . + RUN pip install -r requirements.txt RUN pip install llama-cpp-python==0.2.62 \ @@ -16,6 +18,6 @@ RUN pip install llama-cpp-python==0.2.62 \ RUN wget $MODEL_DOWNLOAD_SOURCE -COPY . . +COPY main.py . CMD ["uvicorn", "main:app", "--proxy-headers", "--host", "0.0.0.0", "--port", "80"] diff --git a/containers/hugging-face-inference/main.py b/containers/hugging-face-inference/main.py index 753308e..aa1be39 100644 --- a/containers/hugging-face-inference/main.py +++ b/containers/hugging-face-inference/main.py @@ -1,7 +1,10 @@ from fastapi import FastAPI from llama_cpp import Llama +from pydantic import BaseModel import os -import prompt + +class Message(BaseModel): + content: str MODEL_FILE_NAME=os.environ["MODEL_FILE_NAME"] @@ -23,11 +26,11 @@ def hello(): } @app.post("/") -def infer(prompt: prompt.Prompt): - +def infer(message: Message): + """Post a message and receive a response""" print("inference endpoint is called", flush=True) - output = llm(prompt=prompt.message, max_tokens=200) + output = llm(prompt=message.content, max_tokens=200) print("output is successfully inferred", flush=True) diff --git a/containers/hugging-face-inference/prompt.py b/containers/hugging-face-inference/prompt.py deleted file mode 100644 index 5dc7363..0000000 --- a/containers/hugging-face-inference/prompt.py +++ /dev/null @@ -1,4 +0,0 @@ -from pydantic import BaseModel - -class Prompt(BaseModel): - message: str \ No newline at end of file From 1a01f21f60d29831a933b05d584b3b994ba06064 Mon Sep 17 00:00:00 2001 From: Reda Noureddine Date: Tue, 7 May 2024 12:02:44 +0200 Subject: [PATCH 09/11] feat: remove cron schedule as used for observability purposes only --- containers/hugging-face-inference/terraform/container.tf | 8 -------- containers/hugging-face-inference/terraform/variables.tf | 5 ----- 2 files changed, 13 deletions(-) diff --git a/containers/hugging-face-inference/terraform/container.tf b/containers/hugging-face-inference/terraform/container.tf index 3b98f07..3502223 100644 --- a/containers/hugging-face-inference/terraform/container.tf +++ b/containers/hugging-face-inference/terraform/container.tf @@ -18,11 +18,3 @@ resource "scaleway_container" "inference-hugging-face" { max_scale = 1 deploy = true } - -resource scaleway_container_cron "inference_cron" { - container_id = scaleway_container.inference-hugging-face.id - schedule = var.inference_cron_schedule - args = jsonencode({ - "message" : "Hello! It's sunny today. How are you doing?" - }) -} \ No newline at end of file diff --git a/containers/hugging-face-inference/terraform/variables.tf b/containers/hugging-face-inference/terraform/variables.tf index 623c911..afc799c 100644 --- a/containers/hugging-face-inference/terraform/variables.tf +++ b/containers/hugging-face-inference/terraform/variables.tf @@ -20,11 +20,6 @@ variable "region" { default = "fr-par" } -variable "inference_cron_schedule" { - type = string - default = "*/15 * * * *" -} - variable "hf_model_file_name" { type = string } From 6990297c235570f256228d2807bfd75076717f71 Mon Sep 17 00:00:00 2001 From: Reda Noureddine Date: Tue, 7 May 2024 12:07:50 +0200 Subject: [PATCH 10/11] docs: readme --- README.md | 1 + containers/hugging-face-inference/README.md | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 05757c7..248d251 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,7 @@ Table of Contents: | **[Python S3 upload](containers/python-s3-upload/README.md)**
A Python + Flask HTTP server that receives file uploads and writes them to S3. | Python | [Terraform] | | **[Terraform NGINX hello world](containers/terraform-nginx-hello-world/README.md)**
A minimal example running the base NGINX image in a serverless container deployed with Terraform. | N/A | [Terraform] | | **[Triggers with Terraform](containers/terraform-triggers/README.md)**
Configuring two SQS triggers, used to trigger two containers, one public, one private. | N/A | [Terraform] | +| **[Inference with Hugging Face Models](containers/hugging-face-inference/README.md)**
Experimentation to deploy and benchmark some lightweight Hugging Face Models in Serverless Containers. | N/A | [Terraform] | ### ⚙️ Jobs diff --git a/containers/hugging-face-inference/README.md b/containers/hugging-face-inference/README.md index e3fa747..de80b2d 100644 --- a/containers/hugging-face-inference/README.md +++ b/containers/hugging-face-inference/README.md @@ -1,4 +1,6 @@ -## Deploy Hugging Face Models in Serverless Containers +# Hugging Face Models + +### Deploy models in Serverless Containers - Export these variables: @@ -9,5 +11,19 @@ export SCW_ACCESS_KEY="access-key" SCW_SECRET_KEY="secret-key" SCW_PROJECT_ID="p - Run script to deploy multiple hugging face models using terraform workspaces: ```bash -bash ./deploy-models.sh +cd terraform && bash terraform.sh -a +``` + +### Benchmark models + +Check your models were deployed on the console and copy your container endpoints to the `terraform/hf-models.json` file, then perform the following command: + +```bash +python benchmark-models.py +``` + +### Destroy terraform resources for all models + +```bash +bash terraform.sh -d ``` \ No newline at end of file From 72d4289b16342fa2df11c51a56d05c0bcd091eb2 Mon Sep 17 00:00:00 2001 From: Reda Noureddine Date: Tue, 7 May 2024 12:50:56 +0200 Subject: [PATCH 11/11] refactor: style and rewording --- containers/hugging-face-inference/README.md | 4 ++ containers/hugging-face-inference/main.py | 23 +++++--- .../terraform/benchmark-models.py | 57 +++++++++++++------ .../terraform/terraform.sh | 4 +- 4 files changed, 62 insertions(+), 26 deletions(-) diff --git a/containers/hugging-face-inference/README.md b/containers/hugging-face-inference/README.md index de80b2d..2f0a040 100644 --- a/containers/hugging-face-inference/README.md +++ b/containers/hugging-face-inference/README.md @@ -8,6 +8,8 @@ export SCW_ACCESS_KEY="access-key" SCW_SECRET_KEY="secret-key" SCW_PROJECT_ID="project-id" REGION="fr-par" ``` +- Add/remove Hugging Face models (with `.gguf` extension) in `terraform/hf-models.json` file. + - Run script to deploy multiple hugging face models using terraform workspaces: ```bash @@ -22,6 +24,8 @@ Check your models were deployed on the console and copy your container endpoints python benchmark-models.py ``` +This will generate a box plot to analyze response time per model family, and a `csv` file containing textual responses per each model. + ### Destroy terraform resources for all models ```bash diff --git a/containers/hugging-face-inference/main.py b/containers/hugging-face-inference/main.py index aa1be39..a473554 100644 --- a/containers/hugging-face-inference/main.py +++ b/containers/hugging-face-inference/main.py @@ -1,33 +1,40 @@ +import os + from fastapi import FastAPI from llama_cpp import Llama from pydantic import BaseModel -import os + class Message(BaseModel): content: str -MODEL_FILE_NAME=os.environ["MODEL_FILE_NAME"] + +MODEL_FILE_NAME = os.environ["MODEL_FILE_NAME"] app = FastAPI() -print("loading model from memory starts", flush=True) +print("loading model starts", flush=True) llm = Llama(model_path=MODEL_FILE_NAME) -print("loading model from memory successfully ends", flush=True) +print("loading model successfully ends", flush=True) + @app.get("/") def hello(): - """Get Inference Server Info""" + """Get info of inference server""" return { - "message": "Hello, this is the inference server! Serving model {model_name}" - .format(model_name=MODEL_FILE_NAME) + "message": "Hello, this is the inference server! Serving model {model_name}".format( + model_name=MODEL_FILE_NAME + ) } + @app.post("/") def infer(message: Message): - """Post a message and receive a response""" + """Post a message and receive a response from inference server""" + print("inference endpoint is called", flush=True) output = llm(prompt=message.content, max_tokens=200) diff --git a/containers/hugging-face-inference/terraform/benchmark-models.py b/containers/hugging-face-inference/terraform/benchmark-models.py index 7ffe2af..17fddaa 100644 --- a/containers/hugging-face-inference/terraform/benchmark-models.py +++ b/containers/hugging-face-inference/terraform/benchmark-models.py @@ -1,27 +1,36 @@ -import json, requests, csv, pandas +import csv +import json + import matplotlib.pyplot as plt +import pandas +import requests + class Benchmark: _model_families = ["llama", "mistral", "phi"] _endpoints = {} - def __init__(self, models_file: str, benchmark_file: str, results_figure: str, message: str) -> None: + def __init__( + self, models_file: str, benchmark_file: str, results_figure: str, message: str + ) -> None: self.models_file = models_file self.benchmark_file = benchmark_file self.message = message self.results_figure = results_figure - def get_container_endpoints_from_json_file(self)-> None: + def get_container_endpoints_from_json_file(self) -> None: if self.models_file == "": raise Exception("file name is empty") - with open(self.models_file, 'r') as models_file: + with open(self.models_file, "r") as models_file: json_data = json.load(models_file) for family in self._model_families: self._endpoints[family] = [] for model in json_data[family]: - self._endpoints[family].append({"model": model["file"], "endpoint": model["ctn_endpoint"]}) + self._endpoints[family].append( + {"model": model["file"], "endpoint": model["ctn_endpoint"]} + ) def analyze_results(self) -> None: benchmark_results = pandas.read_csv(self.benchmark_file) @@ -32,7 +41,7 @@ def analyze_results(self) -> None: def benchmark_models(self, num_samples: int) -> None: self.get_container_endpoints_from_json_file() - fields = ['Model', 'Family', 'Total Response Time', 'Response Message'] + fields = ["Model", "Family", "Total Response Time", "Response Message"] benchmark_data = [] for family in self._model_families: @@ -42,35 +51,51 @@ def benchmark_models(self, num_samples: int) -> None: for _ in range(num_samples): try: - print("Calling model {model} on endpoint {endpoint} with message {message}" - .format(model=endpoint["model"], endpoint=endpoint["endpoint"], message=self.message) + print( + "Calling model {model} on endpoint {endpoint} with message {message}".format( + model=endpoint["model"], + endpoint=endpoint["endpoint"], + message=self.message, + ) ) - rsp = requests.post(endpoint["endpoint"], json={"message": self.message}) + rsp = requests.post( + endpoint["endpoint"], json={"message": self.message} + ) response_text = rsp.json()["choices"][0]["text"] - print("The model {model} responded with: {response_text}" - .format(model=endpoint["model"], response_text=response_text) + print( + "The model {model} responded with: {response_text}".format( + model=endpoint["model"], response_text=response_text + ) ) - benchmark_data.append([endpoint["model"], family, rsp.elapsed.total_seconds(), response_text]) + benchmark_data.append( + [ + endpoint["model"], + family, + rsp.elapsed.total_seconds(), + response_text, + ] + ) except: pass - with open(self.benchmark_file, 'w') as results_file: + with open(self.benchmark_file, "w") as results_file: wrt = csv.writer(results_file) wrt.writerow(fields) wrt.writerows(benchmark_data) self.analyze_results() + if __name__ == "__main__": benchmark = Benchmark( - models_file="hf-models.json", - benchmark_file="benchmark-results.csv", - results_figure="results-plot.png", + models_file="hf-models.json", + benchmark_file="benchmark-results.csv", + results_figure="results-plot.png", message="What the difference between an elephant and an ant?", ) diff --git a/containers/hugging-face-inference/terraform/terraform.sh b/containers/hugging-face-inference/terraform/terraform.sh index 1bf68af..5bbd07a 100755 --- a/containers/hugging-face-inference/terraform/terraform.sh +++ b/containers/hugging-face-inference/terraform/terraform.sh @@ -28,7 +28,7 @@ apply() { } # Destroy resources of each Terraform workspace -destroy(){ +destroy() { for model_file_name in "${!hf_models[@]}"; do terraform workspace select $model_file_name @@ -38,7 +38,7 @@ destroy(){ done } -# Script actions per flag +# Script actions while getopts "ad" option; do case $option in a)