From 397f7b80bd86a1e7da78e94f7e0f75b21436ac56 Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Thu, 19 Dec 2024 21:35:50 -0600 Subject: [PATCH 01/31] First commit of llamacpp Opea component Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- .../llms/text-generation/llamacpp/Dockerfile | 27 ++++++ comps/llms/text-generation/llamacpp/README.md | 84 +++++++++++++++++++ .../llms/text-generation/llamacpp/__init__.py | 2 + .../llamacpp/docker_compose_llm.yaml | 39 +++++++++ .../text-generation/llamacpp/entrypoint.sh | 8 ++ comps/llms/text-generation/llamacpp/llm.py | 65 ++++++++++++++ .../llamacpp/requirements-runtime.txt | 1 + .../text-generation/llamacpp/requirements.txt | 12 +++ 8 files changed, 238 insertions(+) create mode 100644 comps/llms/text-generation/llamacpp/Dockerfile create mode 100644 comps/llms/text-generation/llamacpp/README.md create mode 100644 comps/llms/text-generation/llamacpp/__init__.py create mode 100644 comps/llms/text-generation/llamacpp/docker_compose_llm.yaml create mode 100644 comps/llms/text-generation/llamacpp/entrypoint.sh create mode 100644 comps/llms/text-generation/llamacpp/llm.py create mode 100644 comps/llms/text-generation/llamacpp/requirements-runtime.txt create mode 100644 comps/llms/text-generation/llamacpp/requirements.txt diff --git a/comps/llms/text-generation/llamacpp/Dockerfile b/comps/llms/text-generation/llamacpp/Dockerfile new file mode 100644 index 000000000..a362c3bf6 --- /dev/null +++ b/comps/llms/text-generation/llamacpp/Dockerfile @@ -0,0 +1,27 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + curl \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +# Assumes we're building from the GenAIComps directory. +COPY ../../../comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/llms/text-generation/llamacpp/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/llms/text-generation/llamacpp/ + +ENTRYPOINT ["bash", "entrypoint.sh"] diff --git a/comps/llms/text-generation/llamacpp/README.md b/comps/llms/text-generation/llamacpp/README.md new file mode 100644 index 000000000..b8f64aac0 --- /dev/null +++ b/comps/llms/text-generation/llamacpp/README.md @@ -0,0 +1,84 @@ +# Introduction + +[llama.cpp](https://github.com/ggerganov/llama.cpp) provides inference in pure C/C++, and enables "LLM inference with minimal setup and state-of-the-art performance on a wide range of hardware - locally and in the cloud". + +This OPEA component wraps llama.cpp server so that it can interface with other OPEA components, or for creating OPEA Megaservices. + +## TLDR + +```bash +cd GenAIComps/ +docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yml up +``` + +Please note it's instructive to run and validate each the llama.cpp server and OPEA component below. + +## 1. Run the llama.cpp server + +```bash +cd GenAIComps +docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llamacpp-server --force-recreate +``` + +Notes: + +i) If you prefer to run above in the background without screen output use `up -d` . The `--force-recreate` clears cache. + +ii) To tear down the llama.cpp server and remove the container: + +`docker compose -f comps/llms/text-generation/llamacpp/langchain/docker_compose_llm.yaml llamacpp-server down` + +iii) For [llama.cpp settings](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md) please specify them in the docker_compose_llm.yaml file. + +#### Verify the llama.cpp Service: + +```bash +curl --request POST \ + --url http://localhost:8080/completion \ + --header "Content-Type: application/json" \ + --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}' +``` + +## 2. Run the llama.cpp OPEA Service + +This is essentially a wrapper component of Llama.cpp server. OPEA nicely standardizes and verifies LLM inputs with LLMParamsDoc class (see llm.py). + +### 2.1 Build the llama.cpp OPEA image: + +```bash +cd GenAIComps/ +docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yml up llama-opea-llm +``` + +Equivalently, the above can be achieved with `build` and `run` from the Dockerfile. Build: + +```bash +cd GenAIComps/ +docker build --no-cache -t opea/llm-llamacpp:latest \ + --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy \ + -f comps/llms/text-generation/llamacpp/Dockerfile . +``` + +And run: + +```bash +docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy \ + opea/llm-llamacpp:latest +``` + +### 2.3 Consume the llama.cpp Microservice: + +```bash +curl http://127.0.0.1:9000/v1/chat/completions -X POST \ + -d '{"query":"What is Deep Learning?","max_tokens":32,"top_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \ + -H 'Content-Type: application/json' +``` + +### Notes + +Tearing down services and removing containers: + +```bash +cd GenAIComps/comps/llms/text-generation/llamacpp/ +docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml down +``` diff --git a/comps/llms/text-generation/llamacpp/__init__.py b/comps/llms/text-generation/llamacpp/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/llms/text-generation/llamacpp/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml new file mode 100644 index 000000000..88937ff0d --- /dev/null +++ b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml @@ -0,0 +1,39 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + llamacpp-server: + image: ghcr.io/ggerganov/llama.cpp:server + ports: + - 8080:8080 + environment: + # Refer to settings here: https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md + # Llama.cpp is based on .gguf format, and Hugging Face offers many .gguf format models. + LLAMA_ARG_MODEL_URL: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf + LLAMA_ARG_CTX_SIZE: 4096 + LLAMA_ARG_N_PARALLEL: 2 + LLAMA_ARG_ENDPOINT_METRICS: 1 + LLAMA_ARG_PORT: 8080 + + llamacpp-opea-llm: + image: opea/llm-llamacpp:latest + build: + # Set this to allow COPY comps in the Dockerfile. + # When using docker compose with -f, the comps context is 4 levels down from docker_compose_llm.yaml. + context: ../../../../ + dockerfile: ./comps/llms/text-generation/llamacpp/Dockerfile + depends_on: + - llamacpp-server + ports: + - "9000:9000" + network_mode: "host" # equivalent to: docker run --network host ... + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + # LLAMACPP_ENDPOINT: ${LLAMACPP_ENDPOINT} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/llms/text-generation/llamacpp/entrypoint.sh b/comps/llms/text-generation/llamacpp/entrypoint.sh new file mode 100644 index 000000000..c9a5a3d07 --- /dev/null +++ b/comps/llms/text-generation/llamacpp/entrypoint.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# pip --no-cache-dir install -r requirements-runtime.txt + +python llm.py diff --git a/comps/llms/text-generation/llamacpp/llm.py b/comps/llms/text-generation/llamacpp/llm.py new file mode 100644 index 000000000..5612199eb --- /dev/null +++ b/comps/llms/text-generation/llamacpp/llm.py @@ -0,0 +1,65 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +import openai +from fastapi.responses import StreamingResponse + +from comps import CustomLogger, LLMParamsDoc, ServiceType, opea_microservices, register_microservice + +logger = CustomLogger("llm_llamacpp") +logflag = os.getenv("LOGFLAG", False) +llamacpp_endpoint = os.getenv("LLAMACPP_ENDPOINT", "http://localhost:8080/") + + +# OPEA microservice wrapper of llama.cpp +# llama.cpp server uses openai API format: https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md +@register_microservice( + name="opea_service@llm_llamacpp", + service_type=ServiceType.LLM, + endpoint="/v1/chat/completions", + host="0.0.0.0", + port=9000, +) +async def llm_generate(input: LLMParamsDoc): + if logflag: + logger.info(input) + logger.info(llamacpp_endpoint) + + client = openai.OpenAI( + base_url=llamacpp_endpoint, api_key="sk-no-key-required" # "http://:port" + ) + + # Llama.cpp works with openai API format + # The openai api doesn't have top_k parameter + # https://community.openai.com/t/which-openai-gpt-models-if-any-allow-specifying-top-k/777982/2 + chat_completion = client.chat.completions.create( + model=input.model, + messages=[{"role": "user", "content": input.query}], + max_tokens=input.max_tokens, + temperature=input.temperature, + top_p=input.top_p, + frequency_penalty=input.frequency_penalty, + presence_penalty=input.presence_penalty, + stream=input.streaming, + ) + + if input.streaming: + + def stream_generator(): + for c in chat_completion: + if logflag: + logger.info(c) + yield f"data: {c.model_dump_json()}\n\n" + yield "data: [DONE]\n\n" + + return StreamingResponse(stream_generator(), media_type="text/event-stream") + else: + if logflag: + logger.info(chat_completion) + return chat_completion + + +if __name__ == "__main__": + opea_microservices["opea_service@llm_llamacpp"].start() diff --git a/comps/llms/text-generation/llamacpp/requirements-runtime.txt b/comps/llms/text-generation/llamacpp/requirements-runtime.txt new file mode 100644 index 000000000..225adde27 --- /dev/null +++ b/comps/llms/text-generation/llamacpp/requirements-runtime.txt @@ -0,0 +1 @@ +langserve diff --git a/comps/llms/text-generation/llamacpp/requirements.txt b/comps/llms/text-generation/llamacpp/requirements.txt new file mode 100644 index 000000000..fdb5f5a01 --- /dev/null +++ b/comps/llms/text-generation/llamacpp/requirements.txt @@ -0,0 +1,12 @@ +aiohttp +docarray[full] +fastapi +huggingface_hub +openai +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +prometheus-fastapi-instrumentator +shortuuid +transformers +uvicorn From cb4f5e59a53161ea893dc6fa38ee49266d7a3f69 Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Thu, 19 Dec 2024 21:50:26 -0600 Subject: [PATCH 02/31] Removed unneeded requirements file Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- comps/llms/text-generation/llamacpp/requirements-runtime.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 comps/llms/text-generation/llamacpp/requirements-runtime.txt diff --git a/comps/llms/text-generation/llamacpp/requirements-runtime.txt b/comps/llms/text-generation/llamacpp/requirements-runtime.txt deleted file mode 100644 index 225adde27..000000000 --- a/comps/llms/text-generation/llamacpp/requirements-runtime.txt +++ /dev/null @@ -1 +0,0 @@ -langserve From 2a48bae8e3231c82a370b18ae681968997ed36b7 Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Mon, 6 Jan 2025 15:38:25 -0600 Subject: [PATCH 03/31] Pin the llama.cpp server version, and fix small typo Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- comps/llms/text-generation/llamacpp/README.md | 2 +- comps/llms/text-generation/llamacpp/docker_compose_llm.yaml | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/comps/llms/text-generation/llamacpp/README.md b/comps/llms/text-generation/llamacpp/README.md index b8f64aac0..7b7ffa7d5 100644 --- a/comps/llms/text-generation/llamacpp/README.md +++ b/comps/llms/text-generation/llamacpp/README.md @@ -8,7 +8,7 @@ This OPEA component wraps llama.cpp server so that it can interface with other O ```bash cd GenAIComps/ -docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yml up +docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up ``` Please note it's instructive to run and validate each the llama.cpp server and OPEA component below. diff --git a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml index 88937ff0d..9a718661b 100644 --- a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml +++ b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml @@ -3,7 +3,8 @@ services: llamacpp-server: - image: ghcr.io/ggerganov/llama.cpp:server + # image: ghcr.io/ggerganov/llama.cpp:server + image: ghcr.io/ggerganov/llama.cpp:server-b4419 ports: - 8080:8080 environment: From 4e8215225a2afb528137dc0598731af59e42e1bb Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Mon, 6 Jan 2025 15:55:50 -0600 Subject: [PATCH 04/31] Update README.md to describe hardware support, and provide reference. Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- comps/llms/text-generation/llamacpp/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/comps/llms/text-generation/llamacpp/README.md b/comps/llms/text-generation/llamacpp/README.md index 7b7ffa7d5..d1e5054a2 100644 --- a/comps/llms/text-generation/llamacpp/README.md +++ b/comps/llms/text-generation/llamacpp/README.md @@ -4,6 +4,10 @@ This OPEA component wraps llama.cpp server so that it can interface with other OPEA components, or for creating OPEA Megaservices. +llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#supported-backends), and has only been tested on CPU. + +To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly. + ## TLDR ```bash @@ -47,7 +51,7 @@ This is essentially a wrapper component of Llama.cpp server. OPEA nicely standar ```bash cd GenAIComps/ -docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yml up llama-opea-llm +docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llama-opea-llm ``` Equivalently, the above can be achieved with `build` and `run` from the Dockerfile. Build: From baf381dca98ae237347db41fb0fcdd4b64943f86 Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Mon, 6 Jan 2025 16:03:42 -0600 Subject: [PATCH 05/31] Updated docker_compose_llm.yaml so that the llamacpp-server so the pulled image has specific tag. Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- comps/llms/text-generation/llamacpp/docker_compose_llm.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml index 9a718661b..dd220b6f1 100644 --- a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml +++ b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml @@ -3,7 +3,6 @@ services: llamacpp-server: - # image: ghcr.io/ggerganov/llama.cpp:server image: ghcr.io/ggerganov/llama.cpp:server-b4419 ports: - 8080:8080 From 9d7539dd213b017879c607b94c4336520b8fc64e Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Tue, 7 Jan 2025 11:04:43 -0600 Subject: [PATCH 06/31] Small adjustments to README.md Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- comps/llms/text-generation/llamacpp/README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/comps/llms/text-generation/llamacpp/README.md b/comps/llms/text-generation/llamacpp/README.md index d1e5054a2..15a96ca1f 100644 --- a/comps/llms/text-generation/llamacpp/README.md +++ b/comps/llms/text-generation/llamacpp/README.md @@ -28,9 +28,9 @@ Notes: i) If you prefer to run above in the background without screen output use `up -d` . The `--force-recreate` clears cache. -ii) To tear down the llama.cpp server and remove the container: +ii) To stop the llama.cpp server: -`docker compose -f comps/llms/text-generation/llamacpp/langchain/docker_compose_llm.yaml llamacpp-server down` +`docker compose -f comps/llms/text-generation/llamacpp/langchain/docker_compose_llm.yaml llamacpp-server stop` iii) For [llama.cpp settings](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md) please specify them in the docker_compose_llm.yaml file. @@ -80,9 +80,11 @@ curl http://127.0.0.1:9000/v1/chat/completions -X POST \ ### Notes -Tearing down services and removing containers: +Stopping services: ```bash cd GenAIComps/comps/llms/text-generation/llamacpp/ -docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml down +docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml stop ``` + +`down` may be used instead of 'stop' if you'd like to stop and delete the containers. \ No newline at end of file From fd15ee7529e98ae81c8d4b04483e6f2f1209215c Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 10 Jan 2025 13:13:47 -0600 Subject: [PATCH 07/31] This removes unneeded dependencies in the Dockerfile, unneeded entrypoint.sh Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- comps/llms/text-generation/llamacpp/Dockerfile | 9 ++------- comps/llms/text-generation/llamacpp/README.md | 8 ++++---- .../text-generation/llamacpp/docker_compose_llm.yaml | 3 +-- comps/llms/text-generation/llamacpp/entrypoint.sh | 8 -------- 4 files changed, 7 insertions(+), 21 deletions(-) delete mode 100644 comps/llms/text-generation/llamacpp/entrypoint.sh diff --git a/comps/llms/text-generation/llamacpp/Dockerfile b/comps/llms/text-generation/llamacpp/Dockerfile index a362c3bf6..70500e35d 100644 --- a/comps/llms/text-generation/llamacpp/Dockerfile +++ b/comps/llms/text-generation/llamacpp/Dockerfile @@ -3,18 +3,13 @@ FROM python:3.11-slim -RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ - curl \ - libgl1-mesa-glx \ - libjemalloc-dev - RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ chown -R user /home/user/ USER user -# Assumes we're building from the GenAIComps directory. +# Assumes we're building from the GenAIComps directory, and docker file is in comps/llms/text-generation/llamacpp COPY ../../../comps /home/user/comps RUN pip install --no-cache-dir --upgrade pip setuptools && \ @@ -24,4 +19,4 @@ ENV PYTHONPATH=$PYTHONPATH:/home/user WORKDIR /home/user/comps/llms/text-generation/llamacpp/ -ENTRYPOINT ["bash", "entrypoint.sh"] +ENTRYPOINT ["python", "llm.py"] \ No newline at end of file diff --git a/comps/llms/text-generation/llamacpp/README.md b/comps/llms/text-generation/llamacpp/README.md index 15a96ca1f..e03fd7c36 100644 --- a/comps/llms/text-generation/llamacpp/README.md +++ b/comps/llms/text-generation/llamacpp/README.md @@ -21,12 +21,12 @@ Please note it's instructive to run and validate each the llama.cpp server and O ```bash cd GenAIComps -docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llamacpp-server --force-recreate +docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llamacpp-server ``` Notes: -i) If you prefer to run above in the background without screen output use `up -d` . The `--force-recreate` clears cache. +i) If you prefer to run above in the background without screen output use `up -d`. ii) To stop the llama.cpp server: @@ -51,7 +51,7 @@ This is essentially a wrapper component of Llama.cpp server. OPEA nicely standar ```bash cd GenAIComps/ -docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llama-opea-llm +docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llamacpp-opea-llm --force-recreate ``` Equivalently, the above can be achieved with `build` and `run` from the Dockerfile. Build: @@ -87,4 +87,4 @@ cd GenAIComps/comps/llms/text-generation/llamacpp/ docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml stop ``` -`down` may be used instead of 'stop' if you'd like to stop and delete the containers. \ No newline at end of file +`down` may be used instead of 'stop' if you'd like to stop, and delete the containers and networks. \ No newline at end of file diff --git a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml index dd220b6f1..d66d93afd 100644 --- a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml +++ b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml @@ -18,7 +18,7 @@ services: llamacpp-opea-llm: image: opea/llm-llamacpp:latest build: - # Set this to allow COPY comps in the Dockerfile. + # This context is to allow the 'COPY comps' command in the Dockerfile. # When using docker compose with -f, the comps context is 4 levels down from docker_compose_llm.yaml. context: ../../../../ dockerfile: ./comps/llms/text-generation/llamacpp/Dockerfile @@ -31,7 +31,6 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - # LLAMACPP_ENDPOINT: ${LLAMACPP_ENDPOINT} restart: unless-stopped networks: diff --git a/comps/llms/text-generation/llamacpp/entrypoint.sh b/comps/llms/text-generation/llamacpp/entrypoint.sh deleted file mode 100644 index c9a5a3d07..000000000 --- a/comps/llms/text-generation/llamacpp/entrypoint.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -# pip --no-cache-dir install -r requirements-runtime.txt - -python llm.py From c931902d616cb692fc3ffb54af4ad165adcdde4d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 10 Jan 2025 19:18:51 +0000 Subject: [PATCH 08/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/llms/text-generation/llamacpp/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/llms/text-generation/llamacpp/README.md b/comps/llms/text-generation/llamacpp/README.md index e03fd7c36..00b8e0b77 100644 --- a/comps/llms/text-generation/llamacpp/README.md +++ b/comps/llms/text-generation/llamacpp/README.md @@ -87,4 +87,4 @@ cd GenAIComps/comps/llms/text-generation/llamacpp/ docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml stop ``` -`down` may be used instead of 'stop' if you'd like to stop, and delete the containers and networks. \ No newline at end of file +`down` may be used instead of 'stop' if you'd like to stop, and delete the containers and networks. From a75d28dc2d2f0d1d11803ff5a509cc91a3fc9951 Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 14 Feb 2025 15:50:33 -0600 Subject: [PATCH 09/31] Refactored llama cpp and text-generation README_llamacpp.md Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- .../src/text-generation/README_llamacpp.md | 50 +++++++++++++++++++ comps/third_parties/llamacpp/README.md | 30 +++++++++++ .../deployment/docker_compose/compose.yaml | 38 ++++++++++++++ 3 files changed, 118 insertions(+) create mode 100644 comps/llms/src/text-generation/README_llamacpp.md create mode 100644 comps/third_parties/llamacpp/README.md create mode 100644 comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml diff --git a/comps/llms/src/text-generation/README_llamacpp.md b/comps/llms/src/text-generation/README_llamacpp.md new file mode 100644 index 000000000..06680a98f --- /dev/null +++ b/comps/llms/src/text-generation/README_llamacpp.md @@ -0,0 +1,50 @@ +# Prediction Guard Introduction + +[Prediction Guard](https://docs.predictionguard.com) allows you to utilize hosted open access LLMs, LVMs, and embedding functionality with seamlessly integrated safeguards. In addition to providing a scalable access to open models, Prediction Guard allows you to configure factual consistency checks, toxicity filters, PII filters, and prompt injection blocking. Join the [Prediction Guard Discord channel](https://discord.gg/TFHgnhAFKd) and request an API key to get started. + +## Get Started + +### Run the Predictionguard Microservice + +```bash +export service_name="textgen-predictionguard" + +cd comps/llms/deployment/docker_compose/ +docker compose -f compose_text-generation.yaml up ${service_name} -d +``` + +## Consume the Prediction Guard Microservice + +See the [Prediction Guard docs](https://docs.predictionguard.com/) for available model options. + +### Without stream + +```bash +curl -X POST http://localhost:9000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Hermes-2-Pro-Llama-3-8B", + "messages": "Tell me a joke.", + "max_tokens": 100, + "temperature": 0.7, + "top_p": 0.9, + "top_k": 50, + "stream": false + }' +``` + +### With stream + +```bash +curl -N -X POST http://localhost:9000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Hermes-2-Pro-Llama-3-8B", + "messages": "Tell me a joke.", + "max_tokens": 100, + "temperature": 0.7, + "top_p": 0.9, + "top_k": 50, + "stream": true + }' +``` diff --git a/comps/third_parties/llamacpp/README.md b/comps/third_parties/llamacpp/README.md new file mode 100644 index 000000000..e12f6d34d --- /dev/null +++ b/comps/third_parties/llamacpp/README.md @@ -0,0 +1,30 @@ +# TGI LLM Microservice + +[Text Generation Inference](https://github.com/huggingface/text-generation-inference) (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and more. + +## Start TGI with docker compose + +Set up environment. + +```bash +export LLM_ENDPOINT_PORT=8008 +export host_ip=${host_ip} +export HF_TOKEN=${HF_TOKEN} +export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export MAX_INPUT_TOKENS=1024 +export MAX_TOTAL_TOKENS=2048 +``` + +Run tgi on xeon. + +```bash +cd deplopyment/docker_compose +docker compose -f compose.yaml tgi-server up -d +``` + +Run tgi on gaudi. + +```bash +cd deplopyment/docker_compose +docker compose -f compose.yaml tgi-gaudi-server up -d +``` diff --git a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml new file mode 100644 index 000000000..d66d93afd --- /dev/null +++ b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + llamacpp-server: + image: ghcr.io/ggerganov/llama.cpp:server-b4419 + ports: + - 8080:8080 + environment: + # Refer to settings here: https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md + # Llama.cpp is based on .gguf format, and Hugging Face offers many .gguf format models. + LLAMA_ARG_MODEL_URL: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf + LLAMA_ARG_CTX_SIZE: 4096 + LLAMA_ARG_N_PARALLEL: 2 + LLAMA_ARG_ENDPOINT_METRICS: 1 + LLAMA_ARG_PORT: 8080 + + llamacpp-opea-llm: + image: opea/llm-llamacpp:latest + build: + # This context is to allow the 'COPY comps' command in the Dockerfile. + # When using docker compose with -f, the comps context is 4 levels down from docker_compose_llm.yaml. + context: ../../../../ + dockerfile: ./comps/llms/text-generation/llamacpp/Dockerfile + depends_on: + - llamacpp-server + ports: + - "9000:9000" + network_mode: "host" # equivalent to: docker run --network host ... + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + restart: unless-stopped + +networks: + default: + driver: bridge From 830da586ebdadfd4b4835a67d975747cbbb1b275 Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 14 Feb 2025 16:05:35 -0600 Subject: [PATCH 10/31] Delete unrefactored files Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- .../llms/text-generation/llamacpp/Dockerfile | 22 ----- comps/llms/text-generation/llamacpp/README.md | 90 ------------------- .../llms/text-generation/llamacpp/__init__.py | 2 - .../llamacpp/docker_compose_llm.yaml | 38 -------- comps/llms/text-generation/llamacpp/llm.py | 65 -------------- .../text-generation/llamacpp/requirements.txt | 12 --- 6 files changed, 229 deletions(-) delete mode 100644 comps/llms/text-generation/llamacpp/Dockerfile delete mode 100644 comps/llms/text-generation/llamacpp/README.md delete mode 100644 comps/llms/text-generation/llamacpp/__init__.py delete mode 100644 comps/llms/text-generation/llamacpp/docker_compose_llm.yaml delete mode 100644 comps/llms/text-generation/llamacpp/llm.py delete mode 100644 comps/llms/text-generation/llamacpp/requirements.txt diff --git a/comps/llms/text-generation/llamacpp/Dockerfile b/comps/llms/text-generation/llamacpp/Dockerfile deleted file mode 100644 index 70500e35d..000000000 --- a/comps/llms/text-generation/llamacpp/Dockerfile +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -FROM python:3.11-slim - -RUN useradd -m -s /bin/bash user && \ - mkdir -p /home/user && \ - chown -R user /home/user/ - -USER user - -# Assumes we're building from the GenAIComps directory, and docker file is in comps/llms/text-generation/llamacpp -COPY ../../../comps /home/user/comps - -RUN pip install --no-cache-dir --upgrade pip setuptools && \ - pip install --no-cache-dir -r /home/user/comps/llms/text-generation/llamacpp/requirements.txt - -ENV PYTHONPATH=$PYTHONPATH:/home/user - -WORKDIR /home/user/comps/llms/text-generation/llamacpp/ - -ENTRYPOINT ["python", "llm.py"] \ No newline at end of file diff --git a/comps/llms/text-generation/llamacpp/README.md b/comps/llms/text-generation/llamacpp/README.md deleted file mode 100644 index 00b8e0b77..000000000 --- a/comps/llms/text-generation/llamacpp/README.md +++ /dev/null @@ -1,90 +0,0 @@ -# Introduction - -[llama.cpp](https://github.com/ggerganov/llama.cpp) provides inference in pure C/C++, and enables "LLM inference with minimal setup and state-of-the-art performance on a wide range of hardware - locally and in the cloud". - -This OPEA component wraps llama.cpp server so that it can interface with other OPEA components, or for creating OPEA Megaservices. - -llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#supported-backends), and has only been tested on CPU. - -To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly. - -## TLDR - -```bash -cd GenAIComps/ -docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up -``` - -Please note it's instructive to run and validate each the llama.cpp server and OPEA component below. - -## 1. Run the llama.cpp server - -```bash -cd GenAIComps -docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llamacpp-server -``` - -Notes: - -i) If you prefer to run above in the background without screen output use `up -d`. - -ii) To stop the llama.cpp server: - -`docker compose -f comps/llms/text-generation/llamacpp/langchain/docker_compose_llm.yaml llamacpp-server stop` - -iii) For [llama.cpp settings](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md) please specify them in the docker_compose_llm.yaml file. - -#### Verify the llama.cpp Service: - -```bash -curl --request POST \ - --url http://localhost:8080/completion \ - --header "Content-Type: application/json" \ - --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}' -``` - -## 2. Run the llama.cpp OPEA Service - -This is essentially a wrapper component of Llama.cpp server. OPEA nicely standardizes and verifies LLM inputs with LLMParamsDoc class (see llm.py). - -### 2.1 Build the llama.cpp OPEA image: - -```bash -cd GenAIComps/ -docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llamacpp-opea-llm --force-recreate -``` - -Equivalently, the above can be achieved with `build` and `run` from the Dockerfile. Build: - -```bash -cd GenAIComps/ -docker build --no-cache -t opea/llm-llamacpp:latest \ - --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy \ - -f comps/llms/text-generation/llamacpp/Dockerfile . -``` - -And run: - -```bash -docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy \ - opea/llm-llamacpp:latest -``` - -### 2.3 Consume the llama.cpp Microservice: - -```bash -curl http://127.0.0.1:9000/v1/chat/completions -X POST \ - -d '{"query":"What is Deep Learning?","max_tokens":32,"top_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \ - -H 'Content-Type: application/json' -``` - -### Notes - -Stopping services: - -```bash -cd GenAIComps/comps/llms/text-generation/llamacpp/ -docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml stop -``` - -`down` may be used instead of 'stop' if you'd like to stop, and delete the containers and networks. diff --git a/comps/llms/text-generation/llamacpp/__init__.py b/comps/llms/text-generation/llamacpp/__init__.py deleted file mode 100644 index 916f3a44b..000000000 --- a/comps/llms/text-generation/llamacpp/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml deleted file mode 100644 index d66d93afd..000000000 --- a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -services: - llamacpp-server: - image: ghcr.io/ggerganov/llama.cpp:server-b4419 - ports: - - 8080:8080 - environment: - # Refer to settings here: https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md - # Llama.cpp is based on .gguf format, and Hugging Face offers many .gguf format models. - LLAMA_ARG_MODEL_URL: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf - LLAMA_ARG_CTX_SIZE: 4096 - LLAMA_ARG_N_PARALLEL: 2 - LLAMA_ARG_ENDPOINT_METRICS: 1 - LLAMA_ARG_PORT: 8080 - - llamacpp-opea-llm: - image: opea/llm-llamacpp:latest - build: - # This context is to allow the 'COPY comps' command in the Dockerfile. - # When using docker compose with -f, the comps context is 4 levels down from docker_compose_llm.yaml. - context: ../../../../ - dockerfile: ./comps/llms/text-generation/llamacpp/Dockerfile - depends_on: - - llamacpp-server - ports: - - "9000:9000" - network_mode: "host" # equivalent to: docker run --network host ... - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - restart: unless-stopped - -networks: - default: - driver: bridge diff --git a/comps/llms/text-generation/llamacpp/llm.py b/comps/llms/text-generation/llamacpp/llm.py deleted file mode 100644 index 5612199eb..000000000 --- a/comps/llms/text-generation/llamacpp/llm.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import os - -import openai -from fastapi.responses import StreamingResponse - -from comps import CustomLogger, LLMParamsDoc, ServiceType, opea_microservices, register_microservice - -logger = CustomLogger("llm_llamacpp") -logflag = os.getenv("LOGFLAG", False) -llamacpp_endpoint = os.getenv("LLAMACPP_ENDPOINT", "http://localhost:8080/") - - -# OPEA microservice wrapper of llama.cpp -# llama.cpp server uses openai API format: https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md -@register_microservice( - name="opea_service@llm_llamacpp", - service_type=ServiceType.LLM, - endpoint="/v1/chat/completions", - host="0.0.0.0", - port=9000, -) -async def llm_generate(input: LLMParamsDoc): - if logflag: - logger.info(input) - logger.info(llamacpp_endpoint) - - client = openai.OpenAI( - base_url=llamacpp_endpoint, api_key="sk-no-key-required" # "http://:port" - ) - - # Llama.cpp works with openai API format - # The openai api doesn't have top_k parameter - # https://community.openai.com/t/which-openai-gpt-models-if-any-allow-specifying-top-k/777982/2 - chat_completion = client.chat.completions.create( - model=input.model, - messages=[{"role": "user", "content": input.query}], - max_tokens=input.max_tokens, - temperature=input.temperature, - top_p=input.top_p, - frequency_penalty=input.frequency_penalty, - presence_penalty=input.presence_penalty, - stream=input.streaming, - ) - - if input.streaming: - - def stream_generator(): - for c in chat_completion: - if logflag: - logger.info(c) - yield f"data: {c.model_dump_json()}\n\n" - yield "data: [DONE]\n\n" - - return StreamingResponse(stream_generator(), media_type="text/event-stream") - else: - if logflag: - logger.info(chat_completion) - return chat_completion - - -if __name__ == "__main__": - opea_microservices["opea_service@llm_llamacpp"].start() diff --git a/comps/llms/text-generation/llamacpp/requirements.txt b/comps/llms/text-generation/llamacpp/requirements.txt deleted file mode 100644 index fdb5f5a01..000000000 --- a/comps/llms/text-generation/llamacpp/requirements.txt +++ /dev/null @@ -1,12 +0,0 @@ -aiohttp -docarray[full] -fastapi -huggingface_hub -openai -opentelemetry-api -opentelemetry-exporter-otlp -opentelemetry-sdk -prometheus-fastapi-instrumentator -shortuuid -transformers -uvicorn From 8d058bbd851ec9363ca7ed86ede0a637795c2658 Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 14 Feb 2025 16:12:08 -0600 Subject: [PATCH 11/31] Adding llama.cpp backend include in the compose_text-genearation.yaml Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- .../compose_text-generation.yaml | 12 +++ .../src/text-generation/README_llamacpp.md | 76 ++++++++++++++----- comps/third_parties/llamacpp/README.md | 58 ++++++++++---- .../deployment/docker_compose/compose.yaml | 46 ++++++----- 4 files changed, 131 insertions(+), 61 deletions(-) diff --git a/comps/llms/deployment/docker_compose/compose_text-generation.yaml b/comps/llms/deployment/docker_compose/compose_text-generation.yaml index fbf503ed6..d1a2b3975 100644 --- a/comps/llms/deployment/docker_compose/compose_text-generation.yaml +++ b/comps/llms/deployment/docker_compose/compose_text-generation.yaml @@ -5,6 +5,8 @@ include: - ../../../third_parties/tgi/deployment/docker_compose/compose.yaml - ../../../third_parties/vllm/deployment/docker_compose/compose.yaml - ../../../third_parties/ollama/deployment/docker_compose/compose.yaml + - ../../../third_parties/llamacpp/deployment/docker_compose/compose.yaml + services: textgen: @@ -100,6 +102,16 @@ services: environment: LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNative} + textgen-llamacpp: + extends: textgen + container_name: textgen-service-llamacpp + environment: + LLM_ENDPOINT: http://llamacpp-server + LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenService} + depends_on: + llamacpp-server: + condition: service_healthy + networks: default: driver: bridge diff --git a/comps/llms/src/text-generation/README_llamacpp.md b/comps/llms/src/text-generation/README_llamacpp.md index 06680a98f..f6197c150 100644 --- a/comps/llms/src/text-generation/README_llamacpp.md +++ b/comps/llms/src/text-generation/README_llamacpp.md @@ -1,50 +1,84 @@ -# Prediction Guard Introduction +# llama.cpp Introduction -[Prediction Guard](https://docs.predictionguard.com) allows you to utilize hosted open access LLMs, LVMs, and embedding functionality with seamlessly integrated safeguards. In addition to providing a scalable access to open models, Prediction Guard allows you to configure factual consistency checks, toxicity filters, PII filters, and prompt injection blocking. Join the [Prediction Guard Discord channel](https://discord.gg/TFHgnhAFKd) and request an API key to get started. +[llama.cpp](https://github.com/ggerganov/llama.cpp) provides inference in pure C/C++, and enables "LLM inference with minimal setup and state-of-the-art performance on a wide range of hardware - locally and in the cloud". + +This OPEA component wraps llama.cpp server so that it can interface with other OPEA components, or for creating OPEA Megaservices. + +llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#supported-backends), and has only been tested on CPU. + +To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly. ## Get Started -### Run the Predictionguard Microservice +### 1. Download a gguf model to serve + +To download an example .gguf model to a model path: + +```bash +export MODEL_PATH=~/models +mkdir $MODEL_PATH +cd $MODEL_PATH +wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf +```` + +### 2. Set Environment Variables ```bash -export service_name="textgen-predictionguard" +export MODEL_PATH=~/models +export host_ip=$(hostname -I | awk '{print $1}') +export TEXTGEN_PORT=9000 +export LLM_ENDPOINT_PORT=8008 +export LLM_ENDPOINT="http://${host_ip}:80" +export LLM_MODEL_ID="models/Phi-3-mini-4k-instruct-q4.gguf" +export LLAMA_ARG_CTX_SIZE=4096 +``` +### 3. Run the llama.cpp OPEA Microservice +```bash +export service_name="textgen-llamacpp" cd comps/llms/deployment/docker_compose/ docker compose -f compose_text-generation.yaml up ${service_name} -d ``` -## Consume the Prediction Guard Microservice +The server output can be observed in a terminal with `docker log `. -See the [Prediction Guard docs](https://docs.predictionguard.com/) for available model options. +## Consume the Service -### Without stream +Verify the backend llama.cpp backend server: ```bash -curl -X POST http://localhost:9000/v1/chat/completions \ +curl http://0.0.0.0:8008/v1/chat/completions \ -H "Content-Type: application/json" \ + -H "Authorization: Bearer no-key" \ -d '{ - "model": "Hermes-2-Pro-Llama-3-8B", - "messages": "Tell me a joke.", - "max_tokens": 100, - "temperature": 0.7, - "top_p": 0.9, - "top_k": 50, - "stream": false + "model": "models/Phi-3-mini-4k-instruct-q4.gguf", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "What is deep learning?" + } + ] }' ``` -### With stream +Consume the service: + +This component is based on openAI API convention: ```bash -curl -N -X POST http://localhost:9000/v1/chat/completions \ +curl -X POST http://localhost:9000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "Hermes-2-Pro-Llama-3-8B", - "messages": "Tell me a joke.", + "model": "models/Phi-3-mini-4k-instruct-q4.gguf", + "messages": [{"role": "user", "content": "Write a limerick about python exceptions"}], "max_tokens": 100, "temperature": 0.7, "top_p": 0.9, "top_k": 50, - "stream": true + "stream": false }' -``` +``` \ No newline at end of file diff --git a/comps/third_parties/llamacpp/README.md b/comps/third_parties/llamacpp/README.md index e12f6d34d..8f9b7e627 100644 --- a/comps/third_parties/llamacpp/README.md +++ b/comps/third_parties/llamacpp/README.md @@ -1,30 +1,56 @@ -# TGI LLM Microservice +# Introduction -[Text Generation Inference](https://github.com/huggingface/text-generation-inference) (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and more. +[llama.cpp](https://github.com/ggerganov/llama.cpp) provides inference in pure C/C++, and enables "LLM inference with minimal setup and state-of-the-art performance on a wide range of hardware - locally and in the cloud". -## Start TGI with docker compose +This OPEA component wraps llama.cpp server so that it can interface with other OPEA components, or for creating OPEA Megaservices. -Set up environment. +llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#supported-backends), and has only been tested on CPU. + +To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly. + + +## Get Started + +### 1. Download a gguf Model + +To download an example .gguf model to a model path: + +```bash +export MODEL_PATH=~/models +mkdir $MODEL_PATH +cd $MODEL_PATH +wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf +```` + +### 2. Set Environment Variables ```bash +export MODEL_PATH=~/models +export host_ip=$(hostname -I | awk '{print $1}') export LLM_ENDPOINT_PORT=8008 -export host_ip=${host_ip} -export HF_TOKEN=${HF_TOKEN} -export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" -export MAX_INPUT_TOKENS=1024 -export MAX_TOTAL_TOKENS=2048 +export LLM_MODEL_ID="models/Phi-3-mini-4k-instruct-q4.gguf" +export LLAMA_ARG_CTX_SIZE=4096 ``` -Run tgi on xeon. +### 3. Run the llama.cpp Backend Microservice ```bash -cd deplopyment/docker_compose -docker compose -f compose.yaml tgi-server up -d +cd deployment/docker_compose +docker compose -f compose.yaml up llamacpp-server -d ``` -Run tgi on gaudi. +To use this in an OPEA text generation component please see [llama.cpp text-generation]( +../../llms/src/text-generation/README_llamacpp.md) + +Note: can use docker logs to observe server. + +## Consume the service + +Llama cpp supports openai style API: ```bash -cd deplopyment/docker_compose -docker compose -f compose.yaml tgi-gaudi-server up -d -``` +curl http://${host_ip}:8008/v1/chat/completions \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}' +``` \ No newline at end of file diff --git a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml index d66d93afd..a1058de2b 100644 --- a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml +++ b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml @@ -4,34 +4,32 @@ services: llamacpp-server: image: ghcr.io/ggerganov/llama.cpp:server-b4419 + container_name: llamacpp-server ports: - - 8080:8080 + - ${LLM_ENDPOINT_PORT:-8008}:80 + volumes: + # Download the .gguf models to this path. + - ${MODEL_PATH:-~/models}:/models environment: - # Refer to settings here: https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md - # Llama.cpp is based on .gguf format, and Hugging Face offers many .gguf format models. - LLAMA_ARG_MODEL_URL: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf - LLAMA_ARG_CTX_SIZE: 4096 + LOGFLAG: False + HTTPS_PROXY: ${http_proxy} + HTTP_PROXY: ${https_proxy} + LLM_MODEL_ID: ${LLM_MODEL_ID} + LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT} + host_ip: ${host_ip} + # llama.cpp env variables. Please refer to reference: + # https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md + LLAMA_ARG_PORT: 80 + LLAMA_ARG_MODEL: /$LLM_MODEL_ID + LLAMA_ARG_CTX_SIZE: ${LLAMA_ARG_CTX_SIZE:-4096} LLAMA_ARG_N_PARALLEL: 2 LLAMA_ARG_ENDPOINT_METRICS: 1 - LLAMA_ARG_PORT: 8080 - - llamacpp-opea-llm: - image: opea/llm-llamacpp:latest - build: - # This context is to allow the 'COPY comps' command in the Dockerfile. - # When using docker compose with -f, the comps context is 4 levels down from docker_compose_llm.yaml. - context: ../../../../ - dockerfile: ./comps/llms/text-generation/llamacpp/Dockerfile - depends_on: - - llamacpp-server - ports: - - "9000:9000" - network_mode: "host" # equivalent to: docker run --network host ... - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - restart: unless-stopped + ipc: host + healthcheck: + test: [ "CMD-SHELL", "curl -f http://${host_ip}:80/health || exit 1" ] + interval: 10s + timeout: 10s + retries: 150 networks: default: From a6740b62455f619ef55413da633048c150098ee2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Feb 2025 22:12:52 +0000 Subject: [PATCH 12/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/llms/src/text-generation/README_llamacpp.md | 5 +++-- comps/third_parties/llamacpp/README.md | 10 ++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/comps/llms/src/text-generation/README_llamacpp.md b/comps/llms/src/text-generation/README_llamacpp.md index f6197c150..a2ae32cbe 100644 --- a/comps/llms/src/text-generation/README_llamacpp.md +++ b/comps/llms/src/text-generation/README_llamacpp.md @@ -19,7 +19,7 @@ export MODEL_PATH=~/models mkdir $MODEL_PATH cd $MODEL_PATH wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf -```` +``` ### 2. Set Environment Variables @@ -32,6 +32,7 @@ export LLM_ENDPOINT="http://${host_ip}:80" export LLM_MODEL_ID="models/Phi-3-mini-4k-instruct-q4.gguf" export LLAMA_ARG_CTX_SIZE=4096 ``` + ### 3. Run the llama.cpp OPEA Microservice ```bash @@ -81,4 +82,4 @@ curl -X POST http://localhost:9000/v1/chat/completions \ "top_k": 50, "stream": false }' -``` \ No newline at end of file +``` diff --git a/comps/third_parties/llamacpp/README.md b/comps/third_parties/llamacpp/README.md index 8f9b7e627..00363a784 100644 --- a/comps/third_parties/llamacpp/README.md +++ b/comps/third_parties/llamacpp/README.md @@ -8,7 +8,6 @@ llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=re To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly. - ## Get Started ### 1. Download a gguf Model @@ -20,7 +19,7 @@ export MODEL_PATH=~/models mkdir $MODEL_PATH cd $MODEL_PATH wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf -```` +``` ### 2. Set Environment Variables @@ -36,11 +35,10 @@ export LLAMA_ARG_CTX_SIZE=4096 ```bash cd deployment/docker_compose -docker compose -f compose.yaml up llamacpp-server -d +docker compose -f compose.yaml up llamacpp-server -d ``` -To use this in an OPEA text generation component please see [llama.cpp text-generation]( -../../llms/src/text-generation/README_llamacpp.md) +To use this in an OPEA text generation component please see [llama.cpp text-generation](../../llms/src/text-generation/README_llamacpp.md) Note: can use docker logs to observe server. @@ -53,4 +51,4 @@ curl http://${host_ip}:8008/v1/chat/completions \ -X POST \ -H "Content-Type: application/json" \ -d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}' -``` \ No newline at end of file +``` From d0e27bf6a1baeb682443d495fe8669fdfc68e6e2 Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 21 Feb 2025 15:19:13 -0600 Subject: [PATCH 13/31] Fix service name Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- .../llms/deployment/docker_compose/compose_text-generation.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/llms/deployment/docker_compose/compose_text-generation.yaml b/comps/llms/deployment/docker_compose/compose_text-generation.yaml index d1a2b3975..1bb58d0c1 100644 --- a/comps/llms/deployment/docker_compose/compose_text-generation.yaml +++ b/comps/llms/deployment/docker_compose/compose_text-generation.yaml @@ -102,7 +102,7 @@ services: environment: LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNative} - textgen-llamacpp: + textgen-service-llamacpp: extends: textgen container_name: textgen-service-llamacpp environment: From 91324af5ca945f1087c59a1efbf384e8110541ee Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 21 Feb 2025 15:22:25 -0600 Subject: [PATCH 14/31] Revise llamacpp, using smaller Qwen model and remove unnecessary curl model argument Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- comps/llms/src/text-generation/README_llamacpp.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/comps/llms/src/text-generation/README_llamacpp.md b/comps/llms/src/text-generation/README_llamacpp.md index f6197c150..91ec27416 100644 --- a/comps/llms/src/text-generation/README_llamacpp.md +++ b/comps/llms/src/text-generation/README_llamacpp.md @@ -16,9 +16,9 @@ To download an example .gguf model to a model path: ```bash export MODEL_PATH=~/models -mkdir $MODEL_PATH +mkdir -p $MODEL_PATH # -p means make only if doesn't exist cd $MODEL_PATH -wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf +wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf ```` ### 2. Set Environment Variables @@ -29,13 +29,13 @@ export host_ip=$(hostname -I | awk '{print $1}') export TEXTGEN_PORT=9000 export LLM_ENDPOINT_PORT=8008 export LLM_ENDPOINT="http://${host_ip}:80" -export LLM_MODEL_ID="models/Phi-3-mini-4k-instruct-q4.gguf" +export LLM_MODEL_ID="models/qwen2.5-1.5b-instruct-q4_k_m.gguf" export LLAMA_ARG_CTX_SIZE=4096 ``` ### 3. Run the llama.cpp OPEA Microservice ```bash -export service_name="textgen-llamacpp" +export service_name="textgen-service-llamacpp" cd comps/llms/deployment/docker_compose/ docker compose -f compose_text-generation.yaml up ${service_name} -d ``` @@ -51,7 +51,6 @@ curl http://0.0.0.0:8008/v1/chat/completions \ -H "Content-Type: application/json" \ -H "Authorization: Bearer no-key" \ -d '{ - "model": "models/Phi-3-mini-4k-instruct-q4.gguf", "messages": [ { "role": "system", @@ -73,7 +72,6 @@ This component is based on openAI API convention: curl -X POST http://localhost:9000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "models/Phi-3-mini-4k-instruct-q4.gguf", "messages": [{"role": "user", "content": "Write a limerick about python exceptions"}], "max_tokens": 100, "temperature": 0.7, From f295e29e246f7d6e73574f52eb6261e4291b2e2e Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 21 Feb 2025 15:23:35 -0600 Subject: [PATCH 15/31] Update llamacpp thirdparty readme to use smaller model Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- comps/third_parties/llamacpp/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/comps/third_parties/llamacpp/README.md b/comps/third_parties/llamacpp/README.md index 8f9b7e627..d08e3afb8 100644 --- a/comps/third_parties/llamacpp/README.md +++ b/comps/third_parties/llamacpp/README.md @@ -17,9 +17,10 @@ To download an example .gguf model to a model path: ```bash export MODEL_PATH=~/models -mkdir $MODEL_PATH +mkdir -p $MODEL_PATH # -p means make only if doesn't exist cd $MODEL_PATH -wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf + +wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf ```` ### 2. Set Environment Variables @@ -28,7 +29,7 @@ wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/P export MODEL_PATH=~/models export host_ip=$(hostname -I | awk '{print $1}') export LLM_ENDPOINT_PORT=8008 -export LLM_MODEL_ID="models/Phi-3-mini-4k-instruct-q4.gguf" +export LLM_MODEL_ID="models/qwen2.5-1.5b-instruct-q4_k_m.gguf" export LLAMA_ARG_CTX_SIZE=4096 ``` From 480cb6900d76cb3bbb6fa4238c2073ef11893eb0 Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 21 Feb 2025 15:25:18 -0600 Subject: [PATCH 16/31] Fix healthcheck in llamacpp deployment compose.yaml Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- .../llamacpp/deployment/docker_compose/compose.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml index a1058de2b..49ca5a8c4 100644 --- a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml +++ b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml @@ -26,7 +26,7 @@ services: LLAMA_ARG_ENDPOINT_METRICS: 1 ipc: host healthcheck: - test: [ "CMD-SHELL", "curl -f http://${host_ip}:80/health || exit 1" ] + test: [ "CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1" ] interval: 10s timeout: 10s retries: 150 From 2c9f877e8584a49ed1b011515db1188f47ec1e7b Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 21 Feb 2025 15:26:21 -0600 Subject: [PATCH 17/31] Wrote a test and tested for llamacpp text gen service Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- ...t_llms_text-generation_service_llamacpp.sh | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 tests/llms/test_llms_text-generation_service_llamacpp.sh diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh new file mode 100644 index 000000000..d51399bf6 --- /dev/null +++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# Copyright (C) 2024 Prediction Guard, Inc. +# SPDX-License-Identifier: Apache-2.0 + +set -x + +IMAGE_REPO=${IMAGE_REPO:-"opea"} +export REGISTRY=${IMAGE_REPO} +export TAG="comps" +echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" +echo "TAG=${TAG}" + +WORKPATH=$(dirname "$PWD") # Assumes the script is called from GenAIComps/comps +host_ip=$(hostname -I | awk '{print $1}') # Adjust to a more reliable command +if [ -z "$host_ip" ]; then + host_ip="localhost" # Default to localhost if IP address is empty +fi +LOG_PATH="$WORKPATH/tests" +service_name="textgen-service-llamacpp" + + +function build_docker_images() { + cd $WORKPATH + docker build --no-cache -t ${REGISTRY:-opea}/llm-textgen:${TAG:-latest} --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/llm-textgen built fail" + exit 1 + else + echo "opea/llm-textgen built successful" + fi +} + +function start_service() { + export LLM_ENDPOINT_PORT=8008 + export LLM_ENDPOINT="http://${host_ip}:80" + export TEXTGEN_PORT=9000 + export LLM_MODEL_ID="models/qwen2.5-1.5b-instruct-q4_k_m.gguf" + export LLAMA_ARG_CTX_SIZE=4096 + export LOGFLAG=True + + export MODEL_PATH=~/models + mkdir -p $MODEL_PATH + cd $MODELPATH + wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf + cd $WORKPATH/comps/llms/deployment/docker_compose + docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log + + sleep 60 # Sleep for 1 minute to allow the service to start +} + +function validate_microservice() { + result=$(http_proxy="" curl -X POST http://${host_ip}:${TEXTGEN_PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [{"role": "user", "content": "What is AI?"}], + "max_tokens": 100, + "temperature": 0.7, + "top_p": 0.9, + "top_k": 50, + "stream": false + }') + + if [[ $result == *"content"* ]]; then + echo "Service response is correct." + else + echo "Result wrong. Received was $result" + docker logs ${service_name} + exit 1 + fi +} + +function stop_docker() { + cd $WORKPATH/comps/llms/deployment/docker_compose + docker compose -f compose_text-generation.yaml down ${service_name} --remove-orphans +} + +function main() { + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune +} + +main +set +x From 7310d6a8aa22f0210e432fcb363be282ccb29194 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 21 Feb 2025 21:29:49 +0000 Subject: [PATCH 18/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/llms/src/text-generation/README_llamacpp.md | 5 +++-- comps/third_parties/llamacpp/README.md | 10 ++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/comps/llms/src/text-generation/README_llamacpp.md b/comps/llms/src/text-generation/README_llamacpp.md index 91ec27416..237d515f2 100644 --- a/comps/llms/src/text-generation/README_llamacpp.md +++ b/comps/llms/src/text-generation/README_llamacpp.md @@ -19,7 +19,7 @@ export MODEL_PATH=~/models mkdir -p $MODEL_PATH # -p means make only if doesn't exist cd $MODEL_PATH wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf -```` +``` ### 2. Set Environment Variables @@ -32,6 +32,7 @@ export LLM_ENDPOINT="http://${host_ip}:80" export LLM_MODEL_ID="models/qwen2.5-1.5b-instruct-q4_k_m.gguf" export LLAMA_ARG_CTX_SIZE=4096 ``` + ### 3. Run the llama.cpp OPEA Microservice ```bash @@ -79,4 +80,4 @@ curl -X POST http://localhost:9000/v1/chat/completions \ "top_k": 50, "stream": false }' -``` \ No newline at end of file +``` diff --git a/comps/third_parties/llamacpp/README.md b/comps/third_parties/llamacpp/README.md index d08e3afb8..3f051ca32 100644 --- a/comps/third_parties/llamacpp/README.md +++ b/comps/third_parties/llamacpp/README.md @@ -8,7 +8,6 @@ llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=re To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly. - ## Get Started ### 1. Download a gguf Model @@ -21,7 +20,7 @@ mkdir -p $MODEL_PATH # -p means make only if doesn't exist cd $MODEL_PATH wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf -```` +``` ### 2. Set Environment Variables @@ -37,11 +36,10 @@ export LLAMA_ARG_CTX_SIZE=4096 ```bash cd deployment/docker_compose -docker compose -f compose.yaml up llamacpp-server -d +docker compose -f compose.yaml up llamacpp-server -d ``` -To use this in an OPEA text generation component please see [llama.cpp text-generation]( -../../llms/src/text-generation/README_llamacpp.md) +To use this in an OPEA text generation component please see [llama.cpp text-generation](../../llms/src/text-generation/README_llamacpp.md) Note: can use docker logs to observe server. @@ -54,4 +52,4 @@ curl http://${host_ip}:8008/v1/chat/completions \ -X POST \ -H "Content-Type: application/json" \ -d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}' -``` \ No newline at end of file +``` From efde309de6bbf8dd0eb454dbdd19c4d5fb1a14bf Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 21 Feb 2025 16:11:27 -0600 Subject: [PATCH 19/31] Increase the llamacpp-server wait time Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- tests/llms/test_llms_text-generation_service_llamacpp.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh index d51399bf6..63d3dda6d 100644 --- a/tests/llms/test_llms_text-generation_service_llamacpp.sh +++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh @@ -43,9 +43,8 @@ function start_service() { cd $MODELPATH wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf cd $WORKPATH/comps/llms/deployment/docker_compose - docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log - - sleep 60 # Sleep for 1 minute to allow the service to start + docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose_llama.log + sleep 120 # Allow the service to start } function validate_microservice() { From c474a643b737c7940a92347ca0f5aad247179210 Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 21 Feb 2025 16:34:59 -0600 Subject: [PATCH 20/31] Fixed typos on http environment variables, and volumes Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- .../llamacpp/deployment/docker_compose/compose.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml index 49ca5a8c4..e1544d3a8 100644 --- a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml +++ b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml @@ -9,11 +9,12 @@ services: - ${LLM_ENDPOINT_PORT:-8008}:80 volumes: # Download the .gguf models to this path. - - ${MODEL_PATH:-~/models}:/models + - "${MODEL_PATH:-~/models}:/models" environment: LOGFLAG: False - HTTPS_PROXY: ${http_proxy} - HTTP_PROXY: ${https_proxy} + no_proxy: ${no_proxy} + https_proxy: ${http_proxy} + http_proxy: ${https_proxy} LLM_MODEL_ID: ${LLM_MODEL_ID} LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT} host_ip: ${host_ip} @@ -29,7 +30,7 @@ services: test: [ "CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1" ] interval: 10s timeout: 10s - retries: 150 + retries: 100 networks: default: From 712f575a1dec18ff4d352fc2cd863e9347f713e1 Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 21 Feb 2025 17:06:17 -0600 Subject: [PATCH 21/31] Splitting the llama.cpp test to use compose up on the llama.cpp third-party service first. Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- .../test_llms_text-generation_service_llamacpp.sh | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh index 63d3dda6d..4032c377e 100644 --- a/tests/llms/test_llms_text-generation_service_llamacpp.sh +++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh @@ -41,10 +41,18 @@ function start_service() { export MODEL_PATH=~/models mkdir -p $MODEL_PATH cd $MODELPATH - wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf + wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf \ + -q --show-progress --progress=bar + + # Spin up the third party service first before compose_text-generation.yaml, + # otherwise there's a dependency error. Doesn't have this error when running locally. + cd $WORKPATH/comps/third_parties/llamacpp/deployment/docker_compose/ + docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose_llamacpp.log + sleep 20s + cd $WORKPATH/comps/llms/deployment/docker_compose - docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose_llama.log - sleep 120 # Allow the service to start + docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log + sleep 60s # Allow the service to start } function validate_microservice() { From 68cc00f4d6ac18a43eb57db4aa2e7d7e7d23d802 Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 21 Feb 2025 18:28:03 -0600 Subject: [PATCH 22/31] add alternate command to stop and remove docker containers from previous tests Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- .../test_llms_text-generation_service_llamacpp.sh | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh index 4032c377e..84fd4b8ad 100644 --- a/tests/llms/test_llms_text-generation_service_llamacpp.sh +++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh @@ -46,13 +46,15 @@ function start_service() { # Spin up the third party service first before compose_text-generation.yaml, # otherwise there's a dependency error. Doesn't have this error when running locally. - cd $WORKPATH/comps/third_parties/llamacpp/deployment/docker_compose/ - docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose_llamacpp.log - sleep 20s +# cd $WORKPATH/comps/third_parties/llamacpp/deployment/docker_compose/ +# docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose_llamacpp.log +# sleep 20s cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log + docker ps -a sleep 60s # Allow the service to start + docker ps -a } function validate_microservice() { @@ -82,14 +84,16 @@ function stop_docker() { } function main() { - stop_docker + # stop_docker + # Trying this because stop_docker may not stop and remove containers from previous run tests and may block ports. + docker stop $(docker ps -a -q) && docker rm $(docker ps -a -q) build_docker_images start_service validate_microservice - stop_docker + stop_dockerllm-textgen echo y | docker system prune } From 2dd20646fe41e6dae47995bc8e6a2cf66299df7d Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 21 Feb 2025 18:34:24 -0600 Subject: [PATCH 23/31] Modifying tear down of stop_docker in llamacpp tests to try to remove all containers. Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- tests/llms/test_llms_text-generation_service_llamacpp.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh index 84fd4b8ad..acfe9f15c 100644 --- a/tests/llms/test_llms_text-generation_service_llamacpp.sh +++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh @@ -80,20 +80,19 @@ function validate_microservice() { function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose - docker compose -f compose_text-generation.yaml down ${service_name} --remove-orphans + # docker compose -f compose_text-generation.yaml down ${service_name} --remove-orphans + docker compose -f compose_text-generation.yaml down --remove-orphans } function main() { - # stop_docker - # Trying this because stop_docker may not stop and remove containers from previous run tests and may block ports. - docker stop $(docker ps -a -q) && docker rm $(docker ps -a -q) + stop_docker build_docker_images start_service validate_microservice - stop_dockerllm-textgen + stop_docker echo y | docker system prune } From dbff6fcfd1102fcef0b675190565610a0ab19863 Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 21 Feb 2025 18:46:05 -0600 Subject: [PATCH 24/31] Adding some logs output to debug llamacpp test Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- tests/llms/test_llms_text-generation_service_llamacpp.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh index acfe9f15c..c500b041f 100644 --- a/tests/llms/test_llms_text-generation_service_llamacpp.sh +++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh @@ -53,8 +53,10 @@ function start_service() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log docker ps -a + docker logs llamacpp-server sleep 60s # Allow the service to start docker ps -a + docker logs llamacpp-server } function validate_microservice() { From f184897fce78469e0b672e2210dd0006da38b214 Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 21 Feb 2025 19:04:14 -0600 Subject: [PATCH 25/31] Found model path bug and fixed it to run llama.cpp test Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- .../llamacpp/deployment/docker_compose/compose.yaml | 2 +- tests/llms/test_llms_text-generation_service_llamacpp.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml index e1544d3a8..c352db8e3 100644 --- a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml +++ b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml @@ -9,7 +9,7 @@ services: - ${LLM_ENDPOINT_PORT:-8008}:80 volumes: # Download the .gguf models to this path. - - "${MODEL_PATH:-~/models}:/models" + - ${MODEL_PATH:-~/models}:/models environment: LOGFLAG: False no_proxy: ${no_proxy} diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh index c500b041f..d08b42bd8 100644 --- a/tests/llms/test_llms_text-generation_service_llamacpp.sh +++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh @@ -40,9 +40,9 @@ function start_service() { export MODEL_PATH=~/models mkdir -p $MODEL_PATH - cd $MODELPATH + cd $MODEL_PATH wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf \ - -q --show-progress --progress=bar + --show-progress --progress=bar # Spin up the third party service first before compose_text-generation.yaml, # otherwise there's a dependency error. Doesn't have this error when running locally. From ea4ea388ad6a5bc753da3e86530e54994e7b2dbb Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 21 Feb 2025 23:00:57 -0600 Subject: [PATCH 26/31] Adjusted LLM_ENDPOINT env variable Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- tests/llms/test_llms_text-generation_service_llamacpp.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh index d08b42bd8..fc1f619e6 100644 --- a/tests/llms/test_llms_text-generation_service_llamacpp.sh +++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh @@ -32,7 +32,7 @@ function build_docker_images() { function start_service() { export LLM_ENDPOINT_PORT=8008 - export LLM_ENDPOINT="http://${host_ip}:80" + export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}" export TEXTGEN_PORT=9000 export LLM_MODEL_ID="models/qwen2.5-1.5b-instruct-q4_k_m.gguf" export LLAMA_ARG_CTX_SIZE=4096 @@ -55,8 +55,8 @@ function start_service() { docker ps -a docker logs llamacpp-server sleep 60s # Allow the service to start - docker ps -a - docker logs llamacpp-server +# docker ps -a +# docker logs llamacpp-server } function validate_microservice() { From 01fca036a91b97edf30c1300ca0eb24c3ea5036e Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 21 Feb 2025 23:23:03 -0600 Subject: [PATCH 27/31] Cleaned up test file Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- .../test_llms_text-generation_service_llamacpp.sh | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh index fc1f619e6..1ddb2449d 100644 --- a/tests/llms/test_llms_text-generation_service_llamacpp.sh +++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh @@ -42,21 +42,13 @@ function start_service() { mkdir -p $MODEL_PATH cd $MODEL_PATH wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf \ - --show-progress --progress=bar - - # Spin up the third party service first before compose_text-generation.yaml, - # otherwise there's a dependency error. Doesn't have this error when running locally. -# cd $WORKPATH/comps/third_parties/llamacpp/deployment/docker_compose/ -# docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose_llamacpp.log -# sleep 20s + -q --show-progress --progress=bar cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log docker ps -a docker logs llamacpp-server sleep 60s # Allow the service to start -# docker ps -a -# docker logs llamacpp-server } function validate_microservice() { @@ -82,7 +74,7 @@ function validate_microservice() { function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose - # docker compose -f compose_text-generation.yaml down ${service_name} --remove-orphans + # Using down without particular service_name since there can be containers that aren't taken down from other tests. docker compose -f compose_text-generation.yaml down --remove-orphans } From dfd5057cd9465318f3ea3c13316e186de2b11585 Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Fri, 21 Feb 2025 23:53:41 -0600 Subject: [PATCH 28/31] Adjust host_ip env variable in scope of start_service Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- tests/llms/test_llms_text-generation_service_llamacpp.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh index 1ddb2449d..af43ddbbe 100644 --- a/tests/llms/test_llms_text-generation_service_llamacpp.sh +++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh @@ -31,6 +31,7 @@ function build_docker_images() { } function start_service() { + export host_ip=${host_ip} # must be an environment variable export LLM_ENDPOINT_PORT=8008 export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}" export TEXTGEN_PORT=9000 From 4a965da9b0dd4373ac7850b001f686f09e283a24 Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Mon, 24 Feb 2025 10:56:15 -0600 Subject: [PATCH 29/31] Docker ps to debug orphaned containers. Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- .../test_llms_faq-generation_vllm_on_intel_hpu.sh | 5 ++++- .../test_llms_text-generation_service_llamacpp.sh | 14 ++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/llms/test_llms_faq-generation_vllm_on_intel_hpu.sh b/tests/llms/test_llms_faq-generation_vllm_on_intel_hpu.sh index 8607f2c55..f176bffc6 100644 --- a/tests/llms/test_llms_faq-generation_vllm_on_intel_hpu.sh +++ b/tests/llms/test_llms_faq-generation_vllm_on_intel_hpu.sh @@ -119,8 +119,11 @@ function stop_docker() { } function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a build_docker_images start_service diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh index af43ddbbe..a482724ef 100644 --- a/tests/llms/test_llms_text-generation_service_llamacpp.sh +++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh @@ -31,7 +31,7 @@ function build_docker_images() { } function start_service() { - export host_ip=${host_ip} # must be an environment variable + export host_ip=${host_ip} # must be an environment variable declared in scope of start_service export LLM_ENDPOINT_PORT=8008 export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}" export TEXTGEN_PORT=9000 @@ -49,7 +49,7 @@ function start_service() { docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log docker ps -a docker logs llamacpp-server - sleep 60s # Allow the service to start + sleep 30s # Allow the service to start } function validate_microservice() { @@ -75,13 +75,19 @@ function validate_microservice() { function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose - # Using down without particular service_name since there can be containers that aren't taken down from other tests. + # Using down without particular service_name since still can have orphan containers that aren't taken down from other tests. docker compose -f compose_text-generation.yaml down --remove-orphans } function main() { - stop_docker + echo "Docker containers before stop_docker" + docker ps -a + stop_docker + echo "Docker containers after stop_docker" + docker ps -a + + stop_docker build_docker_images start_service From 32b06e9469c562e7421ef25f9767b140eeeeea70 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 24 Feb 2025 16:57:38 +0000 Subject: [PATCH 30/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/llms/test_llms_text-generation_service_llamacpp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh index a482724ef..1f2f4fcf1 100644 --- a/tests/llms/test_llms_text-generation_service_llamacpp.sh +++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh @@ -86,7 +86,7 @@ function main() { stop_docker echo "Docker containers after stop_docker" docker ps -a - + stop_docker build_docker_images start_service From 33635042572934681279ce3066ac8c76747aa8fb Mon Sep 17 00:00:00 2001 From: Ed Lee <16417837+edlee123@users.noreply.github.com> Date: Mon, 24 Feb 2025 11:29:10 -0600 Subject: [PATCH 31/31] Adding output to debug orphaned docker containers Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com> --- tests/llms/test_llms_doc-summarization_tgi.sh | 8 +++++--- .../llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh | 8 +++++--- tests/llms/test_llms_doc-summarization_vllm.sh | 8 +++++--- .../llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh | 8 +++++--- tests/llms/test_llms_faq-generation_tgi.sh | 8 +++++--- tests/llms/test_llms_faq-generation_tgi_on_intel_hpu.sh | 8 +++++--- tests/llms/test_llms_faq-generation_vllm.sh | 8 +++++--- .../llms/test_llms_text-generation_native_on_intel_hpu.sh | 8 +++++--- tests/llms/test_llms_text-generation_service_llamacpp.sh | 2 +- tests/llms/test_llms_text-generation_service_ollama.sh | 5 +++++ tests/llms/test_llms_text-generation_service_tgi.sh | 4 ++++ .../test_llms_text-generation_service_tgi_on_intel_hpu.sh | 4 ++++ ...test_llms_text-generation_service_vllm_on_intel_hpu.sh | 8 +++++--- 13 files changed, 59 insertions(+), 28 deletions(-) diff --git a/tests/llms/test_llms_doc-summarization_tgi.sh b/tests/llms/test_llms_doc-summarization_tgi.sh index 16e201854..a07000d89 100644 --- a/tests/llms/test_llms_doc-summarization_tgi.sh +++ b/tests/llms/test_llms_doc-summarization_tgi.sh @@ -140,10 +140,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_doc-summarization.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images start_service diff --git a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh index b8c97f5b6..c14c4e1eb 100644 --- a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh +++ b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh @@ -141,10 +141,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_doc-summarization.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images start_service diff --git a/tests/llms/test_llms_doc-summarization_vllm.sh b/tests/llms/test_llms_doc-summarization_vllm.sh index 42e79aa1e..55d8b2ccc 100644 --- a/tests/llms/test_llms_doc-summarization_vllm.sh +++ b/tests/llms/test_llms_doc-summarization_vllm.sh @@ -155,10 +155,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_doc-summarization.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images start_service diff --git a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh index a6096bd30..b245b57c7 100644 --- a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh +++ b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh @@ -154,10 +154,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_doc-summarization.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images start_service diff --git a/tests/llms/test_llms_faq-generation_tgi.sh b/tests/llms/test_llms_faq-generation_tgi.sh index d0ae7aa95..b95389a27 100644 --- a/tests/llms/test_llms_faq-generation_tgi.sh +++ b/tests/llms/test_llms_faq-generation_tgi.sh @@ -102,10 +102,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_faq-generation.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images start_service diff --git a/tests/llms/test_llms_faq-generation_tgi_on_intel_hpu.sh b/tests/llms/test_llms_faq-generation_tgi_on_intel_hpu.sh index 50b1524c0..bf4be175f 100644 --- a/tests/llms/test_llms_faq-generation_tgi_on_intel_hpu.sh +++ b/tests/llms/test_llms_faq-generation_tgi_on_intel_hpu.sh @@ -103,10 +103,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_faq-generation.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images start_service diff --git a/tests/llms/test_llms_faq-generation_vllm.sh b/tests/llms/test_llms_faq-generation_vllm.sh index 588ed4981..43b7b1c65 100644 --- a/tests/llms/test_llms_faq-generation_vllm.sh +++ b/tests/llms/test_llms_faq-generation_vllm.sh @@ -118,10 +118,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_faq-generation.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images start_service diff --git a/tests/llms/test_llms_text-generation_native_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_native_on_intel_hpu.sh index 0d39a8690..d348d2673 100644 --- a/tests/llms/test_llms_text-generation_native_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_native_on_intel_hpu.sh @@ -87,10 +87,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_text-generation.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images start_service validate_microservice diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh index a482724ef..1f2f4fcf1 100644 --- a/tests/llms/test_llms_text-generation_service_llamacpp.sh +++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh @@ -86,7 +86,7 @@ function main() { stop_docker echo "Docker containers after stop_docker" docker ps -a - + stop_docker build_docker_images start_service diff --git a/tests/llms/test_llms_text-generation_service_ollama.sh b/tests/llms/test_llms_text-generation_service_ollama.sh index d5087ce7e..dbf638e13 100644 --- a/tests/llms/test_llms_text-generation_service_ollama.sh +++ b/tests/llms/test_llms_text-generation_service_ollama.sh @@ -69,7 +69,12 @@ function stop_docker() { function main() { + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images llm_models=( diff --git a/tests/llms/test_llms_text-generation_service_tgi.sh b/tests/llms/test_llms_text-generation_service_tgi.sh index c60447025..0e691c65f 100644 --- a/tests/llms/test_llms_text-generation_service_tgi.sh +++ b/tests/llms/test_llms_text-generation_service_tgi.sh @@ -118,7 +118,11 @@ function stop_docker() { function main() { + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a build_docker_images pip install --no-cache-dir openai pydantic diff --git a/tests/llms/test_llms_text-generation_service_tgi_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_service_tgi_on_intel_hpu.sh index c91a51498..efa3809b8 100644 --- a/tests/llms/test_llms_text-generation_service_tgi_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_service_tgi_on_intel_hpu.sh @@ -119,7 +119,11 @@ function stop_docker() { function main() { + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a build_docker_images pip install --no-cache-dir openai pydantic diff --git a/tests/llms/test_llms_text-generation_service_vllm_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_service_vllm_on_intel_hpu.sh index ea8c9ee6c..63cb7f955 100644 --- a/tests/llms/test_llms_text-generation_service_vllm_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_service_vllm_on_intel_hpu.sh @@ -129,10 +129,12 @@ function stop_docker() { cd $WORKPATH/comps/llms/deployment/docker_compose docker compose -f compose_text-generation.yaml down ${service_name} --remove-orphans } - -function main() { - + echo "Docker containers before stop_docker" + docker ps -a stop_docker + echo "Docker containers after stop_docker" + docker ps -a + build_docker_images pip install --no-cache-dir openai pydantic