From 397f7b80bd86a1e7da78e94f7e0f75b21436ac56 Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Thu, 19 Dec 2024 21:35:50 -0600
Subject: [PATCH 01/31] First commit of llamacpp Opea component

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 .../llms/text-generation/llamacpp/Dockerfile  | 27 ++++++
 comps/llms/text-generation/llamacpp/README.md | 84 +++++++++++++++++++
 .../llms/text-generation/llamacpp/__init__.py |  2 +
 .../llamacpp/docker_compose_llm.yaml          | 39 +++++++++
 .../text-generation/llamacpp/entrypoint.sh    |  8 ++
 comps/llms/text-generation/llamacpp/llm.py    | 65 ++++++++++++++
 .../llamacpp/requirements-runtime.txt         |  1 +
 .../text-generation/llamacpp/requirements.txt | 12 +++
 8 files changed, 238 insertions(+)
 create mode 100644 comps/llms/text-generation/llamacpp/Dockerfile
 create mode 100644 comps/llms/text-generation/llamacpp/README.md
 create mode 100644 comps/llms/text-generation/llamacpp/__init__.py
 create mode 100644 comps/llms/text-generation/llamacpp/docker_compose_llm.yaml
 create mode 100644 comps/llms/text-generation/llamacpp/entrypoint.sh
 create mode 100644 comps/llms/text-generation/llamacpp/llm.py
 create mode 100644 comps/llms/text-generation/llamacpp/requirements-runtime.txt
 create mode 100644 comps/llms/text-generation/llamacpp/requirements.txt

diff --git a/comps/llms/text-generation/llamacpp/Dockerfile b/comps/llms/text-generation/llamacpp/Dockerfile
new file mode 100644
index 000000000..a362c3bf6
--- /dev/null
+++ b/comps/llms/text-generation/llamacpp/Dockerfile
@@ -0,0 +1,27 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.11-slim
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    libgl1-mesa-glx \
+    libjemalloc-dev
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+USER user
+
+# Assumes we're building from the GenAIComps directory.
+COPY ../../../comps /home/user/comps
+
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir -r /home/user/comps/llms/text-generation/llamacpp/requirements.txt
+
+ENV PYTHONPATH=$PYTHONPATH:/home/user
+
+WORKDIR /home/user/comps/llms/text-generation/llamacpp/
+
+ENTRYPOINT ["bash", "entrypoint.sh"]
diff --git a/comps/llms/text-generation/llamacpp/README.md b/comps/llms/text-generation/llamacpp/README.md
new file mode 100644
index 000000000..b8f64aac0
--- /dev/null
+++ b/comps/llms/text-generation/llamacpp/README.md
@@ -0,0 +1,84 @@
+# Introduction
+
+[llama.cpp](https://github.com/ggerganov/llama.cpp) provides inference in pure C/C++, and enables "LLM inference with minimal setup and state-of-the-art performance on a wide range of hardware - locally and in the cloud".
+
+This OPEA component wraps llama.cpp server so that it can interface with other OPEA components, or for creating OPEA Megaservices.
+
+## TLDR
+
+```bash
+cd GenAIComps/
+docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yml up
+```
+
+Please note it's instructive to run and validate each the llama.cpp server and OPEA component below.
+
+## 1. Run the llama.cpp server
+
+```bash
+cd GenAIComps
+docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llamacpp-server --force-recreate
+```
+
+Notes:
+
+i) If you prefer to run above in the background without screen output use `up -d` . The `--force-recreate` clears cache.
+
+ii) To tear down the llama.cpp server and remove the container:
+
+`docker compose -f comps/llms/text-generation/llamacpp/langchain/docker_compose_llm.yaml llamacpp-server down`
+
+iii) For [llama.cpp settings](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md) please specify them in the docker_compose_llm.yaml file.
+
+#### Verify the llama.cpp Service:
+
+```bash
+curl --request POST \
+    --url http://localhost:8080/completion \
+    --header "Content-Type: application/json" \
+    --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
+```
+
+## 2. Run the llama.cpp OPEA Service
+
+This is essentially a wrapper component of Llama.cpp server. OPEA nicely standardizes and verifies LLM inputs with LLMParamsDoc class (see llm.py).
+
+### 2.1 Build the llama.cpp OPEA image:
+
+```bash
+cd GenAIComps/
+docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yml up llama-opea-llm
+```
+
+Equivalently, the above can be achieved with `build` and `run` from the Dockerfile. Build:
+
+```bash
+cd GenAIComps/
+docker build --no-cache -t opea/llm-llamacpp:latest \
+  --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy \
+  -f comps/llms/text-generation/llamacpp/Dockerfile .
+```
+
+And run:
+
+```bash
+docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy \
+  opea/llm-llamacpp:latest
+```
+
+### 2.3 Consume the llama.cpp Microservice:
+
+```bash
+curl http://127.0.0.1:9000/v1/chat/completions  -X POST \
+   -d '{"query":"What is Deep Learning?","max_tokens":32,"top_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
+   -H 'Content-Type: application/json'
+```
+
+### Notes
+
+Tearing down services and removing containers:
+
+```bash
+cd GenAIComps/comps/llms/text-generation/llamacpp/
+docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml down
+```
diff --git a/comps/llms/text-generation/llamacpp/__init__.py b/comps/llms/text-generation/llamacpp/__init__.py
new file mode 100644
index 000000000..916f3a44b
--- /dev/null
+++ b/comps/llms/text-generation/llamacpp/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
diff --git a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml
new file mode 100644
index 000000000..88937ff0d
--- /dev/null
+++ b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml
@@ -0,0 +1,39 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  llamacpp-server:
+    image: ghcr.io/ggerganov/llama.cpp:server
+    ports:
+      - 8080:8080
+    environment:
+      # Refer to settings here: https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md
+      # Llama.cpp is based on .gguf format, and Hugging Face offers many .gguf format models.
+      LLAMA_ARG_MODEL_URL: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf
+      LLAMA_ARG_CTX_SIZE: 4096
+      LLAMA_ARG_N_PARALLEL: 2
+      LLAMA_ARG_ENDPOINT_METRICS: 1
+      LLAMA_ARG_PORT: 8080
+
+  llamacpp-opea-llm:
+    image: opea/llm-llamacpp:latest
+    build:
+        # Set this to allow COPY comps in the Dockerfile.
+        # When using docker compose with -f, the comps context is 4 levels down from docker_compose_llm.yaml.
+        context: ../../../../
+        dockerfile: ./comps/llms/text-generation/llamacpp/Dockerfile
+    depends_on:
+      - llamacpp-server
+    ports:
+      - "9000:9000"
+    network_mode: "host" # equivalent to: docker run --network host ...
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      # LLAMACPP_ENDPOINT: ${LLAMACPP_ENDPOINT}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
diff --git a/comps/llms/text-generation/llamacpp/entrypoint.sh b/comps/llms/text-generation/llamacpp/entrypoint.sh
new file mode 100644
index 000000000..c9a5a3d07
--- /dev/null
+++ b/comps/llms/text-generation/llamacpp/entrypoint.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# pip --no-cache-dir install -r requirements-runtime.txt
+
+python llm.py
diff --git a/comps/llms/text-generation/llamacpp/llm.py b/comps/llms/text-generation/llamacpp/llm.py
new file mode 100644
index 000000000..5612199eb
--- /dev/null
+++ b/comps/llms/text-generation/llamacpp/llm.py
@@ -0,0 +1,65 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import openai
+from fastapi.responses import StreamingResponse
+
+from comps import CustomLogger, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
+
+logger = CustomLogger("llm_llamacpp")
+logflag = os.getenv("LOGFLAG", False)
+llamacpp_endpoint = os.getenv("LLAMACPP_ENDPOINT", "http://localhost:8080/")
+
+
+# OPEA microservice wrapper of llama.cpp
+# llama.cpp server uses openai API format: https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md
+@register_microservice(
+    name="opea_service@llm_llamacpp",
+    service_type=ServiceType.LLM,
+    endpoint="/v1/chat/completions",
+    host="0.0.0.0",
+    port=9000,
+)
+async def llm_generate(input: LLMParamsDoc):
+    if logflag:
+        logger.info(input)
+        logger.info(llamacpp_endpoint)
+
+    client = openai.OpenAI(
+        base_url=llamacpp_endpoint, api_key="sk-no-key-required"  # "http://<Your api-server IP>:port"
+    )
+
+    # Llama.cpp works with openai API format
+    # The openai api doesn't have top_k parameter
+    # https://community.openai.com/t/which-openai-gpt-models-if-any-allow-specifying-top-k/777982/2
+    chat_completion = client.chat.completions.create(
+        model=input.model,
+        messages=[{"role": "user", "content": input.query}],
+        max_tokens=input.max_tokens,
+        temperature=input.temperature,
+        top_p=input.top_p,
+        frequency_penalty=input.frequency_penalty,
+        presence_penalty=input.presence_penalty,
+        stream=input.streaming,
+    )
+
+    if input.streaming:
+
+        def stream_generator():
+            for c in chat_completion:
+                if logflag:
+                    logger.info(c)
+                yield f"data: {c.model_dump_json()}\n\n"
+            yield "data: [DONE]\n\n"
+
+        return StreamingResponse(stream_generator(), media_type="text/event-stream")
+    else:
+        if logflag:
+            logger.info(chat_completion)
+        return chat_completion
+
+
+if __name__ == "__main__":
+    opea_microservices["opea_service@llm_llamacpp"].start()
diff --git a/comps/llms/text-generation/llamacpp/requirements-runtime.txt b/comps/llms/text-generation/llamacpp/requirements-runtime.txt
new file mode 100644
index 000000000..225adde27
--- /dev/null
+++ b/comps/llms/text-generation/llamacpp/requirements-runtime.txt
@@ -0,0 +1 @@
+langserve
diff --git a/comps/llms/text-generation/llamacpp/requirements.txt b/comps/llms/text-generation/llamacpp/requirements.txt
new file mode 100644
index 000000000..fdb5f5a01
--- /dev/null
+++ b/comps/llms/text-generation/llamacpp/requirements.txt
@@ -0,0 +1,12 @@
+aiohttp
+docarray[full]
+fastapi
+huggingface_hub
+openai
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+prometheus-fastapi-instrumentator
+shortuuid
+transformers
+uvicorn

From cb4f5e59a53161ea893dc6fa38ee49266d7a3f69 Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Thu, 19 Dec 2024 21:50:26 -0600
Subject: [PATCH 02/31] Removed unneeded requirements file

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 comps/llms/text-generation/llamacpp/requirements-runtime.txt | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 comps/llms/text-generation/llamacpp/requirements-runtime.txt

diff --git a/comps/llms/text-generation/llamacpp/requirements-runtime.txt b/comps/llms/text-generation/llamacpp/requirements-runtime.txt
deleted file mode 100644
index 225adde27..000000000
--- a/comps/llms/text-generation/llamacpp/requirements-runtime.txt
+++ /dev/null
@@ -1 +0,0 @@
-langserve

From 2a48bae8e3231c82a370b18ae681968997ed36b7 Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Mon, 6 Jan 2025 15:38:25 -0600
Subject: [PATCH 03/31] Pin the llama.cpp server version, and fix small typo

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 comps/llms/text-generation/llamacpp/README.md               | 2 +-
 comps/llms/text-generation/llamacpp/docker_compose_llm.yaml | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/comps/llms/text-generation/llamacpp/README.md b/comps/llms/text-generation/llamacpp/README.md
index b8f64aac0..7b7ffa7d5 100644
--- a/comps/llms/text-generation/llamacpp/README.md
+++ b/comps/llms/text-generation/llamacpp/README.md
@@ -8,7 +8,7 @@ This OPEA component wraps llama.cpp server so that it can interface with other O
 
 ```bash
 cd GenAIComps/
-docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yml up
+docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up
 ```
 
 Please note it's instructive to run and validate each the llama.cpp server and OPEA component below.
diff --git a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml
index 88937ff0d..9a718661b 100644
--- a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml
+++ b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml
@@ -3,7 +3,8 @@
 
 services:
   llamacpp-server:
-    image: ghcr.io/ggerganov/llama.cpp:server
+    # image: ghcr.io/ggerganov/llama.cpp:server
+    image: ghcr.io/ggerganov/llama.cpp:server-b4419
     ports:
       - 8080:8080
     environment:

From 4e8215225a2afb528137dc0598731af59e42e1bb Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Mon, 6 Jan 2025 15:55:50 -0600
Subject: [PATCH 04/31] Update README.md to describe hardware support, and
 provide reference.

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 comps/llms/text-generation/llamacpp/README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/comps/llms/text-generation/llamacpp/README.md b/comps/llms/text-generation/llamacpp/README.md
index 7b7ffa7d5..d1e5054a2 100644
--- a/comps/llms/text-generation/llamacpp/README.md
+++ b/comps/llms/text-generation/llamacpp/README.md
@@ -4,6 +4,10 @@
 
 This OPEA component wraps llama.cpp server so that it can interface with other OPEA components, or for creating OPEA Megaservices.
 
+llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#supported-backends), and has only been tested on CPU.
+
+To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly.
+
 ## TLDR
 
 ```bash
@@ -47,7 +51,7 @@ This is essentially a wrapper component of Llama.cpp server. OPEA nicely standar
 
 ```bash
 cd GenAIComps/
-docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yml up llama-opea-llm
+docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llama-opea-llm
 ```
 
 Equivalently, the above can be achieved with `build` and `run` from the Dockerfile. Build:

From baf381dca98ae237347db41fb0fcdd4b64943f86 Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Mon, 6 Jan 2025 16:03:42 -0600
Subject: [PATCH 05/31] Updated docker_compose_llm.yaml so that the
 llamacpp-server so the pulled image has specific tag.

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 comps/llms/text-generation/llamacpp/docker_compose_llm.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml
index 9a718661b..dd220b6f1 100644
--- a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml
+++ b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml
@@ -3,7 +3,6 @@
 
 services:
   llamacpp-server:
-    # image: ghcr.io/ggerganov/llama.cpp:server
     image: ghcr.io/ggerganov/llama.cpp:server-b4419
     ports:
       - 8080:8080

From 9d7539dd213b017879c607b94c4336520b8fc64e Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Tue, 7 Jan 2025 11:04:43 -0600
Subject: [PATCH 06/31] Small adjustments to README.md

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 comps/llms/text-generation/llamacpp/README.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/comps/llms/text-generation/llamacpp/README.md b/comps/llms/text-generation/llamacpp/README.md
index d1e5054a2..15a96ca1f 100644
--- a/comps/llms/text-generation/llamacpp/README.md
+++ b/comps/llms/text-generation/llamacpp/README.md
@@ -28,9 +28,9 @@ Notes:
 
 i) If you prefer to run above in the background without screen output use `up -d` . The `--force-recreate` clears cache.
 
-ii) To tear down the llama.cpp server and remove the container:
+ii) To stop the llama.cpp server:
 
-`docker compose -f comps/llms/text-generation/llamacpp/langchain/docker_compose_llm.yaml llamacpp-server down`
+`docker compose -f comps/llms/text-generation/llamacpp/langchain/docker_compose_llm.yaml llamacpp-server stop`
 
 iii) For [llama.cpp settings](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md) please specify them in the docker_compose_llm.yaml file.
 
@@ -80,9 +80,11 @@ curl http://127.0.0.1:9000/v1/chat/completions  -X POST \
 
 ### Notes
 
-Tearing down services and removing containers:
+Stopping services:
 
 ```bash
 cd GenAIComps/comps/llms/text-generation/llamacpp/
-docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml down
+docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml stop
 ```
+
+`down` may be used instead of 'stop' if you'd like to stop and delete the containers.
\ No newline at end of file

From fd15ee7529e98ae81c8d4b04483e6f2f1209215c Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 10 Jan 2025 13:13:47 -0600
Subject: [PATCH 07/31] This removes unneeded dependencies in the Dockerfile,
 unneeded entrypoint.sh

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 comps/llms/text-generation/llamacpp/Dockerfile           | 9 ++-------
 comps/llms/text-generation/llamacpp/README.md            | 8 ++++----
 .../text-generation/llamacpp/docker_compose_llm.yaml     | 3 +--
 comps/llms/text-generation/llamacpp/entrypoint.sh        | 8 --------
 4 files changed, 7 insertions(+), 21 deletions(-)
 delete mode 100644 comps/llms/text-generation/llamacpp/entrypoint.sh

diff --git a/comps/llms/text-generation/llamacpp/Dockerfile b/comps/llms/text-generation/llamacpp/Dockerfile
index a362c3bf6..70500e35d 100644
--- a/comps/llms/text-generation/llamacpp/Dockerfile
+++ b/comps/llms/text-generation/llamacpp/Dockerfile
@@ -3,18 +3,13 @@
 
 FROM python:3.11-slim
 
-RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
-    curl \
-    libgl1-mesa-glx \
-    libjemalloc-dev
-
 RUN useradd -m -s /bin/bash user && \
     mkdir -p /home/user && \
     chown -R user /home/user/
 
 USER user
 
-# Assumes we're building from the GenAIComps directory.
+# Assumes we're building from the GenAIComps directory, and docker file is in comps/llms/text-generation/llamacpp
 COPY ../../../comps /home/user/comps
 
 RUN pip install --no-cache-dir --upgrade pip setuptools && \
@@ -24,4 +19,4 @@ ENV PYTHONPATH=$PYTHONPATH:/home/user
 
 WORKDIR /home/user/comps/llms/text-generation/llamacpp/
 
-ENTRYPOINT ["bash", "entrypoint.sh"]
+ENTRYPOINT ["python", "llm.py"]
\ No newline at end of file
diff --git a/comps/llms/text-generation/llamacpp/README.md b/comps/llms/text-generation/llamacpp/README.md
index 15a96ca1f..e03fd7c36 100644
--- a/comps/llms/text-generation/llamacpp/README.md
+++ b/comps/llms/text-generation/llamacpp/README.md
@@ -21,12 +21,12 @@ Please note it's instructive to run and validate each the llama.cpp server and O
 
 ```bash
 cd GenAIComps
-docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llamacpp-server --force-recreate
+docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llamacpp-server
 ```
 
 Notes:
 
-i) If you prefer to run above in the background without screen output use `up -d` . The `--force-recreate` clears cache.
+i) If you prefer to run above in the background without screen output use `up -d`.
 
 ii) To stop the llama.cpp server:
 
@@ -51,7 +51,7 @@ This is essentially a wrapper component of Llama.cpp server. OPEA nicely standar
 
 ```bash
 cd GenAIComps/
-docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llama-opea-llm
+docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llamacpp-opea-llm --force-recreate
 ```
 
 Equivalently, the above can be achieved with `build` and `run` from the Dockerfile. Build:
@@ -87,4 +87,4 @@ cd GenAIComps/comps/llms/text-generation/llamacpp/
 docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml stop
 ```
 
-`down` may be used instead of 'stop' if you'd like to stop and delete the containers.
\ No newline at end of file
+`down` may be used instead of 'stop' if you'd like to stop, and delete the containers and networks.
\ No newline at end of file
diff --git a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml
index dd220b6f1..d66d93afd 100644
--- a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml
+++ b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml
@@ -18,7 +18,7 @@ services:
   llamacpp-opea-llm:
     image: opea/llm-llamacpp:latest
     build:
-        # Set this to allow COPY comps in the Dockerfile.
+        # This context is to allow the 'COPY comps' command in the Dockerfile.
         # When using docker compose with -f, the comps context is 4 levels down from docker_compose_llm.yaml.
         context: ../../../../
         dockerfile: ./comps/llms/text-generation/llamacpp/Dockerfile
@@ -31,7 +31,6 @@ services:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
-      # LLAMACPP_ENDPOINT: ${LLAMACPP_ENDPOINT}
     restart: unless-stopped
 
 networks:
diff --git a/comps/llms/text-generation/llamacpp/entrypoint.sh b/comps/llms/text-generation/llamacpp/entrypoint.sh
deleted file mode 100644
index c9a5a3d07..000000000
--- a/comps/llms/text-generation/llamacpp/entrypoint.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# pip --no-cache-dir install -r requirements-runtime.txt
-
-python llm.py

From c931902d616cb692fc3ffb54af4ad165adcdde4d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 10 Jan 2025 19:18:51 +0000
Subject: [PATCH 08/31] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 comps/llms/text-generation/llamacpp/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comps/llms/text-generation/llamacpp/README.md b/comps/llms/text-generation/llamacpp/README.md
index e03fd7c36..00b8e0b77 100644
--- a/comps/llms/text-generation/llamacpp/README.md
+++ b/comps/llms/text-generation/llamacpp/README.md
@@ -87,4 +87,4 @@ cd GenAIComps/comps/llms/text-generation/llamacpp/
 docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml stop
 ```
 
-`down` may be used instead of 'stop' if you'd like to stop, and delete the containers and networks.
\ No newline at end of file
+`down` may be used instead of 'stop' if you'd like to stop, and delete the containers and networks.

From a75d28dc2d2f0d1d11803ff5a509cc91a3fc9951 Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 14 Feb 2025 15:50:33 -0600
Subject: [PATCH 09/31] Refactored llama cpp and text-generation
 README_llamacpp.md

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 .../src/text-generation/README_llamacpp.md    | 50 +++++++++++++++++++
 comps/third_parties/llamacpp/README.md        | 30 +++++++++++
 .../deployment/docker_compose/compose.yaml    | 38 ++++++++++++++
 3 files changed, 118 insertions(+)
 create mode 100644 comps/llms/src/text-generation/README_llamacpp.md
 create mode 100644 comps/third_parties/llamacpp/README.md
 create mode 100644 comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml

diff --git a/comps/llms/src/text-generation/README_llamacpp.md b/comps/llms/src/text-generation/README_llamacpp.md
new file mode 100644
index 000000000..06680a98f
--- /dev/null
+++ b/comps/llms/src/text-generation/README_llamacpp.md
@@ -0,0 +1,50 @@
+# Prediction Guard Introduction
+
+[Prediction Guard](https://docs.predictionguard.com) allows you to utilize hosted open access LLMs, LVMs, and embedding functionality with seamlessly integrated safeguards. In addition to providing a scalable access to open models, Prediction Guard allows you to configure factual consistency checks, toxicity filters, PII filters, and prompt injection blocking. Join the [Prediction Guard Discord channel](https://discord.gg/TFHgnhAFKd) and request an API key to get started.
+
+## Get Started
+
+### Run the Predictionguard Microservice
+
+```bash
+export service_name="textgen-predictionguard"
+
+cd comps/llms/deployment/docker_compose/
+docker compose -f compose_text-generation.yaml up ${service_name} -d
+```
+
+## Consume the Prediction Guard Microservice
+
+See the [Prediction Guard docs](https://docs.predictionguard.com/) for available model options.
+
+### Without stream
+
+```bash
+curl -X POST http://localhost:9000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "Hermes-2-Pro-Llama-3-8B",
+        "messages": "Tell me a joke.",
+        "max_tokens": 100,
+        "temperature": 0.7,
+        "top_p": 0.9,
+        "top_k": 50,
+        "stream": false
+    }'
+```
+
+### With stream
+
+```bash
+curl -N -X POST http://localhost:9000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "Hermes-2-Pro-Llama-3-8B",
+        "messages": "Tell me a joke.",
+        "max_tokens": 100,
+        "temperature": 0.7,
+        "top_p": 0.9,
+        "top_k": 50,
+        "stream": true
+    }'
+```
diff --git a/comps/third_parties/llamacpp/README.md b/comps/third_parties/llamacpp/README.md
new file mode 100644
index 000000000..e12f6d34d
--- /dev/null
+++ b/comps/third_parties/llamacpp/README.md
@@ -0,0 +1,30 @@
+# TGI LLM Microservice
+
+[Text Generation Inference](https://github.com/huggingface/text-generation-inference) (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and more.
+
+## Start TGI with docker compose
+
+Set up environment.
+
+```bash
+export LLM_ENDPOINT_PORT=8008
+export host_ip=${host_ip}
+export HF_TOKEN=${HF_TOKEN}
+export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export MAX_INPUT_TOKENS=1024
+export MAX_TOTAL_TOKENS=2048
+```
+
+Run tgi on xeon.
+
+```bash
+cd deplopyment/docker_compose
+docker compose -f compose.yaml tgi-server up -d
+```
+
+Run tgi on gaudi.
+
+```bash
+cd deplopyment/docker_compose
+docker compose -f compose.yaml tgi-gaudi-server up -d
+```
diff --git a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml
new file mode 100644
index 000000000..d66d93afd
--- /dev/null
+++ b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml
@@ -0,0 +1,38 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  llamacpp-server:
+    image: ghcr.io/ggerganov/llama.cpp:server-b4419
+    ports:
+      - 8080:8080
+    environment:
+      # Refer to settings here: https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md
+      # Llama.cpp is based on .gguf format, and Hugging Face offers many .gguf format models.
+      LLAMA_ARG_MODEL_URL: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf
+      LLAMA_ARG_CTX_SIZE: 4096
+      LLAMA_ARG_N_PARALLEL: 2
+      LLAMA_ARG_ENDPOINT_METRICS: 1
+      LLAMA_ARG_PORT: 8080
+
+  llamacpp-opea-llm:
+    image: opea/llm-llamacpp:latest
+    build:
+        # This context is to allow the 'COPY comps' command in the Dockerfile.
+        # When using docker compose with -f, the comps context is 4 levels down from docker_compose_llm.yaml.
+        context: ../../../../
+        dockerfile: ./comps/llms/text-generation/llamacpp/Dockerfile
+    depends_on:
+      - llamacpp-server
+    ports:
+      - "9000:9000"
+    network_mode: "host" # equivalent to: docker run --network host ...
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge

From 830da586ebdadfd4b4835a67d975747cbbb1b275 Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 14 Feb 2025 16:05:35 -0600
Subject: [PATCH 10/31] Delete unrefactored files

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 .../llms/text-generation/llamacpp/Dockerfile  | 22 -----
 comps/llms/text-generation/llamacpp/README.md | 90 -------------------
 .../llms/text-generation/llamacpp/__init__.py |  2 -
 .../llamacpp/docker_compose_llm.yaml          | 38 --------
 comps/llms/text-generation/llamacpp/llm.py    | 65 --------------
 .../text-generation/llamacpp/requirements.txt | 12 ---
 6 files changed, 229 deletions(-)
 delete mode 100644 comps/llms/text-generation/llamacpp/Dockerfile
 delete mode 100644 comps/llms/text-generation/llamacpp/README.md
 delete mode 100644 comps/llms/text-generation/llamacpp/__init__.py
 delete mode 100644 comps/llms/text-generation/llamacpp/docker_compose_llm.yaml
 delete mode 100644 comps/llms/text-generation/llamacpp/llm.py
 delete mode 100644 comps/llms/text-generation/llamacpp/requirements.txt

diff --git a/comps/llms/text-generation/llamacpp/Dockerfile b/comps/llms/text-generation/llamacpp/Dockerfile
deleted file mode 100644
index 70500e35d..000000000
--- a/comps/llms/text-generation/llamacpp/Dockerfile
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-FROM python:3.11-slim
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p /home/user && \
-    chown -R user /home/user/
-
-USER user
-
-# Assumes we're building from the GenAIComps directory, and docker file is in comps/llms/text-generation/llamacpp
-COPY ../../../comps /home/user/comps
-
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    pip install --no-cache-dir -r /home/user/comps/llms/text-generation/llamacpp/requirements.txt
-
-ENV PYTHONPATH=$PYTHONPATH:/home/user
-
-WORKDIR /home/user/comps/llms/text-generation/llamacpp/
-
-ENTRYPOINT ["python", "llm.py"]
\ No newline at end of file
diff --git a/comps/llms/text-generation/llamacpp/README.md b/comps/llms/text-generation/llamacpp/README.md
deleted file mode 100644
index 00b8e0b77..000000000
--- a/comps/llms/text-generation/llamacpp/README.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# Introduction
-
-[llama.cpp](https://github.com/ggerganov/llama.cpp) provides inference in pure C/C++, and enables "LLM inference with minimal setup and state-of-the-art performance on a wide range of hardware - locally and in the cloud".
-
-This OPEA component wraps llama.cpp server so that it can interface with other OPEA components, or for creating OPEA Megaservices.
-
-llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#supported-backends), and has only been tested on CPU.
-
-To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly.
-
-## TLDR
-
-```bash
-cd GenAIComps/
-docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up
-```
-
-Please note it's instructive to run and validate each the llama.cpp server and OPEA component below.
-
-## 1. Run the llama.cpp server
-
-```bash
-cd GenAIComps
-docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llamacpp-server
-```
-
-Notes:
-
-i) If you prefer to run above in the background without screen output use `up -d`.
-
-ii) To stop the llama.cpp server:
-
-`docker compose -f comps/llms/text-generation/llamacpp/langchain/docker_compose_llm.yaml llamacpp-server stop`
-
-iii) For [llama.cpp settings](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md) please specify them in the docker_compose_llm.yaml file.
-
-#### Verify the llama.cpp Service:
-
-```bash
-curl --request POST \
-    --url http://localhost:8080/completion \
-    --header "Content-Type: application/json" \
-    --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
-```
-
-## 2. Run the llama.cpp OPEA Service
-
-This is essentially a wrapper component of Llama.cpp server. OPEA nicely standardizes and verifies LLM inputs with LLMParamsDoc class (see llm.py).
-
-### 2.1 Build the llama.cpp OPEA image:
-
-```bash
-cd GenAIComps/
-docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llamacpp-opea-llm --force-recreate
-```
-
-Equivalently, the above can be achieved with `build` and `run` from the Dockerfile. Build:
-
-```bash
-cd GenAIComps/
-docker build --no-cache -t opea/llm-llamacpp:latest \
-  --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy \
-  -f comps/llms/text-generation/llamacpp/Dockerfile .
-```
-
-And run:
-
-```bash
-docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy \
-  opea/llm-llamacpp:latest
-```
-
-### 2.3 Consume the llama.cpp Microservice:
-
-```bash
-curl http://127.0.0.1:9000/v1/chat/completions  -X POST \
-   -d '{"query":"What is Deep Learning?","max_tokens":32,"top_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
-   -H 'Content-Type: application/json'
-```
-
-### Notes
-
-Stopping services:
-
-```bash
-cd GenAIComps/comps/llms/text-generation/llamacpp/
-docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml stop
-```
-
-`down` may be used instead of 'stop' if you'd like to stop, and delete the containers and networks.
diff --git a/comps/llms/text-generation/llamacpp/__init__.py b/comps/llms/text-generation/llamacpp/__init__.py
deleted file mode 100644
index 916f3a44b..000000000
--- a/comps/llms/text-generation/llamacpp/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
diff --git a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml
deleted file mode 100644
index d66d93afd..000000000
--- a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-services:
-  llamacpp-server:
-    image: ghcr.io/ggerganov/llama.cpp:server-b4419
-    ports:
-      - 8080:8080
-    environment:
-      # Refer to settings here: https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md
-      # Llama.cpp is based on .gguf format, and Hugging Face offers many .gguf format models.
-      LLAMA_ARG_MODEL_URL: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf
-      LLAMA_ARG_CTX_SIZE: 4096
-      LLAMA_ARG_N_PARALLEL: 2
-      LLAMA_ARG_ENDPOINT_METRICS: 1
-      LLAMA_ARG_PORT: 8080
-
-  llamacpp-opea-llm:
-    image: opea/llm-llamacpp:latest
-    build:
-        # This context is to allow the 'COPY comps' command in the Dockerfile.
-        # When using docker compose with -f, the comps context is 4 levels down from docker_compose_llm.yaml.
-        context: ../../../../
-        dockerfile: ./comps/llms/text-generation/llamacpp/Dockerfile
-    depends_on:
-      - llamacpp-server
-    ports:
-      - "9000:9000"
-    network_mode: "host" # equivalent to: docker run --network host ...
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-    restart: unless-stopped
-
-networks:
-  default:
-    driver: bridge
diff --git a/comps/llms/text-generation/llamacpp/llm.py b/comps/llms/text-generation/llamacpp/llm.py
deleted file mode 100644
index 5612199eb..000000000
--- a/comps/llms/text-generation/llamacpp/llm.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-import openai
-from fastapi.responses import StreamingResponse
-
-from comps import CustomLogger, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
-
-logger = CustomLogger("llm_llamacpp")
-logflag = os.getenv("LOGFLAG", False)
-llamacpp_endpoint = os.getenv("LLAMACPP_ENDPOINT", "http://localhost:8080/")
-
-
-# OPEA microservice wrapper of llama.cpp
-# llama.cpp server uses openai API format: https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md
-@register_microservice(
-    name="opea_service@llm_llamacpp",
-    service_type=ServiceType.LLM,
-    endpoint="/v1/chat/completions",
-    host="0.0.0.0",
-    port=9000,
-)
-async def llm_generate(input: LLMParamsDoc):
-    if logflag:
-        logger.info(input)
-        logger.info(llamacpp_endpoint)
-
-    client = openai.OpenAI(
-        base_url=llamacpp_endpoint, api_key="sk-no-key-required"  # "http://<Your api-server IP>:port"
-    )
-
-    # Llama.cpp works with openai API format
-    # The openai api doesn't have top_k parameter
-    # https://community.openai.com/t/which-openai-gpt-models-if-any-allow-specifying-top-k/777982/2
-    chat_completion = client.chat.completions.create(
-        model=input.model,
-        messages=[{"role": "user", "content": input.query}],
-        max_tokens=input.max_tokens,
-        temperature=input.temperature,
-        top_p=input.top_p,
-        frequency_penalty=input.frequency_penalty,
-        presence_penalty=input.presence_penalty,
-        stream=input.streaming,
-    )
-
-    if input.streaming:
-
-        def stream_generator():
-            for c in chat_completion:
-                if logflag:
-                    logger.info(c)
-                yield f"data: {c.model_dump_json()}\n\n"
-            yield "data: [DONE]\n\n"
-
-        return StreamingResponse(stream_generator(), media_type="text/event-stream")
-    else:
-        if logflag:
-            logger.info(chat_completion)
-        return chat_completion
-
-
-if __name__ == "__main__":
-    opea_microservices["opea_service@llm_llamacpp"].start()
diff --git a/comps/llms/text-generation/llamacpp/requirements.txt b/comps/llms/text-generation/llamacpp/requirements.txt
deleted file mode 100644
index fdb5f5a01..000000000
--- a/comps/llms/text-generation/llamacpp/requirements.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-aiohttp
-docarray[full]
-fastapi
-huggingface_hub
-openai
-opentelemetry-api
-opentelemetry-exporter-otlp
-opentelemetry-sdk
-prometheus-fastapi-instrumentator
-shortuuid
-transformers
-uvicorn

From 8d058bbd851ec9363ca7ed86ede0a637795c2658 Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 14 Feb 2025 16:12:08 -0600
Subject: [PATCH 11/31] Adding llama.cpp backend include in the
 compose_text-genearation.yaml

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 .../compose_text-generation.yaml              | 12 +++
 .../src/text-generation/README_llamacpp.md    | 76 ++++++++++++++-----
 comps/third_parties/llamacpp/README.md        | 58 ++++++++++----
 .../deployment/docker_compose/compose.yaml    | 46 ++++++-----
 4 files changed, 131 insertions(+), 61 deletions(-)

diff --git a/comps/llms/deployment/docker_compose/compose_text-generation.yaml b/comps/llms/deployment/docker_compose/compose_text-generation.yaml
index fbf503ed6..d1a2b3975 100644
--- a/comps/llms/deployment/docker_compose/compose_text-generation.yaml
+++ b/comps/llms/deployment/docker_compose/compose_text-generation.yaml
@@ -5,6 +5,8 @@ include:
   - ../../../third_parties/tgi/deployment/docker_compose/compose.yaml
   - ../../../third_parties/vllm/deployment/docker_compose/compose.yaml
   - ../../../third_parties/ollama/deployment/docker_compose/compose.yaml
+  - ../../../third_parties/llamacpp/deployment/docker_compose/compose.yaml
+
 
 services:
   textgen:
@@ -100,6 +102,16 @@ services:
     environment:
       LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNative}
 
+  textgen-llamacpp:
+    extends: textgen
+    container_name: textgen-service-llamacpp
+    environment:
+      LLM_ENDPOINT: http://llamacpp-server
+      LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenService}
+    depends_on:
+      llamacpp-server:
+        condition: service_healthy
+
 networks:
   default:
     driver: bridge
diff --git a/comps/llms/src/text-generation/README_llamacpp.md b/comps/llms/src/text-generation/README_llamacpp.md
index 06680a98f..f6197c150 100644
--- a/comps/llms/src/text-generation/README_llamacpp.md
+++ b/comps/llms/src/text-generation/README_llamacpp.md
@@ -1,50 +1,84 @@
-# Prediction Guard Introduction
+# llama.cpp Introduction
 
-[Prediction Guard](https://docs.predictionguard.com) allows you to utilize hosted open access LLMs, LVMs, and embedding functionality with seamlessly integrated safeguards. In addition to providing a scalable access to open models, Prediction Guard allows you to configure factual consistency checks, toxicity filters, PII filters, and prompt injection blocking. Join the [Prediction Guard Discord channel](https://discord.gg/TFHgnhAFKd) and request an API key to get started.
+[llama.cpp](https://github.com/ggerganov/llama.cpp) provides inference in pure C/C++, and enables "LLM inference with minimal setup and state-of-the-art performance on a wide range of hardware - locally and in the cloud".
+
+This OPEA component wraps llama.cpp server so that it can interface with other OPEA components, or for creating OPEA Megaservices.
+
+llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#supported-backends), and has only been tested on CPU.
+
+To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly.
 
 ## Get Started
 
-### Run the Predictionguard Microservice
+### 1. Download a gguf model to serve
+
+To download an example .gguf model to a model path:
+
+```bash
+export MODEL_PATH=~/models
+mkdir $MODEL_PATH
+cd $MODEL_PATH
+wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf
+````
+
+### 2. Set Environment Variables
 
 ```bash
-export service_name="textgen-predictionguard"
+export MODEL_PATH=~/models
+export host_ip=$(hostname -I | awk '{print $1}')
+export TEXTGEN_PORT=9000
+export LLM_ENDPOINT_PORT=8008
+export LLM_ENDPOINT="http://${host_ip}:80"
+export LLM_MODEL_ID="models/Phi-3-mini-4k-instruct-q4.gguf"
+export LLAMA_ARG_CTX_SIZE=4096
+```
+### 3. Run the llama.cpp OPEA Microservice
 
+```bash
+export service_name="textgen-llamacpp"
 cd comps/llms/deployment/docker_compose/
 docker compose -f compose_text-generation.yaml up ${service_name} -d
 ```
 
-## Consume the Prediction Guard Microservice
+The server output can be observed in a terminal with `docker log <container>`.
 
-See the [Prediction Guard docs](https://docs.predictionguard.com/) for available model options.
+## Consume the Service
 
-### Without stream
+Verify the backend llama.cpp backend server:
 
 ```bash
-curl -X POST http://localhost:9000/v1/chat/completions \
+curl http://0.0.0.0:8008/v1/chat/completions \
     -H "Content-Type: application/json" \
+    -H "Authorization: Bearer no-key" \
     -d '{
-        "model": "Hermes-2-Pro-Llama-3-8B",
-        "messages": "Tell me a joke.",
-        "max_tokens": 100,
-        "temperature": 0.7,
-        "top_p": 0.9,
-        "top_k": 50,
-        "stream": false
+    "model": "models/Phi-3-mini-4k-instruct-q4.gguf",
+    "messages": [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant."
+        },
+        {
+            "role": "user",
+            "content": "What is deep learning?"
+        }
+        ]
     }'
 ```
 
-### With stream
+Consume the service:
+
+This component is based on openAI API convention:
 
 ```bash
-curl -N -X POST http://localhost:9000/v1/chat/completions \
+curl -X POST http://localhost:9000/v1/chat/completions \
     -H "Content-Type: application/json" \
     -d '{
-        "model": "Hermes-2-Pro-Llama-3-8B",
-        "messages": "Tell me a joke.",
+        "model": "models/Phi-3-mini-4k-instruct-q4.gguf",
+        "messages": [{"role": "user", "content": "Write a limerick about python exceptions"}],
         "max_tokens": 100,
         "temperature": 0.7,
         "top_p": 0.9,
         "top_k": 50,
-        "stream": true
+        "stream": false
     }'
-```
+```
\ No newline at end of file
diff --git a/comps/third_parties/llamacpp/README.md b/comps/third_parties/llamacpp/README.md
index e12f6d34d..8f9b7e627 100644
--- a/comps/third_parties/llamacpp/README.md
+++ b/comps/third_parties/llamacpp/README.md
@@ -1,30 +1,56 @@
-# TGI LLM Microservice
+# Introduction
 
-[Text Generation Inference](https://github.com/huggingface/text-generation-inference) (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and more.
+[llama.cpp](https://github.com/ggerganov/llama.cpp) provides inference in pure C/C++, and enables "LLM inference with minimal setup and state-of-the-art performance on a wide range of hardware - locally and in the cloud".
 
-## Start TGI with docker compose
+This OPEA component wraps llama.cpp server so that it can interface with other OPEA components, or for creating OPEA Megaservices.
 
-Set up environment.
+llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#supported-backends), and has only been tested on CPU.
+
+To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly.
+
+
+## Get Started
+
+### 1. Download a gguf Model
+
+To download an example .gguf model to a model path:
+
+```bash
+export MODEL_PATH=~/models
+mkdir $MODEL_PATH
+cd $MODEL_PATH
+wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf
+````
+
+### 2. Set Environment Variables
 
 ```bash
+export MODEL_PATH=~/models
+export host_ip=$(hostname -I | awk '{print $1}')
 export LLM_ENDPOINT_PORT=8008
-export host_ip=${host_ip}
-export HF_TOKEN=${HF_TOKEN}
-export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export MAX_INPUT_TOKENS=1024
-export MAX_TOTAL_TOKENS=2048
+export LLM_MODEL_ID="models/Phi-3-mini-4k-instruct-q4.gguf"
+export LLAMA_ARG_CTX_SIZE=4096
 ```
 
-Run tgi on xeon.
+### 3. Run the llama.cpp Backend Microservice
 
 ```bash
-cd deplopyment/docker_compose
-docker compose -f compose.yaml tgi-server up -d
+cd deployment/docker_compose
+docker compose -f compose.yaml up llamacpp-server -d 
 ```
 
-Run tgi on gaudi.
+To use this in an OPEA text generation component please see [llama.cpp text-generation](
+../../llms/src/text-generation/README_llamacpp.md)  
+
+Note: can use docker logs <container> to observe server.
+
+## Consume the service
+
+Llama cpp supports openai style API:
 
 ```bash
-cd deplopyment/docker_compose
-docker compose -f compose.yaml tgi-gaudi-server up -d
-```
+curl http://${host_ip}:8008/v1/chat/completions \
+    -X POST \
+    -H "Content-Type: application/json" \
+    -d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
+```
\ No newline at end of file
diff --git a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml
index d66d93afd..a1058de2b 100644
--- a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml
+++ b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml
@@ -4,34 +4,32 @@
 services:
   llamacpp-server:
     image: ghcr.io/ggerganov/llama.cpp:server-b4419
+    container_name: llamacpp-server
     ports:
-      - 8080:8080
+      - ${LLM_ENDPOINT_PORT:-8008}:80
+    volumes:
+      # Download the .gguf models to this path.
+      - ${MODEL_PATH:-~/models}:/models
     environment:
-      # Refer to settings here: https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md
-      # Llama.cpp is based on .gguf format, and Hugging Face offers many .gguf format models.
-      LLAMA_ARG_MODEL_URL: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf
-      LLAMA_ARG_CTX_SIZE: 4096
+      LOGFLAG: False
+      HTTPS_PROXY: ${http_proxy}
+      HTTP_PROXY: ${https_proxy}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+      host_ip: ${host_ip}
+      # llama.cpp env variables. Please refer to reference:
+      # https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md
+      LLAMA_ARG_PORT: 80
+      LLAMA_ARG_MODEL: /$LLM_MODEL_ID
+      LLAMA_ARG_CTX_SIZE: ${LLAMA_ARG_CTX_SIZE:-4096}
       LLAMA_ARG_N_PARALLEL: 2
       LLAMA_ARG_ENDPOINT_METRICS: 1
-      LLAMA_ARG_PORT: 8080
-
-  llamacpp-opea-llm:
-    image: opea/llm-llamacpp:latest
-    build:
-        # This context is to allow the 'COPY comps' command in the Dockerfile.
-        # When using docker compose with -f, the comps context is 4 levels down from docker_compose_llm.yaml.
-        context: ../../../../
-        dockerfile: ./comps/llms/text-generation/llamacpp/Dockerfile
-    depends_on:
-      - llamacpp-server
-    ports:
-      - "9000:9000"
-    network_mode: "host" # equivalent to: docker run --network host ...
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-    restart: unless-stopped
+    ipc: host
+    healthcheck:
+      test: [ "CMD-SHELL", "curl -f http://${host_ip}:80/health || exit 1" ]
+      interval: 10s
+      timeout: 10s
+      retries: 150
 
 networks:
   default:

From a6740b62455f619ef55413da633048c150098ee2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 14 Feb 2025 22:12:52 +0000
Subject: [PATCH 12/31] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 comps/llms/src/text-generation/README_llamacpp.md |  5 +++--
 comps/third_parties/llamacpp/README.md            | 10 ++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/comps/llms/src/text-generation/README_llamacpp.md b/comps/llms/src/text-generation/README_llamacpp.md
index f6197c150..a2ae32cbe 100644
--- a/comps/llms/src/text-generation/README_llamacpp.md
+++ b/comps/llms/src/text-generation/README_llamacpp.md
@@ -19,7 +19,7 @@ export MODEL_PATH=~/models
 mkdir $MODEL_PATH
 cd $MODEL_PATH
 wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf
-````
+```
 
 ### 2. Set Environment Variables
 
@@ -32,6 +32,7 @@ export LLM_ENDPOINT="http://${host_ip}:80"
 export LLM_MODEL_ID="models/Phi-3-mini-4k-instruct-q4.gguf"
 export LLAMA_ARG_CTX_SIZE=4096
 ```
+
 ### 3. Run the llama.cpp OPEA Microservice
 
 ```bash
@@ -81,4 +82,4 @@ curl -X POST http://localhost:9000/v1/chat/completions \
         "top_k": 50,
         "stream": false
     }'
-```
\ No newline at end of file
+```
diff --git a/comps/third_parties/llamacpp/README.md b/comps/third_parties/llamacpp/README.md
index 8f9b7e627..00363a784 100644
--- a/comps/third_parties/llamacpp/README.md
+++ b/comps/third_parties/llamacpp/README.md
@@ -8,7 +8,6 @@ llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=re
 
 To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly.
 
-
 ## Get Started
 
 ### 1. Download a gguf Model
@@ -20,7 +19,7 @@ export MODEL_PATH=~/models
 mkdir $MODEL_PATH
 cd $MODEL_PATH
 wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf
-````
+```
 
 ### 2. Set Environment Variables
 
@@ -36,11 +35,10 @@ export LLAMA_ARG_CTX_SIZE=4096
 
 ```bash
 cd deployment/docker_compose
-docker compose -f compose.yaml up llamacpp-server -d 
+docker compose -f compose.yaml up llamacpp-server -d
 ```
 
-To use this in an OPEA text generation component please see [llama.cpp text-generation](
-../../llms/src/text-generation/README_llamacpp.md)  
+To use this in an OPEA text generation component please see [llama.cpp text-generation](../../llms/src/text-generation/README_llamacpp.md)
 
 Note: can use docker logs <container> to observe server.
 
@@ -53,4 +51,4 @@ curl http://${host_ip}:8008/v1/chat/completions \
     -X POST \
     -H "Content-Type: application/json" \
     -d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
-```
\ No newline at end of file
+```

From d0e27bf6a1baeb682443d495fe8669fdfc68e6e2 Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 21 Feb 2025 15:19:13 -0600
Subject: [PATCH 13/31] Fix service name

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 .../llms/deployment/docker_compose/compose_text-generation.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comps/llms/deployment/docker_compose/compose_text-generation.yaml b/comps/llms/deployment/docker_compose/compose_text-generation.yaml
index d1a2b3975..1bb58d0c1 100644
--- a/comps/llms/deployment/docker_compose/compose_text-generation.yaml
+++ b/comps/llms/deployment/docker_compose/compose_text-generation.yaml
@@ -102,7 +102,7 @@ services:
     environment:
       LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNative}
 
-  textgen-llamacpp:
+  textgen-service-llamacpp:
     extends: textgen
     container_name: textgen-service-llamacpp
     environment:

From 91324af5ca945f1087c59a1efbf384e8110541ee Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 21 Feb 2025 15:22:25 -0600
Subject: [PATCH 14/31] Revise llamacpp, using smaller Qwen model and remove
 unnecessary curl model argument

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 comps/llms/src/text-generation/README_llamacpp.md | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/comps/llms/src/text-generation/README_llamacpp.md b/comps/llms/src/text-generation/README_llamacpp.md
index f6197c150..91ec27416 100644
--- a/comps/llms/src/text-generation/README_llamacpp.md
+++ b/comps/llms/src/text-generation/README_llamacpp.md
@@ -16,9 +16,9 @@ To download an example .gguf model to a model path:
 
 ```bash
 export MODEL_PATH=~/models
-mkdir $MODEL_PATH
+mkdir -p $MODEL_PATH # -p means make only if doesn't exist
 cd $MODEL_PATH
-wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf
+wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf
 ````
 
 ### 2. Set Environment Variables
@@ -29,13 +29,13 @@ export host_ip=$(hostname -I | awk '{print $1}')
 export TEXTGEN_PORT=9000
 export LLM_ENDPOINT_PORT=8008
 export LLM_ENDPOINT="http://${host_ip}:80"
-export LLM_MODEL_ID="models/Phi-3-mini-4k-instruct-q4.gguf"
+export LLM_MODEL_ID="models/qwen2.5-1.5b-instruct-q4_k_m.gguf"
 export LLAMA_ARG_CTX_SIZE=4096
 ```
 ### 3. Run the llama.cpp OPEA Microservice
 
 ```bash
-export service_name="textgen-llamacpp"
+export service_name="textgen-service-llamacpp"
 cd comps/llms/deployment/docker_compose/
 docker compose -f compose_text-generation.yaml up ${service_name} -d
 ```
@@ -51,7 +51,6 @@ curl http://0.0.0.0:8008/v1/chat/completions \
     -H "Content-Type: application/json" \
     -H "Authorization: Bearer no-key" \
     -d '{
-    "model": "models/Phi-3-mini-4k-instruct-q4.gguf",
     "messages": [
         {
             "role": "system",
@@ -73,7 +72,6 @@ This component is based on openAI API convention:
 curl -X POST http://localhost:9000/v1/chat/completions \
     -H "Content-Type: application/json" \
     -d '{
-        "model": "models/Phi-3-mini-4k-instruct-q4.gguf",
         "messages": [{"role": "user", "content": "Write a limerick about python exceptions"}],
         "max_tokens": 100,
         "temperature": 0.7,

From f295e29e246f7d6e73574f52eb6261e4291b2e2e Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 21 Feb 2025 15:23:35 -0600
Subject: [PATCH 15/31] Update llamacpp thirdparty readme to use smaller model

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 comps/third_parties/llamacpp/README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/comps/third_parties/llamacpp/README.md b/comps/third_parties/llamacpp/README.md
index 8f9b7e627..d08e3afb8 100644
--- a/comps/third_parties/llamacpp/README.md
+++ b/comps/third_parties/llamacpp/README.md
@@ -17,9 +17,10 @@ To download an example .gguf model to a model path:
 
 ```bash
 export MODEL_PATH=~/models
-mkdir $MODEL_PATH
+mkdir -p $MODEL_PATH # -p means make only if doesn't exist
 cd $MODEL_PATH
-wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf
+
+wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf
 ````
 
 ### 2. Set Environment Variables
@@ -28,7 +29,7 @@ wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/P
 export MODEL_PATH=~/models
 export host_ip=$(hostname -I | awk '{print $1}')
 export LLM_ENDPOINT_PORT=8008
-export LLM_MODEL_ID="models/Phi-3-mini-4k-instruct-q4.gguf"
+export LLM_MODEL_ID="models/qwen2.5-1.5b-instruct-q4_k_m.gguf"
 export LLAMA_ARG_CTX_SIZE=4096
 ```
 

From 480cb6900d76cb3bbb6fa4238c2073ef11893eb0 Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 21 Feb 2025 15:25:18 -0600
Subject: [PATCH 16/31] Fix healthcheck in llamacpp deployment compose.yaml

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 .../llamacpp/deployment/docker_compose/compose.yaml             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml
index a1058de2b..49ca5a8c4 100644
--- a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml
+++ b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml
@@ -26,7 +26,7 @@ services:
       LLAMA_ARG_ENDPOINT_METRICS: 1
     ipc: host
     healthcheck:
-      test: [ "CMD-SHELL", "curl -f http://${host_ip}:80/health || exit 1" ]
+      test: [ "CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1" ]
       interval: 10s
       timeout: 10s
       retries: 150

From 2c9f877e8584a49ed1b011515db1188f47ec1e7b Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 21 Feb 2025 15:26:21 -0600
Subject: [PATCH 17/31] Wrote a test and tested for llamacpp text gen service

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 ...t_llms_text-generation_service_llamacpp.sh | 90 +++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 tests/llms/test_llms_text-generation_service_llamacpp.sh

diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh
new file mode 100644
index 000000000..d51399bf6
--- /dev/null
+++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+# Copyright (C) 2024 Prediction Guard, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+set -x
+
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+export REGISTRY=${IMAGE_REPO}
+export TAG="comps"
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=${TAG}"
+
+WORKPATH=$(dirname "$PWD") # Assumes the script is called from GenAIComps/comps
+host_ip=$(hostname -I | awk '{print $1}')  # Adjust to a more reliable command
+if [ -z "$host_ip" ]; then
+    host_ip="localhost"  # Default to localhost if IP address is empty
+fi
+LOG_PATH="$WORKPATH/tests"
+service_name="textgen-service-llamacpp"
+
+
+function build_docker_images() {
+    cd $WORKPATH
+    docker build --no-cache -t ${REGISTRY:-opea}/llm-textgen:${TAG:-latest} --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
+    if [ $? -ne 0 ]; then
+        echo "opea/llm-textgen built fail"
+        exit 1
+    else
+        echo "opea/llm-textgen built successful"
+    fi
+}
+
+function start_service() {
+    export LLM_ENDPOINT_PORT=8008
+    export LLM_ENDPOINT="http://${host_ip}:80"
+    export TEXTGEN_PORT=9000
+    export LLM_MODEL_ID="models/qwen2.5-1.5b-instruct-q4_k_m.gguf"
+    export LLAMA_ARG_CTX_SIZE=4096
+    export LOGFLAG=True
+
+    export MODEL_PATH=~/models
+    mkdir -p $MODEL_PATH
+    cd $MODELPATH
+    wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf
+    cd $WORKPATH/comps/llms/deployment/docker_compose
+    docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log
+
+    sleep 60  # Sleep for 1 minute to allow the service to start
+}
+
+function validate_microservice() {
+    result=$(http_proxy="" curl -X POST http://${host_ip}:${TEXTGEN_PORT}/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "messages": [{"role": "user", "content": "What is AI?"}],
+        "max_tokens": 100,
+        "temperature": 0.7,
+        "top_p": 0.9,
+        "top_k": 50,
+        "stream": false
+    }')
+
+    if [[ $result == *"content"* ]]; then
+        echo "Service response is correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs ${service_name}
+        exit 1
+    fi
+}
+
+function stop_docker() {
+    cd $WORKPATH/comps/llms/deployment/docker_compose
+    docker compose -f compose_text-generation.yaml down ${service_name} --remove-orphans
+}
+
+function main() {
+    stop_docker
+
+    build_docker_images
+    start_service
+
+    validate_microservice
+
+    stop_docker
+    echo y | docker system prune
+}
+
+main
+set +x

From 7310d6a8aa22f0210e432fcb363be282ccb29194 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 21 Feb 2025 21:29:49 +0000
Subject: [PATCH 18/31] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 comps/llms/src/text-generation/README_llamacpp.md |  5 +++--
 comps/third_parties/llamacpp/README.md            | 10 ++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/comps/llms/src/text-generation/README_llamacpp.md b/comps/llms/src/text-generation/README_llamacpp.md
index 91ec27416..237d515f2 100644
--- a/comps/llms/src/text-generation/README_llamacpp.md
+++ b/comps/llms/src/text-generation/README_llamacpp.md
@@ -19,7 +19,7 @@ export MODEL_PATH=~/models
 mkdir -p $MODEL_PATH # -p means make only if doesn't exist
 cd $MODEL_PATH
 wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf
-````
+```
 
 ### 2. Set Environment Variables
 
@@ -32,6 +32,7 @@ export LLM_ENDPOINT="http://${host_ip}:80"
 export LLM_MODEL_ID="models/qwen2.5-1.5b-instruct-q4_k_m.gguf"
 export LLAMA_ARG_CTX_SIZE=4096
 ```
+
 ### 3. Run the llama.cpp OPEA Microservice
 
 ```bash
@@ -79,4 +80,4 @@ curl -X POST http://localhost:9000/v1/chat/completions \
         "top_k": 50,
         "stream": false
     }'
-```
\ No newline at end of file
+```
diff --git a/comps/third_parties/llamacpp/README.md b/comps/third_parties/llamacpp/README.md
index d08e3afb8..3f051ca32 100644
--- a/comps/third_parties/llamacpp/README.md
+++ b/comps/third_parties/llamacpp/README.md
@@ -8,7 +8,6 @@ llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=re
 
 To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly.
 
-
 ## Get Started
 
 ### 1. Download a gguf Model
@@ -21,7 +20,7 @@ mkdir -p $MODEL_PATH # -p means make only if doesn't exist
 cd $MODEL_PATH
 
 wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf
-````
+```
 
 ### 2. Set Environment Variables
 
@@ -37,11 +36,10 @@ export LLAMA_ARG_CTX_SIZE=4096
 
 ```bash
 cd deployment/docker_compose
-docker compose -f compose.yaml up llamacpp-server -d 
+docker compose -f compose.yaml up llamacpp-server -d
 ```
 
-To use this in an OPEA text generation component please see [llama.cpp text-generation](
-../../llms/src/text-generation/README_llamacpp.md)  
+To use this in an OPEA text generation component please see [llama.cpp text-generation](../../llms/src/text-generation/README_llamacpp.md)
 
 Note: can use docker logs <container> to observe server.
 
@@ -54,4 +52,4 @@ curl http://${host_ip}:8008/v1/chat/completions \
     -X POST \
     -H "Content-Type: application/json" \
     -d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
-```
\ No newline at end of file
+```

From efde309de6bbf8dd0eb454dbdd19c4d5fb1a14bf Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 21 Feb 2025 16:11:27 -0600
Subject: [PATCH 19/31] Increase the llamacpp-server wait time

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 tests/llms/test_llms_text-generation_service_llamacpp.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh
index d51399bf6..63d3dda6d 100644
--- a/tests/llms/test_llms_text-generation_service_llamacpp.sh
+++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh
@@ -43,9 +43,8 @@ function start_service() {
     cd $MODELPATH
     wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf
     cd $WORKPATH/comps/llms/deployment/docker_compose
-    docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log
-
-    sleep 60  # Sleep for 1 minute to allow the service to start
+    docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose_llama.log
+    sleep 120  # Allow the service to start
 }
 
 function validate_microservice() {

From c474a643b737c7940a92347ca0f5aad247179210 Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 21 Feb 2025 16:34:59 -0600
Subject: [PATCH 20/31] Fixed typos on http environment variables, and volumes

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 .../llamacpp/deployment/docker_compose/compose.yaml      | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml
index 49ca5a8c4..e1544d3a8 100644
--- a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml
+++ b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml
@@ -9,11 +9,12 @@ services:
       - ${LLM_ENDPOINT_PORT:-8008}:80
     volumes:
       # Download the .gguf models to this path.
-      - ${MODEL_PATH:-~/models}:/models
+      - "${MODEL_PATH:-~/models}:/models"
     environment:
       LOGFLAG: False
-      HTTPS_PROXY: ${http_proxy}
-      HTTP_PROXY: ${https_proxy}
+      no_proxy: ${no_proxy}
+      https_proxy: ${http_proxy}
+      http_proxy: ${https_proxy}
       LLM_MODEL_ID: ${LLM_MODEL_ID}
       LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
       host_ip: ${host_ip}
@@ -29,7 +30,7 @@ services:
       test: [ "CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1" ]
       interval: 10s
       timeout: 10s
-      retries: 150
+      retries: 100
 
 networks:
   default:

From 712f575a1dec18ff4d352fc2cd863e9347f713e1 Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 21 Feb 2025 17:06:17 -0600
Subject: [PATCH 21/31] Splitting the llama.cpp test to use compose up on the
 llama.cpp third-party service first.

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 .../test_llms_text-generation_service_llamacpp.sh  | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh
index 63d3dda6d..4032c377e 100644
--- a/tests/llms/test_llms_text-generation_service_llamacpp.sh
+++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh
@@ -41,10 +41,18 @@ function start_service() {
     export MODEL_PATH=~/models
     mkdir -p $MODEL_PATH
     cd $MODELPATH
-    wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf
+    wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf \
+      -q --show-progress --progress=bar
+
+    # Spin up the third party service first before compose_text-generation.yaml,
+    # otherwise there's a dependency error.  Doesn't have this error when running locally.
+    cd $WORKPATH/comps/third_parties/llamacpp/deployment/docker_compose/
+    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose_llamacpp.log
+    sleep 20s
+
     cd $WORKPATH/comps/llms/deployment/docker_compose
-    docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose_llama.log
-    sleep 120  # Allow the service to start
+    docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log
+    sleep 60s  # Allow the service to start
 }
 
 function validate_microservice() {

From 68cc00f4d6ac18a43eb57db4aa2e7d7e7d23d802 Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 21 Feb 2025 18:28:03 -0600
Subject: [PATCH 22/31] add alternate command to stop and remove docker
 containers from previous tests

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 .../test_llms_text-generation_service_llamacpp.sh  | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh
index 4032c377e..84fd4b8ad 100644
--- a/tests/llms/test_llms_text-generation_service_llamacpp.sh
+++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh
@@ -46,13 +46,15 @@ function start_service() {
 
     # Spin up the third party service first before compose_text-generation.yaml,
     # otherwise there's a dependency error.  Doesn't have this error when running locally.
-    cd $WORKPATH/comps/third_parties/llamacpp/deployment/docker_compose/
-    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose_llamacpp.log
-    sleep 20s
+#    cd $WORKPATH/comps/third_parties/llamacpp/deployment/docker_compose/
+#    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose_llamacpp.log
+#    sleep 20s
 
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log
+    docker ps -a
     sleep 60s  # Allow the service to start
+    docker ps -a
 }
 
 function validate_microservice() {
@@ -82,14 +84,16 @@ function stop_docker() {
 }
 
 function main() {
-    stop_docker
+    #    stop_docker
+    # Trying this because stop_docker may not stop and remove containers from previous run tests and may block ports.
+    docker stop $(docker ps -a -q) && docker rm $(docker ps -a -q)
 
     build_docker_images
     start_service
 
     validate_microservice
 
-    stop_docker
+    stop_dockerllm-textgen
     echo y | docker system prune
 }
 

From 2dd20646fe41e6dae47995bc8e6a2cf66299df7d Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 21 Feb 2025 18:34:24 -0600
Subject: [PATCH 23/31] Modifying tear down of stop_docker in llamacpp tests to
 try to remove all containers.

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 tests/llms/test_llms_text-generation_service_llamacpp.sh | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh
index 84fd4b8ad..acfe9f15c 100644
--- a/tests/llms/test_llms_text-generation_service_llamacpp.sh
+++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh
@@ -80,20 +80,19 @@ function validate_microservice() {
 
 function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
-    docker compose -f compose_text-generation.yaml down ${service_name} --remove-orphans
+    # docker compose -f compose_text-generation.yaml down ${service_name} --remove-orphans
+    docker compose -f compose_text-generation.yaml down --remove-orphans
 }
 
 function main() {
-    #    stop_docker
-    # Trying this because stop_docker may not stop and remove containers from previous run tests and may block ports.
-    docker stop $(docker ps -a -q) && docker rm $(docker ps -a -q)
+    stop_docker
 
     build_docker_images
     start_service
 
     validate_microservice
 
-    stop_dockerllm-textgen
+    stop_docker
     echo y | docker system prune
 }
 

From dbff6fcfd1102fcef0b675190565610a0ab19863 Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 21 Feb 2025 18:46:05 -0600
Subject: [PATCH 24/31] Adding some logs output to debug llamacpp test

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 tests/llms/test_llms_text-generation_service_llamacpp.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh
index acfe9f15c..c500b041f 100644
--- a/tests/llms/test_llms_text-generation_service_llamacpp.sh
+++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh
@@ -53,8 +53,10 @@ function start_service() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log
     docker ps -a
+    docker logs llamacpp-server
     sleep 60s  # Allow the service to start
     docker ps -a
+    docker logs llamacpp-server
 }
 
 function validate_microservice() {

From f184897fce78469e0b672e2210dd0006da38b214 Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 21 Feb 2025 19:04:14 -0600
Subject: [PATCH 25/31] Found model path bug and fixed it to run llama.cpp test

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 .../llamacpp/deployment/docker_compose/compose.yaml           | 2 +-
 tests/llms/test_llms_text-generation_service_llamacpp.sh      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml
index e1544d3a8..c352db8e3 100644
--- a/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml
+++ b/comps/third_parties/llamacpp/deployment/docker_compose/compose.yaml
@@ -9,7 +9,7 @@ services:
       - ${LLM_ENDPOINT_PORT:-8008}:80
     volumes:
       # Download the .gguf models to this path.
-      - "${MODEL_PATH:-~/models}:/models"
+      - ${MODEL_PATH:-~/models}:/models
     environment:
       LOGFLAG: False
       no_proxy: ${no_proxy}
diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh
index c500b041f..d08b42bd8 100644
--- a/tests/llms/test_llms_text-generation_service_llamacpp.sh
+++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh
@@ -40,9 +40,9 @@ function start_service() {
 
     export MODEL_PATH=~/models
     mkdir -p $MODEL_PATH
-    cd $MODELPATH
+    cd $MODEL_PATH
     wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf \
-      -q --show-progress --progress=bar
+      --show-progress --progress=bar
 
     # Spin up the third party service first before compose_text-generation.yaml,
     # otherwise there's a dependency error.  Doesn't have this error when running locally.

From ea4ea388ad6a5bc753da3e86530e54994e7b2dbb Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 21 Feb 2025 23:00:57 -0600
Subject: [PATCH 26/31] Adjusted LLM_ENDPOINT env variable

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 tests/llms/test_llms_text-generation_service_llamacpp.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh
index d08b42bd8..fc1f619e6 100644
--- a/tests/llms/test_llms_text-generation_service_llamacpp.sh
+++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh
@@ -32,7 +32,7 @@ function build_docker_images() {
 
 function start_service() {
     export LLM_ENDPOINT_PORT=8008
-    export LLM_ENDPOINT="http://${host_ip}:80"
+    export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
     export TEXTGEN_PORT=9000
     export LLM_MODEL_ID="models/qwen2.5-1.5b-instruct-q4_k_m.gguf"
     export LLAMA_ARG_CTX_SIZE=4096
@@ -55,8 +55,8 @@ function start_service() {
     docker ps -a
     docker logs llamacpp-server
     sleep 60s  # Allow the service to start
-    docker ps -a
-    docker logs llamacpp-server
+#    docker ps -a
+#    docker logs llamacpp-server
 }
 
 function validate_microservice() {

From 01fca036a91b97edf30c1300ca0eb24c3ea5036e Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 21 Feb 2025 23:23:03 -0600
Subject: [PATCH 27/31] Cleaned up test file

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 .../test_llms_text-generation_service_llamacpp.sh    | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh
index fc1f619e6..1ddb2449d 100644
--- a/tests/llms/test_llms_text-generation_service_llamacpp.sh
+++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh
@@ -42,21 +42,13 @@ function start_service() {
     mkdir -p $MODEL_PATH
     cd $MODEL_PATH
     wget --no-clobber https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf \
-      --show-progress --progress=bar
-
-    # Spin up the third party service first before compose_text-generation.yaml,
-    # otherwise there's a dependency error.  Doesn't have this error when running locally.
-#    cd $WORKPATH/comps/third_parties/llamacpp/deployment/docker_compose/
-#    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose_llamacpp.log
-#    sleep 20s
+      -q --show-progress --progress=bar
 
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log
     docker ps -a
     docker logs llamacpp-server
     sleep 60s  # Allow the service to start
-#    docker ps -a
-#    docker logs llamacpp-server
 }
 
 function validate_microservice() {
@@ -82,7 +74,7 @@ function validate_microservice() {
 
 function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
-    # docker compose -f compose_text-generation.yaml down ${service_name} --remove-orphans
+    # Using down without particular service_name since there can be containers that aren't taken down from other tests.
     docker compose -f compose_text-generation.yaml down --remove-orphans
 }
 

From dfd5057cd9465318f3ea3c13316e186de2b11585 Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Fri, 21 Feb 2025 23:53:41 -0600
Subject: [PATCH 28/31] Adjust host_ip env variable in scope of start_service

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 tests/llms/test_llms_text-generation_service_llamacpp.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh
index 1ddb2449d..af43ddbbe 100644
--- a/tests/llms/test_llms_text-generation_service_llamacpp.sh
+++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh
@@ -31,6 +31,7 @@ function build_docker_images() {
 }
 
 function start_service() {
+    export host_ip=${host_ip} # must be an environment variable
     export LLM_ENDPOINT_PORT=8008
     export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
     export TEXTGEN_PORT=9000

From 4a965da9b0dd4373ac7850b001f686f09e283a24 Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Mon, 24 Feb 2025 10:56:15 -0600
Subject: [PATCH 29/31] Docker ps to debug orphaned containers.

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 .../test_llms_faq-generation_vllm_on_intel_hpu.sh  |  5 ++++-
 .../test_llms_text-generation_service_llamacpp.sh  | 14 ++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/tests/llms/test_llms_faq-generation_vllm_on_intel_hpu.sh b/tests/llms/test_llms_faq-generation_vllm_on_intel_hpu.sh
index 8607f2c55..f176bffc6 100644
--- a/tests/llms/test_llms_faq-generation_vllm_on_intel_hpu.sh
+++ b/tests/llms/test_llms_faq-generation_vllm_on_intel_hpu.sh
@@ -119,8 +119,11 @@ function stop_docker() {
 }
 
 function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
 
     build_docker_images
     start_service
diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh
index af43ddbbe..a482724ef 100644
--- a/tests/llms/test_llms_text-generation_service_llamacpp.sh
+++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh
@@ -31,7 +31,7 @@ function build_docker_images() {
 }
 
 function start_service() {
-    export host_ip=${host_ip} # must be an environment variable
+    export host_ip=${host_ip} # must be an environment variable declared in scope of start_service
     export LLM_ENDPOINT_PORT=8008
     export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
     export TEXTGEN_PORT=9000
@@ -49,7 +49,7 @@ function start_service() {
     docker compose -f compose_text-generation.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log
     docker ps -a
     docker logs llamacpp-server
-    sleep 60s  # Allow the service to start
+    sleep 30s  # Allow the service to start
 }
 
 function validate_microservice() {
@@ -75,13 +75,19 @@ function validate_microservice() {
 
 function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
-    # Using down without particular service_name since there can be containers that aren't taken down from other tests.
+    # Using down without particular service_name since still can have orphan containers that aren't taken down from other tests.
     docker compose -f compose_text-generation.yaml down --remove-orphans
 }
 
 function main() {
-    stop_docker
 
+    echo "Docker containers before stop_docker"
+    docker ps -a
+    stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+    
+    stop_docker
     build_docker_images
     start_service
 

From 32b06e9469c562e7421ef25f9767b140eeeeea70 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 24 Feb 2025 16:57:38 +0000
Subject: [PATCH 30/31] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/llms/test_llms_text-generation_service_llamacpp.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh
index a482724ef..1f2f4fcf1 100644
--- a/tests/llms/test_llms_text-generation_service_llamacpp.sh
+++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh
@@ -86,7 +86,7 @@ function main() {
     stop_docker
     echo "Docker containers after stop_docker"
     docker ps -a
-    
+
     stop_docker
     build_docker_images
     start_service

From 33635042572934681279ce3066ac8c76747aa8fb Mon Sep 17 00:00:00 2001
From: Ed Lee <16417837+edlee123@users.noreply.github.com>
Date: Mon, 24 Feb 2025 11:29:10 -0600
Subject: [PATCH 31/31] Adding output to debug orphaned docker containers

Signed-off-by: Ed Lee <16417837+edlee123@users.noreply.github.com>
---
 tests/llms/test_llms_doc-summarization_tgi.sh             | 8 +++++---
 .../llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh  | 8 +++++---
 tests/llms/test_llms_doc-summarization_vllm.sh            | 8 +++++---
 .../llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh | 8 +++++---
 tests/llms/test_llms_faq-generation_tgi.sh                | 8 +++++---
 tests/llms/test_llms_faq-generation_tgi_on_intel_hpu.sh   | 8 +++++---
 tests/llms/test_llms_faq-generation_vllm.sh               | 8 +++++---
 .../llms/test_llms_text-generation_native_on_intel_hpu.sh | 8 +++++---
 tests/llms/test_llms_text-generation_service_llamacpp.sh  | 2 +-
 tests/llms/test_llms_text-generation_service_ollama.sh    | 5 +++++
 tests/llms/test_llms_text-generation_service_tgi.sh       | 4 ++++
 .../test_llms_text-generation_service_tgi_on_intel_hpu.sh | 4 ++++
 ...test_llms_text-generation_service_vllm_on_intel_hpu.sh | 8 +++++---
 13 files changed, 59 insertions(+), 28 deletions(-)

diff --git a/tests/llms/test_llms_doc-summarization_tgi.sh b/tests/llms/test_llms_doc-summarization_tgi.sh
index 16e201854..a07000d89 100644
--- a/tests/llms/test_llms_doc-summarization_tgi.sh
+++ b/tests/llms/test_llms_doc-summarization_tgi.sh
@@ -140,10 +140,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_doc-summarization.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
 
     build_docker_images
     start_service
diff --git a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
index b8c97f5b6..c14c4e1eb 100644
--- a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
+++ b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
@@ -141,10 +141,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_doc-summarization.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
 
     build_docker_images
     start_service
diff --git a/tests/llms/test_llms_doc-summarization_vllm.sh b/tests/llms/test_llms_doc-summarization_vllm.sh
index 42e79aa1e..55d8b2ccc 100644
--- a/tests/llms/test_llms_doc-summarization_vllm.sh
+++ b/tests/llms/test_llms_doc-summarization_vllm.sh
@@ -155,10 +155,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_doc-summarization.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
 
     build_docker_images
     start_service
diff --git a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
index a6096bd30..b245b57c7 100644
--- a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
+++ b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
@@ -154,10 +154,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_doc-summarization.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
 
     build_docker_images
     start_service
diff --git a/tests/llms/test_llms_faq-generation_tgi.sh b/tests/llms/test_llms_faq-generation_tgi.sh
index d0ae7aa95..b95389a27 100644
--- a/tests/llms/test_llms_faq-generation_tgi.sh
+++ b/tests/llms/test_llms_faq-generation_tgi.sh
@@ -102,10 +102,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_faq-generation.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
 
     build_docker_images
     start_service
diff --git a/tests/llms/test_llms_faq-generation_tgi_on_intel_hpu.sh b/tests/llms/test_llms_faq-generation_tgi_on_intel_hpu.sh
index 50b1524c0..bf4be175f 100644
--- a/tests/llms/test_llms_faq-generation_tgi_on_intel_hpu.sh
+++ b/tests/llms/test_llms_faq-generation_tgi_on_intel_hpu.sh
@@ -103,10 +103,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_faq-generation.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
 
     build_docker_images
     start_service
diff --git a/tests/llms/test_llms_faq-generation_vllm.sh b/tests/llms/test_llms_faq-generation_vllm.sh
index 588ed4981..43b7b1c65 100644
--- a/tests/llms/test_llms_faq-generation_vllm.sh
+++ b/tests/llms/test_llms_faq-generation_vllm.sh
@@ -118,10 +118,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_faq-generation.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
 
     build_docker_images
     start_service
diff --git a/tests/llms/test_llms_text-generation_native_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_native_on_intel_hpu.sh
index 0d39a8690..d348d2673 100644
--- a/tests/llms/test_llms_text-generation_native_on_intel_hpu.sh
+++ b/tests/llms/test_llms_text-generation_native_on_intel_hpu.sh
@@ -87,10 +87,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_text-generation.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
     build_docker_images
     start_service
     validate_microservice
diff --git a/tests/llms/test_llms_text-generation_service_llamacpp.sh b/tests/llms/test_llms_text-generation_service_llamacpp.sh
index a482724ef..1f2f4fcf1 100644
--- a/tests/llms/test_llms_text-generation_service_llamacpp.sh
+++ b/tests/llms/test_llms_text-generation_service_llamacpp.sh
@@ -86,7 +86,7 @@ function main() {
     stop_docker
     echo "Docker containers after stop_docker"
     docker ps -a
-    
+
     stop_docker
     build_docker_images
     start_service
diff --git a/tests/llms/test_llms_text-generation_service_ollama.sh b/tests/llms/test_llms_text-generation_service_ollama.sh
index d5087ce7e..dbf638e13 100644
--- a/tests/llms/test_llms_text-generation_service_ollama.sh
+++ b/tests/llms/test_llms_text-generation_service_ollama.sh
@@ -69,7 +69,12 @@ function stop_docker() {
 
 function main() {
 
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
     build_docker_images
 
     llm_models=(
diff --git a/tests/llms/test_llms_text-generation_service_tgi.sh b/tests/llms/test_llms_text-generation_service_tgi.sh
index c60447025..0e691c65f 100644
--- a/tests/llms/test_llms_text-generation_service_tgi.sh
+++ b/tests/llms/test_llms_text-generation_service_tgi.sh
@@ -118,7 +118,11 @@ function stop_docker() {
 
 function main() {
 
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
 
     build_docker_images
     pip install --no-cache-dir openai pydantic
diff --git a/tests/llms/test_llms_text-generation_service_tgi_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_service_tgi_on_intel_hpu.sh
index c91a51498..efa3809b8 100644
--- a/tests/llms/test_llms_text-generation_service_tgi_on_intel_hpu.sh
+++ b/tests/llms/test_llms_text-generation_service_tgi_on_intel_hpu.sh
@@ -119,7 +119,11 @@ function stop_docker() {
 
 function main() {
 
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
 
     build_docker_images
     pip install --no-cache-dir openai pydantic
diff --git a/tests/llms/test_llms_text-generation_service_vllm_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_service_vllm_on_intel_hpu.sh
index ea8c9ee6c..63cb7f955 100644
--- a/tests/llms/test_llms_text-generation_service_vllm_on_intel_hpu.sh
+++ b/tests/llms/test_llms_text-generation_service_vllm_on_intel_hpu.sh
@@ -129,10 +129,12 @@ function stop_docker() {
     cd $WORKPATH/comps/llms/deployment/docker_compose
     docker compose -f compose_text-generation.yaml down ${service_name} --remove-orphans
 }
-
-function main() {
-
+    echo "Docker containers before stop_docker"
+    docker ps -a
     stop_docker
+    echo "Docker containers after stop_docker"
+    docker ps -a
+
 
     build_docker_images
     pip install --no-cache-dir openai pydantic