Add timeout param for DocSum and FaqGen to deal with long context (#1329

) * Add timeout param for DocSum and FaqGen to deal with long context Make timeout param configurable, solve issue opea-project/GenAIExamples#1481 Signed-off-by: Xinyao Wang <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Xinyao Wang <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
opea-project · Mar 4, 2025 · 50d2e8c · 50d2e8c
1 parent a43d1de
commit 50d2e8c
Show file tree

Hide file tree

Showing 10 changed files with 17 additions and 10 deletions.
diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py
@@ -195,6 +195,7 @@ class ChatCompletionRequest(BaseModel):
     # top_p: Optional[float] = None # Priority use openai
     typical_p: Optional[float] = None
     # repetition_penalty: Optional[float] = None
+    timeout: Optional[int] = None
 
     # doc: begin-chat-completion-extra-params
     echo: Optional[bool] = Field(

diff --git a/comps/llms/src/doc-summarization/README.md b/comps/llms/src/doc-summarization/README.md
@@ -133,6 +133,8 @@ curl http://${your_ip}:9000/v1/docsum \
 
 "summary_type" is set to be "auto" by default, in this mode we will check input token length, if it exceed `MAX_INPUT_TOKENS`, `summary_type` will automatically be set to `refine` mode, otherwise will be set to `stuff` mode.
 
+With long contexts, request may get canceled due to its generation taking longer than the default `timeout` value (120s for TGI). Increase it as needed.
+
 **summary_type=stuff**
 
 In this mode LLM generate summary based on complete input text. In this case please carefully set `MAX_INPUT_TOKENS` and `MAX_TOTAL_TOKENS` according to your model and device memory, otherwise it may exceed LLM context limit and raise error when meet long context.
@@ -157,7 +159,7 @@ In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.ma
 ```bash
 curl http://${your_ip}:9000/v1/docsum \
   -X POST \
-  -d '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \
+  -d '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}' \
   -H 'Content-Type: application/json'
 ```
 
@@ -170,6 +172,6 @@ In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * inpu
 ```bash
 curl http://${your_ip}:9000/v1/docsum \
   -X POST \
-  -d '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' \
+  -d '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}' \
   -H 'Content-Type: application/json'
 ```
diff --git a/comps/llms/src/doc-summarization/integrations/tgi.py b/comps/llms/src/doc-summarization/integrations/tgi.py
@@ -70,6 +70,7 @@ async def invoke(self, input: DocSumChatCompletionRequest):
             temperature=input.temperature if input.temperature else 0.01,
             repetition_penalty=input.repetition_penalty if input.repetition_penalty else 1.03,
             streaming=input.stream,
+            timeout=input.timeout if input.timeout is not None else 120,
             server_kwargs=server_kwargs,
             task="text-generation",
         )

diff --git a/comps/llms/src/doc-summarization/integrations/vllm.py b/comps/llms/src/doc-summarization/integrations/vllm.py
@@ -63,6 +63,7 @@ async def invoke(self, input: DocSumChatCompletionRequest):
             top_p=input.top_p if input.top_p else 0.95,
             streaming=input.stream,
             temperature=input.temperature if input.temperature else 0.01,
+            request_timeout=float(input.timeout) if input.timeout is not None else None,
         )
         result = await self.generate(input, self.client)
 

diff --git a/comps/llms/src/faq-generation/integrations/tgi.py b/comps/llms/src/faq-generation/integrations/tgi.py
@@ -67,6 +67,7 @@ async def invoke(self, input: ChatCompletionRequest):
             temperature=input.temperature if input.temperature else 0.01,
             repetition_penalty=input.repetition_penalty if input.repetition_penalty else 1.03,
             streaming=input.stream,
+            timeout=input.timeout if input.timeout is not None else 120,
             server_kwargs=server_kwargs,
         )
         result = await self.generate(input, self.client)

diff --git a/comps/llms/src/faq-generation/integrations/vllm.py b/comps/llms/src/faq-generation/integrations/vllm.py
@@ -60,6 +60,7 @@ async def invoke(self, input: ChatCompletionRequest):
             top_p=input.top_p if input.top_p else 0.95,
             streaming=input.stream,
             temperature=input.temperature if input.temperature else 0.01,
+            request_timeout=float(input.timeout) if input.timeout is not None else None,
         )
         result = await self.generate(input, self.client)
 

diff --git a/tests/llms/test_llms_doc-summarization_tgi.sh b/tests/llms/test_llms_doc-summarization_tgi.sh
@@ -125,15 +125,15 @@ function validate_microservices() {
         'text' \
         "docsum-tgi" \
         "docsum-tgi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}'
 
     echo "Validate refine mode..."
     validate_services \
         "$URL" \
         'text' \
         "docsum-tgi" \
         "docsum-tgi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}'
 }
 
 function stop_docker() {

diff --git a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
@@ -126,15 +126,15 @@ function validate_microservices() {
         'text' \
         "docsum-tgi-gaudi" \
         "docsum-tgi-gaudi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}'
 
     echo "Validate refine mode..."
     validate_services \
         "$URL" \
         'text' \
         "docsum-tgi-gaudi" \
         "docsum-tgi-gaudi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}'
 }
 
 function stop_docker() {

diff --git a/tests/llms/test_llms_doc-summarization_vllm.sh b/tests/llms/test_llms_doc-summarization_vllm.sh
@@ -140,15 +140,15 @@ function validate_microservices() {
         'text' \
         "docsum-vllm" \
         "docsum-vllm" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}'
 
     echo "Validate refine mode..."
     validate_services \
         "$URL" \
         'text' \
         "docsum-vllm" \
         "docsum-vllm" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}'
 }
 
 function stop_docker() {

diff --git a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
@@ -139,15 +139,15 @@ function validate_microservices() {
         'text' \
         "docsum-vllm-gaudi" \
         "docsum-vllm-gaudi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}'
 
     echo "Validate refine mode..."
     validate_services \
         "$URL" \
         'text' \
         "docsum-vllm-gaudi" \
         "docsum-vllm-gaudi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}'
 }
 
 function stop_docker() {