InternLM · lvhan028 · May 26, 2025 · May 12, 2025 · May 12, 2025 · May 12, 2025
diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml
@@ -0,0 +1,77 @@
+model_path: /nvme/qa_test_models
+resource_path: /nvme/qa_test_models/resource
+dst_path: /nvme/qa_test_models/autotest_model
+log_path: /nvme/qa_test_models/autotest_model/log
+benchmark_path: /nvme/qa_test_models/benchmark-reports
+dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+env_tag: 3090
+
+tp_config:
+    empty: 2
+
+turbomind_chat_model:
+    - meta-llama/Llama-3.2-3B-Instruct
+    - internlm/internlm3-8b-instruct
+    - OpenGVLab/InternVL3-8B
+    - OpenGVLab/InternVL2_5-1B
+    - Qwen/Qwen3-8B
+    - Qwen/Qwen2.5-7B-Instruct
+
+pytorch_chat_model:
+    - meta-llama/Llama-3.2-3B-Instruct
+    - internlm/internlm3-8b-instruct
+    - OpenGVLab/InternVL3-8B
+    - OpenGVLab/InternVL2_5-1B
+    - Qwen/Qwen3-8B
+    - Qwen/Qwen2.5-7B-Instruct
+    - Qwen/Qwen2.5-VL-7B-Instruct
+
+turbomind_vl_model:
+    - OpenGVLab/InternVL3-8B
+    - OpenGVLab/InternVL2_5-1B
+
+pytorch_vl_model:
+    - OpenGVLab/InternVL3-8B
+    - OpenGVLab/InternVL2_5-1B
+    - Qwen/Qwen2.5-VL-7B-Instruct
+
+turbomind_base_model:
+    - internlm/internlm3-8b-instruct
+    - Qwen/Qwen3-8B
+
+pytorch_base_model:
+    - internlm/internlm3-8b-instruct
+    - Qwen/Qwen3-8B
+
+turbomind_quatization:
+    no_awq:
+        - OpenGVLab/InternVL3-8B
+    gptq:
+        - empty
+    no_kvint4:
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Chat
+
+pytorch_quatization:
+    awq:
+        - meta-llama/Llama-3.2-3B-Instruct
+        - internlm/internlm3-8b-instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen2.5-7B-Instruct
+    w8a8:
+        - meta-llama/Llama-3.2-3B-Instruct
+        - internlm/internlm3-8b-instruct
+    no_kvint4:
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
diff --git a/autotest/config.yaml b/autotest/config.yaml
@@ -4,6 +4,7 @@ dst_path: /nvme/qa_test_models/autotest_model
 log_path: /nvme/qa_test_models/autotest_model/log
 benchmark_path: /nvme/qa_test_models/benchmark-reports
 dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+env_tag: a100
 
 tp_config:
     internlm2-chat-20b: 2

diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py
@@ -10,6 +10,7 @@
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1))
 def test_hf_pytorch_chat_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
@@ -69,6 +70,7 @@ def test_hf_pytorch_chat_tp4(config, model, cli_case_config, worker_id):
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=4))
 def test_hf_pytorch_chat_kvin4_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
@@ -131,6 +133,7 @@ def test_hf_pytorch_chat_kvin4_tp4(config, model, cli_case_config, worker_id):
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=8))
 def test_hf_pytorch_chat_kvin8_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
@@ -192,6 +195,7 @@ def test_hf_pytorch_chat_kvin8_tp4(config, model, cli_case_config, worker_id):
 @pytest.mark.order(10)
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
+@pytest.mark.test_3090
 @pytest.mark.gpu_num_1
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, model_type='base_model'))
 def test_hf_pytorch_base_tp1(config, model, cli_case_config, worker_id):

diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py
@@ -10,6 +10,7 @@
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1))
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_chat_tp1(config, model, communicator, cli_case_config, worker_id):
@@ -78,6 +79,7 @@ def test_hf_turbomind_chat_tp4(config, model, communicator, cli_case_config, wor
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4))
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_chat_kvint4_tp1(config, model, communicator, cli_case_config, worker_id):
@@ -146,6 +148,7 @@ def test_hf_turbomind_chat_kvint4_tp4(config, model, communicator, cli_case_conf
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8))
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_chat_kvint8_tp1(config, model, communicator, cli_case_config, worker_id):
@@ -314,6 +317,7 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp2(config, model, communicat
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='base_model'))
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_base_tp1(config, model, communicator, cli_case_config, worker_id):

diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py
@@ -7,7 +7,7 @@
 from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
 from lmdeploy.utils import is_bf16_supported
 
-gen_config = GenerationConfig(max_new_tokens=500)
+gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2)
 
 
 def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = None):
@@ -24,6 +24,9 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test,
     if 'turbomind' in backend_type and extra is not None and 'communicator' in extra:
         backend_config.communicator = extra.get('communicator')
 
+    if extra is not None and 'cache-max-entry-count' in extra and extra.get('cache-max-entry-count') is not None:
+        backend_config.cache_max_entry_count = extra.get('cache-max-entry-count')
+
     if 'w4' in model_path or ('4bits' in model_path or 'awq' in model_path.lower()):
         backend_config.model_format = 'awq'
     if 'gptq' in model_path.lower():

diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py
@@ -10,7 +10,7 @@
 from lmdeploy.vl.constants import IMAGE_TOKEN
 from lmdeploy.vl.utils import encode_image_base64
 
-gen_config = GenerationConfig(max_new_tokens=500)
+gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2)
 
 PIC1 = 'tiger.jpeg'
 PIC2 = 'human-pose.jpg'
@@ -33,6 +33,9 @@ def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_te
     if 'turbomind' in backend_type and extra is not None and 'communicator' in extra:
         backend_config.communicator = extra.get('communicator')
 
+    if 'cache-max-entry-count' in extra and extra.get('cache-max-entry-count') is not None:
+        backend_config.cache_max_entry_count = extra.get('cache-max-entry-count')
+
     if 'w4' in model_path or ('4bits' in model_path or 'awq' in model_path.lower()):
         backend_config.model_format = 'awq'
     if not is_bf16_supported():

diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
@@ -9,6 +9,7 @@
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, exclude_dup=True))
 def test_pipeline_chat_pytorch_tp1(config, common_case_config, model, worker_id):
@@ -47,6 +48,7 @@ def test_pipeline_chat_pytorch_tp4(config, common_case_config, model, worker_id)
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=4, exclude_dup=True))
 def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, worker_id):
@@ -85,6 +87,7 @@ def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=8, exclude_dup=True))
 def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, worker_id):

diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
@@ -12,6 +12,7 @@
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, model_type='vl_model'))
 def test_pipeline_chat_tp1(config, model, worker_id):
     if 'gw' in worker_id:
@@ -47,6 +48,7 @@ def test_pipeline_chat_tp4(config, model, worker_id):
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=4, model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
     if 'gw' in worker_id:
@@ -82,6 +84,7 @@ def test_pipeline_chat_kvint4_tp4(config, model, worker_id):
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=8, model_type='vl_model'))
 def test_pipeline_chat_kvint8_tp1(config, model, worker_id):
     if 'gw' in worker_id:

diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
@@ -9,6 +9,7 @@
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1))
 @pytest.mark.parametrize('communicator', get_communicator_list())
@@ -50,6 +51,7 @@ def test_pipeline_chat_tp4(config, common_case_config, model, communicator, work
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4))
 @pytest.mark.parametrize('communicator', get_communicator_list())
@@ -103,6 +105,7 @@ def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, communicato
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8))
 @pytest.mark.parametrize('communicator', get_communicator_list())

diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -12,6 +12,7 @@
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='vl_model'))
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_tp1(config, model, communicator, worker_id):
@@ -53,6 +54,7 @@ def test_pipeline_chat_tp4(config, model, communicator, worker_id):
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4, model_type='vl_model'))
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_kvint4_tp1(config, model, communicator, worker_id):
@@ -100,6 +102,7 @@ def test_pipeline_chat_kvint4_tp4(config, model, communicator, worker_id):
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8, model_type='vl_model'))
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_kvint8_tp1(config, model, communicator, worker_id):

diff --git a/autotest/tools/quantization/test_quantization_awq.py b/autotest/tools/quantization/test_quantization_awq.py
@@ -7,6 +7,7 @@
 
 
 @pytest.mark.order(3)
+@pytest.mark.test_3090
 @pytest.mark.timeout(900)
 @pytest.mark.parametrize('model', get_quantization_model_list('awq'))
 def test_quantization_awq(config, model, worker_id):

diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
@@ -28,6 +28,7 @@ def getModelList(tp_num):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api_pytorch
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True)
 def test_restful_chat_tp1(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:
@@ -73,6 +74,7 @@ def getKvintModelList(tp_num, quant_policy):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=4), indirect=True)
 def test_restful_chat_kvint4_tp1(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:
@@ -109,6 +111,7 @@ def test_restful_chat_kvint4_tp4(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=8), indirect=True)
 def test_restful_chat_kvint8_tp1(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:

diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
@@ -28,6 +28,7 @@ def getModelList(tp_num):
 @pytest.mark.order(7)
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True)
 def test_restful_chat_tp1(config, worker_id):
     if get_workerid(worker_id) is None:
@@ -70,6 +71,7 @@ def getKvintModelList(tp_num, quant_policy: int = None):
 @pytest.mark.order(7)
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=4), indirect=True)
 def test_restful_chat_kvint4_tp1(config, worker_id):
     if get_workerid(worker_id) is None:
@@ -103,6 +105,7 @@ def test_restful_chat_kvint4_tp4(config, worker_id):
 @pytest.mark.order(7)
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=8), indirect=True)
 def test_restful_chat_kvint8_tp1(config, worker_id):
     if get_workerid(worker_id) is None:

diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
@@ -33,6 +33,7 @@ def getModelList(tp_num):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True)
 def test_restful_chat_tp1(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:
@@ -81,6 +82,7 @@ def getKvintModelList(tp_num, quant_policy):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=4), indirect=True)
 def test_restful_chat_kvint4_tp1(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:
@@ -117,6 +119,7 @@ def test_restful_chat_kvint4_tp4(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=8), indirect=True)
 def test_restful_chat_kvint8_tp1(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:

diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
@@ -32,6 +32,7 @@ def getModelList(tp_num):
 @pytest.mark.order(7)
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True)
 def test_restful_chat_tp1(config, worker_id):
     if get_workerid(worker_id) is None:
@@ -77,6 +78,7 @@ def getKvintModelList(tp_num, quant_policy: int = None):
 @pytest.mark.order(7)
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=4), indirect=True)
 def test_restful_chat_kvint4_tp1(config, worker_id):
     if get_workerid(worker_id) is None:
@@ -110,6 +112,7 @@ def test_restful_chat_kvint4_tp4(config, worker_id):
 @pytest.mark.order(7)
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=8), indirect=True)
 def test_restful_chat_kvint8_tp1(config, worker_id):
     if get_workerid(worker_id) is None: