diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml new file mode 100644 index 0000000000..337c15eeca --- /dev/null +++ b/.github/workflows/daily_ete_test_3090.yml @@ -0,0 +1,391 @@ +name: daily_ete_test_3090 + +on: + workflow_dispatch: + inputs: + repo_org: + required: false + description: 'Tested repository organization name. Default is InternLM' + type: string + default: 'InternLM/lmdeploy' + repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + backend: + required: true + description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"' + type: string + default: "['turbomind', 'pytorch']" + model: + required: true + description: 'Set testcase module filter: llm, vllm. Default contains all models' + type: string + default: "['llm','mllm']" + function: + required: true + description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions' + type: string + default: '["pipeline", "restful", "chat"]' + offline_mode: + required: true + description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' + type: boolean + default: false + regression_func: + required: true + description: 'regression functions' + type: string + default: "['quant', 'tools']" + schedule: + - cron: '00 16 * * 0-4' + +env: + HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache + HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai + OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }} + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }} + COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy + FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} + TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy + OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy + OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt + +jobs: + linux-build: + if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}} + strategy: + matrix: + pyver: [py310] + runs-on: ubuntu-latest + env: + PYTHON_VERSION: ${{ matrix.pyver }} + PLAT_NAME: manylinux2014_x86_64 + DOCKER_TAG: cuda12.1 + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Build + run: | + echo ${PYTHON_VERSION} + echo ${PLAT_NAME} + echo ${DOCKER_TAG} + echo ${OUTPUT_FOLDER} + echo ${GITHUB_RUN_ID} + # remove -it + sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh + bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: builder/manywheel/${{ env.OUTPUT_FOLDER }} + retention-days: 1 + name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} + + + download_pkgs: + needs: linux-build + if: ${{!cancelled()}} + runs-on: [self-hosted, 3090-r1] + timeout-minutes: 50 + container: + image: openmmlab/lmdeploy:latest + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/qa_test_models:/nvme/qa_test_models + - /data1:/data1 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Clone repository + uses: actions/checkout@v2 + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Copy repository + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml + - name: Copy repository - offline + if: ${{inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml + - name: Download Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Copy Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + - name: Copy Artifacts - offline + if: ${{inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + + test_quantization: + needs: download_pkgs + if: ${{!cancelled() && contains(needs.download_pkgs.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}} + runs-on: [self-hosted, 3090-r1] + timeout-minutes: 150 + env: + PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA + MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub + MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules + container: + image: openmmlab/lmdeploy:latest + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/qa_test_models:/nvme/qa_test_models + - /data1:/data1 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - quantization w4a16 + continue-on-error: true + if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind') + run: | + pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test and test_3090' --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - quantization w8a8 + continue-on-error: true + if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'pytorch') + run: | + pytest autotest/tools/quantization/test_quantization_w8a8.py --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_tools: + if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} + runs-on: [self-hosted, 3090-r1] + needs: test_quantization + timeout-minutes: 300 + strategy: + fail-fast: false + matrix: + backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}} + function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}} + exclude: + - backend: turbomind + model: mllm + function: chat + - backend: pytorch + model: mllm + function: chat + env: + PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA + MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub + MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules + container: + image: openmmlab/lmdeploy:latest + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/qa_test_models:/nvme/qa_test_models + - /data1:/data1 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - chat + continue-on-error: true + if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' + run: | + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test and test_3090' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + - name: Test lmdeploy - pipeline + continue-on-error: true + if: matrix.function == 'pipeline' + run: | + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and test_3090' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + - name: Test lmdeploy - restful + continue-on-error: true + if: matrix.function == 'restful' + run: | + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and test_3090' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_restful: + if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} + runs-on: [self-hosted, 3090-r1] + needs: test_quantization + strategy: + fail-fast: false + matrix: + backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + timeout-minutes: 60 + container: + image: openmmlab/lmdeploy:latest + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/qa_test_models:/nvme/qa_test_models + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Start restful api turbomind + if: matrix.backend == 'turbomind' + run: | + lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm3-8b-instruct > restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 120s + - name: Start restful api pytorch + if: matrix.backend == 'pytorch' + run: | + lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm3-8b-instruct --backend pytorch > restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 180s + - name: Test lmdeploy - restful api + timeout-minutes: 75 + run: | + pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}}/interface-${{matrix.backend}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Kill api server + if: always() + run: | + kill -15 "$restful_pid" + - name: Start restful api turbomind - base + if: matrix.backend == 'turbomind' + run: | + lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm3-8b-instruct > restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 120s + - name: Start restful api pytorch - base + if: matrix.backend == 'pytorch' + run: | + lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm3-8b-instruct --backend pytorch > restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 180s + - name: Test lmdeploy - restful api - base + timeout-minutes: 40 + run: | + pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 --alluredir=${{env.REPORT_DIR}}/interface-${{matrix.backend}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Kill api server + if: always() + run: | + kill -15 "$restful_pid" + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + get_coverage_report: + if: ${{!cancelled() && success()}} + runs-on: [self-hosted, 3090-r1] + needs: [test_tools, test_restful] + timeout-minutes: 5 + container: + image: openmmlab/lmdeploy:latest + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/qa_test_models:/nvme/qa_test_models + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Get coverage report + run: | + pip install coverage + coverage combine ${{env.REPORT_DIR}} + coverage xml -o ${{env.REPORT_DIR}}/coverage.xml + coverage report -m + mv .coverage ${{env.REPORT_DIR}}/.coverage + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml new file mode 100644 index 0000000000..3957ef932e --- /dev/null +++ b/autotest/config-3090.yaml @@ -0,0 +1,77 @@ +model_path: /nvme/qa_test_models +resource_path: /nvme/qa_test_models/resource +dst_path: /nvme/qa_test_models/autotest_model +log_path: /nvme/qa_test_models/autotest_model/log +benchmark_path: /nvme/qa_test_models/benchmark-reports +dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json +env_tag: 3090 + +tp_config: + empty: 2 + +turbomind_chat_model: + - meta-llama/Llama-3.2-3B-Instruct + - internlm/internlm3-8b-instruct + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-8B + - Qwen/Qwen2.5-7B-Instruct + +pytorch_chat_model: + - meta-llama/Llama-3.2-3B-Instruct + - internlm/internlm3-8b-instruct + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-8B + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + +turbomind_vl_model: + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-1B + +pytorch_vl_model: + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen2.5-VL-7B-Instruct + +turbomind_base_model: + - internlm/internlm3-8b-instruct + - Qwen/Qwen3-8B + +pytorch_base_model: + - internlm/internlm3-8b-instruct + - Qwen/Qwen3-8B + +turbomind_quatization: + no_awq: + - OpenGVLab/InternVL3-8B + gptq: + - empty + no_kvint4: + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-8B + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + no_kvint8: + - deepseek-ai/DeepSeek-V2-Chat + +pytorch_quatization: + awq: + - meta-llama/Llama-3.2-3B-Instruct + - internlm/internlm3-8b-instruct + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-8B + - Qwen/Qwen2.5-7B-Instruct + w8a8: + - meta-llama/Llama-3.2-3B-Instruct + - internlm/internlm3-8b-instruct + no_kvint4: + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-8B + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + no_kvint8: + - deepseek-ai/DeepSeek-V2-Lite-Chat diff --git a/autotest/config.yaml b/autotest/config.yaml index c3f66c4d12..bc82f1d012 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -4,6 +4,7 @@ dst_path: /nvme/qa_test_models/autotest_model log_path: /nvme/qa_test_models/autotest_model/log benchmark_path: /nvme/qa_test_models/benchmark-reports dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json +env_tag: a100 tp_config: internlm2-chat-20b: 2 diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py index 8bcc00d3e5..a60790914c 100644 --- a/autotest/tools/chat/test_command_chat_hf_pytorch.py +++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py @@ -10,6 +10,7 @@ @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1)) def test_hf_pytorch_chat_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' @@ -69,6 +70,7 @@ def test_hf_pytorch_chat_tp4(config, model, cli_case_config, worker_id): @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=4)) def test_hf_pytorch_chat_kvin4_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' @@ -131,6 +133,7 @@ def test_hf_pytorch_chat_kvin4_tp4(config, model, cli_case_config, worker_id): @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=8)) def test_hf_pytorch_chat_kvin8_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' @@ -192,6 +195,7 @@ def test_hf_pytorch_chat_kvin8_tp4(config, model, cli_case_config, worker_id): @pytest.mark.order(10) @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat +@pytest.mark.test_3090 @pytest.mark.gpu_num_1 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, model_type='base_model')) def test_hf_pytorch_base_tp1(config, model, cli_case_config, worker_id): diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py index 2a7a6a36b6..70e367d235 100644 --- a/autotest/tools/chat/test_command_chat_hf_turbomind.py +++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py @@ -10,6 +10,7 @@ @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1)) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_chat_tp1(config, model, communicator, cli_case_config, worker_id): @@ -78,6 +79,7 @@ def test_hf_turbomind_chat_tp4(config, model, communicator, cli_case_config, wor @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4)) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_chat_kvint4_tp1(config, model, communicator, cli_case_config, worker_id): @@ -146,6 +148,7 @@ def test_hf_turbomind_chat_kvint4_tp4(config, model, communicator, cli_case_conf @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8)) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_chat_kvint8_tp1(config, model, communicator, cli_case_config, worker_id): @@ -314,6 +317,7 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp2(config, model, communicat @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='base_model')) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_base_tp1(config, model, communicator, cli_case_config, worker_id): diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py index 98b73323b4..2de77d2bd3 100644 --- a/autotest/tools/pipeline/llm_case.py +++ b/autotest/tools/pipeline/llm_case.py @@ -7,7 +7,7 @@ from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline from lmdeploy.utils import is_bf16_supported -gen_config = GenerationConfig(max_new_tokens=500) +gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2) def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = None): @@ -24,6 +24,9 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, if 'turbomind' in backend_type and extra is not None and 'communicator' in extra: backend_config.communicator = extra.get('communicator') + if extra is not None and 'cache-max-entry-count' in extra and extra.get('cache-max-entry-count') is not None: + backend_config.cache_max_entry_count = extra.get('cache-max-entry-count') + if 'w4' in model_path or ('4bits' in model_path or 'awq' in model_path.lower()): backend_config.model_format = 'awq' if 'gptq' in model_path.lower(): diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py index 7d2b44bf9d..659d13f02e 100644 --- a/autotest/tools/pipeline/mllm_case.py +++ b/autotest/tools/pipeline/mllm_case.py @@ -10,7 +10,7 @@ from lmdeploy.vl.constants import IMAGE_TOKEN from lmdeploy.vl.utils import encode_image_base64 -gen_config = GenerationConfig(max_new_tokens=500) +gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2) PIC1 = 'tiger.jpeg' PIC2 = 'human-pose.jpg' @@ -33,6 +33,9 @@ def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_te if 'turbomind' in backend_type and extra is not None and 'communicator' in extra: backend_config.communicator = extra.get('communicator') + if 'cache-max-entry-count' in extra and extra.get('cache-max-entry-count') is not None: + backend_config.cache_max_entry_count = extra.get('cache-max-entry-count') + if 'w4' in model_path or ('4bits' in model_path or 'awq' in model_path.lower()): backend_config.model_format = 'awq' if not is_bf16_supported(): diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py index f1d67c113e..4e727d6c97 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py @@ -9,6 +9,7 @@ @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat_pytorch @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, exclude_dup=True)) def test_pipeline_chat_pytorch_tp1(config, common_case_config, model, worker_id): @@ -47,6 +48,7 @@ def test_pipeline_chat_pytorch_tp4(config, common_case_config, model, worker_id) @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=4, exclude_dup=True)) def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, worker_id): @@ -85,6 +87,7 @@ def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=8, exclude_dup=True)) def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, worker_id): diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py index 7115926116..a65465fe0c 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py @@ -12,6 +12,7 @@ @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, model_type='vl_model')) def test_pipeline_chat_tp1(config, model, worker_id): if 'gw' in worker_id: @@ -47,6 +48,7 @@ def test_pipeline_chat_tp4(config, model, worker_id): @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=4, model_type='vl_model')) def test_pipeline_chat_kvint4_tp1(config, model, worker_id): if 'gw' in worker_id: @@ -82,6 +84,7 @@ def test_pipeline_chat_kvint4_tp4(config, model, worker_id): @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=8, model_type='vl_model')) def test_pipeline_chat_kvint8_tp1(config, model, worker_id): if 'gw' in worker_id: diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py index 8e4734f386..e4f18d4690 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py @@ -9,6 +9,7 @@ @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1)) @pytest.mark.parametrize('communicator', get_communicator_list()) @@ -50,6 +51,7 @@ def test_pipeline_chat_tp4(config, common_case_config, model, communicator, work @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4)) @pytest.mark.parametrize('communicator', get_communicator_list()) @@ -103,6 +105,7 @@ def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, communicato @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8)) @pytest.mark.parametrize('communicator', get_communicator_list()) diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py index a245a12bcb..09b16b7656 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py @@ -12,6 +12,7 @@ @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='vl_model')) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_tp1(config, model, communicator, worker_id): @@ -53,6 +54,7 @@ def test_pipeline_chat_tp4(config, model, communicator, worker_id): @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4, model_type='vl_model')) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_kvint4_tp1(config, model, communicator, worker_id): @@ -100,6 +102,7 @@ def test_pipeline_chat_kvint4_tp4(config, model, communicator, worker_id): @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8, model_type='vl_model')) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_kvint8_tp1(config, model, communicator, worker_id): diff --git a/autotest/tools/quantization/test_quantization_awq.py b/autotest/tools/quantization/test_quantization_awq.py index d1d948c1ae..7552e6e2aa 100644 --- a/autotest/tools/quantization/test_quantization_awq.py +++ b/autotest/tools/quantization/test_quantization_awq.py @@ -7,6 +7,7 @@ @pytest.mark.order(3) +@pytest.mark.test_3090 @pytest.mark.timeout(900) @pytest.mark.parametrize('model', get_quantization_model_list('awq')) def test_quantization_awq(config, model, worker_id): diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py index 9cbd769a6d..d29d9f526d 100644 --- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py @@ -28,6 +28,7 @@ def getModelList(tp_num): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api_pytorch @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True) def test_restful_chat_tp1(config, common_case_config, worker_id): if get_workerid(worker_id) is None: @@ -73,6 +74,7 @@ def getKvintModelList(tp_num, quant_policy): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=4), indirect=True) def test_restful_chat_kvint4_tp1(config, common_case_config, worker_id): if get_workerid(worker_id) is None: @@ -109,6 +111,7 @@ def test_restful_chat_kvint4_tp4(config, common_case_config, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=8), indirect=True) def test_restful_chat_kvint8_tp1(config, common_case_config, worker_id): if get_workerid(worker_id) is None: diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py index 573d32e5b0..82d7a7bf7a 100644 --- a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py +++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py @@ -28,6 +28,7 @@ def getModelList(tp_num): @pytest.mark.order(7) @pytest.mark.restful_api_vl @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True) def test_restful_chat_tp1(config, worker_id): if get_workerid(worker_id) is None: @@ -70,6 +71,7 @@ def getKvintModelList(tp_num, quant_policy: int = None): @pytest.mark.order(7) @pytest.mark.restful_api_vl @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=4), indirect=True) def test_restful_chat_kvint4_tp1(config, worker_id): if get_workerid(worker_id) is None: @@ -103,6 +105,7 @@ def test_restful_chat_kvint4_tp4(config, worker_id): @pytest.mark.order(7) @pytest.mark.restful_api_vl @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=8), indirect=True) def test_restful_chat_kvint8_tp1(config, worker_id): if get_workerid(worker_id) is None: diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py index 7597af89cf..490f2a2507 100644 --- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py @@ -33,6 +33,7 @@ def getModelList(tp_num): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True) def test_restful_chat_tp1(config, common_case_config, worker_id): if get_workerid(worker_id) is None: @@ -81,6 +82,7 @@ def getKvintModelList(tp_num, quant_policy): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=4), indirect=True) def test_restful_chat_kvint4_tp1(config, common_case_config, worker_id): if get_workerid(worker_id) is None: @@ -117,6 +119,7 @@ def test_restful_chat_kvint4_tp4(config, common_case_config, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=8), indirect=True) def test_restful_chat_kvint8_tp1(config, common_case_config, worker_id): if get_workerid(worker_id) is None: diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py index 728d8e94c0..cbe530d65c 100644 --- a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py +++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py @@ -32,6 +32,7 @@ def getModelList(tp_num): @pytest.mark.order(7) @pytest.mark.restful_api_vl @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True) def test_restful_chat_tp1(config, worker_id): if get_workerid(worker_id) is None: @@ -77,6 +78,7 @@ def getKvintModelList(tp_num, quant_policy: int = None): @pytest.mark.order(7) @pytest.mark.restful_api_vl @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=4), indirect=True) def test_restful_chat_kvint4_tp1(config, worker_id): if get_workerid(worker_id) is None: @@ -110,6 +112,7 @@ def test_restful_chat_kvint4_tp4(config, worker_id): @pytest.mark.order(7) @pytest.mark.restful_api_vl @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=8), indirect=True) def test_restful_chat_kvint8_tp1(config, worker_id): if get_workerid(worker_id) is None: diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index f096dbe112..c53e33bf0f 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -1,5 +1,6 @@ import copy import os +from collections import OrderedDict import yaml from utils.get_run_config import get_tp_num @@ -92,7 +93,8 @@ def get_quantization_model_list(type): config = get_config() if type == 'awq': case_list = [ - x for x in config.get('turbomind_chat_model') + config.get('turbomind_base_model') + x + for x in list(OrderedDict.fromkeys(config.get('turbomind_chat_model') + config.get('turbomind_base_model'))) if x not in config.get('turbomind_quatization').get('no_awq') and not is_quantization_model(x) ] for key in config.get('pytorch_quatization').get('awq'): diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index 9b3f78caae..61559dd444 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -30,6 +30,11 @@ def run_pipeline_chat_test(config, log_path, '_'.join(['pipeline', 'chat', backend_type, worker_id, model_case.split('/')[1] + '.log'])) + if str(config.get('env_tag')) == '3090': + if extra is None: + extra = {} + extra['cache-max-entry-count'] = 0.6 + if extra is not None: extra = json.dumps(extra, ensure_ascii=False, indent=None) extra = extra.replace(' ', '').replace('"', '\\"').replace(',', '\\,') @@ -85,6 +90,11 @@ def run_pipeline_vl_chat_test(config, log_path, '_'.join(['pipeline', 'mllm', backend_type, worker_id, model_case.split('/')[1] + '.log'])) + if str(config.get('env_tag')) == '3090': + if extra is None: + extra = {} + extra['cache-max-entry-count'] = 0.5 + if extra is not None: extra = json.dumps(extra, ensure_ascii=False, indent=None) extra = extra.replace(' ', '').replace('"', '\\"').replace(',', '\\,') @@ -262,21 +272,23 @@ def internvl_vl_testcase(output_text, f, lang: str = 'en'): assert case_result, 'reason: combined images2: panda should in ' + response with allure.step(f'internvl-separate-images-{lang}'): response = get_response_from_output(output_text, f'internvl-separate-images-{lang}') - case_result = 'panda' in response.lower() or '熊猫' in response or 'same' in response.lower() + case_result = 'panda' in response.lower() or '熊猫' in response or 'same' in response.lower( + ) or 'difference' in response.lower() or 'different' in response.lower() f.writelines(f'internvl-separate-images-{lang} result: ' + str(case_result) + 'reason: separate images: panda should in ' + response + '\n') with assume: assert case_result, 'reason: separate images: panda should in ' + response with allure.step(f'internvl-separate-images2-{lang}'): response = get_response_from_output(output_text, f'internvl-separate-images2-{lang}') - case_result = 'panda' in response.lower() or '熊猫' in response or 'same' in response.lower() + case_result = 'panda' in response.lower() or '熊猫' in response or 'same' in response.lower( + ) or 'difference' in response.lower() or 'different' in response.lower() f.writelines(f'internvl-separate-images2-{lang} result: ' + str(case_result) + 'reason: separate images2: panda should in ' + response + '\n') with assume: assert case_result, 'reason: separate images2: panda should in ' + response with allure.step(f'internvl-video-{lang}'): response = get_response_from_output(output_text, f'internvl-video-{lang}') - case_result = 'red panda' in response.lower() or '熊猫' in response + case_result = 'red panda' in response.lower() or '熊猫' in response or 'stick' in response.lower() f.writelines(f'internvl-video-{lang} result: ' + str(case_result) + 'reason: video: panda should in ' + response + '\n') with assume: diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py index 6560bb55fa..3606e0bbbb 100644 --- a/autotest/utils/quantization_utils.py +++ b/autotest/utils/quantization_utils.py @@ -20,22 +20,27 @@ def quantization(config, if quantization_type == 'awq': quantization_cmd = ' '.join( - [cuda_prefix, 'lmdeploy lite auto_awq', origin_model_path, '--work-dir', quantization_model_path]) + ['lmdeploy lite auto_awq', origin_model_path, '--work-dir', quantization_model_path]) elif quantization_type == 'gptq': quantization_cmd = ' '.join( - [cuda_prefix, 'lmdeploy lite auto_gptq', origin_model_path, '--work-dir', quantization_model_path]) + ['lmdeploy lite auto_gptq', origin_model_path, '--work-dir', quantization_model_path]) elif quantization_type == 'w8a8': quantization_cmd = ' '.join( - [cuda_prefix, 'lmdeploy lite smooth_quant', origin_model_path, '--work-dir', quantization_model_path]) + ['lmdeploy lite smooth_quant', origin_model_path, '--work-dir', quantization_model_path]) else: return False, 'quantization type should in [awq, gptq, w8a8], \ now the type is ' + quantization_type + if cuda_prefix is not None: + quantization_cmd = ' '.join([cuda_prefix, quantization_cmd]) + if 'llama-3' in origin_model_name.lower(): quantization_cmd += ' --search-scale' if not is_bf16_supported() or quantization_type == 'gptq': quantization_cmd += ' --batch-size 8' + elif str(config.get('env_tag')) == '3090': + quantization_cmd += ' --batch-size 8' else: quantization_cmd += ' --batch-size 32' diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py index e2d1a5a33a..fc66eaf13d 100644 --- a/autotest/utils/run_client_chat.py +++ b/autotest/utils/run_client_chat.py @@ -49,6 +49,9 @@ def hf_command_line_test(config, else: model_path = model_case + if str(config.get('env_tag')) == '3090': + extra += ' --cache-max-entry-count 0.7' + cmd = get_command_with_extra(' '.join(['lmdeploy chat', model_path, '--backend', type, extra, '--session-len 4096']), config, diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index d9c601d2d8..bb6a7d3626 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -30,7 +30,7 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) if 'extra' in param.keys(): extra = param['extra'] else: - extra = None + extra = '' if 'modelscope' in param.keys(): modelscope = param['modelscope'] @@ -73,23 +73,20 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) if not is_bf16_supported(): cmd += ' --cache-max-entry-count 0.5' + if str(config.get('env_tag')) == '3090': + cmd += ' --cache-max-entry-count 0.5' start_log = os.path.join(log_path, 'start_restful_' + model.split('/')[1] + worker_id + '.log') print('reproduce command restful: ' + cmd) - with open(start_log, 'w') as f: - f.writelines('reproduce command restful: ' + cmd + '\n') + file = open(start_log, 'w') - startRes = subprocess.Popen([cmd], stdout=f, stderr=f, shell=True, text=True, encoding='utf-8') - pid = startRes.pid + startRes = subprocess.Popen([cmd], stdout=file, stderr=file, shell=True, text=True, encoding='utf-8') + pid = startRes.pid http_url = BASE_HTTP_URL + ':' + str(port) - with open(start_log, 'r') as file: - content = file.read() - print(content) start_time = int(time()) - start_timeout = 300 if not is_bf16_supported(): start_timeout = 600 @@ -102,6 +99,17 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) result = health_check(http_url) if result or total_time >= start_timeout: break + try: + # Check if process is still running + return_code = startRes.wait(timeout=1) # Small timeout to check status + if return_code != 0: + with open(start_log, 'r') as f: + content = f.read() + print(content) + return 0, startRes + except subprocess.TimeoutExpired: + continue + file.close() allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT) return pid, startRes @@ -196,10 +204,10 @@ def interactive_test(config, case, case_info, model, url, worker_id: str = ''): interactive_log = os.path.join(log_path, 'interactive_' + model + worker_id + '_' + case + '.log') - file = open(interactive_log, 'w') - result = True + file = open(interactive_log, 'w') + api_client = APIClient(url) file.writelines('available_models:' + ','.join(api_client.available_models) + '\n') @@ -285,6 +293,10 @@ def run_vl_testcase(config, port: int = DEFAULT_PORT): http_url = BASE_HTTP_URL + ':' + str(port) log_path = config.get('log_path') + model = get_model(http_url) + if model is None: + assert False, 'server not start correctly' + client = OpenAI(api_key='YOUR_API_KEY', base_url=http_url + '/v1') model_name = client.models.list().data[0].id diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md index f2740caeff..e8d23b8163 100644 --- a/docs/en/supported_models/supported_models.md +++ b/docs/en/supported_models/supported_models.md @@ -109,7 +109,6 @@ The following tables detail the models supported by LMDeploy's TurboMind engine | Phi-3.5-mini | 3.8B | LLM | Yes | Yes | No | - | - | | Phi-3.5-MoE | 16x3.8B | LLM | Yes | Yes | No | - | - | | Phi-3.5-vision | 4.2B | MLLM | Yes | Yes | No | - | - | -| Phi-4-mini | 3.8B | LLM | Yes | Yes | No | - | - | ```{note} * [1] Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead. diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md index 88d04eed93..d31b566953 100644 --- a/docs/zh_cn/supported_models/supported_models.md +++ b/docs/zh_cn/supported_models/supported_models.md @@ -109,7 +109,6 @@ | Phi-3.5-mini | 3.8B | LLM | Yes | Yes | No | - | - | | Phi-3.5-MoE | 16x3.8B | LLM | Yes | Yes | No | - | - | | Phi-3.5-vision | 4.2B | MLLM | Yes | Yes | No | - | - | -| Phi-4-mini | 3.8B | LLM | Yes | Yes | No | - | - | ```{note} * [1] 目前,Mono-InternVL不支持FP16,因为数值不稳定。请改用BF16