From a2dee3403ab2a9899d7003785a371af2c5ccb6e6 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Mon, 12 May 2025 14:51:29 +0800 Subject: [PATCH 01/28] update --- .github/workflows/daily_ete_test_3090.yml | 581 ++++++++++++++++++++++ autotest/config-3090.yaml | 77 +++ 2 files changed, 658 insertions(+) create mode 100644 .github/workflows/daily_ete_test_3090.yml create mode 100644 autotest/config-3090.yaml diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml new file mode 100644 index 0000000000..09ce3a535f --- /dev/null +++ b/.github/workflows/daily_ete_test_3090.yml @@ -0,0 +1,581 @@ +name: daily_ete_test_3090 + +on: + workflow_dispatch: + inputs: + repo_org: + required: false + description: 'Tested repository organization name. Default is InternLM' + type: string + default: 'InternLM/lmdeploy' + repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + backend: + required: true + description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"' + type: string + default: "['turbomind', 'pytorch']" + model: + required: true + description: 'Set testcase module filter: llm, vllm. Default contains all models' + type: string + default: "['llm','mllm']" + function: + required: true + description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions' + type: string + default: '["pipeline", "restful", "chat"]' + offline_mode: + required: true + description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' + type: boolean + default: false + regression_func: + required: true + description: 'regression functions' + type: string + default: "['quant', 'tools', 'benchmark']" + schedule: + - cron: '00 16 * * 0-4' + +env: + HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache + HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai + OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }} + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }} + COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy + FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} + TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy + OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy + OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt + DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL + +jobs: + linux-build: + if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}} + strategy: + matrix: + pyver: [py310] + runs-on: ubuntu-latest + env: + PYTHON_VERSION: ${{ matrix.pyver }} + PLAT_NAME: manylinux2014_x86_64 + DOCKER_TAG: cuda12.1 + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Build + run: | + echo ${PYTHON_VERSION} + echo ${PLAT_NAME} + echo ${DOCKER_TAG} + echo ${OUTPUT_FOLDER} + echo ${GITHUB_RUN_ID} + # remove -it + sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh + bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: builder/manywheel/${{ env.OUTPUT_FOLDER }} + retention-days: 1 + name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} + + + download_pkgs: + needs: linux-build + if: ${{!cancelled()}} + runs-on: [self-hosted, 3090-r1] + timeout-minutes: 50 + container: + image: openmmlab/lmdeploy:latest + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/qa_test_models:/nvme/qa_test_models + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Clone repository + uses: actions/checkout@v2 + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Copy repository + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: mv autotest/config-3090.yml autotest/config.yml && rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} + - name: Copy repository - offline + if: ${{inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} + - name: Download Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Copy Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + - name: Copy Artifacts - offline + if: ${{inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + + test_quantization: + needs: download_pkgs + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}} + runs-on: [self-hosted, 3090-r1] + timeout-minutes: 150 + env: + PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA + MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub + MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules + container: + image: openmmlab/lmdeploy:latest + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /nvme/qa_test_models/lmdeploy/autotest:/local_case + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/flash_attn-*.whl + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + pip install ${{env.DEEPSEEK_VL}} --no-deps + - name: Check env + run: | + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - quantization w4a16 + continue-on-error: true + if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind') + run: | + pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - quantization w8a8 + continue-on-error: true + if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'pytorch') + run: | + pytest autotest/tools/quantization/test_quantization_w8a8.py --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_tools: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} + runs-on: [self-hosted, 3090-r1] + needs: test_quantization + timeout-minutes: 300 + strategy: + fail-fast: false + matrix: + backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}} + function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}} + exclude: + - backend: turbomind + model: mllm + function: chat + - backend: pytorch + model: mllm + function: chat + include: + - backend: turbomind + model: llm + function: local_case + env: + PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA + MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub + MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules + container: + image: openmmlab/lmdeploy:latest + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/github-actions/resources/lora:/root/lora + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /nvme/qa_test_models/lmdeploy/autotest:/local_case + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/flash_attn-*.whl + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + rm -rf ${{env.DEEPSEEK_VL}}/build + pip install ${{env.DEEPSEEK_VL}} --no-deps + - name: Check env + run: | + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + cp -r /root/lora . + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - chat + continue-on-error: true + if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' + run: | + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + - name: Test lmdeploy - pipeline + continue-on-error: true + if: matrix.function == 'pipeline' + run: | + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + - name: Test lmdeploy - restful + continue-on-error: true + if: matrix.function == 'restful' + run: | + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + - name: Test lmdeploy - local testcase + if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'local_case' + run: | + pytest autotest/toolchain --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}}|| true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_restful: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} + runs-on: [self-hosted, 3090-r1] + needs: test_quantization + strategy: + fail-fast: false + matrix: + backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + timeout-minutes: 60 + container: + image: openmmlab/lmdeploy:latest + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/flash_attn-*.whl + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + pip install ${{env.DEEPSEEK_VL}} --no-deps + - name: Check env + run: | + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Start restful api turbomind + if: matrix.backend == 'turbomind' + run: | + lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm3-8b-instruct > restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 120s + - name: Start restful api pytorch + if: matrix.backend == 'pytorch' + run: | + lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm3-8b-instruct --backend pytorch > restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 180s + - name: Test lmdeploy - restful api + timeout-minutes: 75 + run: | + pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}}/interface-${{matrix.backend}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Kill api server + if: always() + run: | + kill -15 "$restful_pid" + - name: Start restful api turbomind - base + if: matrix.backend == 'turbomind' + run: | + lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm3-8b-instruct > restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 120s + - name: Start restful api pytorch - base + if: matrix.backend == 'pytorch' + run: | + lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm3-8b-instruct --backend pytorch > restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 180s + - name: Test lmdeploy - restful api - base + timeout-minutes: 40 + run: | + pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 --alluredir=${{env.REPORT_DIR}}/interface-${{matrix.backend}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Kill api server + if: always() + run: | + kill -15 "$restful_pid" + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_benchmark: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}} + runs-on: [self-hosted, 3090-r1] + needs: test_quantization + timeout-minutes: 120 + container: + image: openmmlab/lmdeploy:latest + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/flash_attn-*.whl + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + pip install ${{env.DEEPSEEK_VL}} --no-deps + - name: Check env + run: | + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test benchmark script + run: | + pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + chmod -R 777 /nvme/qa_test_models/benchmark-reports/${{ github.run_id }} + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_evaluation: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}} + runs-on: [self-hosted, 3090-r1] + needs: test_quantization + timeout-minutes: 120 # 2hours + strategy: + fail-fast: false + matrix: + evaluate_type: ['chat', 'base'] + container: + image: openmmlab/lmdeploy:latest + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/github-actions/resources:/root/resources + - /nvme/github-actions/opencompass-data:/root/opencompass-data + - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/flash_attn-*.whl + python3 -m pip install sentence_transformers==2.2.2 --no-deps + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + pip install ${{env.DEEPSEEK_VL}} --no-deps + - name: Install opencompass + run: | + git clone --depth=1 https://github.com/open-compass/opencompass.git + cd opencompass + cp /nvme/qa_test_models/offline_pkg/requirements-oc.txt requirements/runtime.txt + python3 -m pip install -e . + echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV + - name: Check env + run: | + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Setup paths for evaluation + run: | + ln -s /root/opencompass-data ./data + python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models . + - name: Evaluate models + if: matrix.evaluate_type == 'chat' + run: | + export LMDEPLOY_DIR=$(pwd) + + python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_chat_7b, pytorch_internlm2_chat_7b, turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_internlm2_chat_20b, pytorch_internlm2_chat_20b, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true + - name: Evaluate base models + if: matrix.evaluate_type == 'base' + run: | + export LMDEPLOY_DIR=$(pwd) + + python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_5_14b, turbomind_internlm2_5_7b_batch1]" "[*race_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]" /root/evaluation-reports/${{ github.run_id }} base true + - name: Clear workspace + if: always() + run: | + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + + get_benchmark_result: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}} + needs: [test_benchmark] + timeout-minutes: 5 + runs-on: [self-hosted, 3090-r1] + container: + image: openmmlab/lmdeploy:latest + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/qa_test_models:/nvme/qa_test_models + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + env: + BENCHMARK_REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }} + steps: + - name: Clone repository + uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Get overview + run: | + pip install pandas fire mmengine + python3 .github/scripts/action_tools.py generate_benchmark_report $BENCHMARK_REPORT_DIR + + + get_coverage_report: + if: ${{!cancelled()}} + runs-on: [self-hosted, 3090-r1] + needs: [test_tools, test_restful, test_benchmark] + timeout-minutes: 5 + container: + image: openmmlab/lmdeploy:latest + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Get coverage report + run: | + pip install coverage + coverage combine ${{env.REPORT_DIR}} + coverage xml -o ${{env.REPORT_DIR}}/coverage.xml + coverage report -m + mv .coverage ${{env.REPORT_DIR}}/.coverage + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml new file mode 100644 index 0000000000..80a9a66703 --- /dev/null +++ b/autotest/config-3090.yaml @@ -0,0 +1,77 @@ +model_path: /nvme/qa_test_models +resource_path: /nvme/qa_test_models/resource +dst_path: /nvme/qa_test_models/autotest_model +log_path: /nvme/qa_test_models/autotest_model/log +benchmark_path: /nvme/qa_test_models/benchmark-reports +dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json + +tp_config: + internlm2-chat-20b: 2 + + +turbomind_chat_model: + - meta-llama/Llama-3.2-3B-Instruct + - internlm/internlm3-8b-instruct + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-8B + - Qwen/Qwen2.5-7B-Instruct + +pytorch_chat_model: + - meta-llama/Llama-3.2-3B-Instruct + - internlm/internlm3-8b-instruct + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-8B + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + +turbomind_vl_model: + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-1B + +pytorch_vl_model: + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-8B + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + +turbomind_base_model: + - internlm/internlm3-8b-instruct + - Qwen/Qwen3-8B + +pytorch_base_model: + - internlm/internlm3-8b-instruct + - Qwen/Qwen3-8B + +turbomind_quatization: + no_awq: + - meta-llama/Meta-Llama-3-1-70B-Instruct + gptq: + no_kvint4: + - Qwen/Qwen3-8B + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + no_kvint8: + - deepseek-ai/DeepSeek-V2-Chat + +pytorch_quatization: + awq: + - meta-llama/Llama-3.2-3B-Instruct + - internlm/internlm3-8b-instruct + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-8B + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + w8a8: + - meta-llama/Llama-3.2-3B-Instruct + - internlm/internlm3-8b-instruct + - Qwen/Qwen3-8B + no_kvint4: + - Qwen/Qwen3-8B + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + no_kvint8: + - deepseek-ai/DeepSeek-V2-Lite-Chat From 53f824cbb0a56b01765aef7027dfb35e47c07ca1 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Mon, 12 May 2025 15:10:38 +0800 Subject: [PATCH 02/28] update --- .github/workflows/daily_ete_test_3090.yml | 51 +++-------------------- autotest/config-3090.yaml | 6 +++ 2 files changed, 11 insertions(+), 46 deletions(-) diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml index 09ce3a535f..af7303d49c 100644 --- a/.github/workflows/daily_ete_test_3090.yml +++ b/.github/workflows/daily_ete_test_3090.yml @@ -140,10 +140,8 @@ jobs: options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - - /nvme/github-actions/packages:/root/packages - /nvme/qa_test_models:/nvme/qa_test_models - - /mnt/shared:/mnt/shared - - /nvme/qa_test_models/lmdeploy/autotest:/local_case + - /data1:/data1 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: - name: Copy repository and Artifacts @@ -151,9 +149,6 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - # manually install flash attn - # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases - python3 -m pip install /root/packages/flash_attn-*.whl python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -210,10 +205,6 @@ jobs: - backend: pytorch model: mllm function: chat - include: - - backend: turbomind - model: llm - function: local_case env: PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub @@ -223,11 +214,8 @@ jobs: options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - - /nvme/github-actions/packages:/root/packages - - /nvme/github-actions/resources/lora:/root/lora - /nvme/qa_test_models:/nvme/qa_test_models - - /mnt/shared:/mnt/shared - - /nvme/qa_test_models/lmdeploy/autotest:/local_case + - /data1:/data1 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: - name: Copy repository and Artifacts @@ -235,9 +223,6 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - # manually install flash attn - # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases - python3 -m pip install /root/packages/flash_attn-*.whl python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -274,13 +259,6 @@ jobs: run: | pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - - name: Test lmdeploy - local testcase - if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'local_case' - run: | - pytest autotest/toolchain --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}}|| true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Clear workfile if: always() run: | @@ -305,7 +283,6 @@ jobs: options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - - /nvme/github-actions/packages:/root/packages - /nvme/qa_test_models:/nvme/qa_test_models - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: @@ -314,9 +291,6 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - # manually install flash attn - # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases - python3 -m pip install /root/packages/flash_attn-*.whl python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -395,7 +369,6 @@ jobs: options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - - /nvme/github-actions/packages:/root/packages - /nvme/qa_test_models:/nvme/qa_test_models - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: @@ -404,9 +377,6 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - # manually install flash attn - # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases - python3 -m pip install /root/packages/flash_attn-*.whl python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | @@ -446,18 +416,17 @@ jobs: strategy: fail-fast: false matrix: - evaluate_type: ['chat', 'base'] + evaluate_type: ['chat'] container: image: openmmlab/lmdeploy:latest options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - - /nvme/github-actions/packages:/root/packages - /nvme/github-actions/resources:/root/resources - /nvme/github-actions/opencompass-data:/root/opencompass-data - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports - /nvme/qa_test_models:/nvme/qa_test_models - - /mnt/shared:/mnt/shared + - /data1:/data1 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: - name: Copy repository and Artifacts @@ -465,9 +434,6 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - # manually install flash attn - # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases - python3 -m pip install /root/packages/flash_attn-*.whl python3 -m pip install sentence_transformers==2.2.2 --no-deps python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy @@ -501,13 +467,7 @@ jobs: run: | export LMDEPLOY_DIR=$(pwd) - python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_chat_7b, pytorch_internlm2_chat_7b, turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_internlm2_chat_20b, pytorch_internlm2_chat_20b, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true - - name: Evaluate base models - if: matrix.evaluate_type == 'base' - run: | - export LMDEPLOY_DIR=$(pwd) - - python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_5_14b, turbomind_internlm2_5_7b_batch1]" "[*race_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]" /root/evaluation-reports/${{ github.run_id }} base true + python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true - name: Clear workspace if: always() run: | @@ -553,7 +513,6 @@ jobs: options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - - /nvme/github-actions/packages:/root/packages - /nvme/qa_test_models:/nvme/qa_test_models - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml index 80a9a66703..51d16a6a7f 100644 --- a/autotest/config-3090.yaml +++ b/autotest/config-3090.yaml @@ -75,3 +75,9 @@ pytorch_quatization: - Qwen/Qwen2.5-VL-7B-Instruct no_kvint8: - deepseek-ai/DeepSeek-V2-Lite-Chat + +benchmark_model: + - meta-llama/Llama-3.2-3B-Instruct + - internlm/internlm3-8b-instruct + - OpenGVLab/InternVL3-8B + - Qwen/Qwen3-8B From 3c79d380433f9eb1394a578a1d99cd63b9d1b51a Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Mon, 12 May 2025 16:34:38 +0800 Subject: [PATCH 03/28] update --- .github/workflows/daily_ete_test_3090.yml | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml index af7303d49c..b7c29d8bcf 100644 --- a/.github/workflows/daily_ete_test_3090.yml +++ b/.github/workflows/daily_ete_test_3090.yml @@ -52,7 +52,6 @@ env: TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt - DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL jobs: linux-build: @@ -110,10 +109,10 @@ jobs: ref: ${{github.event.inputs.repo_ref || 'main'}} - name: Copy repository if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} - run: mv autotest/config-3090.yml autotest/config.yml && rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yml ${{env.TEST_CODE_PATH}}/autotest/config.yml - name: Copy repository - offline if: ${{inputs.offline_mode}} - run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yml ${{env.TEST_CODE_PATH}}/autotest/config.yml - name: Download Artifacts if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} uses: actions/download-artifact@v4 @@ -154,7 +153,6 @@ jobs: run: | python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - pip install ${{env.DEEPSEEK_VL}} --no-deps - name: Check env run: | pip uninstall -y nvidia-nccl-cu11 @@ -228,8 +226,6 @@ jobs: run: | python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - rm -rf ${{env.DEEPSEEK_VL}}/build - pip install ${{env.DEEPSEEK_VL}} --no-deps - name: Check env run: | pip uninstall -y nvidia-nccl-cu11 @@ -296,7 +292,6 @@ jobs: run: | python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - pip install ${{env.DEEPSEEK_VL}} --no-deps - name: Check env run: | pip uninstall -y nvidia-nccl-cu11 @@ -382,7 +377,6 @@ jobs: run: | python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - pip install ${{env.DEEPSEEK_VL}} --no-deps - name: Check env run: | pip uninstall -y nvidia-nccl-cu11 @@ -434,19 +428,16 @@ jobs: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy - dependency run: | - python3 -m pip install sentence_transformers==2.2.2 --no-deps python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - name: Install lmdeploy run: | python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - pip install ${{env.DEEPSEEK_VL}} --no-deps - name: Install opencompass run: | git clone --depth=1 https://github.com/open-compass/opencompass.git cd opencompass - cp /nvme/qa_test_models/offline_pkg/requirements-oc.txt requirements/runtime.txt - python3 -m pip install -e . + python3 -m pip install -e ".[full]" echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV - name: Check env run: | From b20758df1b5437aa12e8a4fb02a10b0c8333355e Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Mon, 12 May 2025 16:42:44 +0800 Subject: [PATCH 04/28] update --- .github/workflows/daily_ete_test_3090.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml index b7c29d8bcf..4c206fbbcc 100644 --- a/.github/workflows/daily_ete_test_3090.yml +++ b/.github/workflows/daily_ete_test_3090.yml @@ -99,6 +99,7 @@ jobs: options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/qa_test_models:/nvme/qa_test_models + - /data1:/data1 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: - name: Clone repository From 8a86f10609de6b5096f38b0346f70a2b3933ebf4 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Mon, 12 May 2025 16:51:53 +0800 Subject: [PATCH 05/28] update --- .github/workflows/daily_ete_test_3090.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml index 4c206fbbcc..5112c8a7e9 100644 --- a/.github/workflows/daily_ete_test_3090.yml +++ b/.github/workflows/daily_ete_test_3090.yml @@ -110,10 +110,10 @@ jobs: ref: ${{github.event.inputs.repo_ref || 'main'}} - name: Copy repository if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} - run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yml ${{env.TEST_CODE_PATH}}/autotest/config.yml + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml - name: Copy repository - offline if: ${{inputs.offline_mode}} - run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yml ${{env.TEST_CODE_PATH}}/autotest/config.yml + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml - name: Download Artifacts if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} uses: actions/download-artifact@v4 From ba8d937b994bb74a0dae402c9560ef903d443711 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Mon, 12 May 2025 16:56:08 +0800 Subject: [PATCH 06/28] update --- .github/workflows/daily_ete_test_3090.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml index 5112c8a7e9..27837152cb 100644 --- a/.github/workflows/daily_ete_test_3090.yml +++ b/.github/workflows/daily_ete_test_3090.yml @@ -187,7 +187,7 @@ jobs: chmod -R 777 $workdir test_tools: - if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} + if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} runs-on: [self-hosted, 3090-r1] needs: test_quantization timeout-minutes: 300 @@ -267,7 +267,7 @@ jobs: chmod -R 777 $workdir test_restful: - if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} + if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} runs-on: [self-hosted, 3090-r1] needs: test_quantization strategy: @@ -356,7 +356,7 @@ jobs: chmod -R 777 $workdir test_benchmark: - if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}} + if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}} runs-on: [self-hosted, 3090-r1] needs: test_quantization timeout-minutes: 120 @@ -404,7 +404,7 @@ jobs: chmod -R 777 $workdir test_evaluation: - if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}} + if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}} runs-on: [self-hosted, 3090-r1] needs: test_quantization timeout-minutes: 120 # 2hours @@ -471,7 +471,7 @@ jobs: get_benchmark_result: - if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}} + if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}} needs: [test_benchmark] timeout-minutes: 5 runs-on: [self-hosted, 3090-r1] @@ -496,7 +496,7 @@ jobs: get_coverage_report: - if: ${{!cancelled()}} + if: ${{!cancelled() && success()}} runs-on: [self-hosted, 3090-r1] needs: [test_tools, test_restful, test_benchmark] timeout-minutes: 5 From 5f978e3e119707ca5463f5cff3c8ed65a04150b8 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Mon, 12 May 2025 17:56:46 +0800 Subject: [PATCH 07/28] update --- .github/workflows/daily_ete_test_3090.yml | 154 +----------------- autotest/config-3090.yaml | 5 +- .../quantization/test_quantization_awq.py | 1 + 3 files changed, 11 insertions(+), 149 deletions(-) diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml index 27837152cb..018af585e8 100644 --- a/.github/workflows/daily_ete_test_3090.yml +++ b/.github/workflows/daily_ete_test_3090.yml @@ -37,7 +37,7 @@ on: required: true description: 'regression functions' type: string - default: "['quant', 'tools', 'benchmark']" + default: "['quant', 'tools']" schedule: - cron: '00 16 * * 0-4' @@ -128,7 +128,7 @@ jobs: test_quantization: needs: download_pkgs - if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}} + if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}} runs-on: [self-hosted, 3090-r1] timeout-minutes: 150 env: @@ -168,7 +168,7 @@ jobs: continue-on-error: true if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind') run: | - pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true + pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test and test_3090' --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - quantization w8a8 continue-on-error: true @@ -242,19 +242,19 @@ jobs: continue-on-error: true if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' run: | - pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test and test_3090' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - name: Test lmdeploy - pipeline continue-on-error: true if: matrix.function == 'pipeline' run: | - pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and test_3090' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - name: Test lmdeploy - restful continue-on-error: true if: matrix.function == 'restful' run: | - pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and test_3090' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - name: Clear workfile if: always() @@ -355,150 +355,10 @@ jobs: mkdir $workdir chmod -R 777 $workdir - test_benchmark: - if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}} - runs-on: [self-hosted, 3090-r1] - needs: test_quantization - timeout-minutes: 120 - container: - image: openmmlab/lmdeploy:latest - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" - volumes: - - /nvme/github-actions/pip-cache:/root/.cache/pip - - /nvme/qa_test_models:/nvme/qa_test_models - - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro - steps: - - name: Copy repository and Artifacts - run: | - cp -r ${{env.TEST_CODE_PATH}}/. . - - name: Install lmdeploy - dependency - run: | - python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - - name: Install lmdeploy - run: | - python3 -m pip install lmdeploy-*.whl --no-deps - python3 -m pip install -r requirements/test.txt - - name: Check env - run: | - pip uninstall -y nvidia-nccl-cu11 - python3 -m pip list - lmdeploy check_env - rm -rf allure-results - # remove tmp log in testcase - rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p - ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - - name: Test benchmark script - run: | - pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - - name: Clear workfile - if: always() - run: | - chmod -R 777 $REPORT_DIR - chmod -R 777 /nvme/qa_test_models/benchmark-reports/${{ github.run_id }} - export workdir=$(pwd) - cd .. - rm -rf $workdir - mkdir $workdir - chmod -R 777 $workdir - - test_evaluation: - if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}} - runs-on: [self-hosted, 3090-r1] - needs: test_quantization - timeout-minutes: 120 # 2hours - strategy: - fail-fast: false - matrix: - evaluate_type: ['chat'] - container: - image: openmmlab/lmdeploy:latest - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" - volumes: - - /nvme/github-actions/pip-cache:/root/.cache/pip - - /nvme/github-actions/resources:/root/resources - - /nvme/github-actions/opencompass-data:/root/opencompass-data - - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports - - /nvme/qa_test_models:/nvme/qa_test_models - - /data1:/data1 - - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro - steps: - - name: Copy repository and Artifacts - run: | - cp -r ${{env.TEST_CODE_PATH}}/. . - - name: Install lmdeploy - dependency - run: | - python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - - name: Install lmdeploy - run: | - python3 -m pip install lmdeploy-*.whl --no-deps - python3 -m pip install -r requirements/test.txt - - name: Install opencompass - run: | - git clone --depth=1 https://github.com/open-compass/opencompass.git - cd opencompass - python3 -m pip install -e ".[full]" - echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV - - name: Check env - run: | - pip uninstall -y nvidia-nccl-cu11 - python3 -m pip list - lmdeploy check_env - rm -rf allure-results - # remove tmp log in testcase - rm -rf /nvme/qa_test_models/autotest_model/log/* - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p - ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - - name: Setup paths for evaluation - run: | - ln -s /root/opencompass-data ./data - python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models . - - name: Evaluate models - if: matrix.evaluate_type == 'chat' - run: | - export LMDEPLOY_DIR=$(pwd) - - python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true - - name: Clear workspace - if: always() - run: | - export workdir=$(pwd) - cd .. - rm -rf $workdir - mkdir $workdir - chmod -R 777 $workdir - - - get_benchmark_result: - if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}} - needs: [test_benchmark] - timeout-minutes: 5 - runs-on: [self-hosted, 3090-r1] - container: - image: openmmlab/lmdeploy:latest - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" - volumes: - - /nvme/qa_test_models:/nvme/qa_test_models - - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro - env: - BENCHMARK_REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }} - steps: - - name: Clone repository - uses: actions/checkout@v2 - with: - repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} - ref: ${{github.event.inputs.repo_ref || 'main'}} - - name: Get overview - run: | - pip install pandas fire mmengine - python3 .github/scripts/action_tools.py generate_benchmark_report $BENCHMARK_REPORT_DIR - - get_coverage_report: if: ${{!cancelled() && success()}} runs-on: [self-hosted, 3090-r1] - needs: [test_tools, test_restful, test_benchmark] + needs: [test_tools, test_restful] timeout-minutes: 5 container: image: openmmlab/lmdeploy:latest diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml index 51d16a6a7f..875ab03f17 100644 --- a/autotest/config-3090.yaml +++ b/autotest/config-3090.yaml @@ -6,8 +6,8 @@ benchmark_path: /nvme/qa_test_models/benchmark-reports dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json tp_config: - internlm2-chat-20b: 2 - + empty: 2 + turbomind_chat_model: - meta-llama/Llama-3.2-3B-Instruct @@ -49,6 +49,7 @@ turbomind_quatization: no_awq: - meta-llama/Meta-Llama-3-1-70B-Instruct gptq: + - empty no_kvint4: - Qwen/Qwen3-8B - Qwen/Qwen2.5-7B-Instruct diff --git a/autotest/tools/quantization/test_quantization_awq.py b/autotest/tools/quantization/test_quantization_awq.py index d1d948c1ae..7552e6e2aa 100644 --- a/autotest/tools/quantization/test_quantization_awq.py +++ b/autotest/tools/quantization/test_quantization_awq.py @@ -7,6 +7,7 @@ @pytest.mark.order(3) +@pytest.mark.test_3090 @pytest.mark.timeout(900) @pytest.mark.parametrize('model', get_quantization_model_list('awq')) def test_quantization_awq(config, model, worker_id): From a92539a7a9c66da9a3c974386a05e60a2a166f50 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Mon, 12 May 2025 18:03:11 +0800 Subject: [PATCH 08/28] update --- autotest/tools/chat/test_command_chat_hf_pytorch.py | 4 ++++ autotest/tools/chat/test_command_chat_hf_turbomind.py | 4 ++++ autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py | 3 +++ autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py | 3 +++ autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py | 3 +++ autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py | 3 +++ autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py | 3 +++ autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py | 3 +++ autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py | 3 +++ autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py | 3 +++ 10 files changed, 32 insertions(+) diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py index 8bcc00d3e5..a60790914c 100644 --- a/autotest/tools/chat/test_command_chat_hf_pytorch.py +++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py @@ -10,6 +10,7 @@ @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1)) def test_hf_pytorch_chat_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' @@ -69,6 +70,7 @@ def test_hf_pytorch_chat_tp4(config, model, cli_case_config, worker_id): @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=4)) def test_hf_pytorch_chat_kvin4_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' @@ -131,6 +133,7 @@ def test_hf_pytorch_chat_kvin4_tp4(config, model, cli_case_config, worker_id): @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=8)) def test_hf_pytorch_chat_kvin8_tp1(config, model, cli_case_config, worker_id): usercase = 'chat_testcase' @@ -192,6 +195,7 @@ def test_hf_pytorch_chat_kvin8_tp4(config, model, cli_case_config, worker_id): @pytest.mark.order(10) @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat +@pytest.mark.test_3090 @pytest.mark.gpu_num_1 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, model_type='base_model')) def test_hf_pytorch_base_tp1(config, model, cli_case_config, worker_id): diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py index 2a7a6a36b6..70e367d235 100644 --- a/autotest/tools/chat/test_command_chat_hf_turbomind.py +++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py @@ -10,6 +10,7 @@ @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1)) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_chat_tp1(config, model, communicator, cli_case_config, worker_id): @@ -78,6 +79,7 @@ def test_hf_turbomind_chat_tp4(config, model, communicator, cli_case_config, wor @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4)) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_chat_kvint4_tp1(config, model, communicator, cli_case_config, worker_id): @@ -146,6 +148,7 @@ def test_hf_turbomind_chat_kvint4_tp4(config, model, communicator, cli_case_conf @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8)) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_chat_kvint8_tp1(config, model, communicator, cli_case_config, worker_id): @@ -314,6 +317,7 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp2(config, model, communicat @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='base_model')) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_hf_turbomind_base_tp1(config, model, communicator, cli_case_config, worker_id): diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py index f1d67c113e..4e727d6c97 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py @@ -9,6 +9,7 @@ @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat_pytorch @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, exclude_dup=True)) def test_pipeline_chat_pytorch_tp1(config, common_case_config, model, worker_id): @@ -47,6 +48,7 @@ def test_pipeline_chat_pytorch_tp4(config, common_case_config, model, worker_id) @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=4, exclude_dup=True)) def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, worker_id): @@ -85,6 +87,7 @@ def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=8, exclude_dup=True)) def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, worker_id): diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py index 7115926116..a65465fe0c 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py @@ -12,6 +12,7 @@ @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, model_type='vl_model')) def test_pipeline_chat_tp1(config, model, worker_id): if 'gw' in worker_id: @@ -47,6 +48,7 @@ def test_pipeline_chat_tp4(config, model, worker_id): @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=4, model_type='vl_model')) def test_pipeline_chat_kvint4_tp1(config, model, worker_id): if 'gw' in worker_id: @@ -82,6 +84,7 @@ def test_pipeline_chat_kvint4_tp4(config, model, worker_id): @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=8, model_type='vl_model')) def test_pipeline_chat_kvint8_tp1(config, model, worker_id): if 'gw' in worker_id: diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py index 8e4734f386..e4f18d4690 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py @@ -9,6 +9,7 @@ @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1)) @pytest.mark.parametrize('communicator', get_communicator_list()) @@ -50,6 +51,7 @@ def test_pipeline_chat_tp4(config, common_case_config, model, communicator, work @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4)) @pytest.mark.parametrize('communicator', get_communicator_list()) @@ -103,6 +105,7 @@ def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, communicato @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8)) @pytest.mark.parametrize('communicator', get_communicator_list()) diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py index a245a12bcb..09b16b7656 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py @@ -12,6 +12,7 @@ @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='vl_model')) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_tp1(config, model, communicator, worker_id): @@ -53,6 +54,7 @@ def test_pipeline_chat_tp4(config, model, communicator, worker_id): @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4, model_type='vl_model')) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_kvint4_tp1(config, model, communicator, worker_id): @@ -100,6 +102,7 @@ def test_pipeline_chat_kvint4_tp4(config, model, communicator, worker_id): @pytest.mark.pipeline_chat @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8, model_type='vl_model')) @pytest.mark.parametrize('communicator', get_communicator_list()) def test_pipeline_chat_kvint8_tp1(config, model, communicator, worker_id): diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py index 9cbd769a6d..d29d9f526d 100644 --- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py @@ -28,6 +28,7 @@ def getModelList(tp_num): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api_pytorch @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True) def test_restful_chat_tp1(config, common_case_config, worker_id): if get_workerid(worker_id) is None: @@ -73,6 +74,7 @@ def getKvintModelList(tp_num, quant_policy): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=4), indirect=True) def test_restful_chat_kvint4_tp1(config, common_case_config, worker_id): if get_workerid(worker_id) is None: @@ -109,6 +111,7 @@ def test_restful_chat_kvint4_tp4(config, common_case_config, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=8), indirect=True) def test_restful_chat_kvint8_tp1(config, common_case_config, worker_id): if get_workerid(worker_id) is None: diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py index 573d32e5b0..82d7a7bf7a 100644 --- a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py +++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py @@ -28,6 +28,7 @@ def getModelList(tp_num): @pytest.mark.order(7) @pytest.mark.restful_api_vl @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True) def test_restful_chat_tp1(config, worker_id): if get_workerid(worker_id) is None: @@ -70,6 +71,7 @@ def getKvintModelList(tp_num, quant_policy: int = None): @pytest.mark.order(7) @pytest.mark.restful_api_vl @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=4), indirect=True) def test_restful_chat_kvint4_tp1(config, worker_id): if get_workerid(worker_id) is None: @@ -103,6 +105,7 @@ def test_restful_chat_kvint4_tp4(config, worker_id): @pytest.mark.order(7) @pytest.mark.restful_api_vl @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=8), indirect=True) def test_restful_chat_kvint8_tp1(config, worker_id): if get_workerid(worker_id) is None: diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py index 7597af89cf..490f2a2507 100644 --- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py @@ -33,6 +33,7 @@ def getModelList(tp_num): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True) def test_restful_chat_tp1(config, common_case_config, worker_id): if get_workerid(worker_id) is None: @@ -81,6 +82,7 @@ def getKvintModelList(tp_num, quant_policy): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=4), indirect=True) def test_restful_chat_kvint4_tp1(config, common_case_config, worker_id): if get_workerid(worker_id) is None: @@ -117,6 +119,7 @@ def test_restful_chat_kvint4_tp4(config, common_case_config, worker_id): @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=8), indirect=True) def test_restful_chat_kvint8_tp1(config, common_case_config, worker_id): if get_workerid(worker_id) is None: diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py index 728d8e94c0..cbe530d65c 100644 --- a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py +++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py @@ -32,6 +32,7 @@ def getModelList(tp_num): @pytest.mark.order(7) @pytest.mark.restful_api_vl @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True) def test_restful_chat_tp1(config, worker_id): if get_workerid(worker_id) is None: @@ -77,6 +78,7 @@ def getKvintModelList(tp_num, quant_policy: int = None): @pytest.mark.order(7) @pytest.mark.restful_api_vl @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=4), indirect=True) def test_restful_chat_kvint4_tp1(config, worker_id): if get_workerid(worker_id) is None: @@ -110,6 +112,7 @@ def test_restful_chat_kvint4_tp4(config, worker_id): @pytest.mark.order(7) @pytest.mark.restful_api_vl @pytest.mark.gpu_num_1 +@pytest.mark.test_3090 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=8), indirect=True) def test_restful_chat_kvint8_tp1(config, worker_id): if get_workerid(worker_id) is None: From 000e2679065ffa4c755b8d6facbaf36d3e2842c1 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Mon, 12 May 2025 18:09:01 +0800 Subject: [PATCH 09/28] update --- .github/workflows/daily_ete_test_3090.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml index 018af585e8..d1a56e7b69 100644 --- a/.github/workflows/daily_ete_test_3090.yml +++ b/.github/workflows/daily_ete_test_3090.yml @@ -128,7 +128,7 @@ jobs: test_quantization: needs: download_pkgs - if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}} + if: ${{!cancelled() && contains(needs.download_pkgs.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}} runs-on: [self-hosted, 3090-r1] timeout-minutes: 150 env: @@ -187,7 +187,7 @@ jobs: chmod -R 777 $workdir test_tools: - if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} + if: ${{!cancelled() && contains(needs.test_quantization.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} runs-on: [self-hosted, 3090-r1] needs: test_quantization timeout-minutes: 300 @@ -267,7 +267,7 @@ jobs: chmod -R 777 $workdir test_restful: - if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} + if: ${{!cancelled() && contains(needs.test_quantization.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} runs-on: [self-hosted, 3090-r1] needs: test_quantization strategy: From 49179cf26d604d60dc5465ed970b997b225ee624 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Mon, 12 May 2025 19:09:49 +0800 Subject: [PATCH 10/28] update --- autotest/utils/quantization_utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py index 6560bb55fa..aad5d682af 100644 --- a/autotest/utils/quantization_utils.py +++ b/autotest/utils/quantization_utils.py @@ -20,17 +20,20 @@ def quantization(config, if quantization_type == 'awq': quantization_cmd = ' '.join( - [cuda_prefix, 'lmdeploy lite auto_awq', origin_model_path, '--work-dir', quantization_model_path]) + ['lmdeploy lite auto_awq', origin_model_path, '--work-dir', quantization_model_path]) elif quantization_type == 'gptq': quantization_cmd = ' '.join( - [cuda_prefix, 'lmdeploy lite auto_gptq', origin_model_path, '--work-dir', quantization_model_path]) + ['lmdeploy lite auto_gptq', origin_model_path, '--work-dir', quantization_model_path]) elif quantization_type == 'w8a8': quantization_cmd = ' '.join( - [cuda_prefix, 'lmdeploy lite smooth_quant', origin_model_path, '--work-dir', quantization_model_path]) + ['lmdeploy lite smooth_quant', origin_model_path, '--work-dir', quantization_model_path]) else: return False, 'quantization type should in [awq, gptq, w8a8], \ now the type is ' + quantization_type + if cuda_prefix is not None: + quantization_cmd = ' '.join([cuda_prefix, quantization_cmd]) + if 'llama-3' in origin_model_name.lower(): quantization_cmd += ' --search-scale' From 37aca3dd73d5fd410aa402f1906d86c1664316c5 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Mon, 12 May 2025 19:11:46 +0800 Subject: [PATCH 11/28] update --- .github/workflows/daily_ete_test_3090.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml index d1a56e7b69..8ca86a4fd9 100644 --- a/.github/workflows/daily_ete_test_3090.yml +++ b/.github/workflows/daily_ete_test_3090.yml @@ -187,7 +187,7 @@ jobs: chmod -R 777 $workdir test_tools: - if: ${{!cancelled() && contains(needs.test_quantization.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} + if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} runs-on: [self-hosted, 3090-r1] needs: test_quantization timeout-minutes: 300 @@ -232,7 +232,6 @@ jobs: pip uninstall -y nvidia-nccl-cu11 python3 -m pip list lmdeploy check_env - cp -r /root/lora . rm -rf allure-results # remove tmp log in testcase rm -rf /nvme/qa_test_models/autotest_model/log/* @@ -267,7 +266,7 @@ jobs: chmod -R 777 $workdir test_restful: - if: ${{!cancelled() && contains(needs.test_quantization.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} + if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} runs-on: [self-hosted, 3090-r1] needs: test_quantization strategy: From d8ba44dc9e72e78da2421296082b14edd6440e60 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Wed, 14 May 2025 13:30:28 +0800 Subject: [PATCH 12/28] update --- .github/workflows/daily_ete_test_3090.yml | 2 +- autotest/utils/config_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml index 8ca86a4fd9..337c15eeca 100644 --- a/.github/workflows/daily_ete_test_3090.yml +++ b/.github/workflows/daily_ete_test_3090.yml @@ -44,7 +44,7 @@ on: env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai - OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }} + OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }} COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index f096dbe112..9aa1b3f8e5 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -92,7 +92,7 @@ def get_quantization_model_list(type): config = get_config() if type == 'awq': case_list = [ - x for x in config.get('turbomind_chat_model') + config.get('turbomind_base_model') + x for x in list(set(config.get('turbomind_chat_model')) + set(config.get('turbomind_base_model'))) if x not in config.get('turbomind_quatization').get('no_awq') and not is_quantization_model(x) ] for key in config.get('pytorch_quatization').get('awq'): From 91faa59684f5a913e635b0fac02242403dbfcd6f Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Wed, 14 May 2025 13:37:47 +0800 Subject: [PATCH 13/28] update --- autotest/utils/config_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index 9aa1b3f8e5..c13325e58e 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -3,6 +3,7 @@ import yaml from utils.get_run_config import get_tp_num +from collections import OrderedDict from lmdeploy.utils import is_bf16_supported @@ -92,7 +93,7 @@ def get_quantization_model_list(type): config = get_config() if type == 'awq': case_list = [ - x for x in list(set(config.get('turbomind_chat_model')) + set(config.get('turbomind_base_model'))) + x for x in list(OrderedDict.fromkeys(config.get('turbomind_chat_model') + config.get('turbomind_base_model'))) if x not in config.get('turbomind_quatization').get('no_awq') and not is_quantization_model(x) ] for key in config.get('pytorch_quatization').get('awq'): From 7961a27aa2f9442894cefafd07d52e26ed0b720b Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Wed, 14 May 2025 14:26:08 +0800 Subject: [PATCH 14/28] update --- autotest/config-3090.yaml | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml index 875ab03f17..6c44ef8af3 100644 --- a/autotest/config-3090.yaml +++ b/autotest/config-3090.yaml @@ -47,7 +47,10 @@ pytorch_base_model: turbomind_quatization: no_awq: - - meta-llama/Meta-Llama-3-1-70B-Instruct + - OpenGVLab/InternVL3-8B + - Qwen/Qwen3-8B + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct gptq: - empty no_kvint4: @@ -61,24 +64,13 @@ pytorch_quatization: awq: - meta-llama/Llama-3.2-3B-Instruct - internlm/internlm3-8b-instruct - - OpenGVLab/InternVL3-8B - OpenGVLab/InternVL2_5-1B - - Qwen/Qwen3-8B - - Qwen/Qwen2.5-7B-Instruct - - Qwen/Qwen2.5-VL-7B-Instruct w8a8: - meta-llama/Llama-3.2-3B-Instruct - internlm/internlm3-8b-instruct - - Qwen/Qwen3-8B no_kvint4: - Qwen/Qwen3-8B - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2.5-VL-7B-Instruct no_kvint8: - deepseek-ai/DeepSeek-V2-Lite-Chat - -benchmark_model: - - meta-llama/Llama-3.2-3B-Instruct - - internlm/internlm3-8b-instruct - - OpenGVLab/InternVL3-8B - - Qwen/Qwen3-8B From 7204e944146b94e3a06cd4b28662f59ec177bb41 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Wed, 14 May 2025 14:28:35 +0800 Subject: [PATCH 15/28] update --- autotest/utils/config_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index c13325e58e..c53e33bf0f 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -1,9 +1,9 @@ import copy import os +from collections import OrderedDict import yaml from utils.get_run_config import get_tp_num -from collections import OrderedDict from lmdeploy.utils import is_bf16_supported @@ -93,7 +93,8 @@ def get_quantization_model_list(type): config = get_config() if type == 'awq': case_list = [ - x for x in list(OrderedDict.fromkeys(config.get('turbomind_chat_model') + config.get('turbomind_base_model'))) + x + for x in list(OrderedDict.fromkeys(config.get('turbomind_chat_model') + config.get('turbomind_base_model'))) if x not in config.get('turbomind_quatization').get('no_awq') and not is_quantization_model(x) ] for key in config.get('pytorch_quatization').get('awq'): From 3e52598e73ca7ab8a372995ae688f383b607b8da Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Wed, 14 May 2025 16:07:17 +0800 Subject: [PATCH 16/28] update --- autotest/config-3090.yaml | 2 +- autotest/config.yaml | 1 + autotest/tools/pipeline/llm_case.py | 5 ++++- autotest/tools/pipeline/mllm_case.py | 3 +++ autotest/utils/pipeline_chat.py | 5 ++++- autotest/utils/run_client_chat.py | 3 +++ autotest/utils/run_restful_chat.py | 13 ++++++++----- 7 files changed, 24 insertions(+), 8 deletions(-) diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml index 6c44ef8af3..7bad2d3daf 100644 --- a/autotest/config-3090.yaml +++ b/autotest/config-3090.yaml @@ -4,11 +4,11 @@ dst_path: /nvme/qa_test_models/autotest_model log_path: /nvme/qa_test_models/autotest_model/log benchmark_path: /nvme/qa_test_models/benchmark-reports dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json +env_tag: 3090 tp_config: empty: 2 - turbomind_chat_model: - meta-llama/Llama-3.2-3B-Instruct - internlm/internlm3-8b-instruct diff --git a/autotest/config.yaml b/autotest/config.yaml index c3f66c4d12..bc82f1d012 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -4,6 +4,7 @@ dst_path: /nvme/qa_test_models/autotest_model log_path: /nvme/qa_test_models/autotest_model/log benchmark_path: /nvme/qa_test_models/benchmark-reports dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json +env_tag: a100 tp_config: internlm2-chat-20b: 2 diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py index 98b73323b4..d229e68870 100644 --- a/autotest/tools/pipeline/llm_case.py +++ b/autotest/tools/pipeline/llm_case.py @@ -10,7 +10,7 @@ gen_config = GenerationConfig(max_new_tokens=500) -def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = None): +def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = {}): if 'pytorch' in backend_type: backend_config = PytorchEngineConfig(tp=tp) @@ -24,6 +24,9 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, if 'turbomind' in backend_type and extra is not None and 'communicator' in extra: backend_config.communicator = extra.get('communicator') + if 'cache-max-entry-count' in extra and extra.get('cache-max-entry-count') is not None: + backend_config.cache_max_entry_count = extra.get('cache-max-entry-count') + if 'w4' in model_path or ('4bits' in model_path or 'awq' in model_path.lower()): backend_config.model_format = 'awq' if 'gptq' in model_path.lower(): diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py index 7d2b44bf9d..05aa3d760d 100644 --- a/autotest/tools/pipeline/mllm_case.py +++ b/autotest/tools/pipeline/mllm_case.py @@ -33,6 +33,9 @@ def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_te if 'turbomind' in backend_type and extra is not None and 'communicator' in extra: backend_config.communicator = extra.get('communicator') + if 'cache-max-entry-count' in extra and extra.get('cache-max-entry-count') is not None: + backend_config.cache_max_entry_count = extra.get('cache-max-entry-count') + if 'w4' in model_path or ('4bits' in model_path or 'awq' in model_path.lower()): backend_config.model_format = 'awq' if not is_bf16_supported(): diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index 9b3f78caae..01dd941b1a 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -14,7 +14,7 @@ def run_pipeline_chat_test(config, model_case, backend_type, worker_id: str = '', - extra: object = None, + extra: object = {}, use_local_model: bool = True, is_smoke: bool = False): log_path = config.get('log_path') @@ -30,6 +30,9 @@ def run_pipeline_chat_test(config, log_path, '_'.join(['pipeline', 'chat', backend_type, worker_id, model_case.split('/')[1] + '.log'])) + if str(config.get('env_tag')) == '3090': + extra['cache_max_entry_count'] = 0.7 + if extra is not None: extra = json.dumps(extra, ensure_ascii=False, indent=None) extra = extra.replace(' ', '').replace('"', '\\"').replace(',', '\\,') diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py index e2d1a5a33a..fc66eaf13d 100644 --- a/autotest/utils/run_client_chat.py +++ b/autotest/utils/run_client_chat.py @@ -49,6 +49,9 @@ def hf_command_line_test(config, else: model_path = model_case + if str(config.get('env_tag')) == '3090': + extra += ' --cache-max-entry-count 0.7' + cmd = get_command_with_extra(' '.join(['lmdeploy chat', model_path, '--backend', type, extra, '--session-len 4096']), config, diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index d9c601d2d8..0bec0d2c0f 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -30,7 +30,10 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) if 'extra' in param.keys(): extra = param['extra'] else: - extra = None + extra = '' + + if str(config.get('env_tag')) == '3090': + extra += ' --cache-max-entry-count 0.7' if 'modelscope' in param.keys(): modelscope = param['modelscope'] @@ -85,8 +88,8 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) pid = startRes.pid http_url = BASE_HTTP_URL + ':' + str(port) - with open(start_log, 'r') as file: - content = file.read() + with open(start_log, 'r') as f: + content = f.read() print(content) start_time = int(time()) @@ -196,10 +199,10 @@ def interactive_test(config, case, case_info, model, url, worker_id: str = ''): interactive_log = os.path.join(log_path, 'interactive_' + model + worker_id + '_' + case + '.log') - file = open(interactive_log, 'w') - result = True + file = open(interactive_log, 'w') + api_client = APIClient(url) file.writelines('available_models:' + ','.join(api_client.available_models) + '\n') From 7cfd5675cbddcae0b923b9bb06192173d9eabbf6 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Wed, 14 May 2025 17:20:38 +0800 Subject: [PATCH 17/28] update --- autotest/config-3090.yaml | 13 +++++++++---- autotest/utils/quantization_utils.py | 2 ++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml index 7bad2d3daf..dca79c230a 100644 --- a/autotest/config-3090.yaml +++ b/autotest/config-3090.yaml @@ -47,13 +47,12 @@ pytorch_base_model: turbomind_quatization: no_awq: - - OpenGVLab/InternVL3-8B - - Qwen/Qwen3-8B - - Qwen/Qwen2.5-7B-Instruct - - Qwen/Qwen2.5-VL-7B-Instruct + - empty gptq: - empty no_kvint4: + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-1B - Qwen/Qwen3-8B - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2.5-VL-7B-Instruct @@ -64,11 +63,17 @@ pytorch_quatization: awq: - meta-llama/Llama-3.2-3B-Instruct - internlm/internlm3-8b-instruct + - OpenGVLab/InternVL3-8B - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-8B + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct w8a8: - meta-llama/Llama-3.2-3B-Instruct - internlm/internlm3-8b-instruct no_kvint4: + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-1B - Qwen/Qwen3-8B - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2.5-VL-7B-Instruct diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py index aad5d682af..3606e0bbbb 100644 --- a/autotest/utils/quantization_utils.py +++ b/autotest/utils/quantization_utils.py @@ -39,6 +39,8 @@ def quantization(config, if not is_bf16_supported() or quantization_type == 'gptq': quantization_cmd += ' --batch-size 8' + elif str(config.get('env_tag')) == '3090': + quantization_cmd += ' --batch-size 8' else: quantization_cmd += ' --batch-size 32' From 5df050461115d5ac8c03a1057be806e0eb18a152 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Wed, 14 May 2025 18:35:28 +0800 Subject: [PATCH 18/28] update --- autotest/config-3090.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml index dca79c230a..5251f447ea 100644 --- a/autotest/config-3090.yaml +++ b/autotest/config-3090.yaml @@ -67,7 +67,6 @@ pytorch_quatization: - OpenGVLab/InternVL2_5-1B - Qwen/Qwen3-8B - Qwen/Qwen2.5-7B-Instruct - - Qwen/Qwen2.5-VL-7B-Instruct w8a8: - meta-llama/Llama-3.2-3B-Instruct - internlm/internlm3-8b-instruct From 16f46b29242d91c68566e53804800200a937b423 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Thu, 15 May 2025 14:12:55 +0800 Subject: [PATCH 19/28] update --- autotest/config-3090.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml index 5251f447ea..791e2dc576 100644 --- a/autotest/config-3090.yaml +++ b/autotest/config-3090.yaml @@ -33,8 +33,6 @@ turbomind_vl_model: pytorch_vl_model: - OpenGVLab/InternVL3-8B - OpenGVLab/InternVL2_5-1B - - Qwen/Qwen3-8B - - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2.5-VL-7B-Instruct turbomind_base_model: From 03ebb243baedfc5bf32fe71e99938d52e114d616 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Thu, 15 May 2025 14:18:19 +0800 Subject: [PATCH 20/28] update --- autotest/utils/pipeline_chat.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index 01dd941b1a..ed3f1b730d 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -31,7 +31,9 @@ def run_pipeline_chat_test(config, model_case.split('/')[1] + '.log'])) if str(config.get('env_tag')) == '3090': - extra['cache_max_entry_count'] = 0.7 + if extra is None: + extra = {} + extra['cache-max-entry-count'] = 0.7 if extra is not None: extra = json.dumps(extra, ensure_ascii=False, indent=None) @@ -76,7 +78,7 @@ def run_pipeline_vl_chat_test(config, model_case, backend_type, worker_id: str = '', - extra: object = None, + extra: object = {}, is_smoke: bool = False): log_path = config.get('log_path') tp = get_tp_num(config, model_case) @@ -88,6 +90,11 @@ def run_pipeline_vl_chat_test(config, log_path, '_'.join(['pipeline', 'mllm', backend_type, worker_id, model_case.split('/')[1] + '.log'])) + if str(config.get('env_tag')) == '3090': + if extra is None: + extra = {} + extra['cache-max-entry-count'] = 0.7 + if extra is not None: extra = json.dumps(extra, ensure_ascii=False, indent=None) extra = extra.replace(' ', '').replace('"', '\\"').replace(',', '\\,') From 62e7a1cc094772f53aee27268a8a92f555d4cb17 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Thu, 15 May 2025 15:03:47 +0800 Subject: [PATCH 21/28] update --- autotest/tools/pipeline/llm_case.py | 2 +- autotest/utils/pipeline_chat.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py index d229e68870..8f70c376ac 100644 --- a/autotest/tools/pipeline/llm_case.py +++ b/autotest/tools/pipeline/llm_case.py @@ -7,7 +7,7 @@ from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline from lmdeploy.utils import is_bf16_supported -gen_config = GenerationConfig(max_new_tokens=500) +gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=1) def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = {}): diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index ed3f1b730d..f5bc2f9d0a 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -93,7 +93,7 @@ def run_pipeline_vl_chat_test(config, if str(config.get('env_tag')) == '3090': if extra is None: extra = {} - extra['cache-max-entry-count'] = 0.7 + extra['cache-max-entry-count'] = 0.5 if extra is not None: extra = json.dumps(extra, ensure_ascii=False, indent=None) From 3223a19e46d260dea090d11712151247e0c642f5 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Thu, 15 May 2025 15:05:45 +0800 Subject: [PATCH 22/28] update --- autotest/utils/pipeline_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index f5bc2f9d0a..452801448b 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -33,7 +33,7 @@ def run_pipeline_chat_test(config, if str(config.get('env_tag')) == '3090': if extra is None: extra = {} - extra['cache-max-entry-count'] = 0.7 + extra['cache-max-entry-count'] = 0.6 if extra is not None: extra = json.dumps(extra, ensure_ascii=False, indent=None) From 92dbd991b03e400b7c973d6db2c13bb9ff172cc5 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Thu, 15 May 2025 16:21:14 +0800 Subject: [PATCH 23/28] update --- docs/en/supported_models/supported_models.md | 1 - docs/zh_cn/supported_models/supported_models.md | 1 - 2 files changed, 2 deletions(-) diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md index f2740caeff..e8d23b8163 100644 --- a/docs/en/supported_models/supported_models.md +++ b/docs/en/supported_models/supported_models.md @@ -109,7 +109,6 @@ The following tables detail the models supported by LMDeploy's TurboMind engine | Phi-3.5-mini | 3.8B | LLM | Yes | Yes | No | - | - | | Phi-3.5-MoE | 16x3.8B | LLM | Yes | Yes | No | - | - | | Phi-3.5-vision | 4.2B | MLLM | Yes | Yes | No | - | - | -| Phi-4-mini | 3.8B | LLM | Yes | Yes | No | - | - | ```{note} * [1] Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead. diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md index 88d04eed93..d31b566953 100644 --- a/docs/zh_cn/supported_models/supported_models.md +++ b/docs/zh_cn/supported_models/supported_models.md @@ -109,7 +109,6 @@ | Phi-3.5-mini | 3.8B | LLM | Yes | Yes | No | - | - | | Phi-3.5-MoE | 16x3.8B | LLM | Yes | Yes | No | - | - | | Phi-3.5-vision | 4.2B | MLLM | Yes | Yes | No | - | - | -| Phi-4-mini | 3.8B | LLM | Yes | Yes | No | - | - | ```{note} * [1] 目前,Mono-InternVL不支持FP16,因为数值不稳定。请改用BF16 From d4408381eef4c5a65f52521ee7509154347f8902 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Thu, 15 May 2025 19:58:54 +0800 Subject: [PATCH 24/28] update --- autotest/utils/run_restful_chat.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index 0bec0d2c0f..0b10fd0630 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -81,18 +81,13 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) print('reproduce command restful: ' + cmd) - with open(start_log, 'w') as f: - f.writelines('reproduce command restful: ' + cmd + '\n') + file = open(start_log, 'w') - startRes = subprocess.Popen([cmd], stdout=f, stderr=f, shell=True, text=True, encoding='utf-8') - pid = startRes.pid + startRes = subprocess.Popen([cmd], stdout=file, stderr=file, shell=True, text=True, encoding='utf-8') + pid = startRes.pid http_url = BASE_HTTP_URL + ':' + str(port) - with open(start_log, 'r') as f: - content = f.read() - print(content) start_time = int(time()) - start_timeout = 300 if not is_bf16_supported(): start_timeout = 600 @@ -105,6 +100,17 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) result = health_check(http_url) if result or total_time >= start_timeout: break + try: + # Check if process is still running + return_code = startRes.wait(timeout=1) # Small timeout to check status + if return_code != 0: + with open(start_log, 'r') as f: + content = f.read() + print(content) + break + except subprocess.TimeoutExpired: + continue + file.close() allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT) return pid, startRes From 065df44bba477db98a08cee7cdb43ef711164a23 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Fri, 16 May 2025 10:25:19 +0800 Subject: [PATCH 25/28] update --- autotest/tools/pipeline/llm_case.py | 2 +- autotest/tools/pipeline/mllm_case.py | 2 +- autotest/utils/pipeline_chat.py | 6 ++++-- autotest/utils/run_restful_chat.py | 7 +++---- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py index 8f70c376ac..1e718eb9df 100644 --- a/autotest/tools/pipeline/llm_case.py +++ b/autotest/tools/pipeline/llm_case.py @@ -7,7 +7,7 @@ from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline from lmdeploy.utils import is_bf16_supported -gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=1) +gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2) def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = {}): diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py index 05aa3d760d..659d13f02e 100644 --- a/autotest/tools/pipeline/mllm_case.py +++ b/autotest/tools/pipeline/mllm_case.py @@ -10,7 +10,7 @@ from lmdeploy.vl.constants import IMAGE_TOKEN from lmdeploy.vl.utils import encode_image_base64 -gen_config = GenerationConfig(max_new_tokens=500) +gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2) PIC1 = 'tiger.jpeg' PIC2 = 'human-pose.jpg' diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index 452801448b..eaa43e963d 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -272,14 +272,16 @@ def internvl_vl_testcase(output_text, f, lang: str = 'en'): assert case_result, 'reason: combined images2: panda should in ' + response with allure.step(f'internvl-separate-images-{lang}'): response = get_response_from_output(output_text, f'internvl-separate-images-{lang}') - case_result = 'panda' in response.lower() or '熊猫' in response or 'same' in response.lower() + case_result = 'panda' in response.lower() or '熊猫' in response or 'same' in response.lower( + ) or 'difference' in response.lower() or 'different' in response.lower() f.writelines(f'internvl-separate-images-{lang} result: ' + str(case_result) + 'reason: separate images: panda should in ' + response + '\n') with assume: assert case_result, 'reason: separate images: panda should in ' + response with allure.step(f'internvl-separate-images2-{lang}'): response = get_response_from_output(output_text, f'internvl-separate-images2-{lang}') - case_result = 'panda' in response.lower() or '熊猫' in response or 'same' in response.lower() + case_result = 'panda' in response.lower() or '熊猫' in response or 'same' in response.lower( + ) or 'difference' in response.lower() or 'different' in response.lower() f.writelines(f'internvl-separate-images2-{lang} result: ' + str(case_result) + 'reason: separate images2: panda should in ' + response + '\n') with assume: diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index 0b10fd0630..201fece932 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -32,9 +32,6 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) else: extra = '' - if str(config.get('env_tag')) == '3090': - extra += ' --cache-max-entry-count 0.7' - if 'modelscope' in param.keys(): modelscope = param['modelscope'] if modelscope: @@ -76,6 +73,8 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) if not is_bf16_supported(): cmd += ' --cache-max-entry-count 0.5' + if str(config.get('env_tag')) == '3090': + extra += ' --cache-max-entry-count 0.5' start_log = os.path.join(log_path, 'start_restful_' + model.split('/')[1] + worker_id + '.log') @@ -107,7 +106,7 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) with open(start_log, 'r') as f: content = f.read() print(content) - break + return 0, startRes except subprocess.TimeoutExpired: continue file.close() From f2581abd5ab3ab57b4d493cb72564c50dc833b76 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Fri, 16 May 2025 10:27:44 +0800 Subject: [PATCH 26/28] update --- autotest/tools/pipeline/llm_case.py | 4 ++-- autotest/utils/pipeline_chat.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py index 1e718eb9df..2de77d2bd3 100644 --- a/autotest/tools/pipeline/llm_case.py +++ b/autotest/tools/pipeline/llm_case.py @@ -10,7 +10,7 @@ gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2) -def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = {}): +def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = None): if 'pytorch' in backend_type: backend_config = PytorchEngineConfig(tp=tp) @@ -24,7 +24,7 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, if 'turbomind' in backend_type and extra is not None and 'communicator' in extra: backend_config.communicator = extra.get('communicator') - if 'cache-max-entry-count' in extra and extra.get('cache-max-entry-count') is not None: + if extra is not None and 'cache-max-entry-count' in extra and extra.get('cache-max-entry-count') is not None: backend_config.cache_max_entry_count = extra.get('cache-max-entry-count') if 'w4' in model_path or ('4bits' in model_path or 'awq' in model_path.lower()): diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index eaa43e963d..376b8d6b4c 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -14,7 +14,7 @@ def run_pipeline_chat_test(config, model_case, backend_type, worker_id: str = '', - extra: object = {}, + extra: object = None, use_local_model: bool = True, is_smoke: bool = False): log_path = config.get('log_path') @@ -78,7 +78,7 @@ def run_pipeline_vl_chat_test(config, model_case, backend_type, worker_id: str = '', - extra: object = {}, + extra: object = None, is_smoke: bool = False): log_path = config.get('log_path') tp = get_tp_num(config, model_case) From dedd709a6a4757fb61045e0892e796429a5c1249 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Fri, 16 May 2025 14:09:08 +0800 Subject: [PATCH 27/28] updatre --- autotest/utils/run_restful_chat.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index 201fece932..e9789f7d66 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -293,6 +293,10 @@ def run_vl_testcase(config, port: int = DEFAULT_PORT): http_url = BASE_HTTP_URL + ':' + str(port) log_path = config.get('log_path') + model = get_model(http_url) + if model is None: + assert False, 'server not start correctly' + client = OpenAI(api_key='YOUR_API_KEY', base_url=http_url + '/v1') model_name = client.models.list().data[0].id From 590045f123d13bac1c5b2ec35804168ed60c9c65 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 Date: Fri, 16 May 2025 17:52:23 +0800 Subject: [PATCH 28/28] update --- autotest/utils/run_restful_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index e9789f7d66..bb6a7d3626 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -74,7 +74,7 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id) if not is_bf16_supported(): cmd += ' --cache-max-entry-count 0.5' if str(config.get('env_tag')) == '3090': - extra += ' --cache-max-entry-count 0.5' + cmd += ' --cache-max-entry-count 0.5' start_log = os.path.join(log_path, 'start_restful_' + model.split('/')[1] + worker_id + '.log')