From a2dee3403ab2a9899d7003785a371af2c5ccb6e6 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Mon, 12 May 2025 14:51:29 +0800
Subject: [PATCH 01/28] update

---
 .github/workflows/daily_ete_test_3090.yml | 581 ++++++++++++++++++++++
 autotest/config-3090.yaml                 |  77 +++
 2 files changed, 658 insertions(+)
 create mode 100644 .github/workflows/daily_ete_test_3090.yml
 create mode 100644 autotest/config-3090.yaml

diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
new file mode 100644
index 0000000000..09ce3a535f
--- /dev/null
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -0,0 +1,581 @@
+name: daily_ete_test_3090
+
+on:
+  workflow_dispatch:
+    inputs:
+      repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is InternLM'
+        type: string
+        default: 'InternLM/lmdeploy'
+      repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
+      backend:
+        required: true
+        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
+        type: string
+        default: "['turbomind', 'pytorch']"
+      model:
+        required: true
+        description: 'Set testcase module filter: llm, vllm. Default contains all models'
+        type: string
+        default: "['llm','mllm']"
+      function:
+        required: true
+        description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions'
+        type: string
+        default: '["pipeline", "restful", "chat"]'
+      offline_mode:
+        required: true
+        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
+        type: boolean
+        default: false
+      regression_func:
+        required: true
+        description: 'regression functions'
+        type: string
+        default: "['quant', 'tools', 'benchmark']"
+  schedule:
+    - cron:  '00 16 * * 0-4'
+
+env:
+  HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
+  HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
+  OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
+  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
+  FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
+  TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy
+  OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
+  OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
+  DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL
+
+jobs:
+  linux-build:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}}
+    strategy:
+      matrix:
+        pyver: [py310]
+    runs-on: ubuntu-latest
+    env:
+      PYTHON_VERSION: ${{ matrix.pyver }}
+      PLAT_NAME: manylinux2014_x86_64
+      DOCKER_TAG: cuda12.1
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Build
+        run: |
+          echo ${PYTHON_VERSION}
+          echo ${PLAT_NAME}
+          echo ${DOCKER_TAG}
+          echo ${OUTPUT_FOLDER}
+          echo ${GITHUB_RUN_ID}
+          # remove -it
+          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
+          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
+          retention-days: 1
+          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
+
+
+  download_pkgs:
+    needs: linux-build
+    if: ${{!cancelled()}}
+    runs-on: [self-hosted, 3090-r1]
+    timeout-minutes: 50
+    container:
+      image: openmmlab/lmdeploy:latest
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Copy repository
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: mv autotest/config-3090.yml autotest/config.yml && rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
+      - name: Copy repository - offline
+        if: ${{inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}}
+      - name: Download Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+      - name: Copy Artifacts - offline
+        if: ${{inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+
+  test_quantization:
+    needs: download_pkgs
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}}
+    runs-on: [self-hosted, 3090-r1]
+    timeout-minutes: 150
+    env:
+      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
+      MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
+    container:
+      image: openmmlab/lmdeploy:latest
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /nvme/qa_test_models/lmdeploy/autotest:/local_case
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/flash_attn-*.whl
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+          pip install ${{env.DEEPSEEK_VL}} --no-deps
+      - name: Check env
+        run: |
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - quantization w4a16
+        continue-on-error: true
+        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind')
+        run: |
+          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - quantization w8a8
+        continue-on-error: true
+        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'pytorch')
+        run: |
+          pytest autotest/tools/quantization/test_quantization_w8a8.py --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_tools:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
+    runs-on: [self-hosted, 3090-r1]
+    needs: test_quantization
+    timeout-minutes: 300
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+        model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}}
+        function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}}
+        exclude:
+          - backend: turbomind
+            model: mllm
+            function: chat
+          - backend: pytorch
+            model: mllm
+            function: chat
+        include:
+          - backend: turbomind
+            model: llm
+            function: local_case
+    env:
+      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
+      MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
+    container:
+      image: openmmlab/lmdeploy:latest
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/resources/lora:/root/lora
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /nvme/qa_test_models/lmdeploy/autotest:/local_case
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/flash_attn-*.whl
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+          rm -rf ${{env.DEEPSEEK_VL}}/build
+          pip install ${{env.DEEPSEEK_VL}} --no-deps
+      - name: Check env
+        run: |
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          cp -r /root/lora .
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - chat
+        continue-on-error: true
+        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat'
+        run: |
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+      - name: Test lmdeploy - pipeline
+        continue-on-error: true
+        if: matrix.function == 'pipeline'
+        run: |
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+      - name: Test lmdeploy - restful
+        continue-on-error: true
+        if: matrix.function == 'restful'
+        run: |
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+      - name: Test lmdeploy - local testcase
+        if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'local_case'
+        run: |
+          pytest autotest/toolchain --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}}|| true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_restful:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
+    runs-on: [self-hosted, 3090-r1]
+    needs: test_quantization
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+    timeout-minutes: 60
+    container:
+      image: openmmlab/lmdeploy:latest
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/flash_attn-*.whl
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+          pip install ${{env.DEEPSEEK_VL}} --no-deps
+      - name: Check env
+        run: |
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Start restful api turbomind
+        if: matrix.backend == 'turbomind'
+        run: |
+          lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm3-8b-instruct > restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 120s
+      - name: Start restful api pytorch
+        if: matrix.backend == 'pytorch'
+        run: |
+          lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm3-8b-instruct --backend pytorch > restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 180s
+      - name: Test lmdeploy - restful api
+        timeout-minutes: 75
+        run: |
+          pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}}/interface-${{matrix.backend}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Kill api server
+        if: always()
+        run: |
+          kill -15 "$restful_pid"
+      - name: Start restful api turbomind - base
+        if: matrix.backend == 'turbomind'
+        run: |
+          lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm3-8b-instruct > restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 120s
+      - name: Start restful api pytorch - base
+        if: matrix.backend == 'pytorch'
+        run: |
+          lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm3-8b-instruct --backend pytorch > restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 180s
+      - name: Test lmdeploy - restful api - base
+        timeout-minutes: 40
+        run: |
+          pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 --alluredir=${{env.REPORT_DIR}}/interface-${{matrix.backend}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Kill api server
+        if: always()
+        run: |
+          kill -15 "$restful_pid"
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_benchmark:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
+    runs-on: [self-hosted, 3090-r1]
+    needs: test_quantization
+    timeout-minutes: 120
+    container:
+      image: openmmlab/lmdeploy:latest
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/flash_attn-*.whl
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+          pip install ${{env.DEEPSEEK_VL}} --no-deps
+      - name: Check env
+        run: |
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test benchmark script
+        run: |
+          pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          chmod -R 777 /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_evaluation:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}}
+    runs-on: [self-hosted, 3090-r1]
+    needs: test_quantization
+    timeout-minutes: 120 # 2hours
+    strategy:
+      fail-fast: false
+      matrix:
+        evaluate_type: ['chat', 'base']
+    container:
+      image: openmmlab/lmdeploy:latest
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/resources:/root/resources
+        - /nvme/github-actions/opencompass-data:/root/opencompass-data
+        - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/flash_attn-*.whl
+          python3 -m pip install sentence_transformers==2.2.2 --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+          pip install ${{env.DEEPSEEK_VL}} --no-deps
+      - name: Install opencompass
+        run: |
+          git clone --depth=1 https://github.com/open-compass/opencompass.git
+          cd opencompass
+          cp /nvme/qa_test_models/offline_pkg/requirements-oc.txt requirements/runtime.txt
+          python3 -m pip install -e .
+          echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
+      - name: Check env
+        run: |
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Setup paths for evaluation
+        run: |
+          ln -s /root/opencompass-data ./data
+          python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models .
+      - name: Evaluate models
+        if: matrix.evaluate_type == 'chat'
+        run: |
+          export LMDEPLOY_DIR=$(pwd)
+
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_chat_7b, pytorch_internlm2_chat_7b, turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_internlm2_chat_20b, pytorch_internlm2_chat_20b, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
+      - name: Evaluate base models
+        if: matrix.evaluate_type == 'base'
+        run: |
+          export LMDEPLOY_DIR=$(pwd)
+
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_5_14b, turbomind_internlm2_5_7b_batch1]" "[*race_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]" /root/evaluation-reports/${{ github.run_id }} base true
+      - name: Clear workspace
+        if: always()
+        run: |
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+
+  get_benchmark_result:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
+    needs: [test_benchmark]
+    timeout-minutes: 5
+    runs-on: [self-hosted, 3090-r1]
+    container:
+      image: openmmlab/lmdeploy:latest
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    env:
+      BENCHMARK_REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Get overview
+        run: |
+          pip install pandas fire mmengine
+          python3 .github/scripts/action_tools.py generate_benchmark_report $BENCHMARK_REPORT_DIR
+
+
+  get_coverage_report:
+    if: ${{!cancelled()}}
+    runs-on: [self-hosted, 3090-r1]
+    needs: [test_tools, test_restful, test_benchmark]
+    timeout-minutes: 5
+    container:
+      image: openmmlab/lmdeploy:latest
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Get coverage report
+        run: |
+          pip install coverage
+          coverage combine ${{env.REPORT_DIR}}
+          coverage xml -o ${{env.REPORT_DIR}}/coverage.xml
+          coverage report -m
+          mv .coverage ${{env.REPORT_DIR}}/.coverage
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml
new file mode 100644
index 0000000000..80a9a66703
--- /dev/null
+++ b/autotest/config-3090.yaml
@@ -0,0 +1,77 @@
+model_path: /nvme/qa_test_models
+resource_path: /nvme/qa_test_models/resource
+dst_path: /nvme/qa_test_models/autotest_model
+log_path: /nvme/qa_test_models/autotest_model/log
+benchmark_path: /nvme/qa_test_models/benchmark-reports
+dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+
+tp_config:
+    internlm2-chat-20b: 2
+    
+
+turbomind_chat_model:
+    - meta-llama/Llama-3.2-3B-Instruct
+    - internlm/internlm3-8b-instruct
+    - OpenGVLab/InternVL3-8B
+    - OpenGVLab/InternVL2_5-1B
+    - Qwen/Qwen3-8B
+    - Qwen/Qwen2.5-7B-Instruct
+
+pytorch_chat_model:
+    - meta-llama/Llama-3.2-3B-Instruct
+    - internlm/internlm3-8b-instruct
+    - OpenGVLab/InternVL3-8B
+    - OpenGVLab/InternVL2_5-1B
+    - Qwen/Qwen3-8B
+    - Qwen/Qwen2.5-7B-Instruct
+    - Qwen/Qwen2.5-VL-7B-Instruct
+
+turbomind_vl_model:
+    - OpenGVLab/InternVL3-8B
+    - OpenGVLab/InternVL2_5-1B
+
+pytorch_vl_model:
+    - OpenGVLab/InternVL3-8B
+    - OpenGVLab/InternVL2_5-1B
+    - Qwen/Qwen3-8B
+    - Qwen/Qwen2.5-7B-Instruct
+    - Qwen/Qwen2.5-VL-7B-Instruct
+
+turbomind_base_model:
+    - internlm/internlm3-8b-instruct
+    - Qwen/Qwen3-8B
+
+pytorch_base_model:
+    - internlm/internlm3-8b-instruct
+    - Qwen/Qwen3-8B
+
+turbomind_quatization:
+    no_awq:
+        - meta-llama/Meta-Llama-3-1-70B-Instruct
+    gptq:
+    no_kvint4:
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Chat
+
+pytorch_quatization:
+    awq:
+        - meta-llama/Llama-3.2-3B-Instruct
+        - internlm/internlm3-8b-instruct
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
+    w8a8:
+        - meta-llama/Llama-3.2-3B-Instruct
+        - internlm/internlm3-8b-instruct
+        - Qwen/Qwen3-8B
+    no_kvint4:
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Lite-Chat

From 53f824cbb0a56b01765aef7027dfb35e47c07ca1 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Mon, 12 May 2025 15:10:38 +0800
Subject: [PATCH 02/28] update

---
 .github/workflows/daily_ete_test_3090.yml | 51 +++--------------------
 autotest/config-3090.yaml                 |  6 +++
 2 files changed, 11 insertions(+), 46 deletions(-)

diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
index 09ce3a535f..af7303d49c 100644
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -140,10 +140,8 @@ jobs:
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
-        - /nvme/github-actions/packages:/root/packages
         - /nvme/qa_test_models:/nvme/qa_test_models
-        - /mnt/shared:/mnt/shared
-        - /nvme/qa_test_models/lmdeploy/autotest:/local_case
+        - /data1:/data1
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Copy repository and Artifacts
@@ -151,9 +149,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          # manually install flash attn
-          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
-          python3 -m pip install /root/packages/flash_attn-*.whl
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -210,10 +205,6 @@ jobs:
           - backend: pytorch
             model: mllm
             function: chat
-        include:
-          - backend: turbomind
-            model: llm
-            function: local_case
     env:
       PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
       MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
@@ -223,11 +214,8 @@ jobs:
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
-        - /nvme/github-actions/packages:/root/packages
-        - /nvme/github-actions/resources/lora:/root/lora
         - /nvme/qa_test_models:/nvme/qa_test_models
-        - /mnt/shared:/mnt/shared
-        - /nvme/qa_test_models/lmdeploy/autotest:/local_case
+        - /data1:/data1
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Copy repository and Artifacts
@@ -235,9 +223,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          # manually install flash attn
-          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
-          python3 -m pip install /root/packages/flash_attn-*.whl
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -274,13 +259,6 @@ jobs:
         run: |
           pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-      - name: Test lmdeploy - local testcase
-        if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'local_case'
-        run: |
-          pytest autotest/toolchain --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}}|| true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Clear workfile
         if: always()
         run: |
@@ -305,7 +283,6 @@ jobs:
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
-        - /nvme/github-actions/packages:/root/packages
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
@@ -314,9 +291,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          # manually install flash attn
-          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
-          python3 -m pip install /root/packages/flash_attn-*.whl
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -395,7 +369,6 @@ jobs:
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
-        - /nvme/github-actions/packages:/root/packages
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
@@ -404,9 +377,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          # manually install flash attn
-          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
-          python3 -m pip install /root/packages/flash_attn-*.whl
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
@@ -446,18 +416,17 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        evaluate_type: ['chat', 'base']
+        evaluate_type: ['chat']
     container:
       image: openmmlab/lmdeploy:latest
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
-        - /nvme/github-actions/packages:/root/packages
         - /nvme/github-actions/resources:/root/resources
         - /nvme/github-actions/opencompass-data:/root/opencompass-data
         - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
         - /nvme/qa_test_models:/nvme/qa_test_models
-        - /mnt/shared:/mnt/shared
+        - /data1:/data1
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Copy repository and Artifacts
@@ -465,9 +434,6 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          # manually install flash attn
-          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
-          python3 -m pip install /root/packages/flash_attn-*.whl
           python3 -m pip install sentence_transformers==2.2.2 --no-deps
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
@@ -501,13 +467,7 @@ jobs:
         run: |
           export LMDEPLOY_DIR=$(pwd)
 
-          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_chat_7b, pytorch_internlm2_chat_7b, turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_internlm2_chat_20b, pytorch_internlm2_chat_20b, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, turbomind_qwen2_5_7b_instruct, pytorch_qwen2_5_7b_instruct, turbomind_llama2_7b_chat, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, pytorch_gemma_2_27b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
-      - name: Evaluate base models
-        if: matrix.evaluate_type == 'base'
-        run: |
-          export LMDEPLOY_DIR=$(pwd)
-
-          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_5_14b, turbomind_internlm2_5_7b_batch1]" "[*race_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]" /root/evaluation-reports/${{ github.run_id }} base true
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
       - name: Clear workspace
         if: always()
         run: |
@@ -553,7 +513,6 @@ jobs:
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
-        - /nvme/github-actions/packages:/root/packages
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml
index 80a9a66703..51d16a6a7f 100644
--- a/autotest/config-3090.yaml
+++ b/autotest/config-3090.yaml
@@ -75,3 +75,9 @@ pytorch_quatization:
         - Qwen/Qwen2.5-VL-7B-Instruct
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
+
+benchmark_model:
+    - meta-llama/Llama-3.2-3B-Instruct
+    - internlm/internlm3-8b-instruct
+    - OpenGVLab/InternVL3-8B
+    - Qwen/Qwen3-8B

From 3c79d380433f9eb1394a578a1d99cd63b9d1b51a Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Mon, 12 May 2025 16:34:38 +0800
Subject: [PATCH 03/28] update

---
 .github/workflows/daily_ete_test_3090.yml | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
index af7303d49c..b7c29d8bcf 100644
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -52,7 +52,6 @@ env:
   TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy
   OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
   OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
-  DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL
 
 jobs:
   linux-build:
@@ -110,10 +109,10 @@ jobs:
           ref: ${{github.event.inputs.repo_ref || 'main'}}
       - name: Copy repository
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        run: mv autotest/config-3090.yml autotest/config.yml && rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yml ${{env.TEST_CODE_PATH}}/autotest/config.yml
       - name: Copy repository - offline
         if: ${{inputs.offline_mode}}
-        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yml ${{env.TEST_CODE_PATH}}/autotest/config.yml
       - name: Download Artifacts
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         uses: actions/download-artifact@v4
@@ -154,7 +153,6 @@ jobs:
         run: |
           python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
-          pip install ${{env.DEEPSEEK_VL}} --no-deps
       - name: Check env
         run: |
           pip uninstall -y nvidia-nccl-cu11
@@ -228,8 +226,6 @@ jobs:
         run: |
           python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
-          rm -rf ${{env.DEEPSEEK_VL}}/build
-          pip install ${{env.DEEPSEEK_VL}} --no-deps
       - name: Check env
         run: |
           pip uninstall -y nvidia-nccl-cu11
@@ -296,7 +292,6 @@ jobs:
         run: |
           python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
-          pip install ${{env.DEEPSEEK_VL}} --no-deps
       - name: Check env
         run: |
           pip uninstall -y nvidia-nccl-cu11
@@ -382,7 +377,6 @@ jobs:
         run: |
           python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
-          pip install ${{env.DEEPSEEK_VL}} --no-deps
       - name: Check env
         run: |
           pip uninstall -y nvidia-nccl-cu11
@@ -434,19 +428,16 @@ jobs:
           cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install sentence_transformers==2.2.2 --no-deps
           python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
         run: |
           python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
-          pip install ${{env.DEEPSEEK_VL}} --no-deps
       - name: Install opencompass
         run: |
           git clone --depth=1 https://github.com/open-compass/opencompass.git
           cd opencompass
-          cp /nvme/qa_test_models/offline_pkg/requirements-oc.txt requirements/runtime.txt
-          python3 -m pip install -e .
+          python3 -m pip install -e ".[full]"
           echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
       - name: Check env
         run: |

From b20758df1b5437aa12e8a4fb02a10b0c8333355e Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Mon, 12 May 2025 16:42:44 +0800
Subject: [PATCH 04/28] update

---
 .github/workflows/daily_ete_test_3090.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
index b7c29d8bcf..4c206fbbcc 100644
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -99,6 +99,7 @@ jobs:
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/qa_test_models:/nvme/qa_test_models
+        - /data1:/data1
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Clone repository

From 8a86f10609de6b5096f38b0346f70a2b3933ebf4 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Mon, 12 May 2025 16:51:53 +0800
Subject: [PATCH 05/28] update

---
 .github/workflows/daily_ete_test_3090.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
index 4c206fbbcc..5112c8a7e9 100644
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -110,10 +110,10 @@ jobs:
           ref: ${{github.event.inputs.repo_ref || 'main'}}
       - name: Copy repository
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yml ${{env.TEST_CODE_PATH}}/autotest/config.yml
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml
       - name: Copy repository - offline
         if: ${{inputs.offline_mode}}
-        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yml ${{env.TEST_CODE_PATH}}/autotest/config.yml
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} && mv ${{env.TEST_CODE_PATH}}/autotest/config-3090.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml
       - name: Download Artifacts
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         uses: actions/download-artifact@v4

From ba8d937b994bb74a0dae402c9560ef903d443711 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Mon, 12 May 2025 16:56:08 +0800
Subject: [PATCH 06/28] update

---
 .github/workflows/daily_ete_test_3090.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
index 5112c8a7e9..27837152cb 100644
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -187,7 +187,7 @@ jobs:
           chmod -R 777 $workdir
 
   test_tools:
-    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
+    if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
     runs-on: [self-hosted, 3090-r1]
     needs: test_quantization
     timeout-minutes: 300
@@ -267,7 +267,7 @@ jobs:
           chmod -R 777 $workdir
 
   test_restful:
-    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
+    if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
     runs-on: [self-hosted, 3090-r1]
     needs: test_quantization
     strategy:
@@ -356,7 +356,7 @@ jobs:
           chmod -R 777 $workdir
 
   test_benchmark:
-    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
+    if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
     runs-on: [self-hosted, 3090-r1]
     needs: test_quantization
     timeout-minutes: 120
@@ -404,7 +404,7 @@ jobs:
           chmod -R 777 $workdir
 
   test_evaluation:
-    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}}
+    if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}}
     runs-on: [self-hosted, 3090-r1]
     needs: test_quantization
     timeout-minutes: 120 # 2hours
@@ -471,7 +471,7 @@ jobs:
 
 
   get_benchmark_result:
-    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
+    if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
     needs: [test_benchmark]
     timeout-minutes: 5
     runs-on: [self-hosted, 3090-r1]
@@ -496,7 +496,7 @@ jobs:
 
 
   get_coverage_report:
-    if: ${{!cancelled()}}
+    if: ${{!cancelled() && success()}}
     runs-on: [self-hosted, 3090-r1]
     needs: [test_tools, test_restful, test_benchmark]
     timeout-minutes: 5

From 5f978e3e119707ca5463f5cff3c8ed65a04150b8 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Mon, 12 May 2025 17:56:46 +0800
Subject: [PATCH 07/28] update

---
 .github/workflows/daily_ete_test_3090.yml     | 154 +-----------------
 autotest/config-3090.yaml                     |   5 +-
 .../quantization/test_quantization_awq.py     |   1 +
 3 files changed, 11 insertions(+), 149 deletions(-)

diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
index 27837152cb..018af585e8 100644
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -37,7 +37,7 @@ on:
         required: true
         description: 'regression functions'
         type: string
-        default: "['quant', 'tools', 'benchmark']"
+        default: "['quant', 'tools']"
   schedule:
     - cron:  '00 16 * * 0-4'
 
@@ -128,7 +128,7 @@ jobs:
 
   test_quantization:
     needs: download_pkgs
-    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}}
+    if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}}
     runs-on: [self-hosted, 3090-r1]
     timeout-minutes: 150
     env:
@@ -168,7 +168,7 @@ jobs:
         continue-on-error: true
         if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind')
         run: |
-          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
+          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test and test_3090' --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - quantization w8a8
         continue-on-error: true
@@ -242,19 +242,19 @@ jobs:
         continue-on-error: true
         if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat'
         run: |
-          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test and test_3090' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
       - name: Test lmdeploy - pipeline
         continue-on-error: true
         if: matrix.function == 'pipeline'
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and test_3090' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
       - name: Test lmdeploy - restful
         continue-on-error: true
         if: matrix.function == 'restful'
         run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and test_3090' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
       - name: Clear workfile
         if: always()
@@ -355,150 +355,10 @@ jobs:
           mkdir $workdir
           chmod -R 777 $workdir
 
-  test_benchmark:
-    if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
-    runs-on: [self-hosted, 3090-r1]
-    needs: test_quantization
-    timeout-minutes: 120
-    container:
-      image: openmmlab/lmdeploy:latest
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
-      volumes:
-        - /nvme/github-actions/pip-cache:/root/.cache/pip
-        - /nvme/qa_test_models:/nvme/qa_test_models
-        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
-    steps:
-      - name: Copy repository and Artifacts
-        run: |
-          cp -r ${{env.TEST_CODE_PATH}}/. .
-      - name: Install lmdeploy - dependency
-        run: |
-          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
-      - name: Install lmdeploy
-        run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
-          python3 -m pip install -r requirements/test.txt
-      - name: Check env
-        run: |
-          pip uninstall -y nvidia-nccl-cu11
-          python3 -m pip list
-          lmdeploy check_env
-          rm -rf allure-results
-          # remove tmp log in testcase
-          rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
-          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
-      - name: Test benchmark script
-        run: |
-          pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Clear workfile
-        if: always()
-        run: |
-          chmod -R 777 $REPORT_DIR
-          chmod -R 777 /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
-          export workdir=$(pwd)
-          cd ..
-          rm -rf $workdir
-          mkdir $workdir
-          chmod -R 777 $workdir
-
-  test_evaluation:
-    if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}}
-    runs-on: [self-hosted, 3090-r1]
-    needs: test_quantization
-    timeout-minutes: 120 # 2hours
-    strategy:
-      fail-fast: false
-      matrix:
-        evaluate_type: ['chat']
-    container:
-      image: openmmlab/lmdeploy:latest
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
-      volumes:
-        - /nvme/github-actions/pip-cache:/root/.cache/pip
-        - /nvme/github-actions/resources:/root/resources
-        - /nvme/github-actions/opencompass-data:/root/opencompass-data
-        - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
-        - /nvme/qa_test_models:/nvme/qa_test_models
-        - /data1:/data1
-        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
-    steps:
-      - name: Copy repository and Artifacts
-        run: |
-          cp -r ${{env.TEST_CODE_PATH}}/. .
-      - name: Install lmdeploy - dependency
-        run: |
-          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
-      - name: Install lmdeploy
-        run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
-          python3 -m pip install -r requirements/test.txt
-      - name: Install opencompass
-        run: |
-          git clone --depth=1 https://github.com/open-compass/opencompass.git
-          cd opencompass
-          python3 -m pip install -e ".[full]"
-          echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
-      - name: Check env
-        run: |
-          pip uninstall -y nvidia-nccl-cu11
-          python3 -m pip list
-          lmdeploy check_env
-          rm -rf allure-results
-          # remove tmp log in testcase
-          rm -rf /nvme/qa_test_models/autotest_model/log/*
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
-          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
-      - name: Setup paths for evaluation
-        run: |
-          ln -s /root/opencompass-data ./data
-          python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models .
-      - name: Evaluate models
-        if: matrix.evaluate_type == 'chat'
-        run: |
-          export LMDEPLOY_DIR=$(pwd)
-
-          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm3_8b_instruct, pytorch_internlm3_8b_instruct]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
-      - name: Clear workspace
-        if: always()
-        run: |
-          export workdir=$(pwd)
-          cd ..
-          rm -rf $workdir
-          mkdir $workdir
-          chmod -R 777 $workdir
-
-
-  get_benchmark_result:
-    if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
-    needs: [test_benchmark]
-    timeout-minutes: 5
-    runs-on: [self-hosted, 3090-r1]
-    container:
-      image: openmmlab/lmdeploy:latest
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
-      volumes:
-        - /nvme/qa_test_models:/nvme/qa_test_models
-        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
-    env:
-      BENCHMARK_REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
-    steps:
-      - name: Clone repository
-        uses: actions/checkout@v2
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Get overview
-        run: |
-          pip install pandas fire mmengine
-          python3 .github/scripts/action_tools.py generate_benchmark_report $BENCHMARK_REPORT_DIR
-
-
   get_coverage_report:
     if: ${{!cancelled() && success()}}
     runs-on: [self-hosted, 3090-r1]
-    needs: [test_tools, test_restful, test_benchmark]
+    needs: [test_tools, test_restful]
     timeout-minutes: 5
     container:
       image: openmmlab/lmdeploy:latest
diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml
index 51d16a6a7f..875ab03f17 100644
--- a/autotest/config-3090.yaml
+++ b/autotest/config-3090.yaml
@@ -6,8 +6,8 @@ benchmark_path: /nvme/qa_test_models/benchmark-reports
 dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
 
 tp_config:
-    internlm2-chat-20b: 2
-    
+    empty: 2
+
 
 turbomind_chat_model:
     - meta-llama/Llama-3.2-3B-Instruct
@@ -49,6 +49,7 @@ turbomind_quatization:
     no_awq:
         - meta-llama/Meta-Llama-3-1-70B-Instruct
     gptq:
+        - empty
     no_kvint4:
         - Qwen/Qwen3-8B
         - Qwen/Qwen2.5-7B-Instruct
diff --git a/autotest/tools/quantization/test_quantization_awq.py b/autotest/tools/quantization/test_quantization_awq.py
index d1d948c1ae..7552e6e2aa 100644
--- a/autotest/tools/quantization/test_quantization_awq.py
+++ b/autotest/tools/quantization/test_quantization_awq.py
@@ -7,6 +7,7 @@
 
 
 @pytest.mark.order(3)
+@pytest.mark.test_3090
 @pytest.mark.timeout(900)
 @pytest.mark.parametrize('model', get_quantization_model_list('awq'))
 def test_quantization_awq(config, model, worker_id):

From a92539a7a9c66da9a3c974386a05e60a2a166f50 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Mon, 12 May 2025 18:03:11 +0800
Subject: [PATCH 08/28] update

---
 autotest/tools/chat/test_command_chat_hf_pytorch.py           | 4 ++++
 autotest/tools/chat/test_command_chat_hf_turbomind.py         | 4 ++++
 autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py     | 3 +++
 autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py    | 3 +++
 autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py   | 3 +++
 autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py  | 3 +++
 autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py    | 3 +++
 autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py   | 3 +++
 autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py  | 3 +++
 autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py | 3 +++
 10 files changed, 32 insertions(+)

diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py
index 8bcc00d3e5..a60790914c 100644
--- a/autotest/tools/chat/test_command_chat_hf_pytorch.py
+++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py
@@ -10,6 +10,7 @@
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1))
 def test_hf_pytorch_chat_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
@@ -69,6 +70,7 @@ def test_hf_pytorch_chat_tp4(config, model, cli_case_config, worker_id):
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=4))
 def test_hf_pytorch_chat_kvin4_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
@@ -131,6 +133,7 @@ def test_hf_pytorch_chat_kvin4_tp4(config, model, cli_case_config, worker_id):
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=8))
 def test_hf_pytorch_chat_kvin8_tp1(config, model, cli_case_config, worker_id):
     usercase = 'chat_testcase'
@@ -192,6 +195,7 @@ def test_hf_pytorch_chat_kvin8_tp4(config, model, cli_case_config, worker_id):
 @pytest.mark.order(10)
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
+@pytest.mark.test_3090
 @pytest.mark.gpu_num_1
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, model_type='base_model'))
 def test_hf_pytorch_base_tp1(config, model, cli_case_config, worker_id):
diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py
index 2a7a6a36b6..70e367d235 100644
--- a/autotest/tools/chat/test_command_chat_hf_turbomind.py
+++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py
@@ -10,6 +10,7 @@
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1))
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_chat_tp1(config, model, communicator, cli_case_config, worker_id):
@@ -78,6 +79,7 @@ def test_hf_turbomind_chat_tp4(config, model, communicator, cli_case_config, wor
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4))
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_chat_kvint4_tp1(config, model, communicator, cli_case_config, worker_id):
@@ -146,6 +148,7 @@ def test_hf_turbomind_chat_kvint4_tp4(config, model, communicator, cli_case_conf
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8))
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_chat_kvint8_tp1(config, model, communicator, cli_case_config, worker_id):
@@ -314,6 +317,7 @@ def test_hf_turbomind_chat_fallback_backend_kvint8_tp2(config, model, communicat
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='base_model'))
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_hf_turbomind_base_tp1(config, model, communicator, cli_case_config, worker_id):
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
index f1d67c113e..4e727d6c97 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
@@ -9,6 +9,7 @@
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, exclude_dup=True))
 def test_pipeline_chat_pytorch_tp1(config, common_case_config, model, worker_id):
@@ -47,6 +48,7 @@ def test_pipeline_chat_pytorch_tp4(config, common_case_config, model, worker_id)
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=4, exclude_dup=True))
 def test_pipeline_chat_kvint4_tp1(config, common_case_config, model, worker_id):
@@ -85,6 +87,7 @@ def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=8, exclude_dup=True))
 def test_pipeline_chat_kvint8_tp1(config, common_case_config, model, worker_id):
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
index 7115926116..a65465fe0c 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
@@ -12,6 +12,7 @@
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, model_type='vl_model'))
 def test_pipeline_chat_tp1(config, model, worker_id):
     if 'gw' in worker_id:
@@ -47,6 +48,7 @@ def test_pipeline_chat_tp4(config, model, worker_id):
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=4, model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
     if 'gw' in worker_id:
@@ -82,6 +84,7 @@ def test_pipeline_chat_kvint4_tp4(config, model, worker_id):
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_torch_model_list(tp_num=1, quant_policy=8, model_type='vl_model'))
 def test_pipeline_chat_kvint8_tp1(config, model, worker_id):
     if 'gw' in worker_id:
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
index 8e4734f386..e4f18d4690 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
@@ -9,6 +9,7 @@
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1))
 @pytest.mark.parametrize('communicator', get_communicator_list())
@@ -50,6 +51,7 @@ def test_pipeline_chat_tp4(config, common_case_config, model, communicator, work
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4))
 @pytest.mark.parametrize('communicator', get_communicator_list())
@@ -103,6 +105,7 @@ def test_pipeline_chat_kvint4_tp4(config, common_case_config, model, communicato
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8))
 @pytest.mark.parametrize('communicator', get_communicator_list())
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
index a245a12bcb..09b16b7656 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -12,6 +12,7 @@
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, model_type='vl_model'))
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_tp1(config, model, communicator, worker_id):
@@ -53,6 +54,7 @@ def test_pipeline_chat_tp4(config, model, communicator, worker_id):
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=4, model_type='vl_model'))
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_kvint4_tp1(config, model, communicator, worker_id):
@@ -100,6 +102,7 @@ def test_pipeline_chat_kvint4_tp4(config, model, communicator, worker_id):
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1, quant_policy=8, model_type='vl_model'))
 @pytest.mark.parametrize('communicator', get_communicator_list())
 def test_pipeline_chat_kvint8_tp1(config, model, communicator, worker_id):
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
index 9cbd769a6d..d29d9f526d 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
@@ -28,6 +28,7 @@ def getModelList(tp_num):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api_pytorch
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True)
 def test_restful_chat_tp1(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:
@@ -73,6 +74,7 @@ def getKvintModelList(tp_num, quant_policy):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=4), indirect=True)
 def test_restful_chat_kvint4_tp1(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:
@@ -109,6 +111,7 @@ def test_restful_chat_kvint4_tp4(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=8), indirect=True)
 def test_restful_chat_kvint8_tp1(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
index 573d32e5b0..82d7a7bf7a 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
@@ -28,6 +28,7 @@ def getModelList(tp_num):
 @pytest.mark.order(7)
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True)
 def test_restful_chat_tp1(config, worker_id):
     if get_workerid(worker_id) is None:
@@ -70,6 +71,7 @@ def getKvintModelList(tp_num, quant_policy: int = None):
 @pytest.mark.order(7)
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=4), indirect=True)
 def test_restful_chat_kvint4_tp1(config, worker_id):
     if get_workerid(worker_id) is None:
@@ -103,6 +105,7 @@ def test_restful_chat_kvint4_tp4(config, worker_id):
 @pytest.mark.order(7)
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=8), indirect=True)
 def test_restful_chat_kvint8_tp1(config, worker_id):
     if get_workerid(worker_id) is None:
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
index 7597af89cf..490f2a2507 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
@@ -33,6 +33,7 @@ def getModelList(tp_num):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True)
 def test_restful_chat_tp1(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:
@@ -81,6 +82,7 @@ def getKvintModelList(tp_num, quant_policy):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=4), indirect=True)
 def test_restful_chat_kvint4_tp1(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:
@@ -117,6 +119,7 @@ def test_restful_chat_kvint4_tp4(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=8), indirect=True)
 def test_restful_chat_kvint8_tp1(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
index 728d8e94c0..cbe530d65c 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
@@ -32,6 +32,7 @@ def getModelList(tp_num):
 @pytest.mark.order(7)
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getModelList(tp_num=1), indirect=True)
 def test_restful_chat_tp1(config, worker_id):
     if get_workerid(worker_id) is None:
@@ -77,6 +78,7 @@ def getKvintModelList(tp_num, quant_policy: int = None):
 @pytest.mark.order(7)
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=4), indirect=True)
 def test_restful_chat_kvint4_tp1(config, worker_id):
     if get_workerid(worker_id) is None:
@@ -110,6 +112,7 @@ def test_restful_chat_kvint4_tp4(config, worker_id):
 @pytest.mark.order(7)
 @pytest.mark.restful_api_vl
 @pytest.mark.gpu_num_1
+@pytest.mark.test_3090
 @pytest.mark.parametrize('prepare_environment', getKvintModelList(tp_num=1, quant_policy=8), indirect=True)
 def test_restful_chat_kvint8_tp1(config, worker_id):
     if get_workerid(worker_id) is None:

From 000e2679065ffa4c755b8d6facbaf36d3e2842c1 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Mon, 12 May 2025 18:09:01 +0800
Subject: [PATCH 09/28] update

---
 .github/workflows/daily_ete_test_3090.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
index 018af585e8..d1a56e7b69 100644
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -128,7 +128,7 @@ jobs:
 
   test_quantization:
     needs: download_pkgs
-    if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}}
+    if: ${{!cancelled() && contains(needs.download_pkgs.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}}
     runs-on: [self-hosted, 3090-r1]
     timeout-minutes: 150
     env:
@@ -187,7 +187,7 @@ jobs:
           chmod -R 777 $workdir
 
   test_tools:
-    if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
+    if: ${{!cancelled() && contains(needs.test_quantization.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
     runs-on: [self-hosted, 3090-r1]
     needs: test_quantization
     timeout-minutes: 300
@@ -267,7 +267,7 @@ jobs:
           chmod -R 777 $workdir
 
   test_restful:
-    if: ${{!cancelled() && success() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
+    if: ${{!cancelled() && contains(needs.test_quantization.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
     runs-on: [self-hosted, 3090-r1]
     needs: test_quantization
     strategy:

From 49179cf26d604d60dc5465ed970b997b225ee624 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Mon, 12 May 2025 19:09:49 +0800
Subject: [PATCH 10/28] update

---
 autotest/utils/quantization_utils.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py
index 6560bb55fa..aad5d682af 100644
--- a/autotest/utils/quantization_utils.py
+++ b/autotest/utils/quantization_utils.py
@@ -20,17 +20,20 @@ def quantization(config,
 
     if quantization_type == 'awq':
         quantization_cmd = ' '.join(
-            [cuda_prefix, 'lmdeploy lite auto_awq', origin_model_path, '--work-dir', quantization_model_path])
+            ['lmdeploy lite auto_awq', origin_model_path, '--work-dir', quantization_model_path])
     elif quantization_type == 'gptq':
         quantization_cmd = ' '.join(
-            [cuda_prefix, 'lmdeploy lite auto_gptq', origin_model_path, '--work-dir', quantization_model_path])
+            ['lmdeploy lite auto_gptq', origin_model_path, '--work-dir', quantization_model_path])
     elif quantization_type == 'w8a8':
         quantization_cmd = ' '.join(
-            [cuda_prefix, 'lmdeploy lite smooth_quant', origin_model_path, '--work-dir', quantization_model_path])
+            ['lmdeploy lite smooth_quant', origin_model_path, '--work-dir', quantization_model_path])
     else:
         return False, 'quantization type should in [awq, gptq, w8a8], \
             now the type is ' + quantization_type
 
+    if cuda_prefix is not None:
+        quantization_cmd = ' '.join([cuda_prefix, quantization_cmd])
+
     if 'llama-3' in origin_model_name.lower():
         quantization_cmd += ' --search-scale'
 

From 37aca3dd73d5fd410aa402f1906d86c1664316c5 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Mon, 12 May 2025 19:11:46 +0800
Subject: [PATCH 11/28] update

---
 .github/workflows/daily_ete_test_3090.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
index d1a56e7b69..8ca86a4fd9 100644
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -187,7 +187,7 @@ jobs:
           chmod -R 777 $workdir
 
   test_tools:
-    if: ${{!cancelled() && contains(needs.test_quantization.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
+    if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
     runs-on: [self-hosted, 3090-r1]
     needs: test_quantization
     timeout-minutes: 300
@@ -232,7 +232,6 @@ jobs:
           pip uninstall -y nvidia-nccl-cu11
           python3 -m pip list
           lmdeploy check_env
-          cp -r /root/lora .
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
@@ -267,7 +266,7 @@ jobs:
           chmod -R 777 $workdir
 
   test_restful:
-    if: ${{!cancelled() && contains(needs.test_quantization.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
+    if: ${{!cancelled() && !contains(needs.test_quantization.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
     runs-on: [self-hosted, 3090-r1]
     needs: test_quantization
     strategy:

From d8ba44dc9e72e78da2421296082b14edd6440e60 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Wed, 14 May 2025 13:30:28 +0800
Subject: [PATCH 12/28] update

---
 .github/workflows/daily_ete_test_3090.yml | 2 +-
 autotest/utils/config_utils.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
index 8ca86a4fd9..337c15eeca 100644
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -44,7 +44,7 @@ on:
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
   HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
-  OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
+  OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
   COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index f096dbe112..9aa1b3f8e5 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -92,7 +92,7 @@ def get_quantization_model_list(type):
     config = get_config()
     if type == 'awq':
         case_list = [
-            x for x in config.get('turbomind_chat_model') + config.get('turbomind_base_model')
+            x for x in list(set(config.get('turbomind_chat_model')) + set(config.get('turbomind_base_model')))
             if x not in config.get('turbomind_quatization').get('no_awq') and not is_quantization_model(x)
         ]
         for key in config.get('pytorch_quatization').get('awq'):

From 91faa59684f5a913e635b0fac02242403dbfcd6f Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Wed, 14 May 2025 13:37:47 +0800
Subject: [PATCH 13/28] update

---
 autotest/utils/config_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index 9aa1b3f8e5..c13325e58e 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -3,6 +3,7 @@
 
 import yaml
 from utils.get_run_config import get_tp_num
+from collections import OrderedDict
 
 from lmdeploy.utils import is_bf16_supported
 
@@ -92,7 +93,7 @@ def get_quantization_model_list(type):
     config = get_config()
     if type == 'awq':
         case_list = [
-            x for x in list(set(config.get('turbomind_chat_model')) + set(config.get('turbomind_base_model')))
+            x for x in list(OrderedDict.fromkeys(config.get('turbomind_chat_model') + config.get('turbomind_base_model')))
             if x not in config.get('turbomind_quatization').get('no_awq') and not is_quantization_model(x)
         ]
         for key in config.get('pytorch_quatization').get('awq'):

From 7961a27aa2f9442894cefafd07d52e26ed0b720b Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Wed, 14 May 2025 14:26:08 +0800
Subject: [PATCH 14/28] update

---
 autotest/config-3090.yaml | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml
index 875ab03f17..6c44ef8af3 100644
--- a/autotest/config-3090.yaml
+++ b/autotest/config-3090.yaml
@@ -47,7 +47,10 @@ pytorch_base_model:
 
 turbomind_quatization:
     no_awq:
-        - meta-llama/Meta-Llama-3-1-70B-Instruct
+        - OpenGVLab/InternVL3-8B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
     gptq:
         - empty
     no_kvint4:
@@ -61,24 +64,13 @@ pytorch_quatization:
     awq:
         - meta-llama/Llama-3.2-3B-Instruct
         - internlm/internlm3-8b-instruct
-        - OpenGVLab/InternVL3-8B
         - OpenGVLab/InternVL2_5-1B
-        - Qwen/Qwen3-8B
-        - Qwen/Qwen2.5-7B-Instruct
-        - Qwen/Qwen2.5-VL-7B-Instruct
     w8a8:
         - meta-llama/Llama-3.2-3B-Instruct
         - internlm/internlm3-8b-instruct
-        - Qwen/Qwen3-8B
     no_kvint4:
         - Qwen/Qwen3-8B
         - Qwen/Qwen2.5-7B-Instruct
         - Qwen/Qwen2.5-VL-7B-Instruct
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
-
-benchmark_model:
-    - meta-llama/Llama-3.2-3B-Instruct
-    - internlm/internlm3-8b-instruct
-    - OpenGVLab/InternVL3-8B
-    - Qwen/Qwen3-8B

From 7204e944146b94e3a06cd4b28662f59ec177bb41 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Wed, 14 May 2025 14:28:35 +0800
Subject: [PATCH 15/28] update

---
 autotest/utils/config_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index c13325e58e..c53e33bf0f 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -1,9 +1,9 @@
 import copy
 import os
+from collections import OrderedDict
 
 import yaml
 from utils.get_run_config import get_tp_num
-from collections import OrderedDict
 
 from lmdeploy.utils import is_bf16_supported
 
@@ -93,7 +93,8 @@ def get_quantization_model_list(type):
     config = get_config()
     if type == 'awq':
         case_list = [
-            x for x in list(OrderedDict.fromkeys(config.get('turbomind_chat_model') + config.get('turbomind_base_model')))
+            x
+            for x in list(OrderedDict.fromkeys(config.get('turbomind_chat_model') + config.get('turbomind_base_model')))
             if x not in config.get('turbomind_quatization').get('no_awq') and not is_quantization_model(x)
         ]
         for key in config.get('pytorch_quatization').get('awq'):

From 3e52598e73ca7ab8a372995ae688f383b607b8da Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Wed, 14 May 2025 16:07:17 +0800
Subject: [PATCH 16/28] update

---
 autotest/config-3090.yaml            |  2 +-
 autotest/config.yaml                 |  1 +
 autotest/tools/pipeline/llm_case.py  |  5 ++++-
 autotest/tools/pipeline/mllm_case.py |  3 +++
 autotest/utils/pipeline_chat.py      |  5 ++++-
 autotest/utils/run_client_chat.py    |  3 +++
 autotest/utils/run_restful_chat.py   | 13 ++++++++-----
 7 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml
index 6c44ef8af3..7bad2d3daf 100644
--- a/autotest/config-3090.yaml
+++ b/autotest/config-3090.yaml
@@ -4,11 +4,11 @@ dst_path: /nvme/qa_test_models/autotest_model
 log_path: /nvme/qa_test_models/autotest_model/log
 benchmark_path: /nvme/qa_test_models/benchmark-reports
 dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+env_tag: 3090
 
 tp_config:
     empty: 2
 
-
 turbomind_chat_model:
     - meta-llama/Llama-3.2-3B-Instruct
     - internlm/internlm3-8b-instruct
diff --git a/autotest/config.yaml b/autotest/config.yaml
index c3f66c4d12..bc82f1d012 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -4,6 +4,7 @@ dst_path: /nvme/qa_test_models/autotest_model
 log_path: /nvme/qa_test_models/autotest_model/log
 benchmark_path: /nvme/qa_test_models/benchmark-reports
 dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+env_tag: a100
 
 tp_config:
     internlm2-chat-20b: 2
diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py
index 98b73323b4..d229e68870 100644
--- a/autotest/tools/pipeline/llm_case.py
+++ b/autotest/tools/pipeline/llm_case.py
@@ -10,7 +10,7 @@
 gen_config = GenerationConfig(max_new_tokens=500)
 
 
-def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = None):
+def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = {}):
 
     if 'pytorch' in backend_type:
         backend_config = PytorchEngineConfig(tp=tp)
@@ -24,6 +24,9 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test,
     if 'turbomind' in backend_type and extra is not None and 'communicator' in extra:
         backend_config.communicator = extra.get('communicator')
 
+    if 'cache-max-entry-count' in extra and extra.get('cache-max-entry-count') is not None:
+        backend_config.cache_max_entry_count = extra.get('cache-max-entry-count')
+
     if 'w4' in model_path or ('4bits' in model_path or 'awq' in model_path.lower()):
         backend_config.model_format = 'awq'
     if 'gptq' in model_path.lower():
diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py
index 7d2b44bf9d..05aa3d760d 100644
--- a/autotest/tools/pipeline/mllm_case.py
+++ b/autotest/tools/pipeline/mllm_case.py
@@ -33,6 +33,9 @@ def run_pipeline_mllm_test(model_path, resource_path, tp, backend_type, is_pr_te
     if 'turbomind' in backend_type and extra is not None and 'communicator' in extra:
         backend_config.communicator = extra.get('communicator')
 
+    if 'cache-max-entry-count' in extra and extra.get('cache-max-entry-count') is not None:
+        backend_config.cache_max_entry_count = extra.get('cache-max-entry-count')
+
     if 'w4' in model_path or ('4bits' in model_path or 'awq' in model_path.lower()):
         backend_config.model_format = 'awq'
     if not is_bf16_supported():
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 9b3f78caae..01dd941b1a 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -14,7 +14,7 @@ def run_pipeline_chat_test(config,
                            model_case,
                            backend_type,
                            worker_id: str = '',
-                           extra: object = None,
+                           extra: object = {},
                            use_local_model: bool = True,
                            is_smoke: bool = False):
     log_path = config.get('log_path')
@@ -30,6 +30,9 @@ def run_pipeline_chat_test(config,
         log_path, '_'.join(['pipeline', 'chat', backend_type, worker_id,
                             model_case.split('/')[1] + '.log']))
 
+    if str(config.get('env_tag')) == '3090':
+        extra['cache_max_entry_count'] = 0.7
+
     if extra is not None:
         extra = json.dumps(extra, ensure_ascii=False, indent=None)
         extra = extra.replace(' ', '').replace('"', '\\"').replace(',', '\\,')
diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py
index e2d1a5a33a..fc66eaf13d 100644
--- a/autotest/utils/run_client_chat.py
+++ b/autotest/utils/run_client_chat.py
@@ -49,6 +49,9 @@ def hf_command_line_test(config,
     else:
         model_path = model_case
 
+    if str(config.get('env_tag')) == '3090':
+        extra += ' --cache-max-entry-count 0.7'
+
     cmd = get_command_with_extra(' '.join(['lmdeploy chat', model_path, '--backend', type, extra,
                                            '--session-len 4096']),
                                  config,
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index d9c601d2d8..0bec0d2c0f 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -30,7 +30,10 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id)
     if 'extra' in param.keys():
         extra = param['extra']
     else:
-        extra = None
+        extra = ''
+
+    if str(config.get('env_tag')) == '3090':
+        extra += ' --cache-max-entry-count 0.7'
 
     if 'modelscope' in param.keys():
         modelscope = param['modelscope']
@@ -85,8 +88,8 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id)
         pid = startRes.pid
 
     http_url = BASE_HTTP_URL + ':' + str(port)
-    with open(start_log, 'r') as file:
-        content = file.read()
+    with open(start_log, 'r') as f:
+        content = f.read()
         print(content)
     start_time = int(time())
 
@@ -196,10 +199,10 @@ def interactive_test(config, case, case_info, model, url, worker_id: str = ''):
 
     interactive_log = os.path.join(log_path, 'interactive_' + model + worker_id + '_' + case + '.log')
 
-    file = open(interactive_log, 'w')
-
     result = True
 
+    file = open(interactive_log, 'w')
+
     api_client = APIClient(url)
     file.writelines('available_models:' + ','.join(api_client.available_models) + '\n')
 

From 7cfd5675cbddcae0b923b9bb06192173d9eabbf6 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Wed, 14 May 2025 17:20:38 +0800
Subject: [PATCH 17/28] update

---
 autotest/config-3090.yaml            | 13 +++++++++----
 autotest/utils/quantization_utils.py |  2 ++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml
index 7bad2d3daf..dca79c230a 100644
--- a/autotest/config-3090.yaml
+++ b/autotest/config-3090.yaml
@@ -47,13 +47,12 @@ pytorch_base_model:
 
 turbomind_quatization:
     no_awq:
-        - OpenGVLab/InternVL3-8B
-        - Qwen/Qwen3-8B
-        - Qwen/Qwen2.5-7B-Instruct
-        - Qwen/Qwen2.5-VL-7B-Instruct
+        - empty
     gptq:
         - empty
     no_kvint4:
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL2_5-1B
         - Qwen/Qwen3-8B
         - Qwen/Qwen2.5-7B-Instruct
         - Qwen/Qwen2.5-VL-7B-Instruct
@@ -64,11 +63,17 @@ pytorch_quatization:
     awq:
         - meta-llama/Llama-3.2-3B-Instruct
         - internlm/internlm3-8b-instruct
+        - OpenGVLab/InternVL3-8B
         - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
     w8a8:
         - meta-llama/Llama-3.2-3B-Instruct
         - internlm/internlm3-8b-instruct
     no_kvint4:
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL2_5-1B
         - Qwen/Qwen3-8B
         - Qwen/Qwen2.5-7B-Instruct
         - Qwen/Qwen2.5-VL-7B-Instruct
diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py
index aad5d682af..3606e0bbbb 100644
--- a/autotest/utils/quantization_utils.py
+++ b/autotest/utils/quantization_utils.py
@@ -39,6 +39,8 @@ def quantization(config,
 
     if not is_bf16_supported() or quantization_type == 'gptq':
         quantization_cmd += ' --batch-size 8'
+    elif str(config.get('env_tag')) == '3090':
+        quantization_cmd += ' --batch-size 8'
     else:
         quantization_cmd += ' --batch-size 32'
 

From 5df050461115d5ac8c03a1057be806e0eb18a152 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Wed, 14 May 2025 18:35:28 +0800
Subject: [PATCH 18/28] update

---
 autotest/config-3090.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml
index dca79c230a..5251f447ea 100644
--- a/autotest/config-3090.yaml
+++ b/autotest/config-3090.yaml
@@ -67,7 +67,6 @@ pytorch_quatization:
         - OpenGVLab/InternVL2_5-1B
         - Qwen/Qwen3-8B
         - Qwen/Qwen2.5-7B-Instruct
-        - Qwen/Qwen2.5-VL-7B-Instruct
     w8a8:
         - meta-llama/Llama-3.2-3B-Instruct
         - internlm/internlm3-8b-instruct

From 16f46b29242d91c68566e53804800200a937b423 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Thu, 15 May 2025 14:12:55 +0800
Subject: [PATCH 19/28] update

---
 autotest/config-3090.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/autotest/config-3090.yaml b/autotest/config-3090.yaml
index 5251f447ea..791e2dc576 100644
--- a/autotest/config-3090.yaml
+++ b/autotest/config-3090.yaml
@@ -33,8 +33,6 @@ turbomind_vl_model:
 pytorch_vl_model:
     - OpenGVLab/InternVL3-8B
     - OpenGVLab/InternVL2_5-1B
-    - Qwen/Qwen3-8B
-    - Qwen/Qwen2.5-7B-Instruct
     - Qwen/Qwen2.5-VL-7B-Instruct
 
 turbomind_base_model:

From 03ebb243baedfc5bf32fe71e99938d52e114d616 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Thu, 15 May 2025 14:18:19 +0800
Subject: [PATCH 20/28] update

---
 autotest/utils/pipeline_chat.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 01dd941b1a..ed3f1b730d 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -31,7 +31,9 @@ def run_pipeline_chat_test(config,
                             model_case.split('/')[1] + '.log']))
 
     if str(config.get('env_tag')) == '3090':
-        extra['cache_max_entry_count'] = 0.7
+        if extra is None:
+            extra = {}
+        extra['cache-max-entry-count'] = 0.7
 
     if extra is not None:
         extra = json.dumps(extra, ensure_ascii=False, indent=None)
@@ -76,7 +78,7 @@ def run_pipeline_vl_chat_test(config,
                               model_case,
                               backend_type,
                               worker_id: str = '',
-                              extra: object = None,
+                              extra: object = {},
                               is_smoke: bool = False):
     log_path = config.get('log_path')
     tp = get_tp_num(config, model_case)
@@ -88,6 +90,11 @@ def run_pipeline_vl_chat_test(config,
         log_path, '_'.join(['pipeline', 'mllm', backend_type, worker_id,
                             model_case.split('/')[1] + '.log']))
 
+    if str(config.get('env_tag')) == '3090':
+        if extra is None:
+            extra = {}
+        extra['cache-max-entry-count'] = 0.7
+
     if extra is not None:
         extra = json.dumps(extra, ensure_ascii=False, indent=None)
         extra = extra.replace(' ', '').replace('"', '\\"').replace(',', '\\,')

From 62e7a1cc094772f53aee27268a8a92f555d4cb17 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Thu, 15 May 2025 15:03:47 +0800
Subject: [PATCH 21/28] update

---
 autotest/tools/pipeline/llm_case.py | 2 +-
 autotest/utils/pipeline_chat.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py
index d229e68870..8f70c376ac 100644
--- a/autotest/tools/pipeline/llm_case.py
+++ b/autotest/tools/pipeline/llm_case.py
@@ -7,7 +7,7 @@
 from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
 from lmdeploy.utils import is_bf16_supported
 
-gen_config = GenerationConfig(max_new_tokens=500)
+gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=1)
 
 
 def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = {}):
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index ed3f1b730d..f5bc2f9d0a 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -93,7 +93,7 @@ def run_pipeline_vl_chat_test(config,
     if str(config.get('env_tag')) == '3090':
         if extra is None:
             extra = {}
-        extra['cache-max-entry-count'] = 0.7
+        extra['cache-max-entry-count'] = 0.5
 
     if extra is not None:
         extra = json.dumps(extra, ensure_ascii=False, indent=None)

From 3223a19e46d260dea090d11712151247e0c642f5 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Thu, 15 May 2025 15:05:45 +0800
Subject: [PATCH 22/28] update

---
 autotest/utils/pipeline_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index f5bc2f9d0a..452801448b 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -33,7 +33,7 @@ def run_pipeline_chat_test(config,
     if str(config.get('env_tag')) == '3090':
         if extra is None:
             extra = {}
-        extra['cache-max-entry-count'] = 0.7
+        extra['cache-max-entry-count'] = 0.6
 
     if extra is not None:
         extra = json.dumps(extra, ensure_ascii=False, indent=None)

From 92dbd991b03e400b7c973d6db2c13bb9ff172cc5 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Thu, 15 May 2025 16:21:14 +0800
Subject: [PATCH 23/28] update

---
 docs/en/supported_models/supported_models.md    | 1 -
 docs/zh_cn/supported_models/supported_models.md | 1 -
 2 files changed, 2 deletions(-)

diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index f2740caeff..e8d23b8163 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -109,7 +109,6 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |          Phi-3.5-mini          |      3.8B       | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |          Phi-3.5-MoE           |     16x3.8B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |         Phi-3.5-vision         |      4.2B       | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
-|           Phi-4-mini           |      3.8B       | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
 * [1] Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 88d04eed93..d31b566953 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -109,7 +109,6 @@
 |          Phi-3.5-mini          |      3.8B       | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |          Phi-3.5-MoE           |     16x3.8B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |         Phi-3.5-vision         |      4.2B       | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
-|           Phi-4-mini           |      3.8B       | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
 * [1] 目前，Mono-InternVL不支持FP16，因为数值不稳定。请改用BF16

From d4408381eef4c5a65f52521ee7509154347f8902 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Thu, 15 May 2025 19:58:54 +0800
Subject: [PATCH 24/28] update

---
 autotest/utils/run_restful_chat.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 0bec0d2c0f..0b10fd0630 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -81,18 +81,13 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id)
 
     print('reproduce command restful: ' + cmd)
 
-    with open(start_log, 'w') as f:
-        f.writelines('reproduce command restful: ' + cmd + '\n')
+    file = open(start_log, 'w')
 
-        startRes = subprocess.Popen([cmd], stdout=f, stderr=f, shell=True, text=True, encoding='utf-8')
-        pid = startRes.pid
+    startRes = subprocess.Popen([cmd], stdout=file, stderr=file, shell=True, text=True, encoding='utf-8')
+    pid = startRes.pid
 
     http_url = BASE_HTTP_URL + ':' + str(port)
-    with open(start_log, 'r') as f:
-        content = f.read()
-        print(content)
     start_time = int(time())
-
     start_timeout = 300
     if not is_bf16_supported():
         start_timeout = 600
@@ -105,6 +100,17 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id)
         result = health_check(http_url)
         if result or total_time >= start_timeout:
             break
+        try:
+            # Check if process is still running
+            return_code = startRes.wait(timeout=1)  # Small timeout to check status
+            if return_code != 0:
+                with open(start_log, 'r') as f:
+                    content = f.read()
+                    print(content)
+                break
+        except subprocess.TimeoutExpired:
+            continue
+    file.close()
     allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT)
     return pid, startRes
 

From 065df44bba477db98a08cee7cdb43ef711164a23 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Fri, 16 May 2025 10:25:19 +0800
Subject: [PATCH 25/28] update

---
 autotest/tools/pipeline/llm_case.py  | 2 +-
 autotest/tools/pipeline/mllm_case.py | 2 +-
 autotest/utils/pipeline_chat.py      | 6 ++++--
 autotest/utils/run_restful_chat.py   | 7 +++----
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py
index 8f70c376ac..1e718eb9df 100644
--- a/autotest/tools/pipeline/llm_case.py
+++ b/autotest/tools/pipeline/llm_case.py
@@ -7,7 +7,7 @@
 from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
 from lmdeploy.utils import is_bf16_supported
 
-gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=1)
+gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2)
 
 
 def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = {}):
diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py
index 05aa3d760d..659d13f02e 100644
--- a/autotest/tools/pipeline/mllm_case.py
+++ b/autotest/tools/pipeline/mllm_case.py
@@ -10,7 +10,7 @@
 from lmdeploy.vl.constants import IMAGE_TOKEN
 from lmdeploy.vl.utils import encode_image_base64
 
-gen_config = GenerationConfig(max_new_tokens=500)
+gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2)
 
 PIC1 = 'tiger.jpeg'
 PIC2 = 'human-pose.jpg'
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 452801448b..eaa43e963d 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -272,14 +272,16 @@ def internvl_vl_testcase(output_text, f, lang: str = 'en'):
             assert case_result, 'reason: combined images2: panda should in ' + response
     with allure.step(f'internvl-separate-images-{lang}'):
         response = get_response_from_output(output_text, f'internvl-separate-images-{lang}')
-        case_result = 'panda' in response.lower() or '熊猫' in response or 'same' in response.lower()
+        case_result = 'panda' in response.lower() or '熊猫' in response or 'same' in response.lower(
+        ) or 'difference' in response.lower() or 'different' in response.lower()
         f.writelines(f'internvl-separate-images-{lang} result: ' + str(case_result) +
                      'reason: separate images: panda should in ' + response + '\n')
         with assume:
             assert case_result, 'reason: separate images: panda should in ' + response
     with allure.step(f'internvl-separate-images2-{lang}'):
         response = get_response_from_output(output_text, f'internvl-separate-images2-{lang}')
-        case_result = 'panda' in response.lower() or '熊猫' in response or 'same' in response.lower()
+        case_result = 'panda' in response.lower() or '熊猫' in response or 'same' in response.lower(
+        ) or 'difference' in response.lower() or 'different' in response.lower()
         f.writelines(f'internvl-separate-images2-{lang} result: ' + str(case_result) +
                      'reason: separate images2: panda should in ' + response + '\n')
         with assume:
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 0b10fd0630..201fece932 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -32,9 +32,6 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id)
     else:
         extra = ''
 
-    if str(config.get('env_tag')) == '3090':
-        extra += ' --cache-max-entry-count 0.7'
-
     if 'modelscope' in param.keys():
         modelscope = param['modelscope']
         if modelscope:
@@ -76,6 +73,8 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id)
 
     if not is_bf16_supported():
         cmd += ' --cache-max-entry-count 0.5'
+    if str(config.get('env_tag')) == '3090':
+        extra += ' --cache-max-entry-count 0.5'
 
     start_log = os.path.join(log_path, 'start_restful_' + model.split('/')[1] + worker_id + '.log')
 
@@ -107,7 +106,7 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id)
                 with open(start_log, 'r') as f:
                     content = f.read()
                     print(content)
-                break
+                return 0, startRes
         except subprocess.TimeoutExpired:
             continue
     file.close()

From f2581abd5ab3ab57b4d493cb72564c50dc833b76 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Fri, 16 May 2025 10:27:44 +0800
Subject: [PATCH 26/28] update

---
 autotest/tools/pipeline/llm_case.py | 4 ++--
 autotest/utils/pipeline_chat.py     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py
index 1e718eb9df..2de77d2bd3 100644
--- a/autotest/tools/pipeline/llm_case.py
+++ b/autotest/tools/pipeline/llm_case.py
@@ -10,7 +10,7 @@
 gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=2)
 
 
-def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = {}):
+def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test, extra: object = None):
 
     if 'pytorch' in backend_type:
         backend_config = PytorchEngineConfig(tp=tp)
@@ -24,7 +24,7 @@ def run_pipeline_chat_test(model_path, cases_path, tp, backend_type, is_pr_test,
     if 'turbomind' in backend_type and extra is not None and 'communicator' in extra:
         backend_config.communicator = extra.get('communicator')
 
-    if 'cache-max-entry-count' in extra and extra.get('cache-max-entry-count') is not None:
+    if extra is not None and 'cache-max-entry-count' in extra and extra.get('cache-max-entry-count') is not None:
         backend_config.cache_max_entry_count = extra.get('cache-max-entry-count')
 
     if 'w4' in model_path or ('4bits' in model_path or 'awq' in model_path.lower()):
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index eaa43e963d..376b8d6b4c 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -14,7 +14,7 @@ def run_pipeline_chat_test(config,
                            model_case,
                            backend_type,
                            worker_id: str = '',
-                           extra: object = {},
+                           extra: object = None,
                            use_local_model: bool = True,
                            is_smoke: bool = False):
     log_path = config.get('log_path')
@@ -78,7 +78,7 @@ def run_pipeline_vl_chat_test(config,
                               model_case,
                               backend_type,
                               worker_id: str = '',
-                              extra: object = {},
+                              extra: object = None,
                               is_smoke: bool = False):
     log_path = config.get('log_path')
     tp = get_tp_num(config, model_case)

From dedd709a6a4757fb61045e0892e796429a5c1249 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Fri, 16 May 2025 14:09:08 +0800
Subject: [PATCH 27/28] updatre

---
 autotest/utils/run_restful_chat.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 201fece932..e9789f7d66 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -293,6 +293,10 @@ def run_vl_testcase(config, port: int = DEFAULT_PORT):
     http_url = BASE_HTTP_URL + ':' + str(port)
     log_path = config.get('log_path')
 
+    model = get_model(http_url)
+    if model is None:
+        assert False, 'server not start correctly'
+
     client = OpenAI(api_key='YOUR_API_KEY', base_url=http_url + '/v1')
     model_name = client.models.list().data[0].id
 

From 590045f123d13bac1c5b2ec35804168ed60c9c65 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <zhulinJulia24@163.com>
Date: Fri, 16 May 2025 17:52:23 +0800
Subject: [PATCH 28/28] update

---
 autotest/utils/run_restful_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index e9789f7d66..bb6a7d3626 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -74,7 +74,7 @@ def start_restful_api(config, param, model, model_path, backend_type, worker_id)
     if not is_bf16_supported():
         cmd += ' --cache-max-entry-count 0.5'
     if str(config.get('env_tag')) == '3090':
-        extra += ' --cache-max-entry-count 0.5'
+        cmd += ' --cache-max-entry-count 0.5'
 
     start_log = os.path.join(log_path, 'start_restful_' + model.split('/')[1] + worker_id + '.log')