Skip to content

[WIP]Add Func: npugraph_batch_size auto-adjust to different model #739

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions tests/singlecard/test_dynamic_npugraph_batchsize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import pytest
import torch
from torch_npu.op_plugin.atb._atb_ops import _register_atb_extensions
from vllm import LLM, SamplingParams

_register_atb_extensions()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does this do?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

torch_npu needs to preload atb's .so before the dyanmo trace procedure.

torch.cuda.CUDAGraph = torch.npu.NPUGraph
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
torch.cuda.CUDAGraph = torch.npu.NPUGraph
TODO: revert me when cuda hard code is fixed in `VllmBackend`
torch.cuda.CUDAGraph = torch.npu.NPUGraph

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok


MODELS = [
"Qwen/Qwen2.5-0.5B-Instruct",
]

TENSOR_PARALLELS = [2]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a multicard ut, let's move this to path tests/multicard to make sure it is tested as expected

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, I will move to multicard


prompts = [
"Hello, my name is",
"The future of AI is",
]


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("temperature", [0.0])
@pytest.mark.parametrize("ignore_eos", [True])
def test_models(model: str, tp_size: int, max_tokens: int, temperature: int,
ignore_eos: bool) -> None:
# Create an LLM.
llm = LLM(
model=model,
tensor_parallel_size=tp_size,
)
# Prepare sampling_parames
sampling_params = SamplingParams(
max_tokens=max_tokens,
temperature=temperature,
ignore_eos=ignore_eos,
)

# Generate texts from the prompts.
# The output is a list of RequestOutput objects
outputs = llm.generate(prompts, sampling_params)
torch.npu.synchronize()
# The output length should be equal to prompts length.
assert len(outputs) == len(prompts)
64 changes: 64 additions & 0 deletions vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#

import gc
import math
import os
import time
import weakref
Expand Down Expand Up @@ -950,6 +951,9 @@ def capture_model(self) -> None:

start_time = time.perf_counter()
start_free_npu_memory = torch.npu.mem_get_info()[0]
# Since vllm npugraph_batch_sizes is too large,
# we need to adjust its length to proper size.
self.verify_adjust_npugraph_batch_sizes()

# Trigger NPU graph capture for specific shapes.
# Capture the large shapes first so that the smaller shapes
Expand All @@ -968,3 +972,63 @@ def capture_model(self) -> None:
# This usually takes 5~20 seconds.
logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
elapsed_time, npu_graph_size / (1 << 30))

def verify_adjust_npugraph_batch_sizes(self) -> None:
# Now, vllm-ascend support max capture size is 1920
max_capture_size = 1920
original_npugraph_batch_sizes = self.npugraph_batch_sizes
num_hidden_layers = self.vllm_config.model_config.hf_config.num_hidden_layers
max_support_len_npugraph = self.get_max_support_len(
max_capture_size, num_hidden_layers)

if max_support_len_npugraph < len(original_npugraph_batch_sizes):
self.npugraph_batch_sizes = self.sample_from_list(
max_support_len_npugraph)

logger.info(
"Model:%s-num_hidden_layers:%d will adjust npugraph_bash_size, pre-adjust-len: %s, post-adjust-len: %s",
self.vllm_config.model_config.architectures[0],
num_hidden_layers, len(original_npugraph_batch_sizes),
len(self.npugraph_batch_sizes))
else:
logger.info(
"Model:%s-num_hidden_layers:%d no need adjust npugraph_bash_size, list_len: %s",
self.vllm_config.model_config.architectures[0],
num_hidden_layers, len(original_npugraph_batch_sizes))

def get_max_support_len(self, max_capture_size, num_hidden_layers) -> int:
parallel_type_cnt = 0
dp_size = self.vllm_config.parallel_config.data_parallel_size
tp_size = self.vllm_config.parallel_config.tensor_parallel_size
if dp_size > 1:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the bigger the parallel size, the smaller the graph step? Should be bigger right?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The types of parallel strategies influence the length of the list. Therefore, the more types of parallel strategies there are, the shorter the list becomes. However, the maximum supported batch_size value in the list remains unchanged.

parallel_type_cnt += 1
if tp_size > 1:
parallel_type_cnt += 1
max_support_len_npugraph = math.floor(max_capture_size /
(num_hidden_layers + 1) /
(parallel_type_cnt + 1))
logger.info(
"max_capture_size:%s, dp_size:%s, tp_size:%s, parallel_type_cnt:%s, max_support_len_npugraph: %s:",
max_capture_size,
dp_size,
tp_size,
parallel_type_cnt,
max_support_len_npugraph,
)

return max_support_len_npugraph

def sample_from_list(self, sample_len) -> list[int]:
# we use this function to sample a new list from old list by given length, and maintain uniformity, for example:
# original: [1 8 16 24 32 40 48 56 64]
# --> sample length = 3: [1 32 64]
# --> sample length = 5: [1 16 32 48 64]
original_len = len(self.npugraph_batch_sizes)
step = (original_len - 1) / (sample_len - 1)
indices = [round(i * step) for i in range(sample_len)]
# Align first and last element of the original list and sub-list
indices[0] = 0
indices[-1] = original_len - 1
# Sample new list
new_list = [self.npugraph_batch_sizes[i] for i in indices]
return new_list
Loading