[https://nvbugs/5596377][fix] Fix mm dummy calculation (#8498)

yechank-nvidia · web-flow · commit cf8a1d2ef9de · 2025-10-29T09:45:21.000+09:00
Signed-off-by: yechank &lt;161688079+yechank-nvidia@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -2,10 +2,8 @@
 import os
 from typing import Any, Dict, List, Optional, Tuple, Union
 
-import numpy as np
 import torch
 import torch.nn as nn
-from PIL import Image
 from torch.nn import functional as F
 from transformers import (AutoProcessor, AutoTokenizer, PretrainedConfig,
                           PreTrainedModel)
@@ -31,7 +29,6 @@
                        ExtraProcessedInputs, InputProcessor,
                        MultimodalPlaceholderMetadata,
                        MultimodalPlaceholderPlacement, TextPrompt,
-                       default_multimodal_input_loader,
                        register_input_processor)
 from ...logger import logger
 from ...sampling_params import SamplingParams
@@ -95,6 +92,8 @@ def __init__(self,
                  model_config: PretrainedConfig,
                  tokenizer: AutoTokenizer,
                  trust_remote_code: bool = True):
+
+        super().__init__()
         self.model_config = model_config
         self.tokenizer = tokenizer if tokenizer is not None else AutoTokenizer.from_pretrained(
             model_path)
@@ -284,81 +283,6 @@ def get_rope_index(
             mrope_position_deltas, device=input_ids.device).unsqueeze(1)
         return position_ids, mrope_position_deltas
 
-    def get_dummy_text(self, input_seq_len: int) -> str:
-        ids = np.random.randint(
-            low=0,
-            high=int(
-                self.model_config.vocab_size),  # high is exclusive in NumPy
-            size=input_seq_len,
-        ).tolist()
-        return self.tokenizer.decode(ids, skip_special_tokens=True)
-
-    def get_dummy_image(self, max_width: int, max_height: int):
-        image = Image.new("RGB", (max_width, max_height), color=255)
-        return image
-
-    def get_dummy_prompt(self, input_seq_len: int):
-        text = ""
-        # we use the max resolution as starting point
-        img_max_dim = 3584
-        image = self.get_dummy_image(max_width=img_max_dim,
-                                     max_height=img_max_dim)
-
-        test_mm_prompt = default_multimodal_input_loader(
-            tokenizer=self.tokenizer,
-            model_dir=self.model_path,
-            model_type=self.model_config.model_type,
-            modality="image",
-            prompts=[text],
-            media=[[image]],
-            image_data_format="pt")[0]
-
-        prompt_token_ids_single_img, _ = self(test_mm_prompt, None)
-
-        # if the max img resolution results in a number of tokens greater then
-        # input_seq_len, we keep lowering the resolution such as to find the
-        # max resolution such as it does not exceed the input_seq_len
-        while len(prompt_token_ids_single_img) > input_seq_len:
-            # reduce img resolution
-            img_max_dim = img_max_dim >> 1
-
-            image = self.get_dummy_image(max_width=img_max_dim,
-                                         max_height=img_max_dim)
-
-            test_mm_prompt = default_multimodal_input_loader(
-                tokenizer=self.tokenizer,
-                model_dir=self.model_path,
-                model_type=self.model_config.model_type,
-                modality="image",
-                prompts=[text],
-                media=[[image]],
-                image_data_format="pt")[0]
-
-            prompt_token_ids_single_img, _ = self(test_mm_prompt, None)
-
-        len_prompt_tokens_ids = len(prompt_token_ids_single_img)
-        # There are corner cases where if we strictly try to generate a text based
-        # on how many tokens we need to complete the input_seq_len, the output of
-        # default_multimodal_input_loader may give more tokens then the input_seq_len and this
-        # can lead to errors.
-        # That is why we try to clip the variable text_token_left to a lower threshold
-        # but close enough to the actual input_seq_len
-        text_generation_perc_threshold = 0.95
-        text_token_left = int((input_seq_len - len_prompt_tokens_ids) *
-                              text_generation_perc_threshold)
-
-        if text_token_left > 0:
-            text = self.get_dummy_text(text_token_left)
-
-        return default_multimodal_input_loader(
-            tokenizer=self.tokenizer,
-            model_dir=self.model_path,
-            model_type=self.model_config.model_type,
-            modality="image",
-            prompts=[text],
-            media=[[image]],
-            image_data_format="pt")[0]
-
     def _preprocess(self, text: dict[str, any], mm_data: dict[str, any],
                     mm_processor_kwargs: Dict[str, Any]):
         images = mm_data.get("image")
@@ -1018,7 +942,6 @@ def forward(
 
             mm_embeds = find_input_mm_embeds(
                 mm_embeds, multimodal_params[:num_context_requests])
-
             if not self.model_config.pretrained_config.disable_fuse_rope:
                 mrope_config = self.prepare_mrope_config(
                     multimodal_params, num_context_requests)
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -1,5 +1,4 @@
 import os
-import random
 from typing import Dict, List, Optional
 
 import torch
@@ -144,39 +143,51 @@ def _create_dummy_mm_context_request(
             "Profiling with the default input dummy context request. This may not take into account the memory consumption of " \
             "the image encoder")
             return requests
-        prompt = input_processor.get_dummy_prompt(input_seq_len)
 
-        prompt_token_ids, extra_processed_inputs = self._model_engine.input_processor_with_hash(
-            prompt, None)
-
-        multimodal_input = extra_processed_inputs.get('multimodal_input')
-        multimodal_data = extra_processed_inputs.get('multimodal_data')
+        max_num_tokens = self._max_num_tokens
+        max_beam_width = self._max_beam_width
+        vocab_size = self._model_engine.model.model_config.pretrained_config.vocab_size
 
-        max_num_tokens = len(prompt_token_ids)
-        assert max_num_tokens > 0, "the length of the prompt of the dummy mm req is less than or equal to 0"
-        remaining_tokens = min(max_num_tokens, input_seq_len)
-        if remaining_tokens > input_seq_len:
-            logger.warning(f"Profiling with multimedia prompt which contains more tokens than the allowed input_seq_len. " \
-                           f"Multimodal prompt has {remaining_tokens} while the input_seq_len is: {input_seq_len}")
+        input_seq_len = min(max_num_tokens, input_seq_len)
+        remaining_tokens = max_num_tokens
         while remaining_tokens > 0:
-            req_mm_input = trtllm.MultimodalInput(
-                multimodal_hashes=multimodal_input.multimodal_hashes,
-                multimodal_positions=multimodal_input.multimodal_positions,
-                multimodal_lengths=multimodal_input.multimodal_lengths
-            ) if multimodal_input else None
-            request = trtllm.Request(prompt_token_ids,
-                                     max_tokens=1,
-                                     streaming=False,
-                                     sampling_config=trtllm.SamplingConfig(
-                                         beam_width=self._max_beam_width, ),
-                                     output_config=trtllm.OutputConfig(),
-                                     end_id=-1,
-                                     multimodal_input=req_mm_input)
-            # TODO:
-            # create_input_processor_with_hash shouldn’t be required during profiling,
-            # but is temporarily needed due to the multimodal input dependency for chunked prefill
-            request.py_multimodal_data = multimodal_data
-            remaining_tokens -= max_num_tokens
+            input_seq_len = min(input_seq_len, remaining_tokens)
+            dummy_mm_prompt = input_processor.get_dummy_prompt(input_seq_len)
+
+            if dummy_mm_prompt is not None:
+                prompt_token_ids, extra_processed_inputs = self._model_engine.input_processor(
+                    dummy_mm_prompt, sampling_params=None)
+                multimodal_data = extra_processed_inputs.get('multimodal_data')
+
+                request = trtllm.Request(prompt_token_ids,
+                                         max_tokens=1,
+                                         streaming=False,
+                                         sampling_config=trtllm.SamplingConfig(
+                                             beam_width=max_beam_width, ),
+                                         output_config=trtllm.OutputConfig(),
+                                         end_id=-1)
+                request.py_multimodal_data = multimodal_data
+            else:
+                # Fall back to text-only prompt when we could not find the small image size.
+                prompt_token_ids = torch.randint(
+                    low=0, high=vocab_size, size=(input_seq_len, )).tolist()
+                request = trtllm.Request(prompt_token_ids,
+                                         max_tokens=1,
+                                         streaming=False,
+                                         sampling_config=trtllm.SamplingConfig(
+                                             beam_width=max_beam_width, ),
+                                         output_config=trtllm.OutputConfig(),
+                                         end_id=-1)
+                if self._model_engine.use_mrope:
+                    request.py_multimodal_data = {
+                        "mrope_config": {
+                            "mrope_position_ids":
+                            torch.zeros(3, 1, input_seq_len, dtype=torch.int32),
+                            "mrope_position_deltas":
+                            torch.zeros(1, 1, dtype=torch.int32)
+                        }
+                    }
+            remaining_tokens -= len(prompt_token_ids)
             requests.append(request)
 
         if self._mapping.enable_attention_dp:
@@ -190,7 +201,6 @@ def _create_dummy_context_requests(
         if hasattr(self._model_engine.model,
                    "original_arch") and MODEL_CLASS_VISION_ENCODER_MAPPING.get(
                        self._model_engine.model.original_arch, None):
-            input_seq_len = min(self._max_num_tokens, input_seq_len)
             requests = self._create_dummy_mm_context_request(input_seq_len)
         # if succeed profiling with multimodal requests then return, otherwise profile
         # with default case
@@ -204,9 +214,9 @@ def _create_dummy_context_requests(
         remaining_tokens = max_num_tokens
         while remaining_tokens > 0:
             input_seq_len = min(input_seq_len, remaining_tokens)
-            input_tokens = [
-                random.randint(0, vocab_size - 1) for _ in range(input_seq_len)
-            ]
+            input_tokens = torch.randint(low=0,
+                                         high=vocab_size,
+                                         size=(input_seq_len, )).tolist()
             request = trtllm.Request(input_tokens,
                                      max_tokens=1,
                                      streaming=False,
diff --git a/tensorrt_llm/inputs/registry.py b/tensorrt_llm/inputs/registry.py
@@ -1,11 +1,14 @@
 import enum
+import random
 from dataclasses import dataclass, field
 from typing import (Any, Callable, Dict, List, Optional, Protocol, Tuple, Type,
                     TypeVar)
 
 from PIL import Image
 from torch import Tensor, nn
 
+import tensorrt_llm
+
 from .._utils import nvtx_range_debug
 from ..logger import logger
 from ..sampling_params import SamplingParams
@@ -47,9 +50,41 @@ class BaseDummyInputsBuilder:
     Base class for generating dummy inputs. Specially for profiling
     """
 
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.image_max_dim = 16384
+        self.img_min_dim = 128
+
+    def get_dummy_image(self, max_width: int, max_height: int):
+        image = Image.new("RGB", (max_width, max_height),
+                          color=random.randint(0, 256))
+        return image
+
     def get_dummy_prompt(self, input_seq_len: int):
-        raise NotImplementedError(
-            "Please ensure this method is implemented in your inherited class")
+        # TODO(yechank): We use the max resolution as starting point and keep reducing the resolution until the prompt length is less than the input sequence length.
+        # Need to find better way to calculate the dummy prompt length as this iteration may not be efficient.
+        while self.image_max_dim >= self.img_min_dim:
+            image = self.get_dummy_image(max_width=self.image_max_dim,
+                                         max_height=self.image_max_dim)
+
+            test_mm_prompt = tensorrt_llm.inputs.utils.default_multimodal_input_loader(
+                tokenizer=self.tokenizer,
+                model_dir=self.model_path,
+                model_type=self.model_config.model_type,
+                modality="image",
+                prompts=[""],
+                media=[[image]],
+                image_data_format="pt")[0]
+
+            prompt_token_ids_single_img, _ = self(test_mm_prompt, None)
+
+            if len(prompt_token_ids_single_img) <= input_seq_len:
+                return test_mm_prompt
+
+            # reduce img resolution
+            self.image_max_dim = self.image_max_dim >> 1
+
+        return None
 
 
 class BaseMultimodalInputProcessor:
@@ -61,6 +96,9 @@ class BaseMultimodalInputProcessor:
     models. Specific processors can override these methods if they need custom logic.
     """
 
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
     def get_processor(self) -> Optional[Any]:
         """Return the processor object if available; otherwise raise NotImplementedError.
         """
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py b/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py
@@ -1,5 +1,6 @@
 import os
 import tempfile
+from pathlib import Path
 from typing import List
 
 import openai
@@ -240,6 +241,8 @@ def test_single_chat_session_video(client: openai.OpenAI, model_name: str):
 @pytest.mark.asyncio(loop_scope="module")
 def test_single_chat_session_image_embed(client: openai.OpenAI,
                                          model_name: str):
+    test_data_root = Path(
+        os.path.join(llm_models_root(), "multimodals", "test_data"))
     content_text = "Describe the natural environment in the image."
     image_url = str(llm_models_root() / "multimodals" / "test_data" /
                     "seashore.png")