Changed weight map to tensor and fix the refit bug (#3573)

cehongwang · web-flow · commit 02737260d233 · 2025-06-17T14:28:01.000-07:00
diff --git a/examples/apps/README.md b/examples/apps/README.md
@@ -0,0 +1,56 @@
+# Flux Demo with Torch-TensorRT
+
+This demo showcases the Flux image generation model accelerated using Torch-TensorRT, with support for different precision modes (FP8, INT8, FP16) and dynamic shapes.
+
+
+## Installation
+
+1. Install the required dependencies:
+
+```bash
+pip install gradio==5.29.0 nvidia-modelopt==0.27.1 diffusers==0.33.1 accelerate==1.3.0
+```
+
+## Usage
+
+The demo can be run with different configurations:
+
+### Basic Usage (FP16)
+
+```bash
+python flux_demo.py
+```
+
+### Using Different Precision Modes
+
+- FP8 mode:
+```bash
+python flux_demo.py --dtype fp8
+```
+
+- INT8 mode:
+```bash
+python flux_demo.py --dtype int8
+```
+
+- FP16 mode (default):
+```bash
+python flux_demo.py --dtype fp16
+```
+
+### Additional Options
+
+- Enable dynamic shapes (allows variable batch sizes):
+```bash
+python flux_demo.py --dynamic_shapes
+```
+
+- Low VRAM mode (for GPUs with ≤32GB VRAM):
+```bash
+python flux_demo.py --low_vram_mode
+```
+
+You can combine these options as needed. For example:
+```bash
+python flux_demo.py --dtype fp8 --dynamic_shapes --low_vram_mode
+```
diff --git a/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py b/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py
@@ -1,7 +1,5 @@
 from dataclasses import dataclass, field
-from typing import Union
 
-import numpy as np
 import torch
 from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo.types import TRTNetwork
@@ -24,10 +22,21 @@ class ConversionContext:
         default_factory=CompilationSettings
     )
     requires_output_allocator: bool = False
-    weight_refit_map: dict[str, np.array] = field(default_factory=dict)
-    cpu_weights_reference_holder: dict[str, Union[torch.Tensor]] = field(
-        default_factory=dict
-    )
+    weight_refit_map: dict[str, torch.Tensor] = field(default_factory=dict)
+    cpu_weights_reference_holder: list[torch.Tensor] = field(default_factory=list)
+
+    def record_weight(self, name: str, weight: torch.Tensor) -> None:
+        """
+        Record the weight and name for refitting and CPU reference.
+        For the refit map, the key is the weight name that appears in the TRT engine and the value is the weight tensor.
+        For the CPU reference holder, we need to hold the reference to the weight tensor until the whole compilation process is complete.
+
+        Args:
+            name: Name of the weight
+            weight: Weight to record
+        """
+        self.weight_refit_map[name] = weight
+        self.cpu_weights_reference_holder.append(weight)
 
     def clear_cpu_weights_reference_holder(self) -> None:
         self.cpu_weights_reference_holder.clear()
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -404,7 +404,7 @@ def _construct_trt_network_def(self) -> None:
     @staticmethod
     def find_weight(
         weight_name: str,
-        np_map: dict[str, Any],
+        weight_refit_map: dict[str, Any],
         state_dict: dict[str, Any],
         device: torch.device,
     ) -> str:
@@ -417,7 +417,7 @@ def find_weight(
         state_dict: state of the graph module
         """
         with unset_fake_temporarily():
-            network_weight = torch.from_numpy(np_map[weight_name]).to(device)
+            network_weight = weight_refit_map[weight_name].to(device)
             for sd_w_name, sd_weight in state_dict.items():
                 if TRTInterpreter.check_weight_equal(sd_weight, network_weight, device):
                     del state_dict[sd_w_name]
@@ -431,8 +431,8 @@ def check_weight_equal(
         device: torch.device,
     ) -> Any:
         with unset_fake_temporarily():
-            if not isinstance(network_weight, torch.Tensor):
-                network_weight = torch.from_numpy(network_weight).to(device)
+            if network_weight.device != device:
+                network_weight = network_weight.to(device)
             try:
                 return sd_weight.shape == network_weight.shape and torch.all(
                     torch.abs(sd_weight - network_weight) < 0.01
@@ -501,8 +501,8 @@ def _save_weight_mapping(self) -> None:
         self.module.to(torch_device)
         sd = self.module.state_dict()
         weight_name_map: dict[str, Any] = {}
-        np_map = self.ctx.weight_refit_map
-        constant_mapping = {k: v for k, v in np_map.items() if v.size == 1}
+        weight_refit_map = self.ctx.weight_refit_map
+        constant_mapping = {k: v for k, v in weight_refit_map.items() if v.size == 1}
         net = self.ctx.net
         for i in range(net.num_layers):
             layer = net[i]
@@ -544,7 +544,7 @@ def _save_weight_mapping(self) -> None:
                     else:
                         sd_weight_name = f"{sd_weight_name}.{torch_attr}"
 
-                    if engine_weight_name in np_map:
+                    if engine_weight_name in weight_refit_map:
                         weight_name_map[engine_weight_name] = sd_weight_name
 
         # Stage 2: Value mapping
@@ -553,10 +553,10 @@ def _save_weight_mapping(self) -> None:
                 # There is no direct connection in batch_norm layer. So skip it
                 pass
             elif sd_weight_name not in sd or not TRTInterpreter.check_weight_equal(
-                sd[sd_weight_name], np_map[engine_weight_name], torch_device
+                sd[sd_weight_name], weight_refit_map[engine_weight_name], torch_device
             ):
                 weight_name_map[engine_weight_name] = TRTInterpreter.find_weight(
-                    engine_weight_name, np_map, sd, torch_device
+                    engine_weight_name, weight_refit_map, sd, torch_device
                 )
                 if (
                     weight_name_map[engine_weight_name] != ""
@@ -567,12 +567,13 @@ def _save_weight_mapping(self) -> None:
 
             weight_name_map[engine_weight_name] = [
                 weight_name_map[engine_weight_name],
-                np_map[engine_weight_name].dtype,
+                weight_refit_map[engine_weight_name].dtype,
             ]
 
         weight_name_map["constant_mapping"] = constant_mapping
         self.weight_name_map = weight_name_map
-        del np_map, sd
+
+        del weight_refit_map, sd
         gc.collect()
         torch.cuda.empty_cache()
 
diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
@@ -3,7 +3,18 @@
 import functools
 import logging
 import os
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, overload
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+    overload,
+)
 
 import numpy as np
 import tensorrt as trt
@@ -321,7 +332,16 @@ def cast_int_or_float_to_bool(
 
 
 def to_trt_weights(
-    value: Any, target_quantized_type: Optional[trt.DataType] = None
+    ctx: ConversionContext,
+    value: torch.Tensor,
+    name: str,
+    layer_type_name: Literal["CONVOLUTION", "DECONVOLUTION", "CONSTANT"],
+    weight_type_name: Literal["KERNEL", "BIAS", "CONSTANT"],
+    target: Optional[Union[Target, str]] = None,
+    source_ir: Optional[SourceIR] = None,
+    target_quantized_type: Optional[trt.DataType] = None,
+    dtype: Optional[trt.DataType] = None,
+    count: Optional[int] = None,
 ) -> trt.Weights:
     """
     Convert a PyTorch tensor or NumPy array to TensorRT weights.
@@ -336,20 +356,51 @@ def to_trt_weights(
         - Input tensors are made contiguous before conversion
         - Data type is preserved from the original tensor/array
     """
-    if isinstance(value, torch.Tensor):
-        # Tensor must be contiguous before conversion
-        value = value.contiguous()
-        value_trt_dtype = _enums.dtype._from(value.dtype).to(trt.DataType)
-        return trt.Weights(value_trt_dtype, value.data_ptr(), value.nelement())
-    elif isinstance(value, np.ndarray):
-        value = np.ascontiguousarray(value)
-        value_np_dtype = _enums.dtype._from(value.dtype).to(np.dtype, use_default=True)
-        return trt.Weights(value_np_dtype, value.data, value.size)
-    else:
+    if not isinstance(value, torch.Tensor):
         raise AssertionError(
-            f"to_trt_weights can only be called on torch.Tensor or np.ndarray, got an object of type: {type(value)}"
+            f"to_trt_weights can only be called on torch.Tensor, got an object of type: {type(value)}"
         )
 
+    # Weight Recording
+    supported_layer_types = ["CONVOLUTION", "DECONVOLUTION", "CONSTANT"]
+    supported_weight_types = ["KERNEL", "BIAS", "CONSTANT"]
+    assert (
+        layer_type_name in supported_layer_types
+    ), f"Encountered unsupported layer type: {layer_type_name}. Supported types are: {supported_layer_types}. Manually calling to_trt_weights with a custom layer type is not intended for general use."
+    assert (
+        weight_type_name in supported_weight_types
+    ), f"Encountered unsupported weight type: {weight_type_name}. Supported types are: {supported_weight_types}. Manually calling to_trt_weights with a custom weight type is not intended for general use."
+
+    if weight_type_name == "CONSTANT" and layer_type_name == "CONSTANT":
+        weight_name = f"{name} CONSTANT"
+        ctx.record_weight(weight_name, value)
+
+    else:
+        assert (
+            target is not None
+        ), "target must be provided if the weight type and layer type is not CONSTANT"
+        source_ir = source_ir if source_ir is not None else SourceIR.UNKNOWN
+        target_name = (
+            f"{source_ir}_ops.{target}"
+            if isinstance(target, str)
+            else f"{source_ir}_ops.{target.__name__}"
+        )
+
+        weight_name = f"[{layer_type_name}]-[{target_name}]-[{name}] {weight_type_name}"
+        ctx.record_weight(weight_name, value)
+
+    # TRT Weights Creation
+
+    # Tensor must be contiguous before conversion
+    value = value.contiguous()
+    if dtype is None:
+        dtype = _enums.dtype._from(value.dtype).to(trt.DataType)
+
+    if count is None:
+        count = value.nelement()
+
+    return trt.Weights(dtype, value.data_ptr(), count)
+
 
 def create_constant(
     ctx: ConversionContext,
@@ -405,34 +456,26 @@ def create_constant(
                         "Currently supported target_quantized_type for uint8 is FP4, got {target_quantized_type=}"
                     )
                 shape[-1] = shape[-1] * 2
-                weights = trt.Weights(
-                    type=trt.DataType.FP4,
-                    ptr=torch_value.data_ptr(),
+                weights = to_trt_weights(
+                    ctx,
+                    torch_value,
+                    name,
+                    "CONSTANT",
+                    "CONSTANT",
+                    dtype=trt.DataType.FP4,
                     count=torch_value.numel() * 2,
                 )
                 constant = ctx.net.add_constant(
                     shape,
                     weights,
                 )
                 constant.name = name
-                ctx.cpu_weights_reference_holder[name + " FP4_CONSTANT"] = torch_value
                 return constant.get_output(0)
 
-            # TODO: Refit map uses numpy arrays. Remove this once refit is updated to use torch.Tensor
-            if torch_value.dtype == torch.bfloat16:
-                torch_value_fp32 = torch_value.to(torch.float32)
-                numpy_value = torch_value_fp32.numpy()
-            else:
-                numpy_value = torch_value.numpy()
-
-            # Used for refit
-            ctx.weight_refit_map[name + " CONSTANT"] = numpy_value.reshape(-1)
-
-            # This is a buffer to hold the torch.Tensor so that they are alive during the course of TRT compilation.
-            ctx.cpu_weights_reference_holder[name] = torch_value
+            # Record the weight in ctx for refit and cpu memory reference
 
             # Convert the torch.Tensor to a trt.Weights object
-            trt_weights = to_trt_weights(torch_value)
+            trt_weights = to_trt_weights(ctx, torch_value, name, "CONSTANT", "CONSTANT")
             constant = ctx.net.add_constant(
                 shape,
                 trt_weights,
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/conv.py b/py/torch_tensorrt/dynamo/conversion/impl/conv.py
@@ -55,7 +55,15 @@ def convNd(
     # Process bias terms
     if isinstance(bias, (torch.Tensor, np.ndarray)):
         bias = to_torch(bias, dtype=input.dtype)
-        bias = get_trt_tensor(ctx, bias, f"{name}_bias")
+        bias = to_trt_weights(
+            ctx,
+            bias,
+            name,
+            layer_type_name="CONVOLUTION",
+            weight_type_name="BIAS",
+            target=target,
+            source_ir=source_ir,
+        )
 
     elif isinstance(bias, TRTTensor):
         bias = get_trt_tensor(ctx, bias, f"{name}_bias")
@@ -85,7 +93,15 @@ def convNd(
 
         num_output_maps = weight.shape[0]
         kernel_shape = weight.shape[2:]
-        weight = to_trt_weights(weight)
+        weight = to_trt_weights(
+            ctx,
+            weight,
+            name,
+            layer_type_name="CONVOLUTION",
+            weight_type_name="KERNEL",
+            target=target,
+            source_ir=source_ir,
+        )
 
     else:
         raise RuntimeError(
@@ -105,6 +121,9 @@ def convNd(
         kernel=trt.Weights() if isinstance(weight, TRTTensor) else weight,
         bias=trt.Weights() if isinstance(bias, TRTTensor) else bias,
     )
+
+    set_layer_name(conv_layer, target, name, source_ir)
+
     # If the weight is a TRTTensor, set it as an input of the layer
     if isinstance(weight, TRTTensor):
         weight = cast_trt_tensor(ctx, weight, input.dtype, name)
@@ -145,8 +164,6 @@ def convNd(
             extend_attr_to_tuple(dilation, 2) if dilation is not None else dilation
         )
 
-    set_layer_name(conv_layer, target, name, source_ir)
-
     # Set relevant attributes of convolution layer
     if padding is not None:
         conv_layer.padding_nd = padding
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/deconv.py b/py/torch_tensorrt/dynamo/conversion/impl/deconv.py
diff --git a/tools/perf/Flux/flux_perf.py b/tools/perf/Flux/flux_perf.py