Move float8_opaque_tensor to prototype (#3365)

Xia-Weiwen · web-flow · commit 6adb8b882657 · 2025-11-23T21:55:49.000-08:00
diff --git a/test/prototype/test_float8_opaque_tensor.py b/test/prototype/test_float8_opaque_tensor.py
@@ -15,8 +15,10 @@
 )
 
 from torchao import quantize_
+from torchao.prototype.float8_opaque_tensor import (
+    Float8DynamicActivationFloat8WeightOpaqueTensorConfig,
+)
 from torchao.quantization import (
-    Float8DynamicActivationFloat8WeightConfig,
     PerGroup,
     PerRow,
     PerTensor,
@@ -29,10 +31,8 @@
 
 
 def get_config(granularity):
-    return Float8DynamicActivationFloat8WeightConfig(
-        activation_dtype=torch.float8_e4m3fn,
+    return Float8DynamicActivationFloat8WeightOpaqueTensorConfig(
         granularity=granularity,
-        float8_packing_format="opaque",
     )
 
 
@@ -133,7 +133,7 @@ def test_module_path(self, dtype):
         quantize_(linear, get_config(PerRow()))
         self.assertEqual(
             str(type(linear.weight)),
-            "<class 'torchao.quantization.Float8OpaqueTensor'>",
+            "<class 'torchao.prototype.float8_opaque_tensor.Float8OpaqueTensor'>",
         )
 
         with tempfile.NamedTemporaryFile() as f:
@@ -142,7 +142,7 @@ def test_module_path(self, dtype):
             state_dict = torch.load(f)
             self.assertEqual(
                 str(type(state_dict["weight"])),
-                "<class 'torchao.quantization.Float8OpaqueTensor'>",
+                "<class 'torchao.prototype.float8_opaque_tensor.Float8OpaqueTensor'>",
             )
 
 
diff --git a/torchao/prototype/float8_opaque_tensor/__init__.py b/torchao/prototype/float8_opaque_tensor/__init__.py
@@ -0,0 +1,7 @@
+from .float8_opaque_tensor import Float8OpaqueTensor
+from .inference_workflow import Float8DynamicActivationFloat8WeightOpaqueTensorConfig
+
+__all__ = [
+    "Float8OpaqueTensor",
+    "Float8DynamicActivationFloat8WeightOpaqueTensorConfig",
+]
diff --git a/torchao/prototype/float8_opaque_tensor/float8_opaque_tensor.py b/torchao/prototype/float8_opaque_tensor/float8_opaque_tensor.py
@@ -21,12 +21,13 @@
 from torchao.quantization.quantize_.common import (
     _choose_quant_func_and_quantize_tensor,
 )
+from torchao.quantization.quantize_.workflows.float8.float8_tensor import (
+    QuantizeTensorToFloat8Kwargs,
+)
 from torchao.utils import (
     TorchAOBaseTensor,
 )
 
-from .float8_tensor import QuantizeTensorToFloat8Kwargs
-
 __all__ = [
     "Float8OpaqueTensor",
 ]
@@ -267,7 +268,7 @@ def _(func, types, args, kwargs):
     return y
 
 
-Float8OpaqueTensor.__module__ = "torchao.quantization"
+Float8OpaqueTensor.__module__ = "torchao.prototype.float8_opaque_tensor"
 
 # Allow a model with Float8OpaqueTensor weights to be loaded with `weights_only=True`
 torch.serialization.add_safe_globals([Float8OpaqueTensor])
diff --git a/torchao/prototype/float8_opaque_tensor/inference_workflow.py b/torchao/prototype/float8_opaque_tensor/inference_workflow.py
@@ -0,0 +1,117 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional, Union
+
+import torch
+
+import torchao
+from torchao.core.config import AOBaseConfig
+
+if TYPE_CHECKING:
+    from torchao.quantization.granularity import PerGroup, PerRow, PerTensor
+
+
+# Define FP8Granularity type alias to break circular import dependencies
+FP8Granularity = Union["PerTensor", "PerRow", "PerGroup"]
+
+import types
+from functools import partial
+
+from torchao.quantization.quant_api import _module_extra_repr
+from torchao.quantization.quantize_.workflows import QuantizeTensorToFloat8Kwargs
+from torchao.quantization.transform_module import (
+    register_quantize_module_handler,
+)
+from torchao.quantization.utils import get_block_size
+
+from .float8_opaque_tensor import Float8OpaqueTensor
+
+
+@dataclass
+class Float8DynamicActivationFloat8WeightOpaqueTensorConfig(AOBaseConfig):
+    """
+    Configuration for applying float8 dynamic symmetric quantization to both activations and weights of linear layers.
+
+    Args:
+        activation_dtype (torch.dtype): The target data type for activation quantization. Only torch.float8_e4m3fn supported.
+        weight_dtype (torch.dtype): The target data type for weight quantization. Only torch.float8_e4m3fn supported.
+        granularity (Optional[Union[FP8Granularity, List[FP8Granularity]]]):
+            The granularity for quantization. Can be either a single granularity (applied to both
+            activations and weights) or a tuple of two granularities (one for activations, one for weights).
+            If None, defaults to PerTensor for both. Currently both quantizations need to be the same type. And
+            only PerTensor/PerRow/PerGroup are supported.
+
+    """
+
+    activation_dtype: torch.dtype = torch.float8_e4m3fn
+    weight_dtype: torch.dtype = torch.float8_e4m3fn
+    granularity: Optional[Union[FP8Granularity, List[FP8Granularity]]] = None
+    set_inductor_config: bool = True
+
+    def __post_init__(self):
+        torch._C._log_api_usage_once(
+            "torchao.quantization.Float8DynamicActivationFloat8WeightConfig"
+        )
+        activation_granularity, weight_granularity = (
+            Float8OpaqueTensor._normalize_and_check_granularity(self.granularity)
+        )
+        self.granularity = [activation_granularity, weight_granularity]
+
+
+def _float8_dynamic_activation_float8_weight_opaque_tensor_quantize(weight, config):
+    activation_dtype = config.activation_dtype
+    granularity = config.granularity
+
+    activation_granularity, weight_granularity = granularity
+
+    act_quant_kwargs = QuantizeTensorToFloat8Kwargs(
+        activation_dtype,
+        activation_granularity,
+    )
+
+    block_size = get_block_size(weight.shape, weight_granularity)
+    quantized_weight = Float8OpaqueTensor.from_hp(
+        weight,
+        block_size=block_size,
+        act_quant_kwargs=act_quant_kwargs,
+    )
+
+    return quantized_weight
+
+
+@register_quantize_module_handler(Float8DynamicActivationFloat8WeightOpaqueTensorConfig)
+def _float8_dynamic_activation_float8_weight_opaque_tensor_transform(
+    module: torch.nn.Module,
+    config: Float8DynamicActivationFloat8WeightOpaqueTensorConfig,
+    *,
+    parameter_name: str = "weight",
+):
+    if config.set_inductor_config:
+        torchao.quantization.utils.recommended_inductor_config_setter()
+
+    assert hasattr(module, parameter_name), (
+        f"applying float8 dynamic activation quant requires module to have parameter {parameter_name} attribute"
+        + f" but {module} does not have one"
+    )
+    quantized_tensor = _float8_dynamic_activation_float8_weight_opaque_tensor_quantize(
+        getattr(module, parameter_name), config
+    )
+    setattr(
+        module,
+        parameter_name,
+        torch.nn.Parameter(quantized_tensor, requires_grad=False),
+    )
+    module.extra_repr = types.MethodType(
+        partial(
+            _module_extra_repr,
+            original_extra_repr=module.extra_repr,
+            parameter_name=parameter_name,
+        ),
+        module,
+    )
+    return module
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -92,7 +92,6 @@
     quantize_affine,
 )
 from .quantize_.workflows import (
-    Float8OpaqueTensor,
     Float8Tensor,
     Int4MarlinSparseTensor,
     Int4OpaqueTensor,
@@ -175,7 +174,6 @@
     "Int4TilePackedTo4dTensor",
     "Float8Tensor",
     "Int4OpaqueTensor",
-    "Float8OpaqueTensor",
     # smooth quant - subject to change
     "get_scale",
     "SmoothFakeDynQuantMixin",
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -74,8 +74,6 @@
     KernelPreference,
 )
 from torchao.quantization.quantize_.workflows import (
-    Float8OpaqueTensor,
-    Float8PackingFormat,
     Float8Tensor,
     Int4ChooseQParamsAlgorithm,
     Int4MarlinSparseTensor,
@@ -1808,23 +1806,14 @@ class Float8DynamicActivationFloat8WeightConfig(AOBaseConfig):
     kernel_preference: KernelPreference = KernelPreference.AUTO
     set_inductor_config: bool = True
     version: int = 2
-    float8_packing_format: Float8PackingFormat = Float8PackingFormat.PLAIN
 
     def __post_init__(self):
         torch._C._log_api_usage_once(
             "torchao.quantization.Float8DynamicActivationFloat8WeightConfig"
         )
-        if (
-            self.version == 2
-            and self.float8_packing_format == Float8PackingFormat.OPAQUE
-        ):
-            activation_granularity, weight_granularity = (
-                Float8OpaqueTensor._normalize_and_check_granularity(self.granularity)
-            )
-        else:
-            activation_granularity, weight_granularity = _normalize_granularity(
-                self.granularity
-            )
+        activation_granularity, weight_granularity = _normalize_granularity(
+            self.granularity
+        )
         self.granularity = [activation_granularity, weight_granularity]
 
         default_use_fast_accum = True
@@ -1854,48 +1843,43 @@ def _float8_dynamic_activation_float8_weight_quantize_tensor(weight, config):
     activation_value_lb = config.activation_value_lb
     activation_value_ub = config.activation_value_ub
     kernel_preference = config.kernel_preference
-    float8_packing_format = config.float8_packing_format
 
     # Ensure works on device
+    _check_hardware_support(granularity)
     activation_granularity, weight_granularity = granularity
 
-    if float8_packing_format == Float8PackingFormat.PLAIN:
-        # Note: right now we assume it's weights of conv2d and conv3d purely based
-        # on the dimension of weight, currently there is no conflict with linear 2d
-        # and moe weights 3d
-        # if we need to support conv1d, which also has 3d weight, we may have to
-        # pass around the module as well to distinguish between conv1d and 3d moe weight
-        if weight.dim() in [4, 5]:
-            # weights for conv2d or 3d
-            assert isinstance(activation_granularity, PerTensor) and isinstance(
-                weight_granularity, PerTensor
-            ), (
-                "4D/5D tensor only supports per tensor activation and weight quantization"
-            )
-
-            # conv3d weight dim: (C_out, C_in, K1, K2, K3)
-            # conv2d weight dim: (C_out, C_in, K1, K2)
-            # skip quantization when either C_out or C_in
-            # is not a multiple of 16
-            if weight.shape[0] % 16 != 0 or weight.shape[1] % 16 != 0:
-                return weight
-
-        elif not _fp8_mm_compat(weight):
-            # TODO(future PR): this should really throw an exception instead of silently
-            # not doing what the user asked
+    # Note: right now we assume it's weights of conv2d and conv3d purely based
+    # on the dimension of weight, currently there is no conflict with linear 2d
+    # and moe weights 3d
+    # if we need to support conv1d, which also has 3d weight, we may have to
+    # pass around the module as well to distinguish between conv1d and 3d moe weight
+    if weight.dim() in [4, 5]:
+        # weights for conv2d or 3d
+        assert isinstance(activation_granularity, PerTensor) and isinstance(
+            weight_granularity, PerTensor
+        ), "4D/5D tensor only supports per tensor activation and weight quantization"
+
+        # conv3d weight dim: (C_out, C_in, K1, K2, K3)
+        # conv2d weight dim: (C_out, C_in, K1, K2)
+        # skip quantization when either C_out or C_in
+        # is not a multiple of 16
+        if weight.shape[0] % 16 != 0 or weight.shape[1] % 16 != 0:
             return weight
+    elif not _fp8_mm_compat(weight):
+        # TODO(future PR): this should really throw an exception instead of silently
+        # not doing what the user asked
+        return weight
 
-        if isinstance(weight_granularity, PerRow):
-            assert weight.dtype == torch.bfloat16, (
-                "PerRow quantization only works for bfloat16 precision input weight"
-            )
+    if isinstance(weight_granularity, PerRow):
+        assert weight.dtype == torch.bfloat16, (
+            "PerRow quantization only works for bfloat16 precision input weight"
+        )
 
     if config.version == 1:
         warnings.warn(
             "Config Deprecation: version 1 of Float8DynamicActivationFloat8WeightConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2649 for more details"
         )
 
-        _check_hardware_support(granularity)
         block_size = get_block_size(weight.shape[-2:], weight_granularity)
         if weight.dim() == 3:
             block_size = tuple([1] + list(block_size))
@@ -1926,26 +1910,14 @@ def _float8_dynamic_activation_float8_weight_quantize_tensor(weight, config):
             kernel_preference=kernel_preference,
         )
 
-        if float8_packing_format == Float8PackingFormat.PLAIN:
-            quantized_weight = Float8Tensor.from_hp(
-                weight,
-                float8_dtype=weight_dtype,
-                granularity=weight_granularity,
-                mm_config=mm_config,
-                kernel_preference=kernel_preference,
-                act_quant_kwargs=act_quant_kwargs,
-            )
-        elif float8_packing_format == Float8PackingFormat.OPAQUE:
-            block_size = get_block_size(weight.shape, weight_granularity)
-            quantized_weight = Float8OpaqueTensor.from_hp(
-                weight,
-                block_size=block_size,
-                act_quant_kwargs=act_quant_kwargs,
-            )
-        else:
-            raise ValueError(
-                f"Unsupported float8 packing format: {float8_packing_format}"
-            )
+        quantized_weight = Float8Tensor.from_hp(
+            weight,
+            float8_dtype=weight_dtype,
+            granularity=weight_granularity,
+            mm_config=mm_config,
+            kernel_preference=kernel_preference,
+            act_quant_kwargs=act_quant_kwargs,
+        )
 
     return quantized_weight
 
@@ -1957,10 +1929,9 @@ def _float8_dynamic_activation_float8_weight_transform(
     *,
     parameter_name: str = "weight",
 ):
-    if config.float8_packing_format == Float8PackingFormat.PLAIN:
-        assert is_sm_at_least_89() or is_MI300(), (
-            "Float8 dynamic activation quantization is only supported on CUDA>=8.9 and MI300+"
-        )
+    assert is_sm_at_least_89() or is_MI300(), (
+        "Float8 dynamic activation quantization is only supported on CUDA>=8.9 and MI300+"
+    )
     if config.set_inductor_config:
         torchao.quantization.utils.recommended_inductor_config_setter()
 
diff --git a/torchao/quantization/quantize_/workflows/__init__.py b/torchao/quantization/quantize_/workflows/__init__.py
@@ -1,7 +1,3 @@
-from .float8.float8_opaque_tensor import (
-    Float8OpaqueTensor,
-)
-from .float8.float8_packing_format import Float8PackingFormat
 from .float8.float8_tensor import (
     Float8Tensor,
     QuantizeTensorToFloat8Kwargs,
@@ -41,9 +37,7 @@
     "Int4MarlinSparseTensor",
     "Int4PlainInt32Tensor",
     "Int4TilePackedTo4dTensor",
-    "Float8OpaqueTensor",
     "Float8Tensor",
-    "Float8PackingFormat",
     "QuantizeTensorToFloat8Kwargs",
     "Int4OpaqueTensor",
     "Int4ChooseQParamsAlgorithm",
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_packing_format.py b/torchao/quantization/quantize_/workflows/float8/float8_packing_format.py