weight convertor

MekkCyber · MekkCyber · commit 688037ebe518 · 2025-11-19T16:44:04.000Z
diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py
@@ -622,7 +622,6 @@ def convert_and_load_state_dict_in_model(
 
     Now that this is done, we can quantize / dequantize accordingly the collected_tensors.
     """
-
     prefix = model.base_model_prefix
     tp_plan = tp_plan or {}
     device_map = device_map or {"": "cpu"}
diff --git a/src/transformers/integrations/torchao.py b/src/transformers/integrations/torchao.py
@@ -24,13 +24,32 @@
 
 logger = logging.get_logger(__name__)
 
+
+def _quantization_type(weight):
+    from torchao.dtypes import AffineQuantizedTensor
+    from torchao.quantization.linear_activation_quantized_tensor import LinearActivationQuantizedTensor
+
+    if isinstance(weight, AffineQuantizedTensor):
+        return f"{weight.__class__.__name__}({weight._quantization_type()})"
+
+    if isinstance(weight, LinearActivationQuantizedTensor):
+        return f"{weight.__class__.__name__}(activation={weight.input_quant_func}, weight={_quantization_type(weight.original_weight_tensor)})"
+
+def _linear_extra_repr(self):
+    weight = _quantization_type(self.weight)
+    if weight is None:
+        return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight=None"
+    else:
+        return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight={weight}"
+
 class TorchAoQuantize(ConversionOps):
     def __init__(self, hf_quantizer):
         self.hf_quantizer = hf_quantizer
 
     def convert(
         self, input_dict: dict[str, torch.Tensor], model: Optional[torch.nn.Module] = None, missing_keys=None, **kwargs
     ) -> dict[str, torch.Tensor]:
+        # print("input_dict", input_dict)
         target_key, value = tuple(input_dict.items())[0]
         value = value[0] if isinstance(value, list) else value
 
@@ -39,8 +58,13 @@ def convert(
         target_key = self.hf_quantizer.get_param_name(target_key)
         module, _ = get_module_from_name(model, target_key)
 
+        """
+        Each nn.Linear layer that needs to be quantized is processed here.
+        First, we set the value the weight tensor, then we move it to the target device. Finally, we quantize the module.
+        """
         from torchao.quantization import quantize_
 
+        full_name = target_key
         # Those are the pre quantized weights
         if ":" in target_key:
             target_key = target_key.rsplit(":", 1)[0]
@@ -51,7 +75,7 @@ def convert(
             # already done) - if it's unsafe-serialized (i.e. not safetensors), not need for anything either
             is_unsafe_serialization = ":" not in full_name
             if tensor_name == "bias" or is_unsafe_serialization:
-                return {target_key: value}
+                return {full_name: value}
             # Sanity check for the new serialization format
             elif not (TORCHAO_VERSION >= version.parse("0.14.0") and is_metadata_torchao(self.hf_quantizer.metadata)):
                 raise ValueError("To use `safetensors` serialization, you should have `torchao>=0.14.0` installed")
@@ -60,38 +84,87 @@ def convert(
             if not hasattr(self.hf_quantizer, "ao_params"):
                 self.hf_quantizer.ao_params = defaultdict(dict)
             self.hf_quantizer.ao_params[target_key].update({full_name: value})
+            missing_keys.discard(full_name)
 
             # We are ready for quantization in this case (we retrieved all the needed keys)
             if len(self.hf_quantizer.ao_params[target_key]) == len(self.hf_quantizer.weight_ao_keys):
-                new_param = unflatten_tensor_state_dict(
-                    self.hf_quantizer.ao_params[target_key], self.hf_quantizer.metadata
-                )[target_key]
+                new_param = unflatten_tensor_state_dict(self.hf_quantizer.ao_params[target_key], self.hf_quantizer.metadata)[target_key]
+                # Free memory
                 del self.hf_quantizer.ao_params[target_key]
-                return {target_key: new_param}
 
             # Add repr to the module
             if isinstance(module, torch.nn.Linear):
-                module.extra_repr = types.MethodType(self.hf_quantizer._linear_extra_repr, module)
-            return {}
+                module.extra_repr = types.MethodType(_linear_extra_repr, module)
+
+            return {full_name: new_param}
         else:
-            module._parameters[tensor_name] = torch.nn.Parameter(value, requires_grad=value.requires_grad).to(
-                value.device
-            )
+            module._parameters[tensor_name] = torch.nn.Parameter(
+                value, requires_grad=value.requires_grad
+            ).to(value.device)
             # if we are quantizing tied parameters, to avoid tying the quantized weights
             # the correct order to do it is
             # 1. load the weight to model
             # 2. run tie_weights to populate the weights
             # 3. quantize
-            mm: Any = model
-            input_embed = mm.get_input_embeddings() if hasattr(mm, "get_input_embeddings") else None
+            input_embed = model.get_input_embeddings()
             if self.hf_quantizer.quantization_config.untie_embedding_weights and id(module) == id(input_embed):
-                if hasattr(mm, "tie_weights"):
-                    mm.tie_weights()
-                if hasattr(mm, "config") and hasattr(mm.config, "get_text_config"):
-                    setattr(mm.config.get_text_config(decoder=True), "tie_word_embeddings", False)
+                model.tie_weights()
+                setattr(model.config.get_text_config(decoder=True), "tie_word_embeddings", False)
+
+            # handle FqnToConfig, introduced in torchao 0.15.0+
+            if self.hf_quantizer.quantization_config._get_ao_version() >= version.Version("0.15.0"):
+                from torchao.quantization import FqnToConfig
+
+                config = self.hf_quantizer.quantization_config.get_apply_tensor_subclass()
+                if isinstance(config, FqnToConfig):
+                    module_fqn, top_level_param_name = target_key.rsplit(".", 1)
+                    c = None
+                    if target_key in config.fqn_to_config:
+                        assert not module_fqn.startswith("re:"), (
+                            "param fqn should not start with`re:`, which is used for specifying regex"
+                        )
+                        c = config.module_fqn_to_config[target_key]
+                    elif module_fqn in config.fqn_to_config:
+                        assert not module_fqn.startswith("re:"), (
+                            "module fqn should not start with`re:`, which is used for specifying regex"
+                        )
+                        c = config.module_fqn_to_config[module_fqn]
+                    # regex match module and param
+                    else:
+                        for maybe_module_fqn_pattern in config.fqn_to_config:
+                            # if key doesn't start with re, it is an exact fqn key, so we don't regex match
+                            if not maybe_module_fqn_pattern.startswith("re:"):
+                                continue
+                            # see if param matches first
+                            elif re.fullmatch(maybe_module_fqn_pattern[3:], target_key):
+                                c = config.module_fqn_to_config[maybe_module_fqn_pattern]
+                                break
+                            elif re.fullmatch(maybe_module_fqn_pattern[3:], module_fqn):
+                                # we'll apply the config for first fully matched pattern
+                                c = config.module_fqn_to_config[maybe_module_fqn_pattern]
+                                break
+                        else:
+                            c = config.module_fqn_to_config.get("_default", None)
+
+                    if c is not None:
+                        if top_level_param_name == "weight":
+                            # we can apply the module config directly
+                            quantize_(module, c, (lambda x, fqn: True))
+                            missing_keys.discard(target_key)
+                            module._is_hf_initialized = True
+                            return {}
+                        else:
+                            # need to apply to custom param name
+                            custom_param_fqn_config = FqnToConfig({top_level_param_name: c})
+                            quantize_(module, custom_param_fqn_config, filter_fn=None)
+                            missing_keys.discard(target_key)
+                            module._is_hf_initialized = True
+                            return {}
+                    return {full_name: value}
 
             # handle ModuleFqnToConfig, introduced in torchao 0.12.0+
-            if self.hf_quantizer.quantization_config._get_ao_version() >= version.Version("0.12.0"):
+            # TODO deprecate this when we deprecate ModuleFqnToConfig
+            elif self.hf_quantizer.quantization_config._get_ao_version() >= version.Version("0.12.0"):
                 from torchao.quantization import ModuleFqnToConfig
 
                 config = self.hf_quantizer.quantization_config.get_apply_tensor_subclass()
@@ -113,14 +186,14 @@ def convert(
                                 break
                         else:
                             c = config.module_fqn_to_config.get("_default", None)
-
                     if c is not None:
                         # filter_fn: not filtering out any modules
                         quantize_(module, c, filter_fn=lambda x, fqn: True)
+                        missing_keys.discard(full_name)
                         module._is_hf_initialized = True
-                        missing_keys.discard(target_key)
-                        return {}
+                    return {full_name: value}
+
             quantize_(module, self.hf_quantizer.quantization_config.get_apply_tensor_subclass())
+            missing_keys.discard(full_name)
             module._is_hf_initialized = True
-            missing_keys.discard(target_key)
             return {}
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -4043,7 +4043,7 @@ def from_pretrained(
                 weight_conversions.extend(
                     [WeightRenaming(source_keys=k, target_keys=v) for k, v in key_mapping.items()]
                 )
-
+            
         if gguf_file:
             if hf_quantizer is not None:
                 raise ValueError(
diff --git a/src/transformers/quantizers/base.py b/src/transformers/quantizers/base.py
@@ -406,6 +406,8 @@ def get_quantize_ops(self):
             f"{self.quantization_config.quant_method} is not available yet and will be supported soon."
         )
 
+    def get_weight_conversions(self):
+        return []
 
 class SequentialLlama4TextExperts(ModuleList):
     """
diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py
@@ -29,7 +29,7 @@
 from safetensors import safe_open
 
 from ..utils import is_torch_available, is_torchao_available, logging
-
+from ..core_model_loading import WeightConverter
 
 if is_torch_available():
     import torch
@@ -533,4 +533,40 @@ def set_metadata(self, checkpoint_files: list[str]):
 
     def get_quantize_ops(self):
         from ..integrations.torchao import TorchAoQuantize
-        return TorchAoQuantize(self)
+        return TorchAoQuantize(self)
+
+    def get_weight_conversions(self):
+        from ..integrations.torchao import TorchAoQuantize
+        return [
+            WeightConverter(
+                source_keys= ["self_attn.q_proj.weight:*"],
+                target_keys= "self_attn.q_proj.weight",
+                operations=[TorchAoQuantize(self)],
+            ),
+            WeightConverter(
+                source_keys= ["self_attn.k_proj.weight:*"],
+                target_keys= "self_attn.k_proj.weight",
+                operations=[TorchAoQuantize(self)],
+            ),
+            WeightConverter(
+                source_keys= ["self_attn.v_proj.weight:*"],
+                target_keys= "self_attn.v_proj.weight",
+                operations=[TorchAoQuantize(self)],
+            ),
+            WeightConverter(
+                source_keys= ["mlp.gate_proj.weight:*"],
+                target_keys= "mlp.gate_proj.weight",
+                operations=[TorchAoQuantize(self)],
+            ),
+            WeightConverter(
+                source_keys= ["mlp.up_proj.weight:*"],
+                target_keys= "mlp.up_proj.weight",
+                operations=[TorchAoQuantize(self)],
+            ),
+            WeightConverter(
+                source_keys= ["mlp.down_proj.weight:*"],
+                target_keys= "mlp.down_proj.weight",
+                operations=[TorchAoQuantize(self)],
+            ),
+
+        ]

Original file line number	Diff line number	Diff line change
`@@ -4043,7 +4043,7 @@ def from_pretrained(`
`4043`	`4043`	`weight_conversions.extend(`
`4044`	`4044`	`[WeightRenaming(source_keys=k, target_keys=v) for k, v in key_mapping.items()]`
`4045`	`4045`	`)`
`4046`		`-`
	`4046`	`+`
`4047`	`4047`	`if gguf_file:`
`4048`	`4048`	`if hf_quantizer is not None:`
`4049`	`4049`	`raise ValueError(`
Original file line number	Diff line number	Diff line change
`@@ -406,6 +406,8 @@ def get_quantize_ops(self):`
`406`	`406`	`f"{self.quantization_config.quant_method} is not available yet and will be supported soon."`
`407`	`407`	`)`
`408`	`408`
	`409`	`+ def get_weight_conversions(self):`
	`410`	`+ return []`
`409`	`411`
`410`	`412`	`class SequentialLlama4TextExperts(ModuleList):`
`411`	`413`	`"""`