Update linear_op.py

yuweivvv · zzhx1 · 子潜 · commit 32b3e9254780 · 2025-11-19T15:51:25.000+08:00
Co-authored-by: zzhx1 &lt;zzh_201018@outlook.com&gt;
Signed-off-by: 子潜 &lt;ziqian@U-DMKXH32D-2015.local&gt;
diff --git a/vllm_ascend/distributed/parallel_state.py b/vllm_ascend/distributed/parallel_state.py
@@ -38,11 +38,13 @@ def get_lmhead_tp_group() -> GroupCoordinator:
         "lm head tensor parallel group is not initialized")
     return _LMTP
 
+
 def get_dftp_group() -> GroupCoordinator:
     assert _DFTP is not None, (
         "denseffn tensor parallel group is not initialized")
     return _DFTP
 
+
 def get_flashcomm2_otp_group() -> GroupCoordinator:
     return _FLASHCOMM2_OTP
 
@@ -183,17 +185,18 @@ def init_ascend_model_parallel(parallel_config: ParallelConfig, ):
                                           get_world_group().local_rank,
                                           backend,
                                           group_name="lmheadtp")
-    
-    denseffn_tensor_parallel_size = get_ascend_config().denseffn_tensor_parallel_size
+
+    denseffn_tensor_parallel_size = get_ascend_config(
+    ).denseffn_tensor_parallel_size
     if denseffn_tensor_parallel_size is not None:
         group_ranks = []
         global _DFTP
-        num_denseffn_tensor_parallel_groups: int = (world_size //
-                                                    denseffn_tensor_parallel_size)
+        num_denseffn_tensor_parallel_groups: int = (
+            world_size // denseffn_tensor_parallel_size)
         for i in range(num_denseffn_tensor_parallel_groups):
             ranks = list(
                 range(i * denseffn_tensor_parallel_size,
-                     (i + 1) * denseffn_tensor_parallel_size))
+                      (i + 1) * denseffn_tensor_parallel_size))
             group_ranks.append(ranks)
         _DFTP = init_model_parallel_group(group_ranks,
                                           get_world_group().local_rank,
diff --git a/vllm_ascend/ops/linear.py b/vllm_ascend/ops/linear.py
@@ -37,7 +37,8 @@
 from vllm.model_executor.utils import set_weight_attrs
 
 from vllm_ascend.ops.linear_op import get_parallel_op, get_replicated_op
-from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_enable_nz, is_first_k_dense
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, is_enable_nz,
+                               is_first_k_dense)
 
 
 class AscendUnquantizedLinearMethod(UnquantizedLinearMethod):
diff --git a/vllm_ascend/ops/linear_op.py b/vllm_ascend/ops/linear_op.py
@@ -52,17 +52,17 @@
 from vllm.forward_context import get_forward_context
 
 from vllm_ascend.ascend_config import get_ascend_config
-from vllm_ascend.distributed.parallel_state import (get_flashcomm2_odp_group,
+from vllm_ascend.distributed.parallel_state import (get_dftp_group,
+                                                    get_flashcomm2_odp_group,
                                                     get_flashcomm2_otp_group,
                                                     get_mlp_tp_group,
-                                                    get_otp_group,
-                                                    get_dftp_group)
-from vllm_ascend.utils import (dense_optim_enable, enable_sp,
-                               flashcomm2_enable,
+                                                    get_otp_group)
+from vllm_ascend.utils import (dense_optim_enable, denseffn_tp_enable,
+                               enable_sp, flashcomm2_enable,
                                get_flashcomm2_reorgnized_batch_ids,
-                               matmul_allreduce_enable, mlp_tp_enable,
-                               oproj_tp_enable, shared_expert_dp_enabled,
-                               denseffn_tp_enable, is_first_k_dense)
+                               is_first_k_dense, matmul_allreduce_enable,
+                               mlp_tp_enable, oproj_tp_enable,
+                               shared_expert_dp_enabled)
 
 
 class CustomLinearOp:
@@ -161,10 +161,10 @@ def __init__(self, layer):
 
     @property
     def comm_group(self):
-        if denseffn_tp_enable():
-            return get_dftp_group()
-        else:
+        if mlp_tp_enable():
             return get_mlp_tp_group()
+        else:
+            return get_dftp_group()
 
     def apply_impl(
         self,
@@ -187,10 +187,10 @@ def __init__(self, layer):
 
     @property
     def comm_group(self):
-        if denseffn_tp_enable():
-            return get_dftp_group()
-        else:
+        if mlp_tp_enable():
             return get_mlp_tp_group()
+        else:
+            return get_dftp_group()
 
     def apply_impl(
         self, input_: torch.Tensor
@@ -613,7 +613,9 @@ def update_attrs(self):
 def _get_column_parallel_op(
         prefix, layer
 ) -> Optional[Union[MLPColumnParallelOp, SequenceColumnParallelOp]]:
-    if (mlp_tp_enable() or (denseffn_tp_enable() and is_first_k_dense(prefix))) and "gate_up_proj" in prefix:
+    if (mlp_tp_enable() or
+        (denseffn_tp_enable()
+         and is_first_k_dense(prefix))) and "gate_up_proj" in prefix:
         return MLPColumnParallelOp(layer)
     if enable_sp():
         if "shared_expert" in prefix:
@@ -633,7 +635,9 @@ def _get_row_parallel_op(
 ) -> Optional[Union[MLPRowParallelOp, OProjRowParallelOp,
                     Flashcomm2OProjRowParallelOp, MatmulAllreduceRowParallelOp,
                     SequenceRowParallelOp]]:
-    if "down_proj" in prefix and (mlp_tp_enable() or (denseffn_tp_enable() and is_first_k_dense(prefix))):
+    if "down_proj" in prefix and (mlp_tp_enable() or
+                                  (denseffn_tp_enable()
+                                   and is_first_k_dense(prefix))):
         return MLPRowParallelOp(layer)
     if "o_proj" in prefix and oproj_tp_enable():
         return OProjRowParallelOp(layer)
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
@@ -36,14 +36,15 @@
 from vllm.model_executor.utils import set_weight_attrs
 
 from vllm_ascend.ascend_config import get_ascend_config
-from vllm_ascend.distributed.parallel_state import (get_flashcomm2_otp_group,
+from vllm_ascend.distributed.parallel_state import (get_dftp_group,
+                                                    get_flashcomm2_otp_group,
                                                     get_mlp_tp_group,
-                                                    get_otp_group,
-                                                    get_dftp_group)
+                                                    get_otp_group)
 from vllm_ascend.ops.fused_moe.fused_moe import AscendUnquantizedFusedMoEMethod
 from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
-from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, flashcomm2_enable,
-                               mlp_tp_enable, oproj_tp_enable, denseffn_tp_enable)
+from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, denseffn_tp_enable,
+                               flashcomm2_enable, mlp_tp_enable,
+                               oproj_tp_enable)
 
 from .utils import get_quant_method
 
@@ -349,7 +350,9 @@ def apply(
         if isinstance(layer, RowParallelLinear):
             if layer.prefix.find("o_proj") != -1 and oproj_tp_enable():
                 tp_rank = get_otp_group().rank_in_group
-            elif layer.prefix.find("down_proj") != -1 and (mlp_tp_enable() or (denseffn_tp_enable() and layer.is_first_k_dense)):
+            elif layer.prefix.find("down_proj") != -1 and (
+                    mlp_tp_enable() or
+                (denseffn_tp_enable() and layer.is_first_k_dense)):
                 if denseffn_tp_enable() and layer.is_first_k_dense:
                     tp_rank = get_dftp_group().rank_in_group
                 else:
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -768,9 +768,11 @@ def lmhead_tp_enable() -> bool:
 def oproj_tp_enable() -> bool:
     return get_ascend_config().oproj_tensor_parallel_size is not None
 
+
 def denseffn_tp_enable() -> bool:
     return get_ascend_config().denseffn_tensor_parallel_size is not None
 
+
 def mlp_tp_enable() -> bool:
     return envs_ascend.VLLM_ASCEND_ENABLE_MLP_OPTIMIZE
 
@@ -1018,24 +1020,25 @@ def get_flashcomm2_reorgnized_batch_ids(global_tp_size) -> list[list[int]]:
 
     return reorgnized_batch_ids
 
+
 def is_first_k_dense(prefix: str) -> bool:
     from vllm.config import get_current_vllm_config
     match = re.search(r'layers\.(\d+)\.', prefix)
     if not match:
         return False
 
     layer_idx = int(match.group(1))
-    
+
     vllm_config = get_current_vllm_config()
     if vllm_config is None:
-        raise ValueError("get_current_vllm_config() returned None. "
-                            "Ensure this function is called within the model initialization context.")
+        raise ValueError(
+            "get_current_vllm_config() returned None. "
+            "Ensure this function is called within the model initialization context."
+        )
     config = vllm_config.model_config.hf_config
 
-    is_moe_layer = (
-        config.n_routed_experts is not None and
-        layer_idx >= config.first_k_dense_replace and
-        layer_idx % config.moe_layer_freq == 0
-    )
+    is_moe_layer = (config.n_routed_experts is not None
+                    and layer_idx >= config.first_k_dense_replace
+                    and layer_idx % config.moe_layer_freq == 0)
 
-    return not is_moe_layer
+    return not is_moe_layer