vllm-project · flying632 · Apr 29, 2025
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -276,8 +276,7 @@ def fused_experts(hidden_states: torch.Tensor,
                               group_list_type=group_list_type)
 
     if expert_map is not None:
-        weighted_down_out = down_out_list * sorted_weights.unsqueeze(1)
-
+        down_out_list.mul_(sorted_weights.unsqueeze(1))
         final_hidden_states = torch.zeros(*original_shape,
                                           device=hidden_states.device,
                                           dtype=dtype)
@@ -286,10 +285,8 @@ def fused_experts(hidden_states: torch.Tensor,
         valid_token_mask = torch.arange(
             0, sorted_token_indices.shape[0],
             device=device).unsqueeze(1) < num_valid_tokens
-        valid_output = torch.where(
-            valid_token_mask, weighted_down_out,
-            torch.zeros_like(weighted_down_out)).to(dtype)
-        final_hidden_states.index_add_(0, sorted_token_indices, valid_output)
+        down_out_list.mul_(valid_token_mask)
+        final_hidden_states.index_add_(0, sorted_token_indices, down_out_list)
     else:
         # TODO: Reorder device memory 2 times here, replace the current
         # implementation here when suitable operators become available.