instead of assert, check in should_fuse

am17an · am17an · commit d78a82c57a97 · 2025-10-22T21:56:21.000+08:00
diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu
@@ -254,8 +254,6 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
     if (with_norm) {
         if (clamp) {
             clamp_val = ggml_get_op_params_f32(clamp, 0);
-            float max_val = ggml_get_op_params_f32(clamp, 1);
-            GGML_ASSERT(max_val == INFINITY);
         }
         launch_topk_moe_cuda<true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used, clamp_val);
     } else {
@@ -269,7 +267,7 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
     }
 }
 
-bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights) {
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp) {
     float scale    = 1.0f;
     float max_bias = 0.0f;
 
@@ -295,6 +293,18 @@ bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tenso
         return false;
     }
 
+    if (clamp) {
+        if (clamp->op != GGML_OP_CLAMP) {
+            return false;
+        }
+        float max_val = ggml_get_op_params_f32(clamp, 1);
+
+        if (max_val != INFINITY) {
+            return false;
+        }
+    }
+
+
     return true;
 }
 
diff --git a/ggml/src/ggml-cuda/topk-moe.cuh b/ggml/src/ggml-cuda/topk-moe.cuh
@@ -11,6 +11,6 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
                            const bool                  delayed_softmax = false,
                            ggml_tensor *               weight_clamp    = nullptr);
 
-bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights);
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp = nullptr);
 
 std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool with_norm, bool delayed_softmax = false);