mlc-ai
diff --git a/‎ci/task/pylint.sh
+1-1 b/‎ci/task/pylint.sh
+1-1
diff --git a/‎python/mlc_llm/compiler_pass/dispatch_triton_kernel.py
+174 b/‎python/mlc_llm/compiler_pass/dispatch_triton_kernel.py
+174
diff --git a/‎python/mlc_llm/compiler_pass/fuse_add_norm.py
+1 b/‎python/mlc_llm/compiler_pass/fuse_add_norm.py
+1
diff --git a/‎python/mlc_llm/compiler_pass/fuse_dequantize_take.py
+1-1 b/‎python/mlc_llm/compiler_pass/fuse_dequantize_take.py
+1-1
diff --git a/‎python/mlc_llm/compiler_pass/pipeline.py
+3-1 b/‎python/mlc_llm/compiler_pass/pipeline.py
+3-1
diff --git a/‎python/mlc_llm/conversation_template/deepseek.py
+16 b/‎python/mlc_llm/conversation_template/deepseek.py
+16
diff --git a/‎python/mlc_llm/interface/gen_config.py
+1 b/‎python/mlc_llm/interface/gen_config.py
+1
diff --git a/‎python/mlc_llm/loader/utils.py
+10-2 b/‎python/mlc_llm/loader/utils.py
+10-2
@@ -10,7 +10,7 @@ if [[ -n ${MLC_CI_SETUP_DEPS:-} ]]; then
     echo "MLC_CI_SETUP_DEPS=1 start setup deps"
     # TVM Unity is a dependency to this testing
     pip install --quiet --pre -U --no-index -f https://mlc.ai/wheels mlc-ai-nightly-cpu
-    pip install requests
+    pip install requests triton
     pip install --quiet --pre -U cuda-python
 fi
 
 
@@ -0,0 +1,174 @@
+"""A pass that dispatch generic calls of triton kernels to specific kernel implementations."""
+
+# pylint: disable=invalid-name
+
+from typing import List
+
+import tvm
+from tvm import IRModule, relax
+from tvm.relax.expr_functor import PyExprMutator, mutator
+
+from mlc_llm.op.triton import (
+    get_tir_w8a8_block_fp8_group_matmul,
+    get_tir_w8a8_block_fp8_matmul,
+)
+from mlc_llm.support import logging
+
+logger = logging.getLogger(__name__)
+
+
+@mutator
+class _Rewriter(PyExprMutator):  # pylint: disable=abstract-method
+    def __init__(self, mod: IRModule, target: tvm.target.Target) -> None:
+        super().__init__(mod)
+        self.mod = mod
+        self.target = target
+        self.extern_mods: List[tvm.runtime.Module] = []
+
+    def transform(self) -> tvm.IRModule:  # pylint: disable=too-many-locals
+        """Entry point of the transformation"""
+        for g_var, func in self.mod.functions_items():
+            if not isinstance(func, relax.Function):
+                continue
+            new_func = self.visit_expr(func)
+            # new_func = remove_all_unused(new_func)
+            self.builder_.update_func(g_var, new_func)
+
+        mod = self.builder_.finalize()
+        mod_attrs = dict(mod.attrs) if mod.attrs else {}
+        mod = mod.with_attr(
+            "external_mods", list(mod_attrs.get("external_mods", [])) + self.extern_mods
+        )
+        return mod
+
+    def visit_call_(self, call: relax.Call) -> relax.Expr:  # pylint: disable=arguments-renamed
+        call = super().visit_call_(call)
+
+        if (
+            call.op != tvm.ir.Op.get("relax.call_dps_packed")
+            or not isinstance(call.args[0], relax.ExternFunc)
+            or not str(call.args[0].global_symbol).startswith("mlc.triton.")
+        ):
+            return call
+
+        global_symbol = str(call.args[0].global_symbol)
+        assert isinstance(call.args[1], relax.Tuple)
+        if global_symbol == "mlc.triton.w8a8_block_fp8_matmul":
+            return self.w8a8_block_fp8_matmul(call.args[1].fields, call.struct_info)
+        if global_symbol == "mlc.triton.w8a8_block_fp8_group_matmul":
+            return self.w8a8_block_fp8_group_matmul(call.args[1].fields, call.struct_info)
+        raise ValueError(f"Unknown mlc.triton kernel identifier: {global_symbol}")
+
+    def w8a8_block_fp8_matmul(  # pylint: disable=too-many-locals
+        self, args: List[relax.Expr], out_sinfo: relax.StructInfo
+    ) -> relax.Expr:
+        """Emit the w8a8_block_fp8_matmul triton kernel."""
+        assert len(args) == 16
+        x, weight, x_scale, weight_scale = args[:4]
+        (
+            N,
+            K,
+            block_n,
+            block_k,
+            BLOCK_SIZE_M,
+            BLOCK_SIZE_N,
+            BLOCK_SIZE_K,
+            GROUP_SIZE_M,
+            num_warps,
+            num_stages,
+        ) = [arg.value.value for arg in args[4:14]]
+        in_dtype, out_dtype = str(args[14].value), str(args[15].value)
+
+        prim_func, func_name = get_tir_w8a8_block_fp8_matmul(
+            N,
+            K,
+            block_n,
+            block_k,
+            in_dtype,  # type: ignore
+            out_dtype,  # type: ignore
+            BLOCK_SIZE_M,
+            BLOCK_SIZE_N,
+            BLOCK_SIZE_K,
+            GROUP_SIZE_M,
+            num_warps,
+            num_stages,
+            self.extern_mods,
+        )
+        if prim_func is None:
+            # The TIR function is already in the IRModule
+            gv = self.builder_.get().get_global_var(func_name)
+        else:
+            # Add the TIR function to the IRModule
+            gv = self.builder_.add_func(prim_func, func_name)
+
+        return relax.call_tir(gv, [x, weight, x_scale, weight_scale], out_sinfo=out_sinfo)
+
+    def w8a8_block_fp8_group_matmul(  # pylint: disable=too-many-locals
+        self, args: List[relax.Expr], out_sinfo: relax.StructInfo
+    ) -> relax.Expr:
+        """Emit the w8a8_block_fp8_group_matmul triton kernel."""
+        assert len(args) == 19
+        x, weight, x_scale, weight_scale, expert_ids, indptr = args[:6]
+        (
+            N,
+            K,
+            num_experts,
+            block_n,
+            block_k,
+            BLOCK_SIZE_M,
+            BLOCK_SIZE_N,
+            BLOCK_SIZE_K,
+            GROUP_SIZE_M,
+            num_warps,
+            num_stages,
+        ) = [arg.value.value for arg in args[6:17]]
+        in_dtype, out_dtype = str(args[17].value), str(args[18].value)
+
+        prim_func, func_name = get_tir_w8a8_block_fp8_group_matmul(
+            N,
+            K,
+            num_experts,
+            block_n,
+            block_k,
+            in_dtype,  # type: ignore
+            out_dtype,  # type: ignore
+            BLOCK_SIZE_M,
+            BLOCK_SIZE_N,
+            BLOCK_SIZE_K,
+            GROUP_SIZE_M,
+            num_warps,
+            num_stages,
+            self.extern_mods,
+        )
+        if prim_func is None:
+            # The TIR function is already in the IRModule
+            gv = self.builder_.get().get_global_var(func_name)
+        else:
+            # Add the TIR function to the IRModule
+            gv = self.builder_.add_func(prim_func, func_name)
+
+        return relax.call_tir(
+            gv,
+            [x, weight, x_scale, weight_scale, expert_ids, indptr],
+            out_sinfo=out_sinfo,
+        )
+
+
+@tvm.transform.module_pass(opt_level=0, name="DispatchTritonKernel")
+class DispatchTritonKernel:  # pylint: disable=too-many-instance-attributes,too-few-public-methods
+    """Rewrite KV cache creation functions to IRModule."""
+
+    def __init__(self, target: tvm.target.Target) -> None:
+        """Initializer.
+
+        Parameters
+        ----------
+        """
+        self.target = target
+
+    def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassContext) -> IRModule:
+        """Entrypoint"""
+        if self.target.kind.name != "cuda":
+            return mod
+
+        return _Rewriter(mod, self.target).transform()
@@ -182,6 +182,7 @@ def visit_call_(self, call: relax.Call) -> relax.Expr:  # pylint: disable=argume
         call = super().visit_call_(call)
 
         # Match the "rms_norm(add(x1, x2), w)" pattern
+        # Todo: support bf16  # pylint: disable=fixme
         if call.op != tvm.ir.Op.get("relax.nn.rms_norm") or call.struct_info.dtype != "float16":
             return call
         assert len(call.args) == 2
 
@@ -15,7 +15,7 @@
 class FuseDequantizeTake:  # pylint: disable=too-few-public-methods
     """A compiler pass that fuses dequantize + take."""
 
-    def transform_module(
+    def transform_module(  # pylint: disable=too-many-locals
         self,
         mod: IRModule,
         _ctx: tvm.transform.PassContext,
 
@@ -28,6 +28,7 @@
 from .blas_dispatch import BLASDispatch
 from .clean_up_tir_attrs import CleanUpTIRAttrs
 from .dispatch_kv_cache_creation import DispatchKVCacheCreation
+from .dispatch_triton_kernel import DispatchTritonKernel
 from .estimate_memory_usage import AttachMetadataWithMemoryUsage
 from .fuse_add_norm import FuseAddRMSNorm
 from .fuse_dequantize_matmul_ewise import FuseDequantizeMatmulEwise
@@ -117,6 +118,7 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
                 _DebugDump("debug-phase0.py", debug_dump, show_meta=False),
                 # Phase 1. Passes on high-level operator graph
                 _LogProgress("Running TVM Relax graph-level optimizations"),
+                DispatchTritonKernel(target),
                 FuseFTDequantizeEpilogue(),
                 FuseDequantizeTranspose(),
                 BLASDispatch(target) if cublas_gemm else tvm.transform.Sequential([]),
@@ -185,6 +187,7 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
                 ),
                 tvm.relax.transform.StaticPlanBlockMemory(),
                 AttachMetadataWithMemoryUsage(metadata),
+                _DebugDump("debug-phase5.py", debug_dump, show_meta=False),
                 tvm.relax.transform.RewriteCUDAGraph(),
                 AttachCUDAGraphAllocInitFunc(),
                 tvm.relax.transform.LowerGPUIPCAllocStorage(),
@@ -193,7 +196,6 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
                 tvm.relax.transform.LowerRuntimeBuiltin(),
                 tvm.relax.transform.VMShapeLower(),
                 tvm.relax.transform.AttachGlobalSymbol(),
-                _DebugDump("debug-final.py", debug_dump, show_meta=False),
                 _LogProgress("Compiling external modules"),
                 tvm.relax.transform.AttachExternModules(ext_mods),
                 _LogProgress("Compilation complete! Exporting to disk"),
 
@@ -36,6 +36,22 @@
     )
 )
 
+# DeepSeek-V3
+ConvTemplateRegistry.register_conv_template(
+    Conversation(
+        name="deepseek_v3",
+        system_template=f"<｜begin▁of▁sentence｜>{MessagePlaceholders.SYSTEM.value}",
+        system_message="You are Deepseek-V3, an AI assistant created exclusively by the Chinese "
+        "Company DeepSeek. You'll provide helpful, harmless, and detailed responses to all "
+        "user inquiries.",
+        roles={"user": "<｜User｜>", "assistant": "<｜Assistant｜>"},
+        seps=["", "<｜end▁of▁sentence｜>"],
+        role_content_sep="",
+        role_empty_sep="",
+        stop_token_ids=[1],
+    )
+)
+
 # DeepSeek-R1-Distill-Qwen
 ConvTemplateRegistry.register_conv_template(
     Conversation(
 
@@ -309,6 +309,7 @@ def gen_config(  # pylint: disable=too-many-locals,too-many-arguments,too-many-b
     "aya-23",
     "deepseek",
     "deepseek_v2",
+    "deepseek_v3",
     "deepseek_r1_qwen",
     "deepseek_r1_llama",
     "olmo",
 
@@ -55,13 +55,21 @@ def load_torch_shard(path: Path) -> Iterator[Tuple[str, np.ndarray]]:
 def load_safetensor_shard(path: Path) -> Iterator[Tuple[str, np.ndarray]]:
     """Load and yield SafeTensor format parameters."""
     import safetensors  # pylint: disable=import-outside-toplevel,import-error
+    import torch  # pylint: disable=import-outside-toplevel
 
     with safetensors.safe_open(path, framework="pt", device="cpu") as in_file:
         for name in in_file.keys():
             param = in_file.get_tensor(name)
             param = param.detach().cpu()
             dtype = str(param.dtype)
             if dtype == "torch.bfloat16":
-                param = param.float()
-            param = param.numpy()
+                import ml_dtypes  # pylint: disable=import-outside-toplevel
+
+                param = param.view(torch.float16).cpu().numpy().view(ml_dtypes.bfloat16)
+            elif dtype == "torch.float8_e4m3fn":
+                import ml_dtypes  # pylint: disable=import-outside-toplevel
+
+                param = param.view(torch.uint8).cpu().numpy().view(ml_dtypes.float8_e4m3fn)
+            else:
+                param = param.numpy()
             yield name, param