pytorch · syed-ahmed · Aug 15, 2025 · danielvegamyhre · Aug 15, 2025 · syed-ahmed
diff --git a/setup.py b/setup.py
@@ -634,6 +634,10 @@ def get_extensions():
         mxfp8_src_files_exist = all(os.path.exists(f) for f in mxfp8_sources)
         if mxfp8_src_files_exist and build_for_sm100a:
             print("Building mxfp8_cuda extension")
+            arch_flags = [
+                "-gencode=arch=compute_100,code=sm_100",
+                "-gencode=arch=compute_120,code=sm_120"
+            ]
             ext_modules.append(
                 CUDAExtension(
                     name="torchao.prototype.mxfp8_cuda",
@@ -647,7 +651,7 @@ def get_extensions():
                     ],
                     extra_compile_args={
                         "cxx": ["-std=c++17", "-O3"],
-                        "nvcc": nvcc_args,
+                        "nvcc": nvcc_args + arch_flags,
                     },
                     extra_link_args=["-lcuda", "-lcudart"],
                 ),

diff --git a/torchao/csrc/cuda/mx_kernels/mxfp8_quantize.cuh b/torchao/csrc/cuda/mx_kernels/mxfp8_quantize.cuh
@@ -22,21 +22,6 @@
 #include <cuda/barrier>
 #include <cuda/ptx>
 
-#define MIN_CUDA_SM 1000 // SM90 = 900, SM100 = 1000
-
-// Check if we're compiling for supported architecture
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < MIN_CUDA_SM)
-#warning                                                                       \
-    "MXFP8 quantization requires SM90+ (Hopper) or SM100+ (Blackwell) architecture. Kernel will be disabled for this architecture."
-#endif
-
-// Architecture detection for native FP8 support
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
-#define HAS_NATIVE_FP8_CONVERSION 1
-#else
-#define HAS_NATIVE_FP8_CONVERSION 0
-#endif
-
 enum class DType {
   kByte,
   kFloat32,
@@ -975,11 +960,6 @@ public:
           output_bits_per_elem); // bits per elem in output fp8e4m3
     }
 
-// Launch kernel based on input/output types and scaling dimensions
-// Only compile kernel launches for SM90+
-#if defined(__CUDACC__) &&                                                     \
-    (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= MIN_CUDA_SM)
-
     // Use TMA and mbarrier instructions
 #define LAUNCH_KERNEL(IType, OType, SCALE_Y, SCALE_X, ScalingMode)                          \
   mxfp8_quantize_kernel<IType, OType, SCALE_Y, SCALE_X, ScalingMode>                        \
@@ -1044,6 +1024,5 @@ public:
 
 #undef LAUNCH_KERNEL
 
-#endif
   }
 };