tile-ai
diff --git a/‎CHANGELOG.md
+11-1 b/‎CHANGELOG.md
+11-1
diff --git a/‎CMakeLists.txt
+43-16 b/‎CMakeLists.txt
+43-16
diff --git a/‎CUDA.cmake
+2-2 b/‎CUDA.cmake
+2-2
diff --git a/‎README.md
+12-3 b/‎README.md
+12-3
diff --git a/‎cmake/NvidiaCutlassConfig.cmake
+7-2 b/‎cmake/NvidiaCutlassConfig.cmake
+7-2
diff --git a/‎examples/08_turing_tensorop_gemm/CMakeLists.txt
-1 b/‎examples/08_turing_tensorop_gemm/CMakeLists.txt
-1
diff --git a/‎examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
+2-3 b/‎examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
+2-3
diff --git a/‎examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
-6 b/‎examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
-6
diff --git a/‎examples/12_gemm_bias_relu/CMakeLists.txt
-1 b/‎examples/12_gemm_bias_relu/CMakeLists.txt
-1
diff --git a/‎examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu
-5 b/‎examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu
-5
diff --git a/‎examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu
-8 b/‎examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu
-8
diff --git a/‎examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu
-4 b/‎examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu
-4
diff --git a/‎examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu
-6 b/‎examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu
-6
@@ -1,5 +1,15 @@
 # NVIDIA CUTLASS Changelog
 
+## [3.2.1](https://github.com/NVIDIA/cutlass/releases/tag/v3.2.1) (2023-09-22)
+* Python support SM90 Epilogue Visitor Tree (EVT) on top of the C++ support released in 3.2.0.
+* SM80 EVT support in C++ and Python.
+* Other SM90 epilogue improvements.
+* Splitting CUTLASS library into smaller units based on operation, arch and datatypes. See [1105](https://github.com/NVIDIA/cutlass/discussions/1105) for details.
+* Making `tools/library/scripts` packageable - `tools/library/scripts` is now moving to `python/cutlass_library`. See the Python [README](/python/README.md) for details.
+* SM90 TF32 kernel improvements for all layouts.
+* SM90 rasterization direction support in the CUTLASS profiler.
+* Improvement for CUTLASS profiler build times.
+* Remove Python-C++ bindings.
 
 ## [3.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.2.0) (2023-08-03)
 
@@ -91,7 +101,7 @@
   * [Few channels](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h) specialization for reduced alignment capabilities
   * [Fixed channels](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h) further specialized when channel count perfectly matches the access vector size
   * [Unit tests](/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu)
-  * [Python-based instance emitter](/tools/library/scripts/generator.py) in the CUTLASS Library and support in the Profiler
+  * [Python-based instance emitter](/python/cutlass_library/generator.py) in the CUTLASS Library and support in the Profiler
 * [BLAS3](https://docs.nvidia.com/cuda/cublas/index.html#cublas-level-3-function-reference) operators accelerated by Tensor Cores
   * Supported types: f32, cf32, f64, cf64, tf32x3, complex tf32x3
   * [HERK](/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu) with [emitter](/tools/library/scripts/rank_k_operation.py)
 
@@ -40,7 +40,7 @@ endif()
 message(STATUS "CMake Version: ${CMAKE_VERSION}")
 set(IMPLICIT_CMAKE_CXX_STANDARD OFF CACHE BOOL "Do not explicitly specify -std=c++11 if set")
 
-project(CUTLASS VERSION 3.2.0 LANGUAGES CXX)
+project(CUTLASS VERSION 3.2.1 LANGUAGES CXX)
 include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)
 
 if (CUDA_VERSION VERSION_LESS 11.3)
@@ -85,17 +85,38 @@ message(STATUS "Default Install Location: ${CMAKE_INSTALL_PREFIX}")
 set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
 # 0 - Sanity, 1 - Release-Quality, 2 - Exhaustive
 
+find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED)
+
+# Install cutlass_library Python package
+execute_process(
+  WORKING_DIRECTORY ${CUTLASS_DIR}/python
+  COMMAND ${Python3_EXECUTABLE} ${CUTLASS_DIR}/python/setup_library.py develop --user
+  RESULT_VARIABLE cutlass_lib_GENERATOR_INSTALL_RESULT
+  OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/cutlass_library_installation.log
+  ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/cutlass_library_installation.log
+)
+
+if(NOT cutlass_lib_GENERATOR_INSTALL_RESULT EQUAL 0)
+  message(FATAL_ERROR "Error installing cutlass_library package. See ${CMAKE_CURRENT_BINARY_DIR}/cutlass_library_installation.log")
+endif()
+
 ################################################################################
 set(CUTLASS_ENABLE_HEADERS_ONLY OFF CACHE BOOL "Enable only the header library")
 
 if(CUTLASS_ENABLE_HEADERS_ONLY)
   set(CUTLASS_ENABLE_EXAMPLES_INIT OFF)
   set(CUTLASS_ENABLE_TOOLS_INIT ON)
   set(CUTLASS_ENABLE_LIBRARY_INIT OFF)
+  set(CUTLASS_ENABLE_TESTS_INIT OFF)
 else()
   set(CUTLASS_ENABLE_EXAMPLES_INIT ON)
   set(CUTLASS_ENABLE_TOOLS_INIT ON)
   set(CUTLASS_ENABLE_LIBRARY_INIT ON)
+  if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
+    set(CUTLASS_ENABLE_TESTS_INIT ON)
+  else()
+    set(CUTLASS_ENABLE_TESTS_INIT OFF)
+  endif()
 endif()
 
 set(CUTLASS_TEST_UNIT_ENABLE_WARNINGS OFF CACHE BOOL "Enable warnings on waived unit tests.")
@@ -104,20 +125,10 @@ set(CUTLASS_ENABLE_EXAMPLES ${CUTLASS_ENABLE_EXAMPLES_INIT} CACHE BOOL "Enable C
 set(CUTLASS_ENABLE_TOOLS ${CUTLASS_ENABLE_TOOLS_INIT} CACHE BOOL "Enable CUTLASS Tools")
 set(CUTLASS_ENABLE_LIBRARY ${CUTLASS_ENABLE_LIBRARY_INIT} CACHE BOOL "Enable CUTLASS Library")
 set(CUTLASS_ENABLE_PROFILER ${CUTLASS_ENABLE_LIBRARY} CACHE BOOL "Enable CUTLASS Profiler")
-set(CUTLASS_ENABLE_PERFORMANCE ${CUTLASS_ENABLE_PROFILER} CACHE BOOL "Enable CUTLASS Proformance")
-
-if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
-  set(CUTLASS_ENABLE_TESTS_INIT ${CUTLASS_ENABLE_LIBRARY})
-else()
-  set(CUTLASS_ENABLE_TESTS_INIT OFF)
-endif()
+set(CUTLASS_ENABLE_PERFORMANCE ${CUTLASS_ENABLE_PROFILER} CACHE BOOL "Enable CUTLASS Performance")
 
 set(CUTLASS_ENABLE_TESTS ${CUTLASS_ENABLE_TESTS_INIT} CACHE BOOL "Enable CUTLASS Tests")
-
-if (CUTLASS_ENABLE_TESTS)
-  include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake)
-endif()
-
+set(CUTLASS_ENABLE_GTEST_UNIT_TESTS ${CUTLASS_ENABLE_TESTS} CACHE BOOL "Enable CUTLASS GTest-based Unit Tests")
 ################################################################################
 
 set(CUTLASS_NVCC_ARCHS_SUPPORTED "")
@@ -285,6 +296,8 @@ if (CUTLASS_ENABLE_TENSOR_CORE_MMA)
 endif()
 
 
+
+
 if (NOT MSVC AND CUTLASS_NVCC_KEEP)
   # MSVC flow handles caching already, but for other generators we handle it here.
   set(CUTLASS_NVCC_KEEP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp CACHE PATH "Location to store NVCC scratch files")
@@ -395,6 +408,7 @@ endif()
 # Some tests require this build option in order to link.
 if (MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /bigobj")
 endif()
 
 function(cutlass_apply_cuda_gencode_flags TARGET)
@@ -572,11 +586,17 @@ target_include_directories(
   $<INSTALL_INTERFACE:include>
   $<BUILD_INTERFACE:${CUTLASS_INCLUDE_DIR}>
   $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
-  $<BUILD_INTERFACE:${CUDA_TOOLKIT_ROOT_DIR}/include>
   $<BUILD_INTERFACE:${cute_SOURCE_DIR}/include>
   $<BUILD_INTERFACE:${cute_SOURCE_DIR}/examples>
   )
 
+# Mark CTK headers as system to supress warnings from them
+target_include_directories(
+  CUTLASS
+  SYSTEM INTERFACE
+  $<BUILD_INTERFACE:${CUDA_TOOLKIT_ROOT_DIR}/include>
+  )
+
 install(
   DIRECTORY
   ${CUTLASS_INCLUDE_DIR}/
@@ -633,6 +653,11 @@ endif()
 
 include(CTest)
 enable_testing()
+
+if (CUTLASS_ENABLE_GTEST_UNIT_TESTS)
+  include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake)
+endif()
+
 if (NOT TARGET test_all)
   add_custom_target(test_all)
 endif()
@@ -818,7 +843,7 @@ function(cutlass_add_executable_tests NAME TARGET)
 
   set(CUTLASS_CTEST_GENERATED_FILES ${CUTLASS_CTEST_GENERATED_FILES};ctest/${TEST_NAME}/CTestTestfile.${TEST_NAME}.cmake CACHE INTERNAL "")
 
-  if (CUTLASS_INSTALL_TESTS)
+    if (CUTLASS_INSTALL_TESTS)
 
     file(GENERATE 
       OUTPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake" 
@@ -831,7 +856,7 @@ function(cutlass_add_executable_tests NAME TARGET)
       RENAME CTestTestfile.${TEST_NAME}.cmake
       )
 
-  endif()
+    endif()
 
 endfunction()
 
@@ -849,7 +874,9 @@ endif()
 
 if (CUTLASS_ENABLE_TESTS)
   add_subdirectory(test)
+  if (CUTLASS_ENABLE_GTEST_UNIT_TESTS)
   add_dependencies(test_all test_unit)
+  endif()
 endif()
 
 if (CUTLASS_INSTALL_TESTS)
 
@@ -305,10 +305,10 @@ function(cutlass_add_library NAME)
 
   if(CUTLASS_NATIVE_CUDA OR CUDA_COMPILER MATCHES "clang")
     cutlass_correct_source_file_language_property(${TARGET_SOURCE_ARGS})
-    add_library(${NAME} ${TARGET_SOURCE_ARGS})
+    add_library(${NAME} ${TARGET_SOURCE_ARGS} "")
   else()
     set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
-    cuda_add_library(${NAME} ${TARGET_SOURCE_ARGS})
+    cuda_add_library(${NAME} ${TARGET_SOURCE_ARGS} "")
   endif()
 
   cutlass_apply_standard_compile_options(${NAME})
 
@@ -43,7 +43,7 @@ In addition to GEMMs, CUTLASS implements high-performance convolution via the im
 
 # What's New in CUTLASS 3.2
 
-CUTLASS 3.2 is an update to CUTLASS adding:
+CUTLASS 3.2.0 is an update to CUTLASS adding:
 - New warp-specialized persistent FP8 GEMM kernel [kernel schedules](/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp) and [mainloops](/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp)  targeting Hopper architecture that achieve great performance with TMA, WGMMA, and threadblock clusters. An example showcasing [Hopper warp-specialized FP8 GEMMs](/examples/54_hopper_fp8_warp_specialized_gemm).
 - New [Epilogue Visitor Tree (EVT)](/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu) support for Hopper TMA epilogues. EVTs allows for user-defined customized epilogue fusion patterns without having to write a new epilogue.
 - [Stream-K](/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp) feature for Hopper. Note that this is only a functional implementation of stream-K, and should not be used for performance comparison. Optimizations are expected in a future release.
@@ -53,6 +53,14 @@ CUTLASS 3.2 is an update to CUTLASS adding:
 - New CUTLASS 2D Convolution Python interface. New [example](/examples/python/03_basic_conv2d.ipynb) here.
 - Support for Windows (MSVC) builds.
 
+CUTLASS 3.2.1 is an update to CUTLASS adding:
+- Python support SM90 Epilogue Visitor Tree (EVT) on top of the C++ support released in 3.2.0.
+- SM80 EVT support in C++ and Python.
+- Splitting CUTLASS library into smaller units based on operation, arch and datatypes. See [1105](https://github.com/NVIDIA/cutlass/discussions/1105) for details.
+- Making `tools/library/scripts` packageable - `tools/library/scripts` is now moving to `python/cutlass_library`. See the Python [README](/python/README.md) for details.
+- SM90 TF32 kernel improvements for all layouts.
+- SM90 rasterization direction support in the CUTLASS profiler.
+- Improvement for CUTLASS profiler build times.
 
 Minimum requirements:
 
@@ -176,7 +184,8 @@ CUTLASS is a header-only template library and does not need to be built to be us
 projects. Client applications should target CUTLASS's `include/` directory in their include
 paths.
 
-CUTLASS unit tests, examples, and utilities can be build with CMake starting version 3.12. 
+CUTLASS unit tests, examples, and utilities can be build with CMake.
+The minimum version of CMake is given in the [Quickstart guide](media/docs/quickstart.md).
 Make sure the `CUDACXX` environment  variable points to NVCC in the CUDA Toolkit installed
 on your system.
 
@@ -512,7 +521,7 @@ reference_device: Passed
 ## More Details on Compiling CUTLASS Kernels and CUTLASS Profiler
 - Please follow the links for more CMake examples on selectively compiling CUTLASS kernels:
   - [GEMM CMake Examples](media/docs/quickstart.md#gemm-cmake-examples) 
-  - [Implicit GEMM conovlution CMake Examples](media/docs/quickstart.md#convolution-cmake-examples)
+  - [Implicit GEMM convolution CMake Examples](media/docs/quickstart.md#convolution-cmake-examples)
 - [Further details about the CUTLASS Profiler are described here.](media/docs/profiler.md)
 
 
 
@@ -2,6 +2,11 @@ get_filename_component(NvidiaCutlass_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
 
 include(CMakeFindDependencyMacro)
 
-if(NOT TARGET nvidia::cutlass::CUTLASS)
-    include("${NvidiaCutlass_CMAKE_DIR}/NvidiaCutlassTargets.cmake")
+if(TARGET nvidia::cutlass::CUTLASS)
+  return()
 endif()
+
+include("${NvidiaCutlass_CMAKE_DIR}/NvidiaCutlassTargets.cmake")
+
+# For backward compatibility with the old name
+add_library(cutlass_lib ALIAS cutlass_library)
@@ -31,6 +31,5 @@
 cutlass_example_add_executable(
   08_turing_tensorop_gemm
   turing_tensorop_gemm.cu
-  DISABLE_TESTS ON
   )
 
@@ -291,8 +291,8 @@ int run() {
                                    LayoutInputB,
                                    ElementOutput,
                                    LayoutOutput,
-                                   ElementComputeEpilogue,
-                                   ElementComputeEpilogue>
+                                   int32_t,
+                                   int32_t>
       gemm_device;
 
   // Launch device reference gemm kernel
@@ -355,4 +355,3 @@ int main() {
 
   return run();
 }
-
@@ -143,7 +143,6 @@ compare if the output from CUTLASS kernel is same as the reference implicit GEMM
 #include "cutlass/util/tensor_view_io.h"
 
 #include "helper.h"
-
 // The code section below describes datatype for input, output tensors and computation between
 // elements
 using ElementAccumulator = int32_t;                 // Data type of accumulator
@@ -675,7 +674,6 @@ Result profile_convolution(Options const &options) {
 
   return result;
 }
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 int main(int argc, char const **args) {
@@ -762,11 +760,7 @@ int main(int argc, char const **args) {
     Result::print_header(std::cout, options) << std::endl;
     result.print(std::cout, 1, options) << std::endl;
   }
-
   return 0;
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-
@@ -31,6 +31,5 @@
 cutlass_example_add_executable(
   12_gemm_bias_relu
   gemm_bias_relu.cu
-  DISABLE_TESTS ON
   )
 
@@ -220,7 +220,6 @@ bool run_fused_conv2d_fprop_optimized_s8_sm75_rf_res() {
 
   return pass;
 }
-
 int main() {
 
   std::vector<bool (*)()>funcs = {
@@ -229,10 +228,6 @@ int main() {
   };
 
   return testRun(75, funcs, "conv int8 RF residency");
-
 }
 
-
-
 ////////////////////////////////////////////////////////////////////////////////
-
@@ -39,7 +39,6 @@
 #include "device/b2b_implicit_gemm_convolution.h"
 #include "b2b_interleaved_conv2d_run.h"
 #include "test_run.h"
-
 ////////////////////////////////////////////////////////////////////////////////
 
 cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_0 (
@@ -219,20 +218,13 @@ bool run_fused_conv2d_fprop_optimized_s8_sm75_shmem() {
 
   return pass;
 }
-
-
 int main() {
-
   std::vector<bool (*)()>funcs = {
     &run_nonfused_conv2d_fprop_optimized_s8_sm75,
     &run_fused_conv2d_fprop_optimized_s8_sm75_shmem
   };
 
   return testRun(75, funcs, "conv int8 shmem staging");
-
 }
 
-
-
 ////////////////////////////////////////////////////////////////////////////////
-
@@ -195,7 +195,6 @@ bool run_fused_gemm_s8_rf_res() {
   return passed;
 
 }
-
 int main() {
 
   std::vector<bool (*)()>funcs = {
@@ -204,9 +203,6 @@ int main() {
   };
 
   return testRun(75, funcs, "gemm int8 RF residency");
-
-
 }
 
-
 ////////////////////////////////////////////////////////////////////////////////
@@ -43,7 +43,6 @@
 #include "device/b2b_gemm.h"
 #include "b2b_interleaved_gemm_run.h"
 #include "test_run.h"
-
 ////////////////////////////////////////////////////////////////////////////////
 
 cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*640, 64, 576);
@@ -197,18 +196,13 @@ bool run_fused_gemm_s8_shmem() {
   return passed;
 
 }
-
 int main() {
 
   std::vector<bool (*)()>funcs = {
     &run_nonfused_gemm_s8,
     &run_fused_gemm_s8_shmem
   };
-
   return testRun(75, funcs, "gemm int8 shmem staing");
-
-
 }
 
-
 ////////////////////////////////////////////////////////////////////////////////
Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,5 @@`
`31`	`31`	`cutlass_example_add_executable(`
`32`	`32`	`08_turing_tensorop_gemm`
`33`	`33`	`turing_tensorop_gemm.cu`
`34`		`- DISABLE_TESTS ON`
`35`	`34`	`)`
`36`	`35`
Original file line number	Diff line number	Diff line change
`@@ -220,7 +220,6 @@ bool run_fused_conv2d_fprop_optimized_s8_sm75_rf_res() {`
`220`	`220`
`221`	`221`	`return pass;`
`222`	`222`	`}`
`223`		`-`
`224`	`223`	`int main() {`
`225`	`224`
`226`	`225`	`std::vector<bool (*)()>funcs = {`
`@@ -229,10 +228,6 @@ int main() {`
`229`	`228`	`};`
`230`	`229`
`231`	`230`	`return testRun(75, funcs, "conv int8 RF residency");`
`232`		`-`
`233`	`231`	`}`
`234`	`232`
`235`		`-`
`236`		`-`
`237`	`233`	`////////////////////////////////////////////////////////////////////////////////`
`238`		`-`
Original file line number	Diff line number	Diff line change
`@@ -195,7 +195,6 @@ bool run_fused_gemm_s8_rf_res() {`
`195`	`195`	`return passed;`
`196`	`196`
`197`	`197`	`}`
`198`		`-`
`199`	`198`	`int main() {`
`200`	`199`
`201`	`200`	`std::vector<bool (*)()>funcs = {`
`@@ -204,9 +203,6 @@ int main() {`
`204`	`203`	`};`
`205`	`204`
`206`	`205`	`return testRun(75, funcs, "gemm int8 RF residency");`
`207`		`-`
`208`		`-`
`209`	`206`	`}`
`210`	`207`
`211`		`-`
`212`	`208`	`////////////////////////////////////////////////////////////////////////////////`