Skip to content

Commit 90d3b0f

Browse files
CUTLASS 3.2.1 (NVIDIA#1113)
* Updates for 3.2.1 release. * Minor fix in gemm op profiler for raster order. * Add scheduler mapping for raster order in the kernels.
1 parent e0aaa3c commit 90d3b0f

File tree

428 files changed

+22241
-21750
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

428 files changed

+22241
-21750
lines changed

CHANGELOG.md

+11-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
# NVIDIA CUTLASS Changelog
22

3+
## [3.2.1](https://github.com/NVIDIA/cutlass/releases/tag/v3.2.1) (2023-09-22)
4+
* Python support SM90 Epilogue Visitor Tree (EVT) on top of the C++ support released in 3.2.0.
5+
* SM80 EVT support in C++ and Python.
6+
* Other SM90 epilogue improvements.
7+
* Splitting CUTLASS library into smaller units based on operation, arch and datatypes. See [1105](https://github.com/NVIDIA/cutlass/discussions/1105) for details.
8+
* Making `tools/library/scripts` packageable - `tools/library/scripts` is now moving to `python/cutlass_library`. See the Python [README](/python/README.md) for details.
9+
* SM90 TF32 kernel improvements for all layouts.
10+
* SM90 rasterization direction support in the CUTLASS profiler.
11+
* Improvement for CUTLASS profiler build times.
12+
* Remove Python-C++ bindings.
313

414
## [3.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.2.0) (2023-08-03)
515

@@ -91,7 +101,7 @@
91101
* [Few channels](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h) specialization for reduced alignment capabilities
92102
* [Fixed channels](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h) further specialized when channel count perfectly matches the access vector size
93103
* [Unit tests](/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu)
94-
* [Python-based instance emitter](/tools/library/scripts/generator.py) in the CUTLASS Library and support in the Profiler
104+
* [Python-based instance emitter](/python/cutlass_library/generator.py) in the CUTLASS Library and support in the Profiler
95105
* [BLAS3](https://docs.nvidia.com/cuda/cublas/index.html#cublas-level-3-function-reference) operators accelerated by Tensor Cores
96106
* Supported types: f32, cf32, f64, cf64, tf32x3, complex tf32x3
97107
* [HERK](/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu) with [emitter](/tools/library/scripts/rank_k_operation.py)

CMakeLists.txt

+43-16
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ endif()
4040
message(STATUS "CMake Version: ${CMAKE_VERSION}")
4141
set(IMPLICIT_CMAKE_CXX_STANDARD OFF CACHE BOOL "Do not explicitly specify -std=c++11 if set")
4242

43-
project(CUTLASS VERSION 3.2.0 LANGUAGES CXX)
43+
project(CUTLASS VERSION 3.2.1 LANGUAGES CXX)
4444
include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)
4545

4646
if (CUDA_VERSION VERSION_LESS 11.3)
@@ -85,17 +85,38 @@ message(STATUS "Default Install Location: ${CMAKE_INSTALL_PREFIX}")
8585
set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
8686
# 0 - Sanity, 1 - Release-Quality, 2 - Exhaustive
8787

88+
find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED)
89+
90+
# Install cutlass_library Python package
91+
execute_process(
92+
WORKING_DIRECTORY ${CUTLASS_DIR}/python
93+
COMMAND ${Python3_EXECUTABLE} ${CUTLASS_DIR}/python/setup_library.py develop --user
94+
RESULT_VARIABLE cutlass_lib_GENERATOR_INSTALL_RESULT
95+
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/cutlass_library_installation.log
96+
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/cutlass_library_installation.log
97+
)
98+
99+
if(NOT cutlass_lib_GENERATOR_INSTALL_RESULT EQUAL 0)
100+
message(FATAL_ERROR "Error installing cutlass_library package. See ${CMAKE_CURRENT_BINARY_DIR}/cutlass_library_installation.log")
101+
endif()
102+
88103
################################################################################
89104
set(CUTLASS_ENABLE_HEADERS_ONLY OFF CACHE BOOL "Enable only the header library")
90105

91106
if(CUTLASS_ENABLE_HEADERS_ONLY)
92107
set(CUTLASS_ENABLE_EXAMPLES_INIT OFF)
93108
set(CUTLASS_ENABLE_TOOLS_INIT ON)
94109
set(CUTLASS_ENABLE_LIBRARY_INIT OFF)
110+
set(CUTLASS_ENABLE_TESTS_INIT OFF)
95111
else()
96112
set(CUTLASS_ENABLE_EXAMPLES_INIT ON)
97113
set(CUTLASS_ENABLE_TOOLS_INIT ON)
98114
set(CUTLASS_ENABLE_LIBRARY_INIT ON)
115+
if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
116+
set(CUTLASS_ENABLE_TESTS_INIT ON)
117+
else()
118+
set(CUTLASS_ENABLE_TESTS_INIT OFF)
119+
endif()
99120
endif()
100121

101122
set(CUTLASS_TEST_UNIT_ENABLE_WARNINGS OFF CACHE BOOL "Enable warnings on waived unit tests.")
@@ -104,20 +125,10 @@ set(CUTLASS_ENABLE_EXAMPLES ${CUTLASS_ENABLE_EXAMPLES_INIT} CACHE BOOL "Enable C
104125
set(CUTLASS_ENABLE_TOOLS ${CUTLASS_ENABLE_TOOLS_INIT} CACHE BOOL "Enable CUTLASS Tools")
105126
set(CUTLASS_ENABLE_LIBRARY ${CUTLASS_ENABLE_LIBRARY_INIT} CACHE BOOL "Enable CUTLASS Library")
106127
set(CUTLASS_ENABLE_PROFILER ${CUTLASS_ENABLE_LIBRARY} CACHE BOOL "Enable CUTLASS Profiler")
107-
set(CUTLASS_ENABLE_PERFORMANCE ${CUTLASS_ENABLE_PROFILER} CACHE BOOL "Enable CUTLASS Proformance")
108-
109-
if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
110-
set(CUTLASS_ENABLE_TESTS_INIT ${CUTLASS_ENABLE_LIBRARY})
111-
else()
112-
set(CUTLASS_ENABLE_TESTS_INIT OFF)
113-
endif()
128+
set(CUTLASS_ENABLE_PERFORMANCE ${CUTLASS_ENABLE_PROFILER} CACHE BOOL "Enable CUTLASS Performance")
114129

115130
set(CUTLASS_ENABLE_TESTS ${CUTLASS_ENABLE_TESTS_INIT} CACHE BOOL "Enable CUTLASS Tests")
116-
117-
if (CUTLASS_ENABLE_TESTS)
118-
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake)
119-
endif()
120-
131+
set(CUTLASS_ENABLE_GTEST_UNIT_TESTS ${CUTLASS_ENABLE_TESTS} CACHE BOOL "Enable CUTLASS GTest-based Unit Tests")
121132
################################################################################
122133

123134
set(CUTLASS_NVCC_ARCHS_SUPPORTED "")
@@ -285,6 +296,8 @@ if (CUTLASS_ENABLE_TENSOR_CORE_MMA)
285296
endif()
286297

287298

299+
300+
288301
if (NOT MSVC AND CUTLASS_NVCC_KEEP)
289302
# MSVC flow handles caching already, but for other generators we handle it here.
290303
set(CUTLASS_NVCC_KEEP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp CACHE PATH "Location to store NVCC scratch files")
@@ -395,6 +408,7 @@ endif()
395408
# Some tests require this build option in order to link.
396409
if (MSVC)
397410
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")
411+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /bigobj")
398412
endif()
399413

400414
function(cutlass_apply_cuda_gencode_flags TARGET)
@@ -572,11 +586,17 @@ target_include_directories(
572586
$<INSTALL_INTERFACE:include>
573587
$<BUILD_INTERFACE:${CUTLASS_INCLUDE_DIR}>
574588
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
575-
$<BUILD_INTERFACE:${CUDA_TOOLKIT_ROOT_DIR}/include>
576589
$<BUILD_INTERFACE:${cute_SOURCE_DIR}/include>
577590
$<BUILD_INTERFACE:${cute_SOURCE_DIR}/examples>
578591
)
579592

593+
# Mark CTK headers as system to supress warnings from them
594+
target_include_directories(
595+
CUTLASS
596+
SYSTEM INTERFACE
597+
$<BUILD_INTERFACE:${CUDA_TOOLKIT_ROOT_DIR}/include>
598+
)
599+
580600
install(
581601
DIRECTORY
582602
${CUTLASS_INCLUDE_DIR}/
@@ -633,6 +653,11 @@ endif()
633653

634654
include(CTest)
635655
enable_testing()
656+
657+
if (CUTLASS_ENABLE_GTEST_UNIT_TESTS)
658+
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake)
659+
endif()
660+
636661
if (NOT TARGET test_all)
637662
add_custom_target(test_all)
638663
endif()
@@ -818,7 +843,7 @@ function(cutlass_add_executable_tests NAME TARGET)
818843

819844
set(CUTLASS_CTEST_GENERATED_FILES ${CUTLASS_CTEST_GENERATED_FILES};ctest/${TEST_NAME}/CTestTestfile.${TEST_NAME}.cmake CACHE INTERNAL "")
820845

821-
if (CUTLASS_INSTALL_TESTS)
846+
if (CUTLASS_INSTALL_TESTS)
822847

823848
file(GENERATE
824849
OUTPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.install.cmake"
@@ -831,7 +856,7 @@ function(cutlass_add_executable_tests NAME TARGET)
831856
RENAME CTestTestfile.${TEST_NAME}.cmake
832857
)
833858

834-
endif()
859+
endif()
835860

836861
endfunction()
837862

@@ -849,7 +874,9 @@ endif()
849874

850875
if (CUTLASS_ENABLE_TESTS)
851876
add_subdirectory(test)
877+
if (CUTLASS_ENABLE_GTEST_UNIT_TESTS)
852878
add_dependencies(test_all test_unit)
879+
endif()
853880
endif()
854881

855882
if (CUTLASS_INSTALL_TESTS)

CUDA.cmake

+2-2
Original file line numberDiff line numberDiff line change
@@ -305,10 +305,10 @@ function(cutlass_add_library NAME)
305305

306306
if(CUTLASS_NATIVE_CUDA OR CUDA_COMPILER MATCHES "clang")
307307
cutlass_correct_source_file_language_property(${TARGET_SOURCE_ARGS})
308-
add_library(${NAME} ${TARGET_SOURCE_ARGS})
308+
add_library(${NAME} ${TARGET_SOURCE_ARGS} "")
309309
else()
310310
set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
311-
cuda_add_library(${NAME} ${TARGET_SOURCE_ARGS})
311+
cuda_add_library(${NAME} ${TARGET_SOURCE_ARGS} "")
312312
endif()
313313

314314
cutlass_apply_standard_compile_options(${NAME})

README.md

+12-3
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ In addition to GEMMs, CUTLASS implements high-performance convolution via the im
4343

4444
# What's New in CUTLASS 3.2
4545

46-
CUTLASS 3.2 is an update to CUTLASS adding:
46+
CUTLASS 3.2.0 is an update to CUTLASS adding:
4747
- New warp-specialized persistent FP8 GEMM kernel [kernel schedules](/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp) and [mainloops](/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp) targeting Hopper architecture that achieve great performance with TMA, WGMMA, and threadblock clusters. An example showcasing [Hopper warp-specialized FP8 GEMMs](/examples/54_hopper_fp8_warp_specialized_gemm).
4848
- New [Epilogue Visitor Tree (EVT)](/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu) support for Hopper TMA epilogues. EVTs allows for user-defined customized epilogue fusion patterns without having to write a new epilogue.
4949
- [Stream-K](/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp) feature for Hopper. Note that this is only a functional implementation of stream-K, and should not be used for performance comparison. Optimizations are expected in a future release.
@@ -53,6 +53,14 @@ CUTLASS 3.2 is an update to CUTLASS adding:
5353
- New CUTLASS 2D Convolution Python interface. New [example](/examples/python/03_basic_conv2d.ipynb) here.
5454
- Support for Windows (MSVC) builds.
5555

56+
CUTLASS 3.2.1 is an update to CUTLASS adding:
57+
- Python support SM90 Epilogue Visitor Tree (EVT) on top of the C++ support released in 3.2.0.
58+
- SM80 EVT support in C++ and Python.
59+
- Splitting CUTLASS library into smaller units based on operation, arch and datatypes. See [1105](https://github.com/NVIDIA/cutlass/discussions/1105) for details.
60+
- Making `tools/library/scripts` packageable - `tools/library/scripts` is now moving to `python/cutlass_library`. See the Python [README](/python/README.md) for details.
61+
- SM90 TF32 kernel improvements for all layouts.
62+
- SM90 rasterization direction support in the CUTLASS profiler.
63+
- Improvement for CUTLASS profiler build times.
5664

5765
Minimum requirements:
5866

@@ -176,7 +184,8 @@ CUTLASS is a header-only template library and does not need to be built to be us
176184
projects. Client applications should target CUTLASS's `include/` directory in their include
177185
paths.
178186

179-
CUTLASS unit tests, examples, and utilities can be build with CMake starting version 3.12.
187+
CUTLASS unit tests, examples, and utilities can be build with CMake.
188+
The minimum version of CMake is given in the [Quickstart guide](media/docs/quickstart.md).
180189
Make sure the `CUDACXX` environment variable points to NVCC in the CUDA Toolkit installed
181190
on your system.
182191

@@ -512,7 +521,7 @@ reference_device: Passed
512521
## More Details on Compiling CUTLASS Kernels and CUTLASS Profiler
513522
- Please follow the links for more CMake examples on selectively compiling CUTLASS kernels:
514523
- [GEMM CMake Examples](media/docs/quickstart.md#gemm-cmake-examples)
515-
- [Implicit GEMM conovlution CMake Examples](media/docs/quickstart.md#convolution-cmake-examples)
524+
- [Implicit GEMM convolution CMake Examples](media/docs/quickstart.md#convolution-cmake-examples)
516525
- [Further details about the CUTLASS Profiler are described here.](media/docs/profiler.md)
517526

518527

cmake/NvidiaCutlassConfig.cmake

+7-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@ get_filename_component(NvidiaCutlass_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
22

33
include(CMakeFindDependencyMacro)
44

5-
if(NOT TARGET nvidia::cutlass::CUTLASS)
6-
include("${NvidiaCutlass_CMAKE_DIR}/NvidiaCutlassTargets.cmake")
5+
if(TARGET nvidia::cutlass::CUTLASS)
6+
return()
77
endif()
8+
9+
include("${NvidiaCutlass_CMAKE_DIR}/NvidiaCutlassTargets.cmake")
10+
11+
# For backward compatibility with the old name
12+
add_library(cutlass_lib ALIAS cutlass_library)

examples/08_turing_tensorop_gemm/CMakeLists.txt

-1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,5 @@
3131
cutlass_example_add_executable(
3232
08_turing_tensorop_gemm
3333
turing_tensorop_gemm.cu
34-
DISABLE_TESTS ON
3534
)
3635

examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu

+2-3
Original file line numberDiff line numberDiff line change
@@ -291,8 +291,8 @@ int run() {
291291
LayoutInputB,
292292
ElementOutput,
293293
LayoutOutput,
294-
ElementComputeEpilogue,
295-
ElementComputeEpilogue>
294+
int32_t,
295+
int32_t>
296296
gemm_device;
297297

298298
// Launch device reference gemm kernel
@@ -355,4 +355,3 @@ int main() {
355355

356356
return run();
357357
}
358-

examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu

-6
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,6 @@ compare if the output from CUTLASS kernel is same as the reference implicit GEMM
143143
#include "cutlass/util/tensor_view_io.h"
144144

145145
#include "helper.h"
146-
147146
// The code section below describes datatype for input, output tensors and computation between
148147
// elements
149148
using ElementAccumulator = int32_t; // Data type of accumulator
@@ -675,7 +674,6 @@ Result profile_convolution(Options const &options) {
675674

676675
return result;
677676
}
678-
679677
/////////////////////////////////////////////////////////////////////////////////////////////////
680678

681679
int main(int argc, char const **args) {
@@ -762,11 +760,7 @@ int main(int argc, char const **args) {
762760
Result::print_header(std::cout, options) << std::endl;
763761
result.print(std::cout, 1, options) << std::endl;
764762
}
765-
766763
return 0;
767764
}
768765

769766
/////////////////////////////////////////////////////////////////////////////////////////////////
770-
771-
772-

examples/12_gemm_bias_relu/CMakeLists.txt

-1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,5 @@
3131
cutlass_example_add_executable(
3232
12_gemm_bias_relu
3333
gemm_bias_relu.cu
34-
DISABLE_TESTS ON
3534
)
3635

examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu

-5
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,6 @@ bool run_fused_conv2d_fprop_optimized_s8_sm75_rf_res() {
220220

221221
return pass;
222222
}
223-
224223
int main() {
225224

226225
std::vector<bool (*)()>funcs = {
@@ -229,10 +228,6 @@ int main() {
229228
};
230229

231230
return testRun(75, funcs, "conv int8 RF residency");
232-
233231
}
234232

235-
236-
237233
////////////////////////////////////////////////////////////////////////////////
238-

examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu

-8
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939
#include "device/b2b_implicit_gemm_convolution.h"
4040
#include "b2b_interleaved_conv2d_run.h"
4141
#include "test_run.h"
42-
4342
////////////////////////////////////////////////////////////////////////////////
4443

4544
cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_0 (
@@ -219,20 +218,13 @@ bool run_fused_conv2d_fprop_optimized_s8_sm75_shmem() {
219218

220219
return pass;
221220
}
222-
223-
224221
int main() {
225-
226222
std::vector<bool (*)()>funcs = {
227223
&run_nonfused_conv2d_fprop_optimized_s8_sm75,
228224
&run_fused_conv2d_fprop_optimized_s8_sm75_shmem
229225
};
230226

231227
return testRun(75, funcs, "conv int8 shmem staging");
232-
233228
}
234229

235-
236-
237230
////////////////////////////////////////////////////////////////////////////////
238-

examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu

-4
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,6 @@ bool run_fused_gemm_s8_rf_res() {
195195
return passed;
196196

197197
}
198-
199198
int main() {
200199

201200
std::vector<bool (*)()>funcs = {
@@ -204,9 +203,6 @@ int main() {
204203
};
205204

206205
return testRun(75, funcs, "gemm int8 RF residency");
207-
208-
209206
}
210207

211-
212208
////////////////////////////////////////////////////////////////////////////////

examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu

-6
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@
4343
#include "device/b2b_gemm.h"
4444
#include "b2b_interleaved_gemm_run.h"
4545
#include "test_run.h"
46-
4746
////////////////////////////////////////////////////////////////////////////////
4847

4948
cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*640, 64, 576);
@@ -197,18 +196,13 @@ bool run_fused_gemm_s8_shmem() {
197196
return passed;
198197

199198
}
200-
201199
int main() {
202200

203201
std::vector<bool (*)()>funcs = {
204202
&run_nonfused_gemm_s8,
205203
&run_fused_gemm_s8_shmem
206204
};
207-
208205
return testRun(75, funcs, "gemm int8 shmem staing");
209-
210-
211206
}
212207

213-
214208
////////////////////////////////////////////////////////////////////////////////

0 commit comments

Comments
 (0)