Skip to content

Commit da1a3cc

Browse files
Becker-ZHcopybara-github
authored andcommitted
Fix Blackwell Roofline
PiperOrigin-RevId: 816766361
1 parent 8afa2e9 commit da1a3cc

File tree

2 files changed

+8
-7
lines changed

2 files changed

+8
-7
lines changed

xprof/utils/hardware_type_utils.cc

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ namespace {
3535
// Below data are calculated from the various NVidia whitepapers/specs.
3636

3737
// https://resources.nvidia.com/en-us-blackwell-architecture?ncid=pa-srch-goog-585983-Intel-Brand-Broad
38+
// Dense Compute as default.
3839
const GpuFlopCapabilities kComputeCap_PerSM_PerCycle_10_0 = {
3940
.cuda_core =
4041
{
@@ -47,11 +48,11 @@ const GpuFlopCapabilities kComputeCap_PerSM_PerCycle_10_0 = {
4748
.tensor_core =
4849
{
4950
.fp64_tflops = 148,
50-
.fp32_tflops = 8192,
51-
.bf16_tflops = 16384,
52-
.fp16_tflops = 16384,
53-
.fp8_tflops = 32768,
54-
.int8_tops = 32768,
51+
.fp32_tflops = 4096,
52+
.bf16_tflops = 8192,
53+
.fp16_tflops = 8192,
54+
.fp8_tflops = 16384,
55+
.int8_tops = 16384,
5556
},
5657
.has_tensor_core_sparsity_support = true,
5758
};

xprof/utils/hardware_type_utils_test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ TEST(HardwareTypeUtilsTest, B200PeakComputTFlops) {
3939
// Get target TFLOPS per SM and check.
4040
double peak_tflops =
4141
GetFlopMaxThroughputPerSM(device_cap) * device_cap.num_cores() / 1000.0;
42-
EXPECT_NEAR(peak_tflops, 4438, /*abs_error=*/1.0);
42+
EXPECT_NEAR(peak_tflops, 2218, /*abs_error=*/1.0);
4343
}
4444

4545
// It should fall back to the highest compute cap less than 10.9.
@@ -59,7 +59,7 @@ TEST(HardwareTypeUtilsTest, FutureBlackwellPeakComputTFlops) {
5959
// Get target TFLOPS per SM and check.
6060
double peak_tflops =
6161
GetFlopMaxThroughputPerSM(device_cap) * device_cap.num_cores() / 1000.0;
62-
EXPECT_NEAR(peak_tflops, 4438, /*abs_error=*/1.0);
62+
EXPECT_NEAR(peak_tflops, 2218, /*abs_error=*/1.0);
6363
}
6464

6565
TEST(HardwareTypeUtilsTest, H100PeakComputTFlops) {

0 commit comments

Comments
 (0)