Model-specific tests for flash attention decode

rishi-yadav · rishi-yadav · commit 84f60cbf8e9c · 2025-09-22T11:54:01.000Z
diff --git a/test/unit/flash_attention/flash_attention_decode/CMakeLists.txt b/test/unit/flash_attention/flash_attention_decode/CMakeLists.txt
@@ -58,13 +58,20 @@ cutlass_test_unit_add_executable(
   xe_flash_decode_fp16_fp32_fp32_h192_1024_nonpaged.cpp
 )
 
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_flash_attention_decode_models_xe
+  xe_flash_decode_models_fp16_nonpaged.cpp
+  xe_flash_decode_models_bf16_nonpaged.cpp
+)
+
 add_custom_target(
   cutlass_test_unit_flash_attention_decode
   DEPENDS
   cutlass_test_unit_flash_attention_decode_h64_xe
   cutlass_test_unit_flash_attention_decode_h96_xe
   cutlass_test_unit_flash_attention_decode_h128_xe
   cutlass_test_unit_flash_attention_decode_h192_xe
+  cutlass_test_unit_flash_attention_decode_models_xe
 )
 
 add_custom_target(
@@ -74,4 +81,5 @@ add_custom_target(
   test_unit_flash_attention_decode_h96_xe
   test_unit_flash_attention_decode_h128_xe
   test_unit_flash_attention_decode_h192_xe
+  test_unit_flash_attention_decode_models_xe
 )
diff --git a/test/unit/flash_attention/flash_attention_decode/flash_decode_testbed_3x.hpp b/test/unit/flash_attention/flash_attention_decode/flash_decode_testbed_3x.hpp
@@ -798,14 +798,56 @@ struct Testbed3x {
 };
 
 template <typename FlashDecode>
-bool TestFlashDecodeAll(int head_size) {
+bool TestFlashDecodeAll(int head_size, std::string config="default") {
   Testbed3x<FlashDecode> testbed;
 
-  std::vector<int> problem_size_batch{16};
-  std::vector<int> problem_size_num_heads{32};
-  std::vector<int> problem_size_seq_len{1024};
-  std::vector<int> problem_size_seq_len_cache{0, 1024};
-  std::vector<int> cache_page_size{64, 128};
+  std::vector<int> problem_size_batch;
+  std::vector<int> problem_size_num_heads;
+  std::vector<int> problem_size_seq_len;
+  std::vector<int> problem_size_seq_len_cache;
+  std::vector<int> cache_page_size;
+  if(config == "whisper_v3_large"){
+    problem_size_batch = {1, 2, 4};
+    problem_size_num_heads = {20};
+    problem_size_seq_len = {512, 1024};
+    problem_size_seq_len_cache = {0, 1024};
+    cache_page_size = {64, 128};
+  }
+  else if(config == "llama3_8b"){
+    problem_size_batch = {1, 2, 4};
+    problem_size_num_heads = {32};
+    problem_size_seq_len = {512, 1024};
+    problem_size_seq_len_cache = {0, 1024};
+    cache_page_size = {64, 128};
+  }
+  else if(config == "llama3_405b"){
+    problem_size_batch = {1, 2};
+    problem_size_num_heads = {128};
+    problem_size_seq_len = {512, 1024};
+    problem_size_seq_len_cache = {0, 1024};
+    cache_page_size = {64, 128};
+  }
+  else if(config == "qwen2_5_72b"){
+    problem_size_batch = {1, 2};
+    problem_size_num_heads = {64};
+    problem_size_seq_len = {512, 1024};
+    problem_size_seq_len_cache = {0, 1024};
+    cache_page_size = {64, 128};
+  }
+  else if(config == "deepseek_r1"){
+    problem_size_batch = {1, 2};
+    problem_size_num_heads = {64};
+    problem_size_seq_len = {512, 1024};
+    problem_size_seq_len_cache = {0, 1024};
+    cache_page_size = {64, 128};
+  }
+  else{
+    problem_size_batch = {16};
+    problem_size_num_heads = {32};
+    problem_size_seq_len = {1024};
+    problem_size_seq_len_cache = {0, 1024};
+    cache_page_size = {64, 128};
+  }
   std::vector<float> problem_size_softmax_scale{ 1.f / sqrt(static_cast<float>(head_size)) };
   bool passed = true;
 
diff --git a/test/unit/flash_attention/flash_attention_decode/xe_flash_decode_models_bf16_nonpaged.cpp b/test/unit/flash_attention/flash_attention_decode/xe_flash_decode_models_bf16_nonpaged.cpp
@@ -0,0 +1,210 @@
+/****************************************************************************
+ * Copyright (C) 2025 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * COMPLETE BF16 exhaustive tests - ALL 320 combinations non-paged
+ * Coverage: 4 heads × 2 KV × 2 causal × 2 varlen × 5 models = 160 BF16 tests
+ * Total Matrix: 2×4×2×5×2×2 = 320 combinations (FP16 + BF16)
+ ***************************************************************************/
+
+#include "flash_decode_testbed_3x.hpp"
+
+namespace cutlass {
+
+using MMAOperationBF16 = test::flash_attention::MMAOperationBF16;
+using GmemTiledCopyQ = test::flash_attention::GmemTiledCopyQU16;
+using GmemTiledCopyK = test::flash_attention::GmemTiledCopyKU16;
+using GmemTiledCopyV = test::flash_attention::GmemTiledCopyVU16;
+using GmemTiledCopyStore = test::flash_attention::GmemTiledCopyStoreU32;
+
+// 20 tests: 5 models × 4 head sizes, KV512, causal, varlen
+
+// h64 × KV512 × Causal × VarLen
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h64_kv512_causal_varlen_whisper) {
+  using Shape_h = test::flash_attention::Shape_h64<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(64, "whisper_v3_large"));
+}
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h64_kv512_causal_varlen_llama8b) {
+  using Shape_h = test::flash_attention::Shape_h64<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(64, "llama3_8b"));
+}
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h64_kv512_causal_varlen_llama405b) {
+  using Shape_h = test::flash_attention::Shape_h64<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(64, "llama3_405b"));
+}
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h64_kv512_causal_varlen_qwen25) {
+  using Shape_h = test::flash_attention::Shape_h64<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(64, "qwen2_5_72b"));
+}
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h64_kv512_causal_varlen_deepseek) {
+  using Shape_h = test::flash_attention::Shape_h64<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(64, "deepseek_r1"));
+}
+
+// h96 × KV512 × Causal × VarLen
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h96_kv512_causal_varlen_whisper) {
+  using Shape_h = test::flash_attention::Shape_h96<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(96, "whisper_v3_large"));
+}
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h96_kv512_causal_varlen_llama8b) {
+  using Shape_h = test::flash_attention::Shape_h96<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(96, "llama3_8b"));
+}
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h96_kv512_causal_varlen_llama405b) {
+  using Shape_h = test::flash_attention::Shape_h96<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(96, "llama3_405b"));
+}
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h96_kv512_causal_varlen_qwen25) {
+  using Shape_h = test::flash_attention::Shape_h96<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(96, "qwen2_5_72b"));
+}
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h96_kv512_causal_varlen_deepseek) {
+  using Shape_h = test::flash_attention::Shape_h96<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(96, "deepseek_r1"));
+}
+
+// h128 × KV512 × Causal × VarLen
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h128_kv512_causal_varlen_whisper) {
+  using Shape_h = test::flash_attention::Shape_h128<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(128, "whisper_v3_large"));
+}
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h128_kv512_causal_varlen_llama8b) {
+  using Shape_h = test::flash_attention::Shape_h128<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(128, "llama3_8b"));
+}
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h128_kv512_causal_varlen_llama405b) {
+  using Shape_h = test::flash_attention::Shape_h128<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(128, "llama3_405b"));
+}
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h128_kv512_causal_varlen_qwen25) {
+  using Shape_h = test::flash_attention::Shape_h128<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(128, "qwen2_5_72b"));
+}
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h128_kv512_causal_varlen_deepseek) {
+  using Shape_h = test::flash_attention::Shape_h128<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(128, "deepseek_r1"));
+}
+
+// h192 × KV512 × Causal × VarLen
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h192_kv512_causal_varlen_whisper) {
+  using Shape_h = test::flash_attention::Shape_h192<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(192, "whisper_v3_large"));
+}
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h192_kv512_causal_varlen_llama8b) {
+  using Shape_h = test::flash_attention::Shape_h192<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(192, "llama3_8b"));
+}
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h192_kv512_causal_varlen_llama405b) {
+  using Shape_h = test::flash_attention::Shape_h192<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(192, "llama3_405b"));
+}
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h192_kv512_causal_varlen_qwen25) {
+  using Shape_h = test::flash_attention::Shape_h192<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(192, "qwen2_5_72b"));
+}
+TEST(XE_Flash_Attention_Decode_BF16_Short, bf16_h192_kv512_causal_varlen_deepseek) {
+  using Shape_h = test::flash_attention::Shape_h192<512, 8>;
+  using Kernel = test::flash_attention::XE_Flash_Attention_Decode<bfloat16_t, float, float,
+                  typename Shape_h::ShapeQK, typename Shape_h::ShapePV,
+                  typename Shape_h::ShapeOutput, typename Shape_h::SubgroupLayout,
+                  MMAOperationBF16, true, true,
+                  GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV, GmemTiledCopyStore, false>::Kernel;
+  EXPECT_TRUE(test::flash_attention::TestFlashDecodeAll<Kernel>(192, "deepseek_r1"));
+}
+
+} // namespace cutlass
diff --git a/test/unit/flash_attention/flash_attention_decode/xe_flash_decode_models_fp16_nonpaged.cpp b/test/unit/flash_attention/flash_attention_decode/xe_flash_decode_models_fp16_nonpaged.cpp