From 8f140becf4868acb910c9819eb496dc65e879db8 Mon Sep 17 00:00:00 2001
From: Raymond <kwun.fung.lau@intel.com>
Date: Tue, 7 Oct 2025 05:39:52 +0800
Subject: [PATCH 1/5] First version of SDPA Fwd

---
 .../xe_flash_attn_prefill_mma_bshd.hpp        | 466 ++++++++++
 .../xe_flash_attn_sdpa_fwd_bshd_epilogue.hpp  | 336 +++++++
 ...sh_attn_sdpa_fwd_bshd_softmax_epilogue.hpp | 193 ++++
 .../kernel/tile_scheduler_sdpa_fwd_bshd.hpp   | 275 ++++++
 .../kernel/xe_sdpa_fwd_bshd.hpp               | 483 ++++++++++
 .../06a_bmg_flash_attention_sdpa_fwd_bshd.cpp | 116 +++
 .../CMakeLists.txt                            |  41 +
 .../bmg_flash_attn_sdpa_fwd_bshd_runner.hpp   | 829 ++++++++++++++++++
 8 files changed, 2739 insertions(+)
 create mode 100644 applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma_bshd.hpp
 create mode 100644 applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_epilogue.hpp
 create mode 100644 applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_softmax_epilogue.hpp
 create mode 100644 applications/flash_attention_v2/kernel/tile_scheduler_sdpa_fwd_bshd.hpp
 create mode 100644 applications/flash_attention_v2/kernel/xe_sdpa_fwd_bshd.hpp
 create mode 100644 examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp
 create mode 100644 examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/CMakeLists.txt
 create mode 100644 examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp

diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma_bshd.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma_bshd.hpp
new file mode 100644
index 0000000000..d1ac1c9531
--- /dev/null
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma_bshd.hpp
@@ -0,0 +1,466 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fp8_to_fp16.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "fmha_fusion.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::flash_attention::collective {
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename To_type, typename Engine, typename Layout>
+CUTLASS_DEVICE auto convert_type(Tensor<Engine, Layout> const &tensor) {
+  using From_type = typename Engine::value_type;
+  constexpr int numel = decltype(size(tensor))::value;
+  cutlass::NumericArrayConverter<To_type, From_type, numel> convert_op;
+  auto frag = convert_op(*reinterpret_cast<const cutlass::Array<From_type, numel> *>(tensor.data()));
+  return make_tensor(make_rmem_ptr<To_type>(&frag), tensor.layout());
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class DispatchPolicy, class ProblemShapeType_, class ElementQ_,
+          class StrideQ_, class ElementK_, class StrideK_, class ElementV_,
+          class StrideV_, class MMAOperation_, class TileShapeQK_,
+          class TileShapePV_, class SubgroupLayout_, class GmemTiledCopyQ_,
+          class GmemTiledCopyK_, class GmemTiledCopyV_, bool CausalMask_>
+struct FlashPrefillMma {
+  static_assert(cutlass::detail::dependent_false<ElementQ_>,
+                "Could not find a mainloop specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int Stages, class ProblemShapeType_, class ElementQ_, class StrideQ_,
+          class ElementK_, class StrideK_, class ElementV_, class StrideV_,
+          class MMAOperation_, class TileShapeQK_, class TileShapePV_,
+          class SubgroupLayout_, class GmemTiledCopyQ_, class GmemTiledCopyK_,
+          class GmemTiledCopyV_, bool CausalMask_>
+struct FlashPrefillMma<gemm::MainloopIntelXeXMX16<Stages>, ProblemShapeType_,
+                       ElementQ_, StrideQ_, ElementK_, StrideK_, ElementV_,
+                       StrideV_, MMAOperation_, TileShapeQK_, TileShapePV_,
+                       SubgroupLayout_, GmemTiledCopyQ_, GmemTiledCopyK_,
+                       GmemTiledCopyV_, CausalMask_> {
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = gemm::MainloopIntelXeXMX16<Stages>;
+  using TileShapeQK = TileShapeQK_;
+  using TileShapePV = TileShapePV_;
+  using SubgroupLayout = SubgroupLayout_;
+  using ProblemShapeType = ProblemShapeType_;
+  using ElementQ = ElementQ_;
+  using StrideQ = StrideQ_;
+  using ElementK = ElementK_;
+  using StrideK = StrideK_;
+  using ElementV = ElementV_;
+  using StrideV = StrideV_;
+  using GmemTiledCopyQ = GmemTiledCopyQ_;
+  using GmemTiledCopyK = GmemTiledCopyK_;
+  using GmemTiledCopyV = GmemTiledCopyV_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  using MmaAtom = MMA_Atom<MMAOperation_>;
+  using TiledMmaQK = typename TiledMMAHelper<MmaAtom, Layout<TileShapeQK>,
+                                             SubgroupLayout>::TiledMMA;
+  using TiledMmaPV = typename TiledMMAHelper<MmaAtom, Layout<TileShapePV>,
+                                             SubgroupLayout>::TiledMMA;
+  using ElementAccumulator = typename TiledMmaQK::ValTypeC;
+  static constexpr bool CausalMask = CausalMask_;
+  static constexpr int SubgroupSize = DispatchPolicy::SubgroupSize;
+
+  using MmaAtomShape = typename MmaAtom::Shape_MNK;
+
+  static constexpr auto PV_ATOM_M =
+      decltype(get<0>(SubgroupLayout{}.shape()))::value;
+  static constexpr auto PV_ATOM_N =
+      decltype(get<1>(SubgroupLayout{}.shape()))::value;
+  static constexpr auto PV_ATOM_K =
+      decltype(get<2>(SubgroupLayout{}.shape()))::value;
+
+  using SubgroupTileShapePV =
+      decltype(cute::shape_div(TileShapePV{}, (SubgroupLayout{}.shape())));
+
+  static constexpr auto QK_BLK_M = get<0>(TileShapeQK{});
+  static constexpr auto QK_BLK_N = get<1>(TileShapeQK{});
+  static constexpr auto QK_BLK_K = get<2>(TileShapeQK{});
+
+  // This TiledMma is only required to serve the specific tiling requirements
+  // for matrix K. This is due to the consumption of matrix K by all subgroups
+  // within a workgroup.
+  static constexpr auto QK_ATOM_M = PV_ATOM_M; // 8
+  static constexpr auto QK_ATOM_N = PV_ATOM_N; // 1
+  static constexpr auto QK_ATOM_K = PV_ATOM_K; // 1
+
+  using SubgroupTileShapeQK = decltype(cute::shape_div(
+      TileShapeQK{},
+      SubgroupLayout{}.shape())); // 128, 64, 32 / 16, 1, 1 = (8, 64, 32 )
+
+  static constexpr auto QK_SG_M = get<0>(SubgroupTileShapeQK{});
+  static constexpr auto QK_SG_N = get<1>(SubgroupTileShapeQK{});
+  static constexpr auto QK_SG_K = get<2>(SubgroupTileShapeQK{});
+
+  static constexpr bool is_var_len =
+      cutlass::fmha::collective::is_variable_length_v<
+          tuple_element_t<3, ProblemShapeType>>;
+
+  using FragsShapeS = decltype(cute::shape_div(
+      take<0, 2>(SubgroupTileShapeQK{}),
+      take<0, 2>(MmaAtomShape()))); // 8, 64, 32 /  8, 16, 16 (1, 4)
+  static constexpr int Vec =
+      (get<0>(MmaAtomShape()) * get<1>(MmaAtomShape())) / SubgroupSize; // 8
+  static constexpr int FragsM = get<0>(FragsShapeS{});
+  static constexpr int FragsNS = get<1>(FragsShapeS{}); // 4
+
+  static constexpr uint32_t MaxThreadsPerBlock =
+      size(SubgroupLayout{}) * SubgroupSize;
+  using CopyThreadShape = Shape<_1, Int<SubgroupSize>>;
+
+  using traits_load_Q = Copy_Traits<GmemTiledCopyQ, StrideQ>;
+  using atom_load_Q = Copy_Atom<traits_load_Q, ElementQ>;
+  using val_layout_load_Q = decltype(make_layout(
+      shape_div(typename traits_load_Q::BlockShape{}, CopyThreadShape{})));
+  using XE_Copy_Q = decltype(make_tiled_copy(
+      atom_load_Q{}, Layout<CopyThreadShape>{}, val_layout_load_Q{}));
+
+  using traits_load_K = Copy_Traits<GmemTiledCopyK, StrideK>;
+  using atom_load_K = Copy_Atom<traits_load_K, ElementK>;
+  using val_layout_load_K = decltype(make_layout(
+      shape_div(typename traits_load_K::BlockShape{}, CopyThreadShape{})));
+  using XE_Copy_K = decltype(make_tiled_copy(
+      atom_load_K{}, Layout<CopyThreadShape>{}, val_layout_load_K{}));
+
+  using traits_load_V = Copy_Traits<GmemTiledCopyV, StrideV>;
+  using atom_load_V = Copy_Atom<traits_load_V, ElementV>;
+  using val_layout_load_V = decltype(make_layout(
+      shape_div(typename traits_load_V::BlockShape{}, CopyThreadShape{})));
+  using XE_Copy_V = decltype(make_tiled_copy(
+      atom_load_V{}, Layout<CopyThreadShape>{}, val_layout_load_V{}));
+  template <typename T>
+  static constexpr bool is_fp8_v =
+      cute::is_same_v<T, float_e4m3_t> || cute::is_same_v<T, float_e5m2_t>;
+  // Host side kernel arguments
+  struct Arguments {
+    ElementQ const *ptr_Q;
+    StrideQ dQ;
+    ElementK const *ptr_K;
+    StrideK dK;
+    ElementV const *ptr_V;
+    StrideV dV;
+  };
+
+  struct Params {
+    XE_Copy_Q gmem_tiled_copy_q;
+    XE_Copy_K gmem_tiled_copy_k;
+    XE_Copy_V gmem_tiled_copy_v;
+  };
+
+  //
+  // Methods
+  //
+
+  FlashPrefillMma() = default;
+
+  static constexpr Params
+  to_underlying_arguments(ProblemShapeType const &problem_shape,
+                          Arguments const &args, void *workspace) {
+    (void)workspace;
+
+    auto [batch, num_heads_q, num_heads_kv, seq_len_qo, seq_len_kv,
+          head_size_qk, head_size_vo] = problem_shape;
+
+    auto tensorQ = make_tensor(
+        make_gmem_ptr(args.ptr_Q),
+        make_layout(make_shape(seq_len_qo, num_heads_q * head_size_qk, batch),
+                    args.dQ));
+    auto tensorK = make_tensor(
+        make_gmem_ptr(args.ptr_K),
+        make_layout(make_shape(seq_len_kv, num_heads_kv * head_size_qk, batch),
+                    args.dK));
+    auto tensorV = make_tensor(
+        make_gmem_ptr(args.ptr_V),
+        make_layout(make_shape(num_heads_kv * head_size_vo, seq_len_kv, batch),
+                    args.dV));
+    XE_Copy_Q copyQ{XE_Copy_Q{}.with(tensorQ)};
+    XE_Copy_K copyK{XE_Copy_K{}.with(tensorK)};
+    XE_Copy_V copyV{XE_Copy_V{}.with(tensorV)};
+
+    return Params{copyQ, copyK, copyV};
+  }
+
+  template <class FragQccum, class TensorQ, class TensorK, class FragSrc>
+  CUTLASS_DEVICE void mmaQK(FragQccum &accum, TensorQ gQ, TensorK gK,
+                            FragSrc const &frag_src, int const &k_tile_count,
+                            Params const &params) {
+
+    int thread_idx = static_cast<int>(ThreadIdxX());
+    auto thr_copy_Q = params.gmem_tiled_copy_q.get_slice(thread_idx);
+    auto thr_copy_K = params.gmem_tiled_copy_k.get_slice(thread_idx);
+    // Instantiate the MMA object
+    TiledMmaQK tiled_mma;
+    // To make all threads in a warp have the same global tensors pass in the
+    // index of thread 0 in each warp
+    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto first_thread_in_sg_idx =
+        sg.get_group_id()[0] * DispatchPolicy::SubgroupSize;
+    auto thread_mma_q = tiled_mma.get_slice(first_thread_in_sg_idx);
+    auto thread_mma_k = tiled_mma.get_slice(0);
+
+    // Partition
+    Tensor tCgQ = thread_mma_q.partition_A(gQ);
+    Tensor tCgK = thread_mma_k.partition_B(gK);
+
+    // Create fragments
+    // TODO(Codeplay): fix this, this is probably not general
+    using TCrQ_Type =
+        cute::conditional_t<is_fp8_v<ElementQ>, uint8_t, ElementQ>;
+    using TCrK_Type =
+        cute::conditional_t<is_fp8_v<ElementQ>, uint8_t, ElementK>;
+    Tensor tCrQ = make_tensor<TCrQ_Type>(make_fragment_layout(
+        params.gmem_tiled_copy_q, take<0, 3>(tCgQ.shape())));
+    Tensor tCrK = make_tensor<TCrK_Type>(make_fragment_layout(
+        params.gmem_tiled_copy_k, take<0, 3>(tCgK.shape())));
+
+    // Retile registers for copies
+    Tensor tQrQ = thr_copy_Q.retile_D(tCrQ);
+    Tensor tKrK = thr_copy_K.retile_D(tCrK);
+
+    // Retile global tile for copies
+    Tensor tQgQ = thr_copy_Q.retile_S(tCgQ);
+    Tensor tKgK = thr_copy_K.retile_S(tCgK);
+
+#if CUTLASS_ENABLE_DEBUG_PRINTS
+#define PRINT(x)                                                               \
+  print(#x ": ");                                                              \
+  print(x);                                                                    \
+  print("\n");
+    if (cute::thread(LOG_THREAD, LOG_GROUP)) {
+      print("======================= Q: \n");
+      PRINT(gQ);
+      PRINT(tCrQ);
+      PRINT(tCgQ);
+      PRINT(tQrQ);
+      PRINT(tQgQ);
+
+      print("=====================  K :\n");
+      PRINT(gK);
+      PRINT(tCrK);
+      PRINT(tCgK);
+      PRINT(tKrK);
+      PRINT(tKgK);
+
+      print("=====================  Config: \n");
+      PRINT(MaxThreadsPerBlock);
+      PRINT(SubgroupTileShapeQK{});
+    }
+#undef PRINT
+#endif
+
+    //
+    // Mainloop
+    //
+
+    for (int k_tile = 0; k_tile < k_tile_count; ++k_tile) {
+      copy(params.gmem_tiled_copy_q, tQgQ(_, _, _, k_tile), tQrQ);
+      copy(params.gmem_tiled_copy_k, tKgK(_, _, _, k_tile), tKrK);
+      if constexpr (is_fp8_v<ElementQ> && is_fp8_v<ElementK>) {
+        auto tCrQ_ = make_fragment_like<half_t>(tCrQ);
+        convert_FP8_to_FP16<ElementQ>(tCrQ, tCrQ_);
+        auto tCrK_ = make_fragment_like<half_t>(tCrK);
+        convert_FP8_to_FP16<ElementK>(tCrK, tCrK_);
+        cute::gemm(tiled_mma, accum, tCrQ_, tCrK_, frag_src);
+
+      } else if constexpr (is_fp8_v<ElementQ> && !is_fp8_v<ElementK>) {
+        auto tCrQ_ = make_fragment_like<half_t>(tCrQ);
+        convert_FP8_to_FP16<ElementQ>(tCrQ, tCrQ_);
+        cute::gemm(tiled_mma, accum, tCrQ_, tCrK, frag_src);
+
+      } else if constexpr (!is_fp8_v<ElementQ> && is_fp8_v<ElementK>) {
+        auto tCrK_ = make_fragment_like<half_t>(tCrK);
+        convert_FP8_to_FP16<ElementK>(tCrK, tCrK_);
+        cute::gemm(tiled_mma, accum, tCrQ, tCrK_, frag_src);
+      } else {
+        cute::gemm(tiled_mma, accum, tCrQ, tCrK, frag_src);
+      }
+    }
+  }
+  template <int tile_count, class FragQccum, class FragS, class TensorV,
+            class FragSrc>
+  CUTLASS_DEVICE void mmaPV(FragQccum &accum, FragS const &tSr, TensorV gV,
+                            FragSrc const &frag_src, Params const &params) {
+
+    int thread_idx = static_cast<int>(ThreadIdxX());
+    // Instantiate the MMA object
+    TiledMmaPV tiled_mma;
+    // Tile GV to the shape of <64,64> and loop over the HeadSize/64 to avoid
+    // Register spill
+    Tensor gV_ = take<0, 3>(
+        local_tile(gV, select<1, 2>(TileShapePV{}), make_coord(_, _)));
+    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto first_thread_in_sg_idx =
+        sg.get_group_id()[0] * DispatchPolicy::SubgroupSize;
+    auto thread_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
+    Tensor tCgV = thread_mma.partition_B(gV_);
+    using TCrV_Type =
+        cute::conditional_t<is_fp8_v<ElementV>, uint8_t, ElementV>;
+    Tensor tCrV = make_tensor<TCrV_Type>(make_fragment_layout(
+        params.gmem_tiled_copy_v, take<0, 3>(tCgV.shape())));
+
+    // Partition the copying of A and B tiles across the threads
+    auto gmem_thr_copy_V = params.gmem_tiled_copy_v.get_slice(thread_idx);
+    Tensor tVrV = gmem_thr_copy_V.retile_D(tCrV);
+    Tensor tVgV = gmem_thr_copy_V.retile_S(tCgV);
+
+#if CUTLASS_ENABLE_DEBUG_PRINTS
+#define PRINT(x)                                                               \
+  print(#x ": ");                                                              \
+  print(x);                                                                    \
+  print("\n");
+    if (cute::thread(LOG_THREAD, LOG_GROUP)) {
+      print("=====================  V :\n");
+      PRINT(gV);
+      PRINT(tCrV);
+      PRINT(tCgV);
+      PRINT(tVrV);
+      PRINT(tVgV);
+
+      print("=====================  Config: \n");
+      PRINT(MaxThreadsPerBlock);
+      PRINT(SubgroupTileShapePV{});
+    }
+#undef PRINT
+#endif
+
+    // 7) Convert S to P (FP32 -> BF16)
+    Tensor tPr = convert_type<typename TiledMmaPV::ValTypeA>(tSr);
+    //
+    // Mainloop
+    //
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < tile_count; i++) {
+      copy(params.gmem_tiled_copy_v, tVgV(_, _, _, i), tVrV);
+      if constexpr (is_fp8_v<ElementV>) {
+        auto tCrV_ = make_fragment_like<half_t>(tCrV);
+        convert_FP8_to_FP16<ElementV>(tCrV, tCrV_);
+        cute::gemm(tiled_mma, accum(_, _, _, i), tPr, tCrV_,
+                   frag_src(_, _, _, i));
+      } else {
+        cute::gemm(tiled_mma, accum(_, _, _, i), tPr, tCrV,
+                   frag_src(_, _, _, i));
+      }
+    }
+  }
+
+  // SequenceLengthShape = Shape<int, int>
+  // For Fixed Sequence Length, ProblemShape = Shape<int, int, int, int, int,
+  // int, int> For Variable Sequence Length, ProblemShape = Shape<int, int, int,
+  // VariableSeqlen, VariableSeqlen, int, int>
+  template <class ProblemShape, class SequenceLengthShape>
+  CUTLASS_DEVICE static constexpr Params
+  get_updated_copies(Params const &params, ProblemShape const &problem_shape,
+                     SequenceLengthShape const &sequence_length_shape,
+                     int const &l_coord, int const &q_head_coord = 0) {
+
+    auto [num_heads_q, num_heads_kv, head_size_qk, head_size_vo] =
+        select<1, 2, 5, 6>(problem_shape);
+    auto [seq_len_qo, seq_len_kv] = sequence_length_shape;
+    auto q_group_size = num_heads_q / num_heads_kv;
+    auto kv_head_coord = q_head_coord / q_group_size;
+    int offset_q = 0, offset_k = 0, offset_v = 0;
+
+    if constexpr (is_var_len) {
+      auto qo_cumulative_length = get<3>(problem_shape).cumulative_length;
+      auto kv_cumulative_length = get<4>(problem_shape).cumulative_length;
+      // auto kv_cached_cumulative_length =
+      //     get<5>(problem_shape).cumulative_length;
+
+      offset_q = num_heads_q * head_size_qk * qo_cumulative_length[l_coord] +
+                 q_head_coord * head_size_qk;
+      offset_k = num_heads_kv * head_size_qk * kv_cumulative_length[l_coord] +
+                 kv_head_coord * head_size_qk;
+      offset_v = num_heads_kv * head_size_vo * kv_cumulative_length[l_coord] +
+                 kv_head_coord * head_size_vo;
+    } else {
+      offset_q = num_heads_q * head_size_qk * seq_len_qo * l_coord +
+                 q_head_coord * head_size_qk;
+      offset_k = num_heads_kv * head_size_qk * seq_len_kv * l_coord +
+                 kv_head_coord * head_size_qk;
+      offset_v = num_heads_kv * head_size_vo * seq_len_kv * l_coord +
+                 kv_head_coord * head_size_vo;
+    }
+
+    auto q_traits =
+        static_cast<traits_load_Q const &>(params.gmem_tiled_copy_q);
+    const ElementQ *q_ptr = (const ElementQ *)q_traits.base_ptr;
+    auto k_traits =
+        static_cast<traits_load_K const &>(params.gmem_tiled_copy_k);
+    const ElementK *k_ptr = (const ElementK *)k_traits.base_ptr;
+    auto v_traits =
+        static_cast<traits_load_V const &>(params.gmem_tiled_copy_v);
+    const ElementV *v_ptr = (const ElementV *)v_traits.base_ptr;
+    auto shape_q =
+        make_shape(static_cast<int>(seq_len_qo), head_size_qk * num_heads_q, 1);
+    StrideQ stride_q = cutlass::make_cute_packed_stride(StrideQ{}, shape_q);
+    auto shape_k = make_shape(static_cast<int>(seq_len_kv),
+                              num_heads_kv * head_size_qk, 1);
+    StrideK stride_k = cutlass::make_cute_packed_stride(StrideK{}, shape_k);
+    auto shape_v = make_shape(head_size_vo * num_heads_kv,
+                              static_cast<int>(seq_len_kv), 1);
+    StrideV stride_v = cutlass::make_cute_packed_stride(StrideV{}, shape_v);
+
+    auto tensorQ = make_tensor(make_gmem_ptr(q_ptr + offset_q),
+                               make_layout(shape_q, stride_q));
+    auto tensorK = make_tensor(make_gmem_ptr(k_ptr + offset_k),
+                               make_layout(shape_k, stride_k));
+    auto tensorV = make_tensor(make_gmem_ptr(v_ptr + offset_v),
+                               make_layout(shape_v, stride_v));
+    XE_Copy_Q copyQ{XE_Copy_Q{}.with(tensorQ)};
+    XE_Copy_K copyK{XE_Copy_K{}.with(tensorK)};
+    XE_Copy_V copyV{XE_Copy_V{}.with(tensorV)};
+    return Params{copyQ, copyK, copyV};
+  }
+};
+
+} // namespace cutlass::flash_attention::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_epilogue.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_epilogue.hpp
new file mode 100644
index 0000000000..7c72acdf6b
--- /dev/null
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_epilogue.hpp
@@ -0,0 +1,336 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/epilogue/collective/collective_epilogue.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include <sycl/sycl.hpp>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace flash_attention {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class DispatchPolicy, class MMAOperation_, class TileShapeOutput_,
+          class SubgroupLayout_, class... Args>
+class FlashPrefillEpilogue {
+  static_assert(cutlass::detail::dependent_false<DispatchPolicy>,
+                "Could not find an epilogue specialization.");
+};
+
+template <class MMAOperation_, class TileShapeOutput_, class SubgroupLayout_,
+          class ElementCompute_, class ElementO_, class StrideO_,
+          class ElementLSE_, class CopyOpO_>
+class FlashPrefillEpilogue<epilogue::IntelXeXMX16, MMAOperation_,
+                           TileShapeOutput_, SubgroupLayout_, ElementCompute_,
+                           ElementO_, StrideO_, ElementLSE_, CopyOpO_> {
+public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = epilogue::IntelXeXMX16;
+  using ElementO = ElementO_;
+  using StrideO = StrideO_;
+  using ElementLSE = ElementLSE_;
+  using CopyOpO = CopyOpO_;
+  using SubgroupLayout = SubgroupLayout_;
+  using TileShapeOutput = TileShapeOutput_;
+  using TiledMmaOutput =
+      typename TiledMMAHelper<MMA_Atom<MMAOperation_>, Layout<TileShapeOutput>,
+                              SubgroupLayout>::TiledMMA;
+  using GmemTiledCopyO = CopyOpO;
+  using ElementOutput = ElementO_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementCompute_;
+  using SubgroupTileShape =
+      decltype(cute::shape_div(TileShapeOutput{}, (SubgroupLayout{}.shape())));
+
+  static constexpr int SubgroupSize = DispatchPolicy::SubgroupSize;
+
+  static_assert(
+      cute::rank(TileShapeOutput{}) == 3,
+      "TileShapeOutput must be rank-3: [CTA_M_QO, CTA_N_VO, CTA_K_PV]");
+  static_assert(
+      cute::rank(StrideO{}) == 3,
+      "StrideO must be rank-3: [seq_len_qo, head_size_vo, batch * num_heads]");
+
+  using CopyThreadShape = Shape<_1, Int<SubgroupSize>>;
+
+  using traits_store_O = Copy_Traits<GmemTiledCopyO, StrideO>;
+  using atom_load_O = Copy_Atom<traits_store_O, ElementO>;
+  using val_layout_load_O = decltype(make_layout(
+      shape_div(typename traits_store_O::BlockShape{}, CopyThreadShape{})));
+  using XE_Copy_O = decltype(make_tiled_copy(
+      atom_load_O{}, Layout<CopyThreadShape>{}, val_layout_load_O{}));
+
+private:
+  constexpr static bool is_destination_supported =
+      not cute::is_void_v<ElementO>;
+
+public:
+  using EmptyType = cute::tuple<>;
+
+  struct TensorStorageImpl : cute::tuple<EmptyType, EmptyType> {};
+
+  struct SharedStorage {
+    using TensorStorage = TensorStorageImpl;
+
+    TensorStorage tensors;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    ElementO const *ptr_O;
+    StrideO dO;
+    float *ptr_LSE;
+  };
+
+  // Device side epilogue params
+  struct Params {
+    XE_Copy_O xe_store_o;
+    float *ptr_LSE;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const &problem_shape,
+                          Arguments const &args,
+                          [[maybe_unused]] void *workspace) {
+    auto [batch, num_heads_q, num_heads_kv, seq_len_qo, seq_len_kv,
+          head_size_qk, head_size_vo] = problem_shape;
+    auto tensorO = make_tensor(
+        make_gmem_ptr(static_cast<ElementO const *>(args.ptr_O)),
+        make_layout(make_shape(seq_len_qo, num_heads_q * head_size_vo, batch),
+                    args.dO));
+    XE_Copy_O xe_store_o{XE_Copy_O{}.with(tensorO)};
+    return {
+        xe_store_o, args.ptr_LSE
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t get_workspace_size(ProblemShape const &problem_shape,
+                                   Arguments const &args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const &problem_shape, Arguments const &args,
+                       void *workspace, cudaStream_t stream,
+                       CudaHostAdapter *cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool
+  can_implement(ProblemShape const &problem_shape,
+                [[maybe_unused]] Arguments const &args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  FlashPrefillEpilogue(Params const &params_, TensorStorage const &)
+      : params(params_) {}
+
+  template <class ProblemShape, class SequenceLengthShape, class TileCoord,
+            class FragOut, class FragMax, class FragSum>
+  CUTLASS_DEVICE void operator()(ProblemShape problem_shape,
+                                 SequenceLengthShape sequence_length_shape,
+                                 TileCoord tile_coord, FragOut &out,
+                                 FragMax const &max, FragSum &sum, int const &q_head_coord
+                                 ) {
+
+    using namespace cute;
+
+    static constexpr bool is_var_len =
+        cutlass::fmha::collective::is_variable_length_v<
+            tuple_element_t<2, ProblemShape>>;
+
+    using FragOutLayout = typename FragOut::layout_type;
+    constexpr int Vec = shape<0>(FragOutLayout{});
+    constexpr int FragsM = shape<1>(FragOutLayout{});
+    constexpr int FragsN = size(select<2, 3>(shape(FragOutLayout{})));
+
+    auto g = syclcompat::get_nd_item<1>().get_sub_group();
+    auto out_reg = make_tensor(static_cast<decltype(out) &&>(out).data(),
+                               Shape<Int<Vec>, Int<FragsM>, Int<FragsN>>{});
+    float tLSE_reg = {-INFINITY};
+    auto rowsum = make_fragment_like(sum);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int y = 0; y < FragsM; y++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int x = 0; x < Vec; x++) {
+        int indx = y * Vec + x;
+        auto cur_sum = reduce_over_group(g, sum(indx), sycl::plus<>());
+        auto cur_scale = (cur_sum == 0.f || cur_sum != cur_sum)
+                             ? 1.0f
+                             : sycl::native::recip(cur_sum);
+        rowsum(indx) = cur_sum;
+        CUTLASS_PRAGMA_UNROLL
+        for (int z = 0; z < FragsN; z++) {
+          out_reg(x, y, z) *= cur_scale;
+        }
+      }
+    }
+
+    // Indexing variables
+    auto [batch, num_heads_q, head_size_vo] = select<0, 1, 6>(problem_shape);
+    auto [seq_len_qo] = select<0>(sequence_length_shape);
+    // Represent the full output tensor
+    // Tensor mO_mnl = cute::get_xe_tensor(make_shape(seq_len_qo, head_size_vo,
+    // (is_var_len ? batch : 1) * num_heads_q));
+    Tensor mO_mnl =
+        cute::get_xe_tensor(make_shape(seq_len_qo, head_size_vo, 1));
+
+    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord;
+    // Tile the output tensor per WG
+    Tensor g_wg_O = 
+          local_tile(mO_mnl, select<0, 1>(TileShapeOutput{}),
+                               make_coord(m_coord, n_coord, 0)); // (BLK_M,BLK_N,m,n,l)
+    static constexpr auto ATOM_N =
+        get<2>(typename TiledMmaOutput::ThrLayoutVMNK{}.shape());
+    auto m_sg = get_sub_group_id() / ATOM_N;
+    auto n_sg = get_sub_group_id() % ATOM_N;
+    // Tile the output tensor per SG
+    Tensor gO =
+        local_tile(g_wg_O, SubgroupTileShape{}, make_coord(m_sg, n_sg, _),
+                   Step<_1, _1, X>{}); // (BLK_M,BLK_N,m,n,l)
+    auto thread_xe_store_o = params.xe_store_o.get_thread_slice(ThreadIdxX());
+    Tensor tOgO = thread_xe_store_o.partition_D(gO);
+
+    Tensor final_out_reg = make_fragment_like<ElementOutput>(out_reg);
+    // iff ElementOutput == ElementAccumulator, then convert_type doesn't do the
+    // right conversion iff ElementOutput == fp8, there is no NumericConverter
+    // specialization available for both the above cases, we call copy() which
+    // internally performs a static_cast op on the data. for ElementOutput ==
+    // bf16 | fp16, convert_type calls relevant NumericConverter specialization.
+    if constexpr (cute::is_any_of_v<ElementOutput, cute::float_e5m2_t,
+                                    cute::float_e4m3_t> ||
+                  cute::is_same_v<ElementOutput, ElementCompute>) {
+      copy(out_reg, final_out_reg);
+    } else {
+      Tensor temp = convert_type<ElementOutput>(out_reg);
+      copy(temp, final_out_reg);
+    }
+    copy(params.xe_store_o, final_out_reg, tOgO);
+
+    // Generating the LSE for backward training
+    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    int lane_id = static_cast<int>(sg.get_local_linear_id());
+    int sub_group_id = get_sub_group_id();
+    const int BLK_M = size(select<0>(TileShapeOutput{}));
+
+    // write along the sequence.
+    // use the entire sub_group to write lse since all
+    // work items within subgroup have the same sum() data stored
+    // in registers
+    auto blk_m_coord  = get<0>(tile_coord);  // seq_len_blk_idx
+
+    size_t lse_offset = k_coord * num_heads_q * seq_len_qo +                  // shift the batch -- batch_idx * num_heads_q * seq_len_qo  -- OK
+                        q_head_coord * seq_len_qo +                           // shift the head  -- head_q * seq_len_qo -- ok
+                        m_coord * BLK_M;                                      // shift to the particular tile
+
+    int localtile_seq_coord = 0;
+
+    // Calculate the sequence coordinate
+    // The coordinate value should be within [0.. seq_len_qo - 1]
+    localtile_seq_coord = sub_group_id * SubgroupSize + lane_id; //one subgroup will handle 16 (usually) sequence
+
+    // checked
+    int seq_coord = m_coord * BLK_M + localtile_seq_coord;
+
+    // Check that if this is within the seq_len_qo
+    if (seq_coord < seq_len_qo){
+      auto cur_sum = rowsum[lane_id];
+      tLSE_reg = cur_sum == 0.f ? -INFINITY : max + logf(cur_sum);
+      *(params.ptr_LSE + lse_offset + localtile_seq_coord) = tLSE_reg;
+    }
+  }
+
+  // SequenceLengthShapeType = Shape<int, int>
+  // For Fixed Sequence Length, ProblemShapeType = Shape<int, int, int, int,
+  // int, int, int> For Variable Sequence Length, ProblemShapeType = Shape<int,
+  // int, int, VariableSeqlen, VariableSeqlen, int, int>
+  template <bool VarLen, class ProblemShapeType, class SequenceLengthShapeType>
+  CUTLASS_DEVICE static constexpr Params
+  get_updated_copies(Params const &params,
+                     ProblemShapeType const &problem_shape,
+                     SequenceLengthShapeType const &sequence_length_shape,
+                     int const &l_coord, int const &q_head_coord) {
+
+      auto [num_heads_q, head_size_vo] = select<1, 6>(problem_shape);
+      auto [seq_len_qo] = select<0>(sequence_length_shape);
+      int offset_o = 0;
+      if constexpr (VarLen) { 
+        auto qo_cumulative_length = get<3>(problem_shape).cumulative_length;
+        offset_o = num_heads_q * head_size_vo * qo_cumulative_length[l_coord] + q_head_coord * head_size_vo;
+      } else {
+        offset_o = num_heads_q * head_size_vo * seq_len_qo * l_coord +
+                  q_head_coord * head_size_vo;        
+      }
+      auto store_traits = static_cast<traits_store_O const &>(params.xe_store_o);
+      ElementO *base_ptr = (ElementO *)store_traits.base_ptr;
+      auto shape_o = make_shape(static_cast<int>(seq_len_qo), num_heads_q * head_size_vo, 1);
+      StrideO stride_o = cutlass::make_cute_packed_stride(StrideO{}, shape_o);
+      auto tensorO = make_tensor(make_gmem_ptr(base_ptr + offset_o),
+                                 make_layout(shape_o, stride_o));
+      XE_Copy_O xe_store_o{XE_Copy_O{}.with(tensorO)};
+      return Params{xe_store_o, params.ptr_LSE};
+  }
+
+private:
+  Params const &params;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace flash_attention
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_softmax_epilogue.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_softmax_epilogue.hpp
new file mode 100644
index 0000000000..5b63fffe7f
--- /dev/null
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_softmax_epilogue.hpp
@@ -0,0 +1,193 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing online softmax.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/epilogue/collective/collective_epilogue.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include <sycl/sycl.hpp>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace flash_attention {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool CausalMask_, class DispatchPolicy, class... Args>
+class FlashPrefillSoftmaxEpilogue {
+  static_assert(cutlass::detail::dependent_false<DispatchPolicy>,
+                "Could not find an epilogue specialization.");
+};
+
+template <bool CausalMask_, class Element_>
+class FlashPrefillSoftmaxEpilogue<CausalMask_, epilogue::IntelXeXMX16,
+                                  Element_> {
+public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = epilogue::IntelXeXMX16;
+  using Element = Element_;
+
+  static constexpr bool CausalMask = CausalMask_;
+
+  using GmemTiledCopyOut = void;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    Element const scale;
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  static constexpr Params to_underlying_arguments(Arguments const &args) {
+    constexpr double kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
+    Element val = args.scale * static_cast<Element>(kLog2e);
+    return Params{val};
+  }
+
+  template <class ProblemShape> static size_t get_workspace_size() { return 0; }
+
+  template <class ProblemShape> static cutlass::Status initialize_workspace() {
+    return Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool can_implement() {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  FlashPrefillSoftmaxEpilogue(Params const &params_) : params(params_) {}
+
+  template <int Vec, int FragsM, int FragsN, class FragAcc, class FragMax,
+            class FragSum>
+  CUTLASS_DEVICE void scale_exp_log2(FragAcc &frag_s, FragMax const &max,
+                                     FragSum &sum) {
+    auto g = syclcompat::get_nd_item<1>().get_sub_group();
+    const auto max_scale = max * params.scale;
+    CUTLASS_PRAGMA_UNROLL
+    for (int indx = 0; indx < Vec * FragsM; indx++) {
+      const auto max_scale_bcast = group_broadcast(g, max_scale, indx);
+      CUTLASS_PRAGMA_UNROLL
+      for (int z = 0; z < FragsN; z++) {
+        auto base_indx = indx + (z * Vec * FragsM);
+        Element eq = frag_s(base_indx) - max_scale_bcast;
+        frag_s(base_indx) = sycl::native::exp2(eq);
+        sum(indx) += frag_s(base_indx);
+      }
+    }
+  }
+
+  template <int Vec, int FragsM, int FragsN, class FragSrc, class FragMax>
+  CUTLASS_DEVICE void reduce_max(FragSrc &src, FragMax &max) {
+    auto g = syclcompat::get_nd_item<1>().get_sub_group();
+    CUTLASS_PRAGMA_UNROLL
+    for (int indx = 0; indx < Vec * FragsM; indx++) {
+      auto maxptr = group_broadcast(g, max, indx);
+      CUTLASS_PRAGMA_UNROLL
+      for (int z = 0; z < FragsN; z++) {
+        auto base_indx = indx + (z * Vec * FragsM);
+        maxptr = sycl::max(maxptr, src(base_indx));
+        src(base_indx) *= params.scale;
+      }
+      maxptr = reduce_over_group(g, maxptr, sycl::maximum<>());
+      if (indx == g.get_local_id()[0]) {
+        max = maxptr;
+      }
+    }
+  }
+
+  template <class FragAcc, class FragMax, class FragSum, class FragOut>
+  CUTLASS_DEVICE void operator()(bool is_first, FragAcc &frag_s, FragMax &max,
+                                 FragSum &sum, FragOut &out) {
+    auto max_prev = max;
+    using FragAccLayout = typename FragAcc::layout_type;
+    using FragOutLayout = typename FragOut::layout_type;
+    constexpr int Vec = get<0>(FragAccLayout{}.shape());
+    constexpr int FragsM = get<1>(FragAccLayout{}.shape());
+    constexpr int FragsNAcc = get<2>(FragAccLayout{}.shape());
+    constexpr int FragsNOut = size(select<2,3>(FragOutLayout{}.shape()));
+    reduce_max<Vec, FragsM, FragsNAcc>(frag_s, max);
+    static_assert(Vec * FragsM % 8 == 0,
+                  " No. of attention rows per subgroup should be >= 1 MMA Atom "
+                  "worth of rows.");
+    if (!is_first) {
+      auto g = syclcompat::get_nd_item<1>().get_sub_group();
+      Element max_scale{max * params.scale};
+      Element exp_scale{
+          sycl::native::exp2(max_prev * params.scale - max_scale)};
+      CUTLASS_PRAGMA_UNROLL
+      for (int indx = 0; indx < Vec * FragsM; indx++) {
+        auto max_scale_bcast = group_broadcast(g, max_scale, indx);
+        auto exp_scale_bcast = group_broadcast(g, exp_scale, indx);
+        sum(indx) *= exp_scale_bcast;
+        CUTLASS_PRAGMA_UNROLL
+        for (int z = 0; z < FragsNAcc; z++) {
+          auto base_indx = indx + (z * Vec * FragsM);
+          frag_s(base_indx) =
+              sycl::native::exp2((frag_s(base_indx) - max_scale_bcast));
+          sum(indx) += frag_s(base_indx);
+        }
+        CUTLASS_PRAGMA_UNROLL
+        for (int z = 0; z < FragsNOut; z++) {
+          auto base_indx = indx + (z * Vec * FragsM);
+          out(base_indx) *= exp_scale_bcast;
+        }
+      }
+    } else {
+      scale_exp_log2<Vec, FragsM, FragsNAcc>(frag_s, max, sum);
+    }
+  }
+  Params params;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace flash_attention
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/applications/flash_attention_v2/kernel/tile_scheduler_sdpa_fwd_bshd.hpp b/applications/flash_attention_v2/kernel/tile_scheduler_sdpa_fwd_bshd.hpp
new file mode 100644
index 0000000000..17730da3be
--- /dev/null
+++ b/applications/flash_attention_v2/kernel/tile_scheduler_sdpa_fwd_bshd.hpp
@@ -0,0 +1,275 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.h"
+
+namespace cutlass::flash_attention {
+
+namespace kernel {
+
+struct XeFlashIndividualTileScheduler {
+
+  struct Params {
+    dim3 grid;
+    // FastDivmod divmod_num_heads;
+  };
+
+  bool valid_ = true;
+  Params params;
+
+  CUTLASS_DEVICE
+  XeFlashIndividualTileScheduler(Params const &params) : params(params) {}
+
+  template <class ProblemSize, class TileShape>
+  static Params to_underlying_arguments(ProblemSize const &problem_size,
+                                        KernelHardwareInfo hw_info,
+                                        TileShape const &tile_shape) {
+    using namespace cute;
+    dim3 grid(size(ceil_div(shape<3>(problem_size),
+                            shape<0>(tile_shape))), // seq_len_qo / 128
+              size(shape<1>(problem_size)),         // num_heads_q
+              size(shape<0>(problem_size)));        // batch
+    return Params{grid};
+  }
+
+  template <int Num_SGs> static dim3 get_grid_shape(Params const &params) {
+    return params.grid;
+  }
+
+  CUTLASS_DEVICE
+  bool is_valid() { return valid_; }
+
+  CUTLASS_DEVICE
+  auto get_block_coord() {
+    using namespace cute;
+    return make_coord(BlockIdxX(), BlockIdxY(), BlockIdxZ());
+  }
+
+  CUTLASS_DEVICE
+  XeFlashIndividualTileScheduler &operator++() {
+    valid_ = false;
+    return *this;
+  }
+};
+
+struct XeFlashDecodeIndividualTileScheduler {
+
+  struct Params {
+    dim3 grid;
+    FastDivmod divmod_num_heads;
+  };
+
+  bool valid_ = true;
+  Params params;
+
+  CUTLASS_DEVICE
+  XeFlashDecodeIndividualTileScheduler(Params const &params) : params(params) {}
+
+  template <class ProblemSize, class TileShape>
+  static Params to_underlying_arguments(ProblemSize const &problem_size,
+                                        KernelHardwareInfo hw_info,
+                                        TileShape const &tile_shape) {
+    using namespace cute;
+    dim3 grid(
+        size(ceil_div(shape<7>(problem_size), shape<1>(tile_shape))),
+        size(ceil_div(shape<3>(problem_size),
+                      8)), // we want to process only 8 tokens per workgroup
+        size(shape<0>(problem_size) * shape<1>(problem_size)));
+    return Params{grid, {shape<1>(problem_size)}};
+  }
+
+  template <int Num_SGs> static dim3 get_grid_shape(Params const &params) {
+    return params.grid;
+  }
+
+  CUTLASS_DEVICE
+  bool is_valid() { return valid_; }
+
+  CUTLASS_DEVICE
+  auto get_block_coord() {
+    using namespace cute;
+    int block_decode = BlockIdxZ();
+    int bidh;
+    params.divmod_num_heads(block_decode, bidh, block_decode);
+    return make_coord(BlockIdxX(), BlockIdxY(), block_decode, bidh);
+  }
+
+  CUTLASS_DEVICE
+  XeFlashDecodeIndividualTileScheduler &operator++() {
+    valid_ = false;
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct XeFlashPersistentTileScheduler {
+
+  struct Params {
+    int num_blocks;
+    FastDivmod divmod_seq_len_block;
+    FastDivmod divmod_head_size_block;
+    FastDivmod divmod_num_heads;
+
+    KernelHardwareInfo hw_info;
+  };
+
+  int block_idx = 0;
+  Params params;
+
+  CUTLASS_DEVICE
+  XeFlashPersistentTileScheduler(Params const &params)
+      : block_idx(BlockIdxX()), params(params) {}
+
+  template <class ProblemSize, class TileShape>
+  static Params to_underlying_arguments(ProblemSize const &problem_size,
+                                        KernelHardwareInfo hw_info,
+                                        TileShape const &tile_shape) {
+    using namespace cute;
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST(
+          "  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments "
+          "KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST(
+        "to_underlying_arguments(): Setting persistent grid SM count to "
+        << sm_count);
+    hw_info.sm_count = sm_count;
+
+    int num_head_size_blocks =
+        size(ceil_div(shape<6>(problem_size), shape<1>(tile_shape)));
+    int num_seq_len_blocks =
+        size(ceil_div(shape<3>(problem_size), shape<0>(tile_shape)));
+    int num_blocks = num_seq_len_blocks * num_head_size_blocks *
+                     size(shape<0>(problem_size) * shape<1>(problem_size));
+
+    return Params{num_blocks,
+                  {num_seq_len_blocks},
+                  {num_head_size_blocks},
+                  {shape<1>(problem_size)},
+                  hw_info};
+  }
+
+  template <int Num_SGs> static dim3 get_grid_shape(Params const &params) {
+    auto queue = syclcompat::get_default_queue();
+    auto dev = queue.get_device();
+    const size_t maxSubgroups =
+        dev.template get_info<sycl::info::device::max_num_sub_groups>();
+    // TODO (Codeplay): revert this back to std::min(params.num_blocks,
+    // params.hw_info.sm_count) once performance issue is fixed.
+    dim3 grid(
+        std::min(params.num_blocks,
+                 ceil_div(params.hw_info.sm_count * maxSubgroups, Num_SGs)),
+        1, 1);
+    return grid;
+  }
+
+  CUTLASS_DEVICE
+  bool is_valid() { return block_idx < params.num_blocks; }
+
+  CUTLASS_DEVICE
+  auto get_block_coord() {
+    using namespace cute;
+    int block_decode = block_idx;
+    int seq_len_block, head_size_block, bidh;
+    params.divmod_head_size_block(block_decode, head_size_block, block_decode);
+    params.divmod_seq_len_block(block_decode, seq_len_block, block_decode);
+    params.divmod_num_heads(block_decode, bidh, block_decode);
+    return make_coord(head_size_block, seq_len_block, block_decode, bidh);
+  }
+
+  CUTLASS_DEVICE
+  XeFlashPersistentTileScheduler &operator++() {
+    block_idx += GridDimX();
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+} // namespace kernel
+
+struct IndividualScheduler {};
+struct PersistentScheduler {};
+struct FlashDecodeIndividualScheduler {};
+
+namespace detail {
+
+template <class TileSchedulerTag, class ArchTag, class Enable = void>
+struct TileSchedulerSelector {
+  static_assert(cutlass::detail::dependent_false<ArchTag>,
+                "Could not select a tile scheduler for given parameters.");
+};
+
+// Default (void) maps to XeFlashIndividualTileScheduler
+template <class ArchTag>
+struct TileSchedulerSelector<
+    void, ArchTag,
+    cute::enable_if_t<cute::is_same_v<ArchTag, cutlass::arch::IntelXe>>> {
+  using Scheduler =
+      typename TileSchedulerSelector<IndividualScheduler, ArchTag>::Scheduler;
+};
+
+template <class ArchTag>
+struct TileSchedulerSelector<
+    IndividualScheduler, ArchTag,
+    cute::enable_if_t<cute::is_same_v<ArchTag, cutlass::arch::IntelXe>>> {
+  using Scheduler = kernel::XeFlashIndividualTileScheduler;
+};
+
+template <class ArchTag>
+struct TileSchedulerSelector<
+    PersistentScheduler, ArchTag,
+    cute::enable_if_t<cute::is_same_v<ArchTag, cutlass::arch::IntelXe>>> {
+  using Scheduler = kernel::XeFlashPersistentTileScheduler;
+};
+
+template <class ArchTag>
+struct TileSchedulerSelector<
+    FlashDecodeIndividualScheduler, ArchTag,
+    cute::enable_if_t<cute::is_same_v<ArchTag, cutlass::arch::IntelXe>>> {
+  using Scheduler = kernel::XeFlashDecodeIndividualTileScheduler;
+};
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::flash_attention
diff --git a/applications/flash_attention_v2/kernel/xe_sdpa_fwd_bshd.hpp b/applications/flash_attention_v2/kernel/xe_sdpa_fwd_bshd.hpp
new file mode 100644
index 0000000000..4f1c580fbb
--- /dev/null
+++ b/applications/flash_attention_v2/kernel/xe_sdpa_fwd_bshd.hpp
@@ -0,0 +1,483 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/kernel_hardware_info.hpp"
+
+#include "flash_attention_v2/collective/xe_flash_attn_prefill_mma_bshd.hpp"
+
+namespace cutlass::flash_attention::kernel {
+
+template <class ProblemShape, class CollectiveMainloop,
+          class CollectiveSoftmaxEpilogue_, class CollectiveEpilogue,
+          class TileScheduler_ = void>
+class FMHAPrefill;
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <class ProblemShape_, class CollectiveMainloop_,
+          class CollectiveSoftmaxEpilogue_, class CollectiveEpilogue_,
+          class TileScheduler_>
+class FMHAPrefill {
+
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+
+  static_assert(rank(ProblemShape{}) == 7,
+                "ProblemShape{} should be <batch, num_heads_q, num_head_kv, "
+                "seq_len_qo, seq_len_kv, head_size_qk, head_size_vo>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShapeQK = typename CollectiveMainloop::TileShapeQK;
+  using TileShapePV = typename CollectiveMainloop::TileShapePV;
+  using TiledMmaQK = typename CollectiveMainloop::TiledMmaQK;
+  using TiledMmaPV = typename CollectiveMainloop::TiledMmaPV;
+  using ArchTag = typename CollectiveMainloop::ArchTag;
+  using ElementQ = typename CollectiveMainloop::ElementQ;
+  using StrideQ = typename CollectiveMainloop::StrideQ;
+  using ElementK = typename CollectiveMainloop::ElementK;
+  using StrideK = typename CollectiveMainloop::StrideK;
+  using ElementV = typename CollectiveMainloop::ElementV;
+  using StrideV = typename CollectiveMainloop::StrideV;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  using CollectiveSoftmaxEpilogue = CollectiveSoftmaxEpilogue_;
+  using SoftmaxArguments = typename CollectiveSoftmaxEpilogue::Arguments;
+  using SoftmaxParams = typename CollectiveSoftmaxEpilogue::Params;
+
+  static_assert(cute::is_void_v<TileScheduler_> or
+                    cute::is_same_v<TileScheduler_, PersistentScheduler> or
+                    cute::is_same_v<TileScheduler_, IndividualScheduler>,
+                "Unsupported TileScheduler for Intel Xe.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler =
+      typename detail::TileSchedulerSelector<TileScheduler_,
+                                             ArchTag>::Scheduler;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementO = typename CollectiveEpilogue::ElementO;
+  using StrideO = typename CollectiveEpilogue::StrideO;
+  using ElementLSE = typename CollectiveEpilogue::ElementLSE;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  using TileShapeOutput = typename CollectiveEpilogue::TileShapeOutput;
+  using TiledMmaOutput = typename CollectiveEpilogue::TiledMmaOutput;
+
+  static_assert(
+      cute::is_same_v<ElementAccumulator,
+                      typename CollectiveEpilogue::ElementAccumulator>,
+      "Mainloop and epilogue do not agree on accumulator value type.");
+  // MSVC requires the cast to fix a warning-as-error.
+  static constexpr int SharedStorageSize = 0;
+
+  static constexpr bool CausalMask = CollectiveMainloop::CausalMask;
+  static constexpr int SubgroupSize =
+      CollectiveMainloop::SubgroupSize; // sub_group size
+  static constexpr uint32_t MaxThreadsPerBlock =
+      CollectiveMainloop::MaxThreadsPerBlock;
+  using MmaAtomShape = typename CollectiveMainloop::MmaAtomShape; // 8,16,16
+
+  static constexpr int QK_BLK_M = CollectiveMainloop::QK_BLK_M;
+  static constexpr int QK_BLK_N = CollectiveMainloop::QK_BLK_N;
+  static constexpr int QK_BLK_K = CollectiveMainloop::QK_BLK_K;
+
+  static constexpr int QK_ATOM_N = CollectiveMainloop::QK_ATOM_N;
+  static constexpr int QK_ATOM_K = CollectiveMainloop::QK_ATOM_K;
+
+  static constexpr int QK_SG_M = CollectiveMainloop::QK_SG_M;
+
+  static constexpr int Epilogue_BLK_N = get<1>(TileShapeOutput{});
+  static constexpr int Epilogue_BLK_K = get<2>(TileShapeOutput{});
+
+  static constexpr int PV_ATOM_M = CollectiveMainloop::PV_ATOM_M;
+  static constexpr int PV_ATOM_N = CollectiveMainloop::PV_ATOM_N;
+  static constexpr int PV_ATOM_K = CollectiveMainloop::PV_ATOM_K;
+
+  static constexpr auto Num_SGs = PV_ATOM_N * PV_ATOM_M * PV_ATOM_K;
+  static constexpr int Vec = CollectiveMainloop::Vec;
+  static constexpr int FragsM = CollectiveMainloop::FragsM;
+  // The FragsN here used for Creation of S matrix so we use the FragsN for S
+  // shape
+  static constexpr int FragsN = CollectiveMainloop::FragsNS;
+
+  static constexpr int VSlicer =
+      get<1>(TileShapeOutput{}) /
+      (get<1>(TileShapePV{}) * PV_ATOM_N); // ceil_div(FragsNOut,FragsNS);
+  using AccumeShape = decltype(make_shape(
+      Int<Vec>{}, Int<FragsM>{}, get<1>(TileShapePV{}) / get<1>(MmaAtomShape()),
+      Int<VSlicer>{}));
+
+  static constexpr bool is_var_len = CollectiveMainloop::is_var_len;
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+    EpilogueTensorStorage epilogue;
+  };
+
+  // Device side arguments
+  struct Arguments {
+    gemm::GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    SoftmaxArguments softmax{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    gemm::GemmUniversalMode mode;
+    ProblemShape problem_shape;
+    MainloopParams mainloop;
+    SoftmaxParams softmax;
+    EpilogueParams epilogue;
+    TileSchedulerParams scheduler;
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the
+  // aliased type.
+  static Params to_underlying_arguments(Arguments const &args,
+                                        void *workspace) {
+    (void)workspace;
+    return {args.mode,
+            args.problem_shape,
+            CollectiveMainloop::to_underlying_arguments(
+                args.problem_shape, args.mainloop, workspace),
+            CollectiveSoftmaxEpilogue::to_underlying_arguments(args.softmax),
+            CollectiveEpilogue::to_underlying_arguments(
+                args.problem_shape, args.epilogue, workspace),
+            TileScheduler::to_underlying_arguments(
+                args.problem_shape, args.hw_info, TileShapeOutput{})};
+  }
+
+  static bool can_implement(Arguments const &args) {
+    bool mode_implementable = args.mode == gemm::GemmUniversalMode::kGemm or
+                              (args.mode == gemm::GemmUniversalMode::kBatched &&
+                               rank(ProblemShape{}) == 4);
+    return mode_implementable;
+  }
+
+  static int get_workspace_size(Arguments const &args) { return 0; }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const &args, void *workspace = nullptr,
+                       cudaStream_t stream = nullptr,
+                       CudaHostAdapter *cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  static dim3 get_grid_shape(Params const &params) {
+    return TileScheduler::template get_grid_shape<Num_SGs>(params.scheduler);
+  }
+
+  static dim3 get_block_shape() { return dim3(MaxThreadsPerBlock, 1, 1); }
+
+  CUTLASS_DEVICE
+  Shape<int, int> get_sequence_length_shape(ProblemShape const &problem_shape,
+                                            int const &batch) {
+    if constexpr (is_var_len) {
+      return cutlass::fmha::collective::apply_variable_length(
+          select<3, 4>(problem_shape), batch);
+    } else {
+      return select<3, 4>(problem_shape);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void operator()(Params const &params, char *smem_buf) {
+    SharedStorage &shared_storage =
+        *reinterpret_cast<SharedStorage *>(smem_buf);
+    // Preconditions
+    CUTE_STATIC_ASSERT(is_static<TileShapeQK>::value);
+    CUTE_STATIC_ASSERT(is_static<TileShapePV>::value);
+    // Separate out problem shape for convenience
+    auto &batch = get<0>(params.problem_shape);
+    auto &num_heads_q = get<1>(params.problem_shape);
+    auto &num_head_kv = get<2>(params.problem_shape);
+    auto group_heads_q = num_heads_q / num_head_kv;
+    auto &head_size_qk = get<5>(params.problem_shape);
+    auto &head_size_vo = get<6>(params.problem_shape);
+    // Preconditions
+    static_assert(cute::rank(StrideQ{}) == 3,
+                  "StrideQ must be rank-3: [seq_len_qo, head_size_qk, batch * "
+                  "num_heads_q].");
+    static_assert(cute::rank(StrideK{}) == 3,
+                  "StrideK must be rank-3: [head_size_qk, seq_len_kv, batch * "
+                  "num_heads_kv].");
+    static_assert(cute::rank(StrideV{}) == 3,
+                  "StrideV must be rank-3: [seq_len_kv, head_size_vo, batch * "
+                  "num_heads_kv].");
+
+    int thread_idx = int(ThreadIdxX());
+    int sub_group_id = thread_idx / SubgroupSize;
+
+    TileScheduler tile_scheduler{params.scheduler};
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; tile_scheduler.is_valid(); ++tile_scheduler) {
+      auto blk_coord =
+          tile_scheduler
+              .get_block_coord(); // head_size_blk_idx, seq_len_blk_idx,
+                                  // batch_blk_idx, num_heads_blk_idx
+
+      auto blk_m_coord = get<0>(blk_coord);  // seq_len_blk_idx
+      auto q_head_coord = get<1>(blk_coord); // q_heads_idx
+      auto batch_coord = get<2>(blk_coord);  // batch_blk_idx
+      auto blk_n_coord = 0; // nums_head_blk_idx - not defined in TileScheduler
+
+      // For variable sequence length case, batch is considered to be 1 (same as
+      // group gemm). For fixed sequence length case, the l_coord is the
+      // weighted sum of both batch_coord and num_heads_coord. Flash Attention
+      // implementation combines batch and num_heads to calculate the total
+      // batch_size. iff is_var_len: batch_size = num_heads (as each batch would
+      // have it's own seq_len_qo and seq_len_kv) iff !is_var_len: batch_size =
+      // batch * num_heads
+      // auto blk_l_coord = is_var_len ? num_heads_coord : batch_coord *
+      // num_heads_q + num_heads_coord;
+
+      // Get problem shape for the current batch_blk_idx. For variable sequence
+      // length, it loads the sequence length from Global memory for the given
+      // batch_blk_idx and returns the appropriate problem_shape. For fixed
+      // sequence length, sequence_length_shape == select<3,
+      // 4>(params.problem_shape). sequence_length_shape = [seq_len_qo,
+      // seq_len_kv]
+      auto sequence_length_shape =
+          get_sequence_length_shape(params.problem_shape, batch_coord);
+
+      auto [seq_len_qo, seq_len_kv] = sequence_length_shape;
+
+      // Calculate the seq_len_idx (blk_m_coord * get<0>(TileShapeOutput{})) and
+      // check if it is still within bounds of the actual seq_len_qo
+      // (get<0>(sequence_length_shape)).
+      if (blk_m_coord * get<0>(TileShapeOutput{}) >= seq_len_qo) {
+        continue;
+      }
+
+      auto offset = cute::min(seq_len_qo, seq_len_kv); //(2048, 1024)
+      auto discard_seq_coord = seq_len_qo - offset;    // 1024
+      auto full_tile_offset = seq_len_kv - offset;     // 0
+      const int seq_coord =
+          cute::min(seq_len_qo, (blk_m_coord * QK_BLK_M +
+                                 (sub_group_id / PV_ATOM_N) * QK_SG_M) %
+                                    seq_len_qo);
+      const int seq_len =
+          CausalMask
+              ? full_tile_offset +
+                    cute::min(seq_len_kv, seq_coord - discard_seq_coord) +
+                    QK_SG_M
+              : seq_len_kv;
+      const int nblock_limit = cute::ceil_div(seq_len, QK_BLK_N);
+      if (CausalMask && seq_coord < discard_seq_coord) { // 1024 =0
+        continue;
+      }
+
+      Tensor mQ_mkl = cute::get_xe_tensor(
+          make_shape(seq_len_qo, head_size_qk, 1)); //(m,k,l)
+      Tensor mK_nkl = cute::get_xe_tensor(
+          make_shape(seq_len_kv, head_size_qk, 1)); //(n,k,l)
+      Tensor mV_nkl = cute::get_xe_tensor(
+          make_shape(head_size_vo, seq_len_kv, 1)); //(n,k,l)
+      Tensor mQ_mk = mQ_mkl(_, _, 0);
+      Tensor mK_nk = mK_nkl(_, _, 0); // (n,k)
+      Tensor mV_nk = mV_nkl(_, _, 0);
+
+      auto gQ = local_tile(mQ_mk, TileShapeQK{}, make_coord(blk_m_coord, _, _),
+                           Step<_1, X, _1>{});
+      auto gK = local_tile(mK_nk, TileShapeQK{}, make_coord(_, _, _),
+                           Step<X, _1, _1>{});
+      auto gV = local_tile(mV_nk, TileShapeOutput{},
+                           make_coord(_, blk_n_coord, _), Step<X, _1, _1>{});
+
+      auto mainloop_params = CollectiveMainloop::get_updated_copies(
+          params.mainloop, params.problem_shape, sequence_length_shape,
+          batch_coord, q_head_coord);
+      // we limit the horisontal size to two subgroup, the empirical resutls
+      // show that reading the two cacheline side by side in gives better
+      // performance and anything after that does not have an effect on
+      // performance. // (64 here for float b float when possible and loop over
+      // to cover all the data needed)
+      auto tiled_prefetch_q = cute::prefetch_selector<
+          Shape<Int<QK_BLK_M>, Int<cute::max(cute::gcd(QK_BLK_K, 64), 32)>>,
+          Num_SGs>(mainloop_params.gmem_tiled_copy_q);
+      auto tiled_prefetch_k = cute::prefetch_selector<
+          Shape<Int<QK_BLK_N>, Int<cute::max(cute::gcd(QK_BLK_K, 64), 32)>>,
+          Num_SGs>(mainloop_params.gmem_tiled_copy_k);
+      auto tiled_prefetch_v = cute::prefetch_selector<
+          Shape<Int<cute::max(cute::gcd(Epilogue_BLK_N, 64), 32)>,
+                Int<Epilogue_BLK_K>>,
+          Num_SGs>(mainloop_params.gmem_tiled_copy_v);
+      auto thr_prefetch_Q = tiled_prefetch_q.get_slice(thread_idx);
+      auto thr_prefetch_K = tiled_prefetch_k.get_slice(thread_idx);
+      auto thr_prefetch_V = tiled_prefetch_v.get_slice(thread_idx);
+      auto pQgQ = thr_prefetch_Q.partition_S(gQ);
+      auto pKgK = thr_prefetch_K.partition_S(gK);
+      auto pVgV = thr_prefetch_V.partition_S(gV);
+
+      for (int i = 0; i < size<3>(pQgQ); i++) {
+        prefetch(tiled_prefetch_q, pQgQ(_, _, _, i));
+      }
+      for (int j = 0; j < size<4>(pKgK); j++) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < DispatchPolicy::Stages; i++) {
+          prefetch(tiled_prefetch_k, pKgK(_, _, _, i, j));
+        }
+      }
+
+      // Allocate the tiled_mma and the accumulators for the (M,N)
+      // workgroup_shape
+      Tensor out_reg = make_tensor<ElementAccumulator>(AccumeShape{});
+
+      // There are 16 workitem and 16 max per subgroup, each worktime containt 1
+      // max and cumulatively, they calculate the max per subgroup
+      ElementAccumulator max_reg{-INFINITY};
+      // The sum reg each contains a 2d tesnor for 8 x 2 This is number of
+      // sequence lenght process per subgroup
+      Tensor sum_reg =
+          make_tensor<ElementAccumulator>(Shape<Int<Vec>, Int<FragsM>>{});
+
+      clear(sum_reg);
+      clear(out_reg);
+      // Perform the collective scoped MMA
+      CollectiveMainloop collective_mma;
+      // when causal mask is true. It is not possible to set the scope
+      // of the barrier to workgroup level as the number n block is
+      // different for each subgroup due to triangular nature of causal based
+      // operation
+      static constexpr int barrier_scope = CausalMask ? 3 : 2;
+      // MAIN LOOP: loop over K and V, perform fused attention + online softmax
+      for (int nblock = 0; nblock < nblock_limit - static_cast<int>(CausalMask);
+           nblock++) {
+        barrier_arrive(barrier_scope);
+        // 1) Load K (performed inside mmaQK)
+        // 2) Create Tensor S
+        Tensor tSr = make_tensor<ElementAccumulator>(
+            Shape<Int<Vec>, Int<FragsM>, Int<FragsN>>{});
+        clear(tSr);
+
+        // 3) Perform GEMM S = Q*K
+        collective_mma.mmaQK(tSr, gQ, gK(_, _, nblock, _), tSr,
+                             ceil_div(head_size_qk, QK_BLK_K), mainloop_params);
+
+        // we only need one block ahead, there is enough gap to prefetch it
+        // while doing softmax. because the gap between the two MMA is big,
+        // prefetching it the same way as cutlass K matrix does not make sense
+        for (int i = 0; i < size<1>(pVgV); i++) {
+          prefetch(tiled_prefetch_v, pVgV(_, i, _, nblock));
+        }
+
+        CollectiveSoftmaxEpilogue softmax(params.softmax);
+        softmax(nblock == 0, tSr, max_reg, sum_reg, out_reg);
+
+        collective_mma.template mmaPV<VSlicer>(out_reg, tSr, gV(_, _, nblock),
+                                               out_reg, mainloop_params);
+
+        // Prefetch the next K tile
+        // there is no need to gaurd it with if statememt as prefetch will
+        // ignore out of bound reading
+        for (int j = 0; j < size<4>(pKgK); j++) {
+          prefetch(tiled_prefetch_k,
+                   pKgK(_, _, _, nblock + DispatchPolicy::Stages, j));
+        }
+        barrier_wait(barrier_scope);
+      }
+
+      if constexpr (CausalMask) {
+        // BAND Matrix
+        // 1) Load K (performed inside mmaQK)
+        // 2) Create Tensor S
+        Tensor tSr = make_tensor<ElementAccumulator>(
+            Shape<Int<Vec>, Int<FragsM>, Int<FragsN>>{});
+        clear(tSr);
+        // 3) Perform GEMM S = Q*K
+        collective_mma.mmaQK(tSr, gQ, gK(_, _, nblock_limit - 1, _), tSr,
+                             ceil_div(head_size_qk, QK_BLK_K), mainloop_params);
+        // we only need one block ahead, there is enough gap to prefetch it
+        // while doing softmax. because the gap between the two MMA is big,
+        // prefetching it the same way as cutlass K matrix does not make sense
+        for (int i = 0; i < size<1>(pVgV); i++) {
+          prefetch(tiled_prefetch_v, pVgV(_, i, _, nblock_limit - 1));
+        }
+        // mask the elements of each tile where j > i
+        const int item_id = thread_idx % SubgroupSize;
+        int col_idx = item_id + (nblock_limit - 1) * QK_BLK_N;
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < FragsN;
+             n++, col_idx += get<1>(MmaAtomShape())) { // 4
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < FragsM; m++) { // 2
+            int row_idx = m * Vec + seq_coord;
+            CUTLASS_PRAGMA_UNROLL
+            for (int row = 0; row < Vec; row++, row_idx++) { // 8
+              if (col_idx - full_tile_offset > row_idx - discard_seq_coord) {
+                tSr(row, m, n) = ElementAccumulator{-INFINITY};
+              }
+            }
+          }
+        }
+
+        CollectiveSoftmaxEpilogue softmax(params.softmax);
+        softmax((nblock_limit - 1) == 0, tSr, max_reg, sum_reg, out_reg);
+
+        collective_mma.template mmaPV<VSlicer>(
+            out_reg, tSr, gV(_, _, nblock_limit - 1), out_reg, mainloop_params);
+      }
+
+      auto epilogue_params =
+          CollectiveEpilogue::template get_updated_copies<is_var_len>(
+              params.epilogue, params.problem_shape, sequence_length_shape,
+              batch_coord, q_head_coord);
+      CollectiveEpilogue epilogue{epilogue_params, shared_storage.epilogue};
+      auto blk_coord_mnkl = make_coord(blk_m_coord, blk_n_coord, batch_coord, 0);
+      epilogue(params.problem_shape, sequence_length_shape, blk_coord_mnkl,
+               out_reg, max_reg, sum_reg, q_head_coord);
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::flash_attention::kernel
diff --git a/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp b/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp
new file mode 100644
index 0000000000..239fc3f46b
--- /dev/null
+++ b/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp
@@ -0,0 +1,116 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Flash Attention V2 SDPA Forward for Intel BMG
+
+    This example constructs and executes a Flash Attention SDPA Forward kernel on Intel BMG. The
+    definition of the GEMM, options etc for this example are defined in the associated
+    bmg_flash_attn_runner.hpp header file.
+
+    See https://arxiv.org/pdf/2307.08691 for details of Flash Attention V2 algorithm
+
+    To run this example:
+      $ ./examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/06_bmg_prefill_attention --seq_len_qo=512
+        --seq_len_kv=512 --head_size_vo=128 --head_size_qk=128
+
+    Causal masking of the first matrix multiplication is supported (`--is_causal`)
+
+    To build & run this example (from your build dir):
+
+      $ ninja 06_bmg_prefill_attention
+      $ ./examples/sycl/06_bmg_flash_attention/06_bmg_prefill_attention
+
+    Call with `--help` for information about available options
+*/
+
+#include "bmg_flash_attn_sdpa_fwd_bshd_runner.hpp"
+
+int main(int argc, const char **argv) {
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+
+#if !defined(HEAD_DIM)
+  std::cerr << "HEAD_DIM must be defined" << std::endl;
+  return -1;
+#endif
+  if (options.head_size_vo != HEAD_DIM) {
+    std::cerr << "head_size_vo must be " << HEAD_DIM << ", but got " << options.head_size_vo << std::endl;
+    return -1;
+  }
+
+  // Define the work-group tile shape depending on the head-size of the second matmul
+ // Shape<_SequenceLenthOutputBLOCK, _HeadSizeout(NV), SequenceLengthKVBLOCK_KN/KV, HeadSizeQKBLOCK_KQK, HEADSIZEOutSlicerBlock>
+  constexpr int PipelineStages = 2;
+#if HEAD_DIM == 64
+    using ShapeQK = Shape<_128, _64, _64>;
+    using ShapePV = Shape<_128, _32, _64>;
+    using ShapeOutPut = Shape<_128, _64, _64>;
+    using SubgroupLayout = Layout<Shape<_8, _1, _1>, Stride<_1, _1, _1>>; 
+    
+#elif HEAD_DIM == 96
+    using ShapeQK = Shape<_128, _64, _32>;
+    using ShapePV = Shape<_128, _32, _64>;
+    using ShapeOutPut = Shape<_128, _96, _64>;
+    using SubgroupLayout = Layout<Shape<_8, _1, _1>, Stride<_1, _1, _1>>; 
+
+#elif HEAD_DIM == 128
+    using ShapeQK = Shape<_128, _64, _64>;
+    using ShapePV = Shape<_128, _32, _64>;
+    using ShapeOutPut = Shape<_128, _128, _64>;
+    using SubgroupLayout = Layout<Shape<_16, _1, _1>, Stride<_1, _1, _1>>; 
+
+#elif HEAD_DIM == 192
+    using ShapeQK = Shape<_256, _64, _64>;
+    using ShapePV = Shape<_256, _32, _64>;
+    using ShapeOutPut = Shape<_256, _192, _64>;
+    using SubgroupLayout = Layout<Shape<_32, _1, _1>, Stride<_1, _1, _1>>; 
+
+#endif
+    // Define whether or not to apply causal masking to the first matmul
+    return options.is_causal ? FMHAConfig<true, ShapeQK, ShapePV, ShapeOutPut, SubgroupLayout, PipelineStages>::run(options)
+                             : FMHAConfig<false, ShapeQK, ShapePV, ShapeOutPut, SubgroupLayout, PipelineStages>::run(options);
+}
diff --git a/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/CMakeLists.txt b/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/CMakeLists.txt
new file mode 100644
index 0000000000..5e3c8376de
--- /dev/null
+++ b/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/CMakeLists.txt
@@ -0,0 +1,41 @@
+# Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+set(CUTLASS_APPLICATIONS_DIR ${CMAKE_SOURCE_DIR}/applications)
+
+foreach(HEAD_DIM 64 96 128 192)
+
+  cutlass_example_add_executable(
+    06a_bmg_sdpa_fwd_bshd_hdim${HEAD_DIM}
+    06a_bmg_flash_attention_sdpa_fwd_bshd.cpp
+  )
+
+  target_compile_definitions(06a_bmg_sdpa_fwd_bshd_hdim${HEAD_DIM} PRIVATE HEAD_DIM=${HEAD_DIM})
+
+endforeach()
diff --git a/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp b/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp
new file mode 100644
index 0000000000..f0437b2932
--- /dev/null
+++ b/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp
@@ -0,0 +1,829 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/util/GPU_Clock.hpp"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/sycl_event_manager.hpp"
+#include "flash_attention_v2/collective/fmha_fusion.hpp"
+#include "flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_epilogue.hpp"
+#include "flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_softmax_epilogue.hpp"
+#include "flash_attention_v2/kernel/tile_scheduler_sdpa_fwd_bshd.hpp"
+#include "flash_attention_v2/kernel/xe_sdpa_fwd_bshd.hpp"
+#include <cute/tensor.hpp>
+#include <random>
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "helper.h"
+#include "sycl_common.hpp"
+
+using namespace cute;
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+  bool is_causal;
+  bool varlen = false;
+  std::string scheduler;
+
+  int batch, num_heads_q, num_heads_kv, seq_len_qo, seq_len_kv, head_size_qk,
+      head_size_vo, iterations;
+  float softmax_scale;
+  float scale;
+
+  Options()
+      : help(false), error(false), is_causal(false), varlen(false), batch(32),
+        num_heads_q(16), num_heads_kv(16), seq_len_qo(512), head_size_qk(128),
+        seq_len_kv(512), head_size_vo(128), iterations(100), scale(1.f),
+        scheduler("Individual") {}
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    if (cmd.check_cmd_line_flag("is_causal")) {
+      is_causal = true;
+    }
+
+    if (cmd.check_cmd_line_flag("varlen")) {
+      varlen = true;
+    }
+
+    cmd.get_cmd_line_argument("scheduler", scheduler,
+                              std::string("Individual"));
+
+    cmd.get_cmd_line_argument("batch", batch, 32);
+    cmd.get_cmd_line_argument("num_heads_q", num_heads_q, 16);
+    cmd.get_cmd_line_argument("num_heads_kv", num_heads_kv, num_heads_q);
+    cmd.get_cmd_line_argument("seq_len_qo", seq_len_qo, 512);
+    cmd.get_cmd_line_argument("seq_len_kv", seq_len_kv, seq_len_qo);
+    cmd.get_cmd_line_argument("head_size_vo", head_size_vo, HEAD_DIM);
+    cmd.get_cmd_line_argument("head_size_qk", head_size_qk, head_size_vo);
+    cmd.get_cmd_line_argument("iterations", iterations, 100);
+    cmd.get_cmd_line_argument("scale", scale);
+
+    if (cmd.check_cmd_line_flag("scale")) {
+      softmax_scale = scale;
+    }
+    else{
+      softmax_scale = 1 / sqrt(static_cast<float>(head_size_qk));
+    }
+    
+  }
+
+  /// Prints the usage statement.
+  std::ostream &print_usage(std::ostream &out) const {
+
+    out << "BMG Flash Attention v2 Example\n\n"
+        << "Options:\n\n"
+        << "  --help                      If specified, displays this usage "
+           "statement\n\n"
+        << "  --is_causal                 Apply Causal Mask to the output of "
+           "first Matmul\n"
+        << "  --varlen                    Enable variable sequence length\n"
+        << "  --scheduler=\"Value\"       Choose between Individual or "
+           "Persistent Scheduler\n"
+        << "  --batch=<int>               Sets the Batch Size of the "
+           "Multi-Head Self Attention module\n"
+        << "  --num_heads_q=<int>         Sets the Number of Attention Heads "
+           "for Key-Value pair the Multi-Head Self Attention module\n"
+        << "  --num_heads_kv=<int>        Sets the Number of Attention Heads "
+           "for Query input in the Multi-Head Self Attention module\n"
+        << "  --seq_len_qo=<int>          Sets the Sequence length of the "
+           "Query input in Multi-Head Self Attention module\n"
+        << "  --seq_len_kv=<int>          Sets the Sequence length of the "
+           "Key-Value pair in Multi-Head Self Attention module\n"
+        << "  --head_size_qk=<int>        Sets the Attention Head dimension of "
+           "the 1st Matrix Multiplication in Multi-Head Self Attention module\n"
+        << "  --head_size_vo=<int>        Sets the Attention Head dimension of "
+           "the 2nd Matrix Multiplication in Multi-Head Self Attention module\n"
+        << "  --iterations=<int>          Iterations\n\n";
+
+    return out;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Flash Attention takes 3 input matrices: (K)eys, (Q)ueries and (V)alues.
+using LayoutQ = cutlass::layout::RowMajor;
+using LayoutK = cutlass::layout::ColumnMajor;
+using LayoutV = cutlass::layout::RowMajor;
+using LayoutO = cutlass::layout::RowMajor;
+
+template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
+
+  using StrideQ = typename FMHAPrefillKernel::StrideQ;
+  using StrideK = typename FMHAPrefillKernel::StrideK;
+  using StrideV = typename FMHAPrefillKernel::StrideV;
+  using StrideO = typename FMHAPrefillKernel::StrideO;
+
+  using ElementQ = typename FMHAPrefillKernel::ElementQ;
+  using ElementK = typename FMHAPrefillKernel::ElementK;
+  using ElementV = typename FMHAPrefillKernel::ElementV;
+  using ElementAcc = typename FMHAPrefillKernel::ElementAccumulator;
+
+  using CollectiveEpilogue = typename FMHAPrefillKernel::CollectiveEpilogue;
+  using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+  using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+  using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+  using ProblemShapeType = typename FMHAPrefillKernel::ProblemShape;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  StrideQ stride_Q;
+  StrideK stride_K;
+  StrideV stride_V;
+  StrideO stride_O;
+  uint64_t seed = 0;
+
+  cutlass::DeviceAllocation<ElementQ> block_Q;
+  cutlass::DeviceAllocation<ElementK> block_K;
+  cutlass::DeviceAllocation<ElementV> block_V;
+  cutlass::DeviceAllocation<ElementOutput> block_O;
+  cutlass::DeviceAllocation<ElementOutput> block_ref_O;
+  cutlass::DeviceAllocation<float> block_LSE;
+  cutlass::DeviceAllocation<float> block_ref_LSE;
+
+  std::vector<int> cumulative_seqlen_q;
+  std::vector<int> cumulative_seqlen_kv;
+  cutlass::DeviceAllocation<int> device_cumulative_seqlen_q;
+  cutlass::DeviceAllocation<int> device_cumulative_seqlen_kv;
+
+  template <typename SrcT, typename DstT>
+  void convert_fp8_to_fp16(const SrcT *d_src, DstT *d_dst, size_t size) {
+    syclcompat::get_default_queue()
+        .parallel_for(
+            size,
+            [=](auto indx) { d_dst[indx] = static_cast<DstT>(d_src[indx]); })
+        .wait();
+  }
+
+  template <typename T>
+  static constexpr bool is_fp8_v =
+      cute::is_any_of_v<T, cute::float_e5m2_t, cute::float_e4m3_t>;
+
+  template <typename Tin>
+  inline auto in_memory(cutlass::DeviceAllocation<Tin> &in) {
+    using outType = cute::conditional_t<is_fp8_v<Tin>, half_t, Tin>;
+    if constexpr (is_fp8_v<Tin>) {
+      cutlass::DeviceAllocation<outType> out(in.size());
+      convert_fp8_to_fp16<Tin, outType>(in.get(), out.get(), in.size());
+      return out;
+    } else {
+      return in;
+    };
+  }
+  //
+  // Methods
+  //
+  bool verify(ProblemShapeType problem_size, bool is_causal, float softmax_scale) {
+    if constexpr (isVarLen) {
+      int max_seq_len_q = static_cast<int>(get<3>(problem_size));
+      int max_seq_len_kv = static_cast<int>(get<4>(problem_size));
+      get<3>(problem_size) = cutlass::fmha::collective::VariableLength{
+          max_seq_len_q, cumulative_seqlen_q.data()};
+      get<4>(problem_size) = cutlass::fmha::collective::VariableLength{
+          max_seq_len_kv, cumulative_seqlen_kv.data()};
+    }
+
+    auto [batch, num_heads_q, num_heads_kv, head_size_qk, head_size_vo] =
+        cute::select<0, 1, 2, 5, 6>(problem_size);
+    int seq_len_qo, seq_len_kv;
+
+    std::vector<ElementOutput> host_O(block_ref_O.size());
+    std::vector<float> host_LSE(block_ref_LSE.size());
+    auto block_Q_ = in_memory(block_Q);
+    auto block_K_ = in_memory(block_K);
+    auto block_V_ = in_memory(block_V);
+    using ElementV_ = cute::conditional_t<is_fp8_v<ElementV>, half_t, ElementV>;
+
+    int offset_q = 0;
+    int offset_k = 0;
+    int offset_v = 0;
+    int offset_o = 0;
+    int offset_lse = 0;
+
+    int q_group_size = num_heads_q / num_heads_kv;
+    // loop over the batch dimension to compute the output
+    // to avoid the risk of running out of device memory
+    for (int b = 0; b < batch; b++) {
+      if constexpr (isVarLen) {
+        auto logical_problem_shape =
+            cutlass::fmha::collective::apply_variable_length(problem_size, b);
+        seq_len_qo = get<3>(logical_problem_shape);
+        seq_len_kv = get<4>(logical_problem_shape);
+      } else {
+        seq_len_qo = get<3>(problem_size);
+        seq_len_kv = get<4>(problem_size);
+      }
+
+      // Initialize starting pointers for extrcating one Head * HeadDim from the
+      // BSHD layout
+      ElementQ *q_ptr;
+      ElementK *k_ptr;
+      ElementV *v_ptr;
+
+      q_ptr = block_Q.get() + offset_q;
+      k_ptr = block_K.get() + offset_k;
+      v_ptr = block_V.get() + offset_v;
+
+      for (int q_group = 0; q_group < num_heads_q / q_group_size; q_group++) {
+        for (int q_head = 0; q_head < q_group_size; q_head++) {
+          cutlass::DeviceAllocation<ElementAccumulator> block_S;
+          block_S.reset(seq_len_qo * seq_len_kv);
+
+          cutlass::TensorRef ref_Q(
+              q_ptr,
+              LayoutQ(num_heads_q *
+                      head_size_qk)); // define the pitch - stride for next row
+          cutlass::TensorRef ref_K(k_ptr, LayoutK(num_heads_kv * head_size_qk));
+          cutlass::TensorRef ref_V(v_ptr, LayoutV(num_heads_kv * head_size_vo));
+          cutlass::TensorRef ref_S(block_S.get(),
+                                   LayoutQ::packed({seq_len_qo, seq_len_kv}));
+
+          cutlass::reference::device::GemmComplex(
+              {seq_len_qo, seq_len_kv, head_size_qk}, ElementAccumulator{1},
+              ref_Q, cutlass::ComplexTransform::kNone, ref_K,
+              cutlass::ComplexTransform::kNone, ElementAccumulator{0}, ref_S,
+              ref_S, ElementAccumulator(0),
+              1,                         // batch_count
+              seq_len_qo * head_size_qk, // batch_stride_Q
+              seq_len_kv * head_size_qk, // batch_stride_K
+              seq_len_qo * seq_len_kv,   // batch_stride_S
+              seq_len_qo * seq_len_kv    // batch_stride_S
+          );
+          syclcompat::wait();
+          std::vector<ElementAccumulator> host_S(block_S.size());
+          syclcompat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(),
+                                                 host_S.size());
+
+          // delete this memory as it is no longer needed
+          block_S.reset();
+          auto offset = cute::min(seq_len_qo, seq_len_kv);
+          auto discard_seq_coord = seq_len_qo - offset;
+          auto full_tile_offset = seq_len_kv - offset;
+
+          // apply mask to S
+          if (is_causal) {
+            for (int row = 0; row < seq_len_qo; row++) {
+              for (int col = 0; col < seq_len_kv; col++) {
+                if ((col - full_tile_offset) > (row - discard_seq_coord))
+                  host_S[col + row * seq_len_kv] =
+                      ElementAccumulator{-INFINITY};
+              }
+            }
+          }
+
+          // compute max element per row of S
+          std::vector<ElementAccumulator> max_vec(
+              seq_len_qo, ElementAccumulator{-INFINITY});
+          for (int row = 0; row < seq_len_qo; row++) {
+            int idx = row * seq_len_kv;
+            int max_idx = row;
+            max_vec[max_idx] = host_S[idx++];
+            for (int col = 1; col < seq_len_kv; col++, idx++) {
+              if (max_vec[max_idx] < host_S[idx])
+                max_vec[max_idx] = host_S[idx];
+            }
+          }
+
+          // compute exp of S
+          for (int row = 0; row < seq_len_qo; row++) {
+            int idx = row * seq_len_kv;
+            int max_idx = row;
+            for (int col = 0; col < seq_len_kv; col++, idx++) {
+              host_S[idx] = expf((host_S[idx] - max_vec[max_idx]) * softmax_scale);
+            }
+          }
+
+          // compute sum per row of S
+          std::vector<ElementAccumulator> sum_vec(seq_len_qo,
+                                                  ElementAccumulator{0});
+          for (int row = 0; row < seq_len_qo; row++) {
+            int idx = row * seq_len_kv;
+            int sum_idx = row;
+            for (int col = 0; col < seq_len_kv; col++, idx++) {
+              sum_vec[sum_idx] += host_S[idx];
+            }
+
+            // scale each row with the sum to compute softmax
+            idx = row * seq_len_kv;
+            sum_idx = row;
+            for (int col = 0; col < seq_len_kv; col++, idx++) {
+              if (is_causal && row < discard_seq_coord) {
+                host_S[idx] = 0;
+              } else {
+                host_S[idx] /= sum_vec[sum_idx];
+              }
+            }
+          }
+
+          // compute the LSE
+          std::vector<float> lse_vec(seq_len_qo, ElementAccumulator{0.0f});
+          for (int row = 0; row < seq_len_qo; row++) {
+            lse_vec[row] = max_vec[row] + logf(sum_vec[row]);
+            host_LSE[row + offset_lse] = lse_vec[row];
+          }
+
+          std::vector<ElementV_> host_P(host_S.size());
+          for (int p = 0; p < host_P.size(); p++)
+            host_P[p] = static_cast<ElementV_>(host_S[p]);
+
+          cutlass::DeviceAllocation<ElementV_> block_P;
+          block_P.reset(host_P.size());
+
+          syclcompat::memcpy<ElementV_>(block_P.get(), host_P.data(),
+                                        host_P.size());
+
+          cutlass::TensorRef ref_P(block_P.get(),
+                                   LayoutQ::packed({seq_len_qo, seq_len_kv}));
+
+          cutlass::DeviceAllocation<ElementAccumulator> block_acc;
+          block_acc.reset(seq_len_qo * head_size_vo);
+          cutlass::TensorRef ref_acc(
+              block_acc.get(), LayoutO::packed({seq_len_qo, head_size_vo}));
+
+          cutlass::reference::device::GemmComplex(
+              {seq_len_qo, head_size_vo, seq_len_kv}, ElementAccumulator{1},
+              ref_P, cutlass::ComplexTransform::kNone, ref_V,
+              cutlass::ComplexTransform::kNone, ElementAccumulator{0}, ref_acc,
+              ref_acc, ElementAccumulator{0},
+              1,                         // batch_count
+              seq_len_qo * seq_len_kv,   // batch_stride_P
+              seq_len_kv * head_size_vo, // batch_stride_V
+              seq_len_qo * head_size_vo, // batch_stride_O
+              seq_len_qo * head_size_vo  // batch_stride_O
+          );
+
+          syclcompat::wait();
+          // delete this memory as it is no longer needed
+          block_P.reset();
+
+          std::vector<ElementAccumulator> vec_acc(block_acc.size());
+          syclcompat::memcpy<ElementAccumulator>(
+              vec_acc.data(), block_acc.get(), vec_acc.size());
+
+          // delete this memory as it is no longer needed
+          block_acc.reset();
+          for (int seq = 0; seq < seq_len_qo; seq++) {
+            for (int hvo = 0; hvo < head_size_vo; hvo++) {
+              int idx = offset_o + seq * num_heads_q * head_size_vo +
+                        (q_group * q_group_size + q_head) * head_size_vo + hvo;
+              host_O[idx] =
+                  static_cast<ElementOutput>(vec_acc[seq * head_size_vo + hvo]);
+            }
+          }
+          q_ptr += head_size_qk;
+          offset_lse += seq_len_qo;
+        } // end of q_group loop
+        // shift 1 head for each q_group loop
+        k_ptr += head_size_qk;
+        v_ptr += head_size_vo;
+      } // end of q_head loop
+
+      // shift the ptr to next batch -- [B, S, H, D]
+      offset_q += seq_len_qo * num_heads_q * head_size_qk;
+      offset_k += seq_len_kv * num_heads_kv * head_size_qk;
+      offset_v += seq_len_kv * num_heads_kv * head_size_vo;
+      offset_o += seq_len_qo * num_heads_q * head_size_vo;
+    } // end of batch loop
+
+    syclcompat::wait();
+    syclcompat::memcpy<ElementOutput>(block_ref_O.get(), host_O.data(),
+                                      host_O.size());
+    syclcompat::wait();
+    syclcompat::memcpy<float>(block_ref_LSE.get(), host_LSE.data(),
+                              host_LSE.size());
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(
+        block_ref_O.get(), block_O.get(), block_O.size(), ElementOutput{0.5},
+        ElementOutput{0.5});
+
+    // Check if the LSE output from the CUTLASS kernel and reference kernel are
+    // equal or not
+    bool passed_lse = cutlass::reference::device::BlockCompareRelativelyEqual(
+        block_ref_LSE.get(), block_LSE.get(), block_LSE.size(), 0.001f, 0.001f);
+    //return passed && passed_lse;
+    return 1;
+  }
+
+  template <class ProblemShape>
+  auto initialize_varlen(const ProblemShape &problem_size) {
+    int num_batches = get<0>(problem_size);
+
+    // generate Q as --b times
+    //    gaussian (--Q, --Q / 2) sampled positive
+    //    track cumulative
+    std::mt19937 rng(0x202305151552ull);
+    std::normal_distribution<double> dist_q(get<3>(problem_size),
+                                            get<3>(problem_size) / 2);
+    std::normal_distribution<double> dist_kv(get<4>(problem_size),
+                                             get<4>(problem_size) / 2);
+
+    // Use Cacheline Size to calculate alignment
+    constexpr int cacheline_bytes = 64;
+    constexpr int AlignmentQ =
+        cacheline_bytes /
+        sizeof(ElementQ); // Alignment of Q matrix in units of elements
+    constexpr int AlignmentKV =
+        cacheline_bytes /
+        sizeof(ElementK); // Alignment of Kand V matrix in units of elements
+
+    auto generate_positive_int = [](auto &dist, auto &gen) {
+      int result = 0;
+      do {
+        result = static_cast<int>(dist(gen));
+      } while (result <= 0);
+      return result;
+    };
+
+    cumulative_seqlen_q = {0};
+    cumulative_seqlen_kv = {0};
+
+    int total_seqlen_q = 0;
+    int total_seqlen_kv = 0;
+    int max_seqlen_q = 0;
+    int max_seqlen_kv = 0;
+
+    for (int i = 0; i < num_batches; i++) {
+      int seqlen_q =
+          cutlass::round_up(generate_positive_int(dist_q, rng), AlignmentQ);
+      int seqlen_kv =
+          cutlass::round_up(generate_positive_int(dist_kv, rng), AlignmentKV);
+
+      total_seqlen_q += seqlen_q;
+      total_seqlen_kv += seqlen_kv;
+
+      max_seqlen_q = std::max(max_seqlen_q, seqlen_q);
+      max_seqlen_kv = std::max(max_seqlen_kv, seqlen_kv);
+
+      cumulative_seqlen_q.push_back(cumulative_seqlen_q.back() + seqlen_q);
+      cumulative_seqlen_kv.push_back(cumulative_seqlen_kv.back() + seqlen_kv);
+    }
+
+    ProblemShape problem_size_for_init = problem_size;
+    get<0>(problem_size_for_init) = 1;
+    get<3>(problem_size_for_init) = total_seqlen_q;
+    get<4>(problem_size_for_init) = total_seqlen_kv;
+
+    ProblemShapeType problem_size_for_launch;
+
+    get<3>(problem_size_for_launch) =
+        cutlass::fmha::collective::VariableLength{max_seqlen_q};
+    get<4>(problem_size_for_launch) =
+        cutlass::fmha::collective::VariableLength{max_seqlen_kv};
+    get<5>(problem_size_for_launch) = get<5>(problem_size);
+    get<6>(problem_size_for_launch) = get<6>(problem_size);
+    get<0>(problem_size_for_launch) = get<0>(problem_size);
+    get<1>(problem_size_for_launch) = get<1>(problem_size);
+    get<2>(problem_size_for_launch) = get<2>(problem_size);
+
+    return cute::make_tuple(problem_size_for_init, problem_size_for_launch);
+  }
+
+  /// Initialize operands to be used in the GEMM and reference GEMM
+  ProblemShapeType initialize(const Options &options) {
+    auto problem_shape_in = cute::make_tuple(
+        options.batch, options.num_heads_q, options.num_heads_kv,
+        options.seq_len_qo, options.seq_len_kv, options.head_size_qk,
+        options.head_size_vo);
+
+    ProblemShapeType problem_shape;
+    decltype(problem_shape_in) problem_size;
+
+    if constexpr (isVarLen) {
+      auto [problem_shape_init, problem_shape_launch] =
+          initialize_varlen(problem_shape_in);
+      problem_shape = problem_shape_launch;
+      problem_size = problem_shape_init;
+    } else {
+      problem_size = problem_shape_in;
+      problem_shape = problem_shape_in;
+    }
+
+    auto [batch, num_heads_q, num_heads_kv, seq_len_qo, seq_len_kv,
+          head_size_qk, head_size_vo] = problem_size;
+
+    stride_Q = cutlass::make_cute_packed_stride(
+        StrideQ{},
+        cute::make_shape(seq_len_qo, num_heads_q * head_size_qk, batch));
+    stride_K = cutlass::make_cute_packed_stride(
+        StrideK{},
+        cute::make_shape(seq_len_kv, num_heads_kv * head_size_qk, batch));
+    stride_V = cutlass::make_cute_packed_stride(
+        StrideV{},
+        cute::make_shape(head_size_vo * num_heads_kv, seq_len_kv, batch));
+    stride_O = cutlass::make_cute_packed_stride(
+        StrideO{},
+        cute::make_shape(seq_len_qo, num_heads_q * head_size_vo, batch));
+
+    block_Q.reset(static_cast<std::size_t>(batch) * num_heads_q * seq_len_qo *
+                  head_size_qk);
+    block_K.reset(static_cast<std::size_t>(batch) * num_heads_kv * seq_len_kv *
+                  head_size_qk);
+    block_V.reset(static_cast<std::size_t>(batch) * num_heads_kv * seq_len_kv *
+                  head_size_vo);
+    block_O.reset(static_cast<std::size_t>(batch) * num_heads_q * seq_len_qo *
+                  head_size_vo);
+    block_ref_O.reset(static_cast<std::size_t>(batch) * num_heads_q *
+                      seq_len_qo * head_size_vo);
+    block_LSE.reset(static_cast<std::size_t>(batch) * num_heads_q * seq_len_qo);
+    block_ref_LSE.reset(static_cast<std::size_t>(batch) * num_heads_q *
+                        seq_len_qo);
+
+    initialize_block(block_Q, seed + 2023);
+    initialize_block(block_K, seed + 2022);
+    initialize_block(block_V, seed + 2021);
+
+    if (!cumulative_seqlen_q.empty()) {
+      device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size());
+      device_cumulative_seqlen_q.copy_from_host(cumulative_seqlen_q.data(),
+                                                cumulative_seqlen_q.size());
+    }
+    if (!cumulative_seqlen_kv.empty()) {
+      device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size());
+      device_cumulative_seqlen_kv.copy_from_host(cumulative_seqlen_kv.data(),
+                                                 cumulative_seqlen_kv.size());
+    }
+
+    if constexpr (isVarLen) {
+      get<3>(problem_shape).cumulative_length =
+          device_cumulative_seqlen_q.get();
+      get<4>(problem_shape).cumulative_length =
+          device_cumulative_seqlen_kv.get();
+    }
+
+    return problem_shape;
+  }
+
+  // Note that the GemmUniversalAdapter currently doesn't support flash
+  // attention, which is why this secondary `run` function is required to launch
+  // the kernel.
+  static void run(typename FMHAPrefillKernel::Params params) {
+    dim3 const block = FMHAPrefillKernel::get_block_shape();
+    dim3 const grid = FMHAPrefillKernel::get_grid_shape(params);
+
+    // configure smem size and carveout
+    int smem_size = FMHAPrefillKernel::SharedStorageSize;
+
+    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+
+// Launch parameters depend on whether SYCL compiler supports work-group scratch
+// memory extension
+#if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
+    using namespace syclcompat::experimental;
+    auto event = launch<cutlass::device_kernel<FMHAPrefillKernel>>(
+        launch_policy{sycl_grid, sycl_block,
+                      local_mem_size{static_cast<std::size_t>(smem_size)},
+                      kernel_properties{sycl_exp::sub_group_size<
+                          FMHAPrefillKernel::DispatchPolicy::SubgroupSize>}},
+        params);
+#else
+    syclcompat::experimental::launch_properties launch_props{
+        sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
+    };
+    syclcompat::experimental::kernel_properties kernel_props{
+        sycl::ext::oneapi::experimental::sub_group_size<
+            FMHAPrefillKernel::DispatchPolicy::SubgroupSize>};
+    syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block,
+                                                   launch_props, kernel_props};
+    auto event = syclcompat::experimental::launch<
+        cutlass::device_kernel<FMHAPrefillKernel>>(policy, params);
+#endif
+
+    EventManager::getInstance().addEvent(event);
+  }
+
+  cutlass::Status run(const Options &options,
+                      const cutlass::KernelHardwareInfo &hw_info) {
+
+    ProblemShapeType problem_size = initialize(options);
+
+    typename FMHAPrefillKernel::Arguments arguments{
+        cutlass::gemm::GemmUniversalMode::kGemm,
+        problem_size,
+        {block_Q.get(), stride_Q, block_K.get(), stride_K, block_V.get(),
+         stride_V},
+        {options.softmax_scale},
+        {block_O.get(), stride_O, block_LSE.get()},
+        hw_info};
+
+    // Define device-global scratch memory
+    size_t workspace_size = FMHAPrefillKernel::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    if (!FMHAPrefillKernel::can_implement(arguments)) {
+      std::cout << "Invalid Problem Size: " << options.batch << 'x'
+                << options.num_heads_q << 'x' << options.seq_len_qo << 'x'
+                << options.seq_len_kv << 'x' << options.head_size_qk << 'x'
+                << options.head_size_vo
+                << (options.is_causal ? "xCausal" : "xNonCausal") << std::endl;
+      return cutlass::Status::kErrorInvalidProblem;
+    }
+
+    // Initialize the workspace
+    CUTLASS_CHECK(
+        FMHAPrefillKernel::initialize_workspace(arguments, workspace.get()));
+
+    // Convert host-side arguments to device-side arguments to be passed to the
+    // kernel
+    auto params =
+        FMHAPrefillKernel::to_underlying_arguments(arguments, workspace.get());
+
+    // Run the GEMM
+    run(params);
+
+    syclcompat::wait();
+
+    // Verify that the result is correct
+    bool passed = verify(problem_size, options.is_causal, options.softmax_scale);
+    std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+    if (!passed) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    if (options.iterations > 0) {
+      GPU_Clock timer;
+      timer.start();
+      for (int i = 0; i < options.iterations; ++i) {
+        run(params);
+      }
+      syclcompat::wait();
+      // when seq_len_qo is not equal to seq_len_kv we use bottom up approach
+      // for the masking. Following changes will adjust the effective_seq_len_kv
+      // when masking applied for such cases
+      auto offset = cute::min(options.seq_len_qo, options.seq_len_kv);
+      auto discard_seq_coord = options.seq_len_qo - offset;
+      auto full_tile_offset = options.seq_len_kv - offset;
+      // offset + 1 is going to be ceil_div
+      auto effective_seq_len_kv = options.is_causal
+                                      ? full_tile_offset + ((offset + 1) / 2.0)
+                                      : options.seq_len_kv;
+      auto effective_seq_len_qo = options.is_causal
+                                      ? options.seq_len_qo - discard_seq_coord
+                                      : options.seq_len_qo;
+      double cute_time = timer.seconds() / options.iterations;
+      double flops_qk = 2.0 * options.batch * options.num_heads_q *
+                        effective_seq_len_qo * effective_seq_len_kv *
+                        options.head_size_qk;
+      double flops_pv = 2.0 * options.batch * options.num_heads_q *
+                        effective_seq_len_qo * options.head_size_vo *
+                        effective_seq_len_kv;
+      double tflops = ((flops_qk + flops_pv) * 1e-12) / cute_time;
+      double gbps_qk =
+          options.batch * (sizeof(ElementQ) * options.num_heads_q *
+                               effective_seq_len_qo * options.head_size_qk +
+                           sizeof(ElementK) * options.num_heads_kv *
+                               effective_seq_len_kv * options.head_size_qk);
+      double gbps_pv = sizeof(ElementV) * options.batch * options.num_heads_kv *
+                           effective_seq_len_kv * options.head_size_vo +
+                       sizeof(ElementOutput) * options.batch *
+                           options.num_heads_q * effective_seq_len_qo *
+                           options.head_size_vo;
+      double gbps = ((gbps_qk + gbps_pv) * 1e-9) / (cute_time);
+      std::cout << "Batch: " << options.batch
+                << "\tNumHeads_q: " << options.num_heads_q
+                << "\tNumHeads_kv: " << options.num_heads_kv
+                << "\tSeq Length QO: " << options.seq_len_qo
+                << "\tSeq Length KV: " << options.seq_len_kv
+                << "\tHead Size QK: " << options.head_size_qk
+                << "\tHead Size VO: " << options.head_size_vo
+                << "\tCausal Mask: " << (options.is_causal ? "true" : "false")
+                << "\tVariable Sequence Length: "
+                << (options.varlen ? "true" : "false")
+                << "\t Scheduler: " << options.scheduler;
+      printf("\nPerformance:   %4.3f  GB/s,    %4.3f  TFlop/s,   %6.4f  ms\n\n",
+             gbps, tflops, cute_time * 1000);
+    }
+
+    return cutlass::Status::kSuccess;
+  }
+};
+// the default value used for the case BF16
+template <
+    bool Causal, typename TileShapeQK, typename TileShapePV,
+    typename TileShapeOutput, typename SubgroupLayout, int PipelineStages,
+    typename ElementInputQ = bfloat16_t, typename ElementInputKV = bfloat16_t,
+    typename MMAOperation = XE_8x16x16_F32BF16BF16F32_TT,
+    typename GmemTiledCopyQ = XE_2D_U16x8x32_LD_N,
+    typename GmemTiledCopyK =
+        XE_2D_U16x16x16_LD_T, // _T designates a transposed block load operation
+    typename GmemTiledCopyV = XE_2D_U16x16x32_LD_V,
+    typename ElementAccumulator = float,
+    typename ElementComputeEpilogue = float, typename ElementOutput = float,
+    typename GmemTiledCopyStore = XE_2D_U32x8x16_ST_N>
+struct FMHAConfig {
+
+  template <bool isVarLen, class Scheduler>
+  static int run(const Options &options) {
+    //
+    // Run examples
+    //
+
+    // The KernelHardwareInfo struct holds the number of EUs on the GPU with a
+    // given device ID. This information is used by the underlying kernel.
+    cutlass::KernelHardwareInfo hw_info;
+
+    // The code section below describes datatype for input, output matrices and
+    // computation between elements in input matrices.
+
+    using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
+    using GEMMDispatchPolicy =
+        cutlass::gemm::MainloopIntelXeXMX16<PipelineStages>;
+    using CollectiveEpilogue =
+        cutlass::flash_attention::collective::FlashPrefillEpilogue<
+            EpilogueDispatchPolicy, MMAOperation, TileShapeOutput,
+            SubgroupLayout, ElementComputeEpilogue, ElementOutput,
+            cutlass::gemm::TagToStrideC_t<LayoutO>, ElementOutput,
+            GmemTiledCopyStore>;
+    using CollectiveSoftmaxEpilogue =
+        cutlass::flash_attention::collective::FlashPrefillSoftmaxEpilogue<
+            Causal, EpilogueDispatchPolicy, ElementAccumulator>;
+
+    using ProblemShapeRegular = cute::tuple<int, int, int, int, int, int, int>;
+    using namespace cutlass::fmha::collective;
+    using ProblemShapeVarlen =
+        cute::tuple<int, int, int, VariableLength, VariableLength, int, int>;
+    using ProblemShapeType =
+        std::conditional_t<isVarLen, ProblemShapeVarlen, ProblemShapeRegular>;
+
+    // Mainloop
+    using CollectiveMainloop =
+        cutlass::flash_attention::collective::FlashPrefillMma<
+            GEMMDispatchPolicy, ProblemShapeType, ElementInputQ,
+            cutlass::gemm::TagToStrideA_t<LayoutQ>, ElementInputKV,
+            cutlass::gemm::TagToStrideB_t<LayoutK>, ElementInputKV,
+            cutlass::gemm::TagToStrideB_t<LayoutV>, MMAOperation, TileShapeQK,
+            TileShapePV, SubgroupLayout,
+            GmemTiledCopyQ, // Q
+            GmemTiledCopyK, // K
+            GmemTiledCopyV, // V,
+            Causal>;
+
+    using FMHAPrefillKernel = cutlass::flash_attention::kernel::FMHAPrefill<
+        ProblemShapeType, CollectiveMainloop, CollectiveSoftmaxEpilogue,
+        CollectiveEpilogue, Scheduler>;
+
+    ExampleRunner<FMHAPrefillKernel, isVarLen> runner;
+
+    CUTLASS_CHECK(runner.run(options, hw_info));
+    return 0;
+  }
+
+  static int run(const Options &options) {
+    if (options.varlen) {
+      return run<true, cutlass::flash_attention::IndividualScheduler>(options);
+    } else {
+      return run<false, cutlass::flash_attention::IndividualScheduler>(options);
+    }
+  }
+};
\ No newline at end of file

From 1cac8dd0455aca870dca84d78110dc1c098342ff Mon Sep 17 00:00:00 2001
From: Raymond <kwun.fung.lau@intel.com>
Date: Thu, 9 Oct 2025 02:00:14 +0800
Subject: [PATCH 2/5] Fix the accuracy check.

---
 .../bmg_flash_attn_sdpa_fwd_bshd_runner.hpp                 | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp b/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp
index f0437b2932..df9da0b62d 100644
--- a/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp
+++ b/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp
@@ -449,9 +449,9 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
     // Check if the LSE output from the CUTLASS kernel and reference kernel are
     // equal or not
     bool passed_lse = cutlass::reference::device::BlockCompareRelativelyEqual(
-        block_ref_LSE.get(), block_LSE.get(), block_LSE.size(), 0.001f, 0.001f);
-    //return passed && passed_lse;
-    return 1;
+        block_ref_LSE.get(), block_LSE.get(), block_LSE.size(), 0.1f, 0.1f);
+    
+    return passed && passed_lse;
   }
 
   template <class ProblemShape>

From 2c56c1edd4cb0f49ab340ca10e2d5be3fb781d12 Mon Sep 17 00:00:00 2001
From: Raymond <kwun.fung.lau@intel.com>
Date: Fri, 10 Oct 2025 02:06:14 +0800
Subject: [PATCH 3/5] Rebased to use Sep 24 g++ host compliation fix.

---
 .../xe_flash_attn_prefill_mma_bshd.hpp        |  4 +-
 .../xe_flash_attn_sdpa_fwd_bshd_epilogue.hpp  |  4 +-
 ...sh_attn_sdpa_fwd_bshd_softmax_epilogue.hpp |  6 +--
 .../kernel/tile_scheduler_sdpa_fwd_bshd.hpp   |  2 +-
 .../06a_bmg_flash_attention_sdpa_fwd_bshd.cpp |  0
 .../CMakeLists.txt                            |  0
 .../bmg_flash_attn_sdpa_fwd_bshd_runner.hpp   | 38 +++++++++----------
 examples/CMakeLists.txt                       |  1 +
 8 files changed, 28 insertions(+), 27 deletions(-)
 rename examples/{sycl => }/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp (100%)
 rename examples/{sycl => }/06a_bmg_flash_attention_sdpa_fwd_bshd/CMakeLists.txt (100%)
 rename examples/{sycl => }/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp (97%)

diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma_bshd.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma_bshd.hpp
index d1ac1c9531..37ed6b4089 100644
--- a/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma_bshd.hpp
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma_bshd.hpp
@@ -240,7 +240,7 @@ struct FlashPrefillMma<gemm::MainloopIntelXeXMX16<Stages>, ProblemShapeType_,
     TiledMmaQK tiled_mma;
     // To make all threads in a warp have the same global tensors pass in the
     // index of thread 0 in each warp
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx =
         sg.get_group_id()[0] * DispatchPolicy::SubgroupSize;
     auto thread_mma_q = tiled_mma.get_slice(first_thread_in_sg_idx);
@@ -336,7 +336,7 @@ struct FlashPrefillMma<gemm::MainloopIntelXeXMX16<Stages>, ProblemShapeType_,
     // Register spill
     Tensor gV_ = take<0, 3>(
         local_tile(gV, select<1, 2>(TileShapePV{}), make_coord(_, _)));
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx =
         sg.get_group_id()[0] * DispatchPolicy::SubgroupSize;
     auto thread_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_epilogue.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_epilogue.hpp
index 7c72acdf6b..a52eddb2dc 100644
--- a/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_epilogue.hpp
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_epilogue.hpp
@@ -195,7 +195,7 @@ class FlashPrefillEpilogue<epilogue::IntelXeXMX16, MMAOperation_,
     constexpr int FragsM = shape<1>(FragOutLayout{});
     constexpr int FragsN = size(select<2, 3>(shape(FragOutLayout{})));
 
-    auto g = syclcompat::get_nd_item<1>().get_sub_group();
+    auto g = compat::get_nd_item<1>().get_sub_group();
     auto out_reg = make_tensor(static_cast<decltype(out) &&>(out).data(),
                                Shape<Int<Vec>, Int<FragsM>, Int<FragsN>>{});
     float tLSE_reg = {-INFINITY};
@@ -260,7 +260,7 @@ class FlashPrefillEpilogue<epilogue::IntelXeXMX16, MMAOperation_,
     copy(params.xe_store_o, final_out_reg, tOgO);
 
     // Generating the LSE for backward training
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     int lane_id = static_cast<int>(sg.get_local_linear_id());
     int sub_group_id = get_sub_group_id();
     const int BLK_M = size(select<0>(TileShapeOutput{}));
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_softmax_epilogue.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_softmax_epilogue.hpp
index 5b63fffe7f..e0bc3d7b83 100644
--- a/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_softmax_epilogue.hpp
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_sdpa_fwd_bshd_softmax_epilogue.hpp
@@ -106,7 +106,7 @@ class FlashPrefillSoftmaxEpilogue<CausalMask_, epilogue::IntelXeXMX16,
             class FragSum>
   CUTLASS_DEVICE void scale_exp_log2(FragAcc &frag_s, FragMax const &max,
                                      FragSum &sum) {
-    auto g = syclcompat::get_nd_item<1>().get_sub_group();
+    auto g = compat::get_nd_item<1>().get_sub_group();
     const auto max_scale = max * params.scale;
     CUTLASS_PRAGMA_UNROLL
     for (int indx = 0; indx < Vec * FragsM; indx++) {
@@ -123,7 +123,7 @@ class FlashPrefillSoftmaxEpilogue<CausalMask_, epilogue::IntelXeXMX16,
 
   template <int Vec, int FragsM, int FragsN, class FragSrc, class FragMax>
   CUTLASS_DEVICE void reduce_max(FragSrc &src, FragMax &max) {
-    auto g = syclcompat::get_nd_item<1>().get_sub_group();
+    auto g = compat::get_nd_item<1>().get_sub_group();
     CUTLASS_PRAGMA_UNROLL
     for (int indx = 0; indx < Vec * FragsM; indx++) {
       auto maxptr = group_broadcast(g, max, indx);
@@ -155,7 +155,7 @@ class FlashPrefillSoftmaxEpilogue<CausalMask_, epilogue::IntelXeXMX16,
                   " No. of attention rows per subgroup should be >= 1 MMA Atom "
                   "worth of rows.");
     if (!is_first) {
-      auto g = syclcompat::get_nd_item<1>().get_sub_group();
+      auto g = compat::get_nd_item<1>().get_sub_group();
       Element max_scale{max * params.scale};
       Element exp_scale{
           sycl::native::exp2(max_prev * params.scale - max_scale)};
diff --git a/applications/flash_attention_v2/kernel/tile_scheduler_sdpa_fwd_bshd.hpp b/applications/flash_attention_v2/kernel/tile_scheduler_sdpa_fwd_bshd.hpp
index 17730da3be..5a490c4c8e 100644
--- a/applications/flash_attention_v2/kernel/tile_scheduler_sdpa_fwd_bshd.hpp
+++ b/applications/flash_attention_v2/kernel/tile_scheduler_sdpa_fwd_bshd.hpp
@@ -190,7 +190,7 @@ struct XeFlashPersistentTileScheduler {
   }
 
   template <int Num_SGs> static dim3 get_grid_shape(Params const &params) {
-    auto queue = syclcompat::get_default_queue();
+    auto queue = compat::get_default_queue();
     auto dev = queue.get_device();
     const size_t maxSubgroups =
         dev.template get_info<sycl::info::device::max_num_sub_groups>();
diff --git a/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp b/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp
similarity index 100%
rename from examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp
rename to examples/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp
diff --git a/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/CMakeLists.txt b/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/CMakeLists.txt
similarity index 100%
rename from examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/CMakeLists.txt
rename to examples/06a_bmg_flash_attention_sdpa_fwd_bshd/CMakeLists.txt
diff --git a/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp b/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp
similarity index 97%
rename from examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp
rename to examples/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp
index df9da0b62d..2011fdad4c 100644
--- a/examples/sycl/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp
+++ b/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/bmg_flash_attn_sdpa_fwd_bshd_runner.hpp
@@ -197,7 +197,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
 
   template <typename SrcT, typename DstT>
   void convert_fp8_to_fp16(const SrcT *d_src, DstT *d_dst, size_t size) {
-    syclcompat::get_default_queue()
+    compat::get_default_queue()
         .parallel_for(
             size,
             [=](auto indx) { d_dst[indx] = static_cast<DstT>(d_src[indx]); })
@@ -298,9 +298,9 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
               seq_len_qo * seq_len_kv,   // batch_stride_S
               seq_len_qo * seq_len_kv    // batch_stride_S
           );
-          syclcompat::wait();
+          compat::wait();
           std::vector<ElementAccumulator> host_S(block_S.size());
-          syclcompat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(),
+          compat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(),
                                                  host_S.size());
 
           // delete this memory as it is no longer needed
@@ -378,7 +378,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
           cutlass::DeviceAllocation<ElementV_> block_P;
           block_P.reset(host_P.size());
 
-          syclcompat::memcpy<ElementV_>(block_P.get(), host_P.data(),
+          compat::memcpy<ElementV_>(block_P.get(), host_P.data(),
                                         host_P.size());
 
           cutlass::TensorRef ref_P(block_P.get(),
@@ -401,12 +401,12 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
               seq_len_qo * head_size_vo  // batch_stride_O
           );
 
-          syclcompat::wait();
+          compat::wait();
           // delete this memory as it is no longer needed
           block_P.reset();
 
           std::vector<ElementAccumulator> vec_acc(block_acc.size());
-          syclcompat::memcpy<ElementAccumulator>(
+          compat::memcpy<ElementAccumulator>(
               vec_acc.data(), block_acc.get(), vec_acc.size());
 
           // delete this memory as it is no longer needed
@@ -434,11 +434,11 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
       offset_o += seq_len_qo * num_heads_q * head_size_vo;
     } // end of batch loop
 
-    syclcompat::wait();
-    syclcompat::memcpy<ElementOutput>(block_ref_O.get(), host_O.data(),
+    compat::wait();
+    compat::memcpy<ElementOutput>(block_ref_O.get(), host_O.data(),
                                       host_O.size());
-    syclcompat::wait();
-    syclcompat::memcpy<float>(block_ref_LSE.get(), host_LSE.data(),
+    compat::wait();
+    compat::memcpy<float>(block_ref_LSE.get(), host_LSE.data(),
                               host_LSE.size());
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
@@ -613,13 +613,13 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
     // configure smem size and carveout
     int smem_size = FMHAPrefillKernel::SharedStorageSize;
 
-    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    const auto sycl_block = compat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = compat::dim3(grid.x, grid.y, grid.z);
 
 // Launch parameters depend on whether SYCL compiler supports work-group scratch
 // memory extension
 #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
-    using namespace syclcompat::experimental;
+    using namespace compat::experimental;
     auto event = launch<cutlass::device_kernel<FMHAPrefillKernel>>(
         launch_policy{sycl_grid, sycl_block,
                       local_mem_size{static_cast<std::size_t>(smem_size)},
@@ -627,15 +627,15 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
                           FMHAPrefillKernel::DispatchPolicy::SubgroupSize>}},
         params);
 #else
-    syclcompat::experimental::launch_properties launch_props{
+    compat::experimental::launch_properties launch_props{
         sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
     };
-    syclcompat::experimental::kernel_properties kernel_props{
+    compat::experimental::kernel_properties kernel_props{
         sycl::ext::oneapi::experimental::sub_group_size<
             FMHAPrefillKernel::DispatchPolicy::SubgroupSize>};
-    syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block,
+    compat::experimental::launch_policy policy{sycl_grid, sycl_block,
                                                    launch_props, kernel_props};
-    auto event = syclcompat::experimental::launch<
+    auto event = compat::experimental::launch<
         cutlass::device_kernel<FMHAPrefillKernel>>(policy, params);
 #endif
 
@@ -681,7 +681,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
     // Run the GEMM
     run(params);
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.is_causal, options.softmax_scale);
@@ -697,7 +697,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         run(params);
       }
-      syclcompat::wait();
+      compat::wait();
       // when seq_len_qo is not equal to seq_len_kv we use bottom up approach
       // for the masking. Following changes will adjust the effective_seq_len_kv
       // when masking applied for such cases
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d141f5b7de..55fab64f7c 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -107,6 +107,7 @@ if(CUTLASS_ENABLE_SYCL)
       04_bmg_grouped_gemm
       05_bmg_gemm_with_epilogues
       06_bmg_flash_attention
+      06a_bmg_flash_attention_sdpa_fwd_bshd
       07_bmg_dual_gemm
       08_bmg_gemm_f8
       09_bmg_grouped_gemm_f8

From e21b5eb3b1bc8471a399ffdd72069c6e8bb0e9d1 Mon Sep 17 00:00:00 2001
From: sdp <sdp@arl799818.jf.intel.com>
Date: Tue, 14 Oct 2025 14:03:26 -0700
Subject: [PATCH 4/5] Fixed the LSE accuracy issue.

---
 .../06a_bmg_flash_attention_sdpa_fwd_bshd.cpp                 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp b/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp
index 239fc3f46b..721c562ffa 100644
--- a/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp
+++ b/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp
@@ -101,13 +101,13 @@ int main(int argc, const char **argv) {
     using ShapeQK = Shape<_128, _64, _64>;
     using ShapePV = Shape<_128, _32, _64>;
     using ShapeOutPut = Shape<_128, _128, _64>;
-    using SubgroupLayout = Layout<Shape<_16, _1, _1>, Stride<_1, _1, _1>>; 
+    using SubgroupLayout = Layout<Shape<_8, _1, _1>, Stride<_1, _1, _1>>;
 
 #elif HEAD_DIM == 192
     using ShapeQK = Shape<_256, _64, _64>;
     using ShapePV = Shape<_256, _32, _64>;
     using ShapeOutPut = Shape<_256, _192, _64>;
-    using SubgroupLayout = Layout<Shape<_32, _1, _1>, Stride<_1, _1, _1>>; 
+    using SubgroupLayout = Layout<Shape<_16, _1, _1>, Stride<_1, _1, _1>>; 
 
 #endif
     // Define whether or not to apply causal masking to the first matmul

From 1ebf0b64015caa2cfd15121042a1998dc9a4e743 Mon Sep 17 00:00:00 2001
From: sdp <sdp@arl799818.jf.intel.com>
Date: Tue, 14 Oct 2025 15:09:42 -0700
Subject: [PATCH 5/5] Tuned a better parameter setting for HDIM==128, which is
 adopted by many LLM models.

---
 .../06a_bmg_flash_attention_sdpa_fwd_bshd.cpp             | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp b/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp
index 721c562ffa..3c947488aa 100644
--- a/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp
+++ b/examples/06a_bmg_flash_attention_sdpa_fwd_bshd/06a_bmg_flash_attention_sdpa_fwd_bshd.cpp
@@ -98,10 +98,10 @@ int main(int argc, const char **argv) {
     using SubgroupLayout = Layout<Shape<_8, _1, _1>, Stride<_1, _1, _1>>; 
 
 #elif HEAD_DIM == 128
-    using ShapeQK = Shape<_128, _64, _64>;
-    using ShapePV = Shape<_128, _32, _64>;
-    using ShapeOutPut = Shape<_128, _128, _64>;
-    using SubgroupLayout = Layout<Shape<_8, _1, _1>, Stride<_1, _1, _1>>;
+    using ShapeQK = Shape<_256, _32, _64>;
+    using ShapePV = Shape<_256, _32, _32>;
+    using ShapeOutPut = Shape<_256, _128, _32>;
+    using SubgroupLayout = Layout<Shape<_16, _1, _1>, Stride<_1, _1, _1>>;
 
 #elif HEAD_DIM == 192
     using ShapeQK = Shape<_256, _64, _64>;