intel · CuiYifeng · Oct 20, 2025 · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025
diff --git a/src/ATen/native/xpu/sycl/Dropout.cpp b/src/ATen/native/xpu/sycl/Dropout.cpp
@@ -165,7 +165,7 @@ struct FusedDropoutUnrollFunctor {
         if (li < total_elements_) {
           // Convert `linearIndex` into an offset of `a`
           const IndexType aOffset =
-              IndexToOffset<const scalar_t, IndexType>::get(li, a_);
+              IndexToOffset<const scalar_t, IndexType, ADims>::get(li, a_);
           src[ii] = a_.data[aOffset];
         }
       }
@@ -174,7 +174,7 @@ struct FusedDropoutUnrollFunctor {
         if (li < total_elements_) {
           // Convert `linearIndex` into an offset of `b`
           const IndexType bOffset =
-              IndexToOffset<scalar_t, IndexType>::get(li, b_);
+              IndexToOffset<scalar_t, IndexType, BDims>::get(li, b_);
           b_.data[bOffset] = src[ii] * (&rand.x)[ii] * scale;
           c_.data[bOffset] = (mask_t)(&rand.x)[ii];
         }

diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp
diff --git a/src/ATen/native/xpu/sycl/Indexing.h b/src/ATen/native/xpu/sycl/Indexing.h
@@ -211,10 +211,8 @@ class IndexKernel {
     if constexpr (TrivialOffCal) {
       idx_off = idx_logical_off;
     } else {
-      idx_off = IndexToOffset<IdxType, int64_t>::get(
-          idx_logical_off,
-          cfg_.iinfo_,
-          IndexToOffset<IdxType, int64_t>::NON_STRICT_CONTIGUOUS);
+      idx_off = IndexToOffset<IdxType, int64_t, -1>::get(
+          idx_logical_off, cfg_.iinfo_);
     }
     glb_batch_group = id.glb_batch / cfg_.index_num_;
     glb_batch_group_loc_off = cfg_.iinfo_.data[idx_off];
@@ -322,26 +320,18 @@ class IndexKernel {
     } else {
       if (cfg_.indexing_dst_) {
         // index_copy, index_add, index_fill
-        dst_off = IndexToOffset<ValType, int64_t>::get(
-            glb_indexing_logical_off,
-            cfg_.dinfo_,
-            IndexToOffset<ValType, int64_t>::NON_STRICT_CONTIGUOUS);
+        dst_off = IndexToOffset<ValType, int64_t, -1>::get(
+            glb_indexing_logical_off, cfg_.dinfo_);
         if (cfg_.sinfo_.data != nullptr) {
-          src_off = IndexToOffset<const ValType, int64_t>::get(
-              glb_fixing_logical_off,
-              cfg_.sinfo_,
-              IndexToOffset<const ValType, int64_t>::NON_STRICT_CONTIGUOUS);
+          src_off = IndexToOffset<const ValType, int64_t, -1>::get(
+              glb_fixing_logical_off, cfg_.sinfo_);
         }
       } else {
         // index_select
-        src_off = IndexToOffset<const ValType, int64_t>::get(
-            glb_indexing_logical_off,
-            cfg_.sinfo_,
-            IndexToOffset<const ValType, int64_t>::NON_STRICT_CONTIGUOUS);
-        dst_off = IndexToOffset<ValType, int64_t>::get(
-            glb_fixing_logical_off,
-            cfg_.dinfo_,
-            IndexToOffset<ValType, int64_t>::NON_STRICT_CONTIGUOUS);
+        src_off = IndexToOffset<const ValType, int64_t, -1>::get(
+            glb_indexing_logical_off, cfg_.sinfo_);
+        dst_off = IndexToOffset<ValType, int64_t, -1>::get(
+            glb_fixing_logical_off, cfg_.dinfo_);
       }
     }
     cfg_.func_(

diff --git a/src/ATen/native/xpu/sycl/RNNKernels.cpp b/src/ATen/native/xpu/sycl/RNNKernels.cpp
@@ -77,12 +77,13 @@ void collapseDims(TensorInfo<T, T2>& info, Args&... infos) {
   collapseDims(infos...);
 }
 
-#define DEVICE_LINEAR_GET(D_TENSOR, INDEX) \
-  D_TENSOR.data[IndexToOffset<scalar_t, index_type>::get(INDEX, D_TENSOR)]
+#define DEVICE_LINEAR_GET(D_TENSOR, INDEX)                               \
+  D_TENSOR.data[IndexToOffset<scalar_t, index_type, indexing_kind>::get( \
+      INDEX, D_TENSOR)]
 
 // Biases are always 1D
 #define DEVICE_BIAS_GET(D_TENSOR, INDEX) \
-  D_TENSOR.data[IndexToOffset<scalar_t, index_type>::get(INDEX, D_TENSOR)]
+  D_TENSOR.data[IndexToOffset<scalar_t, index_type, 1>::get(INDEX, D_TENSOR)]
 
 #define H2F(input) static_cast<accscalar_t>(input)
 #define F2H(input) static_cast<scalar_t>(input)
@@ -93,7 +94,11 @@ inline T sigmoid(T in) {
   return one / (one + std::exp(-in));
 }
 
-template <typename scalar_t, typename accscalar_t, typename index_type>
+template <
+    typename scalar_t,
+    typename accscalar_t,
+    typename index_type,
+    int indexing_kind>
 struct LstmCellForwardFunctor {
   void operator()(sycl::nd_item<1> item) const {
     bool has_bias = bias1_.data != nullptr;
@@ -205,7 +210,11 @@ struct LstmCellForwardFunctor {
   index_type totalElements_;
 };
 
-template <typename scalar_t, typename accscalar_t, typename index_type>
+template <
+    typename scalar_t,
+    typename accscalar_t,
+    typename index_type,
+    int indexing_kind>
 struct LstmCellBackwardFunctor {
   void operator()(sycl::nd_item<1> item) const {
     bool has_gradoutput = gradoutput_.data != nullptr;
@@ -296,7 +305,11 @@ struct LstmCellBackwardFunctor {
   index_type totalElements_;
 };
 
-template <typename scalar_t, typename accscalar_t, typename index_type>
+template <
+    typename scalar_t,
+    typename accscalar_t,
+    typename index_type,
+    int indexing_kind>
 struct GruCellForwardFunctor {
   void operator()(sycl::nd_item<1> item) const {
     bool has_bias = Bias1_.data != nullptr;
@@ -387,7 +400,11 @@ struct GruCellForwardFunctor {
   const index_type totalElements_;
 };
 
-template <typename scalar_t, typename accscalar_t, typename index_type>
+template <
+    typename scalar_t,
+    typename accscalar_t,
+    typename index_type,
+    int indexing_kind>
 struct GruCellBackwardFunctor {
   void operator()(sycl::nd_item<1> item) const {
     for (index_type linearIndex = item.get_global_id(0);
@@ -469,12 +486,6 @@ void lstm_forward_impl(
   if (numel == 0)
     return;
 
-  using KernelT = LstmCellForwardFunctor<scalar_t, accscalar_t, index_type>;
-  auto max_wg_size = syclMaxWorkGroupSize<KernelT>();
-  auto config = rnn_get_launch_config(max_wg_size, numel);
-  auto nwg = std::get<0>(config);
-  auto local_range = std::get<1>(config);
-
   auto input_gatesI = getTensorInfo<scalar_t, index_type>(input_gates);
   auto hidden_gatesI = getTensorInfo<scalar_t, index_type>(hidden_gates);
   auto input_biasI = tryGetTensorInfo<scalar_t, index_type>(input_bias);
@@ -503,6 +514,12 @@ void lstm_forward_impl(
         hyI,
         cyI,
         workspaceI);
+    using KernelT =
+        LstmCellForwardFunctor<scalar_t, accscalar_t, index_type, 1>;
+    auto max_wg_size = syclMaxWorkGroupSize<KernelT>();
+    auto config = rnn_get_launch_config(max_wg_size, numel);
+    auto nwg = std::get<0>(config);
+    auto local_range = std::get<1>(config);
     KernelT kfn(
         input_gatesI,
         hidden_gatesI,
@@ -517,6 +534,12 @@ void lstm_forward_impl(
     sycl_kernel_submit(
         nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
   } else {
+    using KernelT =
+        LstmCellForwardFunctor<scalar_t, accscalar_t, index_type, 2>;
+    auto max_wg_size = syclMaxWorkGroupSize<KernelT>();
+    auto config = rnn_get_launch_config(max_wg_size, numel);
+    auto nwg = std::get<0>(config);
+    auto local_range = std::get<1>(config);
     KernelT kfn(
         input_gatesI,
         hidden_gatesI,
@@ -548,12 +571,6 @@ void lstm_backward_impl(
   if (numel == 0)
     return;
 
-  using KernelT = LstmCellBackwardFunctor<scalar_t, accscalar_t, index_type>;
-  auto max_wg_size = syclMaxWorkGroupSize<KernelT>();
-  auto config = rnn_get_launch_config(max_wg_size, numel);
-  auto nwg = std::get<0>(config);
-  auto local_range = std::get<1>(config);
-
   auto grad_hyI = tryGetTensorInfo<scalar_t, index_type>(grad_hy);
   auto grad_cyI = tryGetTensorInfo<scalar_t, index_type>(grad_cy);
   auto cxI = getTensorInfo<scalar_t, index_type>(cx);
@@ -567,6 +584,12 @@ void lstm_backward_impl(
           {grad_hy, grad_cy, cx, cy, workspace, grad_gates, grad_cx})) {
     collapseDims(
         grad_hyI, grad_cyI, cxI, cyI, workspaceI, grad_gatesI, grad_cxI);
+    using KernelT =
+        LstmCellBackwardFunctor<scalar_t, accscalar_t, index_type, 1>;
+    auto max_wg_size = syclMaxWorkGroupSize<KernelT>();
+    auto config = rnn_get_launch_config(max_wg_size, numel);
+    auto nwg = std::get<0>(config);
+    auto local_range = std::get<1>(config);
     KernelT kfn(
         workspaceI,
         grad_gatesI,
@@ -580,6 +603,12 @@ void lstm_backward_impl(
     sycl_kernel_submit(
         nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
   } else {
+    using KernelT =
+        LstmCellBackwardFunctor<scalar_t, accscalar_t, index_type, 2>;
+    auto max_wg_size = syclMaxWorkGroupSize<KernelT>();
+    auto config = rnn_get_launch_config(max_wg_size, numel);
+    auto nwg = std::get<0>(config);
+    auto local_range = std::get<1>(config);
     KernelT kfn(
         workspaceI,
         grad_gatesI,
@@ -610,12 +639,6 @@ void gru_forward_impl(
   if (numel == 0)
     return;
 
-  using KernelT = GruCellForwardFunctor<scalar_t, accscalar_t, index_type>;
-  auto max_wg_size = syclMaxWorkGroupSize<KernelT>();
-  auto config = rnn_get_launch_config(max_wg_size, numel);
-  auto nwg = std::get<0>(config);
-  auto local_range = std::get<1>(config);
-
   auto input_gatesI = getTensorInfo<scalar_t, index_type>(input_gates);
   auto hidden_gatesI = getTensorInfo<scalar_t, index_type>(hidden_gates);
   auto input_biasI = tryGetTensorInfo<scalar_t, index_type>(input_bias);
@@ -641,6 +664,11 @@ void gru_forward_impl(
         hxI,
         hyI,
         workspaceI);
+    using KernelT = GruCellForwardFunctor<scalar_t, accscalar_t, index_type, 1>;
+    auto max_wg_size = syclMaxWorkGroupSize<KernelT>();
+    auto config = rnn_get_launch_config(max_wg_size, numel);
+    auto nwg = std::get<0>(config);
+    auto local_range = std::get<1>(config);
     KernelT kfn(
         input_gatesI,
         hidden_gatesI,
@@ -654,6 +682,11 @@ void gru_forward_impl(
     sycl_kernel_submit(
         nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
   } else {
+    using KernelT = GruCellForwardFunctor<scalar_t, accscalar_t, index_type, 2>;
+    auto max_wg_size = syclMaxWorkGroupSize<KernelT>();
+    auto config = rnn_get_launch_config(max_wg_size, numel);
+    auto nwg = std::get<0>(config);
+    auto local_range = std::get<1>(config);
     KernelT kfn(
         input_gatesI,
         hidden_gatesI,
@@ -682,12 +715,6 @@ void gru_backward_impl(
   if (numel == 0)
     return;
 
-  using KernelT = GruCellBackwardFunctor<scalar_t, accscalar_t, index_type>;
-  auto max_wg_size = syclMaxWorkGroupSize<KernelT>();
-  auto config = rnn_get_launch_config(max_wg_size, numel);
-  auto nwg = std::get<0>(config);
-  auto local_range = std::get<1>(config);
-
   auto grad_hyI = getTensorInfo<scalar_t, index_type>(grad_hy);
   auto workspaceI = getTensorInfo<scalar_t, index_type>(workspace);
   auto grad_input_gatesI =
@@ -701,6 +728,12 @@ void gru_backward_impl(
           {grad_hy, workspace, grad_input_gates, grad_hidden_gates, grad_hx})) {
     collapseDims(
         grad_hyI, workspaceI, grad_input_gatesI, grad_hidden_gatesI, grad_hxI);
+    using KernelT =
+        GruCellBackwardFunctor<scalar_t, accscalar_t, index_type, 1>;
+    auto max_wg_size = syclMaxWorkGroupSize<KernelT>();
+    auto config = rnn_get_launch_config(max_wg_size, numel);
+    auto nwg = std::get<0>(config);
+    auto local_range = std::get<1>(config);
     KernelT kfn(
         grad_input_gatesI,
         grad_hidden_gatesI,
@@ -712,6 +745,12 @@ void gru_backward_impl(
     sycl_kernel_submit(
         nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
   } else {
+    using KernelT =
+        GruCellBackwardFunctor<scalar_t, accscalar_t, index_type, 2>;
+    auto max_wg_size = syclMaxWorkGroupSize<KernelT>();
+    auto config = rnn_get_launch_config(max_wg_size, numel);
+    auto nwg = std::get<0>(config);
+    auto local_range = std::get<1>(config);
     KernelT kfn(
         grad_input_gatesI,
         grad_hidden_gatesI,