From 861fd9cd0555c7a6792c7d498f3d3d4d6e6134cc Mon Sep 17 00:00:00 2001
From: john bowen <bowen36@llnl.gov>
Date: Fri, 12 Dec 2025 14:58:55 -0800
Subject: [PATCH 1/4] Cleanup sycl and openmp target forall implementations

---
 include/RAJA/policy/openmp_target/forall.hpp | 150 +++-----
 include/RAJA/policy/sequential/launch.hpp    |  52 +--
 include/RAJA/policy/sycl/forall.hpp          | 344 ++++---------------
 3 files changed, 144 insertions(+), 402 deletions(-)
diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp
index 3e2b256c51..de4045378c 100644
--- a/include/RAJA/policy/openmp_target/forall.hpp
+++ b/include/RAJA/policy/openmp_target/forall.hpp
@@ -39,9 +39,7 @@ template<size_t ThreadsPerTeam,
          typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Omp>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    concepts::negate<
-        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec<ThreadsPerTeam>& p,
             Iterable&& iter,
@@ -49,65 +47,14 @@ forall_impl(resources::Omp omp_res,
             ForallParam f_params)
 {
   using EXEC_POL = camp::decay<decltype(p)>;
-
-  RAJA::expt::ParamMultiplexer::parampack_init(p, f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-  using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body  = loop_body;
-
-  RAJA_EXTRACT_BED_IT(iter);
-
-  // Reset if exceed CUDA threads per block limit.
-  int tperteam = ThreadsPerTeam;
-  if (tperteam > omp::MAXNUMTHREADS)
-  {
-    tperteam = omp::MAXNUMTHREADS;
-  }
-
-  // calculate number of teams based on user defined threads per team
-  // datasize is distance between begin() and end() of iterable
-  auto numteams = RAJA_DIVIDE_CEILING_INT(distance_it, tperteam);
-  if (numteams > tperteam)
-  {
-    // Omp target reducers will write team # results, into Threads-sized array.
-    // Need to insure NumTeams <= Threads to prevent array out of bounds access.
-    numteams = tperteam;
-  }
-
-  // thread_limit(tperteam) unused due to XL seg fault (when tperteam !=
-  // distance)
-  auto i = distance_it;
-
-#pragma omp target teams distribute parallel for num_teams(numteams)           \
-    schedule(static, 1) map(to                                                 \
-                            : body, begin_it) reduction(combine                \
-                                                        : f_params)
-  for (i = 0; i < distance_it; ++i)
+  constexpr bool is_forall_param_empty =
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>::value;
+  if constexpr (!is_forall_param_empty)
   {
-    Body ib = body;
-    RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
+    RAJA::expt::ParamMultiplexer::parampack_init(p, f_params);
+    RAJA_OMP_DECLARE_REDUCTION_COMBINE;
   }
 
-  RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params);
-
-  return resources::EventProxy<resources::Omp>(omp_res);
-}
-
-template<size_t ThreadsPerTeam,
-         typename Iterable,
-         typename Func,
-         typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Omp>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Omp omp_res,
-            const omp_target_parallel_for_exec<ThreadsPerTeam>&,
-            Iterable&& iter,
-            Func&& loop_body,
-            ForallParam)
-{
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
   Body body  = loop_body;
 
@@ -133,14 +80,30 @@ forall_impl(resources::Omp omp_res,
   // thread_limit(tperteam) unused due to XL seg fault (when tperteam !=
   // distance)
   auto i = distance_it;
-
+  if constexpr (is_forall_param_empty)
+  {
 #pragma omp target teams distribute parallel for num_teams(numteams)           \
     schedule(static, 1) map(to                                                 \
                             : body, begin_it)
-  for (i = 0; i < distance_it; ++i)
+    for (i = 0; i < distance_it; ++i)
+    {
+      Body ib = body;
+      ib(begin_it[i]);
+    }
+  }
+  else
   {
-    Body ib = body;
-    ib(begin_it[i]);
+#pragma omp target teams distribute parallel for num_teams(numteams)           \
+    schedule(static, 1) map(to                                                 \
+                            : body, begin_it) reduction(combine                \
+                                                        : f_params)
+    for (i = 0; i < distance_it; ++i)
+    {
+      Body ib = body;
+      RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
+    }
+
+    RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params);
   }
 
   return resources::EventProxy<resources::Omp>(omp_res);
@@ -149,9 +112,7 @@ forall_impl(resources::Omp omp_res,
 template<typename Iterable, typename Func, typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Omp>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    concepts::negate<
-        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec_nt& p,
             Iterable&& iter,
@@ -159,53 +120,42 @@ forall_impl(resources::Omp omp_res,
             ForallParam f_params)
 {
   using EXEC_POL = camp::decay<decltype(p)>;
-
-  RAJA::expt::ParamMultiplexer::parampack_init(p, f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  constexpr bool is_forall_param_empty =
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>::value;
+  if constexpr (!is_forall_param_empty)
+  {
+    RAJA::expt::ParamMultiplexer::parampack_init(p, f_params);
+    RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  }
 
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
   Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
+  if constexpr (!is_forall_param_empty)
+  {
 #pragma omp target teams distribute parallel for schedule(static, 1)           \
     firstprivate(body, begin_it) reduction(combine                             \
                                            : f_params)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
-  {
-    Body ib = body;
-    RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
-  }
-
-  RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params);
-
-  return resources::EventProxy<resources::Omp>(omp_res);
-}
-
-template<typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Omp>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Omp omp_res,
-            const omp_target_parallel_for_exec_nt&,
-            Iterable&& iter,
-            Func&& loop_body,
-            ForallParam)
-{
-  using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body  = loop_body;
-
-  RAJA_EXTRACT_BED_IT(iter);
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    {
+      Body ib = body;
+      RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
+    }
 
+    RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params);
+  }
+  else
+  {
 #pragma omp target teams distribute parallel for schedule(static, 1)           \
     firstprivate(body, begin_it)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
-  {
-    Body ib = body;
-    ib(begin_it[i]);
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    {
+      Body ib = body;
+      ib(begin_it[i]);
+    }
   }
-
   return resources::EventProxy<resources::Omp>(omp_res);
 }
 
diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp
index ee98804ecf..2522ea8b98 100644
--- a/include/RAJA/policy/sequential/launch.hpp
+++ b/include/RAJA/policy/sequential/launch.hpp
@@ -39,37 +39,10 @@ struct LaunchExecute<RAJA::null_launch_t>
 template<>
 struct LaunchExecute<RAJA::seq_launch_t>
 {
-
   template<typename BODY, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       LaunchParams const& params,
-       BODY const& body,
-       ReduceParams& RAJA_UNUSED_ARG(ReduceParams))
-  {
-
-    LaunchContext ctx;
-
-    char* kernel_local_mem = new char[params.shared_mem_size];
-    ctx.shared_mem_ptr     = kernel_local_mem;
-
-    body(ctx);
-
-    delete[] kernel_local_mem;
-    ctx.shared_mem_ptr = nullptr;
-
-    return resources::EventProxy<resources::Resource>(res);
-  }
-
-  template<typename BODY, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      concepts::negate<
-          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>>
   exec(RAJA::resources::Resource res,
        LaunchParams const& launch_params,
        BODY const& body,
@@ -77,19 +50,32 @@ struct LaunchExecute<RAJA::seq_launch_t>
   {
     using EXEC_POL = RAJA::seq_exec;
     EXEC_POL pol {};
-
-    expt::ParamMultiplexer::parampack_init(pol, launch_reducers);
+    constexpr bool is_parampack_empty =
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>::value;
+    if constexpr (!is_parampack_empty)
+    {
+      expt::ParamMultiplexer::parampack_init(pol, launch_reducers);
+    }
 
     LaunchContext ctx;
     char* kernel_local_mem = new char[launch_params.shared_mem_size];
     ctx.shared_mem_ptr     = kernel_local_mem;
 
-    expt::invoke_body(launch_reducers, body, ctx);
+    if constexpr (!is_parampack_empty)
+    {
+      expt::invoke_body(launch_reducers, body, ctx);
+    }
+    else
+    {
+      body(ctx);
+    }
 
     delete[] kernel_local_mem;
     ctx.shared_mem_ptr = nullptr;
-
-    expt::ParamMultiplexer::parampack_resolve(pol, launch_reducers);
+    if constexpr (!is_parampack_empty)
+    {
+      expt::ParamMultiplexer::parampack_resolve(pol, launch_reducers);
+    }
 
     return resources::EventProxy<resources::Resource>(res);
   }
diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp
index fd719d68c9..b6d935020f 100644
--- a/include/RAJA/policy/sycl/forall.hpp
+++ b/include/RAJA/policy/sycl/forall.hpp
@@ -84,27 +84,26 @@ ::sycl::range<1> getGridDim(size_t len, size_t block_size)
 ////////////////////////////////////////////////////////////////////////
 //
 
+
 template<typename Iterable,
          typename LoopBody,
          size_t BlockSize,
          bool Async,
-         typename ForallParam,
-         typename std::enable_if<std::is_trivially_copyable<LoopBody> {},
-                                 bool>::type = true>
+         typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Sycl>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>>
 forall_impl(resources::Sycl& sycl_res,
-            sycl_exec<BlockSize, Async>,
+            sycl_exec<BlockSize, Async> const& pol,
             Iterable&& iter,
             LoopBody&& loop_body,
-            ForallParam)
-{
+            ForallParam f_params)
 
+{
   using Iterator = camp::decay<decltype(std::begin(iter))>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = camp::decay<decltype(pol)>;
 
   //
   // Compute the requested iteration space size
@@ -112,95 +111,22 @@ forall_impl(resources::Sycl& sycl_res,
   Iterator begin = std::begin(iter);
   Iterator end   = std::end(iter);
   IndexType len  = std::distance(begin, end);
-
-  // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0)
+  constexpr bool is_parampack_empty =
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>::value;
+  constexpr bool is_lbody_trivially_copyable =
+      std::is_trivially_copyable<LoopBody>::value;
+  LOOP_BODY* lbody = loop_body;
+  RAJA_FT_BEGIN;
+  if constexpr (!is_parampack_empty)
   {
-    // Note: We could fix an incorrect workgroup size.
-    //       It would change what was specified.
-    //       For now, leave the device compiler to error with invalid WG size.
-
-    //
-    // Compute the number of blocks
-    //
-    sycl_dim_t blockSize {BlockSize};
-    sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
-
-    ::sycl::queue* q = sycl_res.get_queue();
-
-    q->submit([&](::sycl::handler& h) {
-      h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize},
-                     [=](::sycl::nd_item<1> it) {
-                       IndexType ii = it.get_global_id(0);
-                       if (ii < len)
-                       {
-                         loop_body(begin[ii]);
-                       }
-                     });
-    });
-
-    if (!Async)
-    {
-      q->wait();
-    }
+    RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params);
   }
-
-  return resources::EventProxy<resources::Sycl>(sycl_res);
-}
-
-//
-// Define if parampack is empty, avoids ambigous definitions.
-//
-
-template<typename Iterable,
-         typename LoopBody,
-         size_t BlockSize,
-         bool Async,
-         typename ForallParam,
-         typename std::enable_if<!std::is_trivially_copyable<LoopBody> {},
-                                 bool>::type = true>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Sycl>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Sycl& sycl_res,
-            sycl_exec<BlockSize, Async>,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam)
-{
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
-  using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType =
-      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-
-  //
-  // Compute the requested iteration space size
-  //
-  Iterator begin = std::begin(iter);
-  Iterator end   = std::end(iter);
-  IndexType len  = std::distance(begin, end);
-
-  // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0)
+  if constexpr (is_lbody_trivially_copyable)
+  {
+    lbody = &loop_body;
+  }
+  else
   {
-
-    // Note: We could fix an incorrect workgroup size.
-    //       It would change what was specified.
-    //       For now, leave the device compiler to error with invalid WG size.
-
-    //
-    // Compute the number of blocks
-    //
-    sycl_dim_t blockSize {BlockSize};
-    sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
-
-    ::sycl::queue* q = sycl_res.get_queue();
-
-    LOOP_BODY* lbody;
-    Iterator* beg;
-
-    RAJA_FT_BEGIN;
     //
     // Setup shared memory buffers
     // Kernel body is nontrivially copyable, create space on device and copy to
@@ -211,62 +137,8 @@ forall_impl(resources::Sycl& sycl_res,
 
     beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
     q->memcpy(beg, &begin, sizeof(Iterator)).wait();
-
-    q->submit([&](::sycl::handler& h) {
-       h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize},
-                      [=](::sycl::nd_item<1> it) {
-                        Index_type ii = it.get_global_id(0);
-
-                        if (ii < len)
-                        {
-                          (*lbody)((*beg)[ii]);
-                        }
-                      });
-     }).wait();  // Need to wait for completion to free memory
-
-    // Free our device memory
-    ::sycl::free(lbody, *q);
-    ::sycl::free(beg, *q);
-
-    RAJA_FT_END;
   }
 
-  return resources::EventProxy<resources::Sycl>(sycl_res);
-}
-
-template<typename Iterable,
-         typename LoopBody,
-         size_t BlockSize,
-         bool Async,
-         typename ForallParam,
-         typename std::enable_if<std::is_trivially_copyable<LoopBody> {},
-                                 bool>::type = true>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Sycl>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    concepts::negate<
-        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(resources::Sycl& sycl_res,
-            sycl_exec<BlockSize, Async> const& pol,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam f_params)
-
-{
-  using Iterator = camp::decay<decltype(std::begin(iter))>;
-  using IndexType =
-      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = camp::decay<decltype(pol)>;
-
-  //
-  // Compute the requested iteration space size
-  //
-  Iterator begin = std::begin(iter);
-  Iterator end   = std::end(iter);
-  IndexType len  = std::distance(begin, end);
-
-  RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params);
-
   // Only launch kernel if we have something to iterate over
   if (len > 0 && BlockSize > 0)
   {
@@ -276,135 +148,69 @@ forall_impl(resources::Sycl& sycl_res,
     //
     sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
-    RAJA_UNUSED_VAR(gridSize);
-
-    ::sycl::queue* q = sycl_res.get_queue();
-
-    auto combiner = [](ForallParam x, ForallParam y) {
-      RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y);
-      return x;
-    };
-
-    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
-    RAJA::expt::ParamMultiplexer::parampack_init(pol, *res);
-    auto reduction = ::sycl::reduction(res, f_params, combiner);
-
-    q->submit([&](::sycl::handler& h) {
-      h.parallel_for(::sycl::range<1>(len), reduction,
-                     [=](::sycl::item<1> it, auto& red) {
-                       ForallParam fp;
-                       RAJA::expt::ParamMultiplexer::parampack_init(pol, fp);
-                       IndexType ii = it.get_id(0);
-                       if (ii < len)
-                       {
-                         RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
-                       }
-                       red.combine(fp);
-                     });
-    });
-
-    q->wait();
-    RAJA::expt::ParamMultiplexer::parampack_combine(pol, f_params, *res);
-    ::sycl::free(res, *q);
-  }
-  RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params);
-
-  return resources::EventProxy<resources::Sycl>(sycl_res);
-}
-
-template<typename Iterable,
-         typename LoopBody,
-         size_t BlockSize,
-         bool Async,
-         typename ForallParam,
-         typename std::enable_if<!std::is_trivially_copyable<LoopBody> {},
-                                 bool>::type = true>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Sycl>,
-    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-    concepts::negate<
-        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
-forall_impl(resources::Sycl& sycl_res,
-            sycl_exec<BlockSize, Async> const& pol,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam f_params)
-
-{
-  using Iterator  = camp::decay<decltype(std::begin(iter))>;
-  using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType =
-      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = camp::decay<decltype(pol)>;
-
-  //
-  // Compute the requested iteration space size
-  //
-  Iterator begin = std::begin(iter);
-  Iterator end   = std::end(iter);
-  IndexType len  = std::distance(begin, end);
-
-  RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params);
-
-  // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0)
-  {
-    //
-    // Compute the number of blocks
-    //
-    sycl_dim_t blockSize {BlockSize};
-    sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
-    RAJA_UNUSED_VAR(gridSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
+    // Both the parallel_for call, combinations, and resolution are all
+    // unique to the parameter case, so we make a constexpr branch here
+    if constexpr (!is_parampack_empty)
+    {
 
-    auto combiner = [](ForallParam x, ForallParam y) {
-      RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y);
-      return x;
-    };
-
-    // START
-    //
-    LOOP_BODY* lbody;
-    Iterator* beg;
-    RAJA_FT_BEGIN;
-    //
-    // Setup shared memory buffers
-    // Kernel body is nontrivially copyable, create space on device and copy to
-    // Workaround until "is_device_copyable" is supported
-    //
-    lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
-    q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
-
-    beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
-    q->memcpy(beg, &begin, sizeof(Iterator)).wait();
+      auto combiner = [](ForallParam x, ForallParam y) {
+        RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y);
+        return x;
+      };
+
+      ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
+      RAJA::expt::ParamMultiplexer::parampack_init(pol, *res);
+      auto reduction = ::sycl::reduction(res, f_params, combiner);
+
+      q->submit([&](::sycl::handler& h) {
+        h.parallel_for(::sycl::range<1>(len), reduction,
+                       [=](::sycl::item<1> it, auto& red) {
+                         ForallParam fp;
+                         RAJA::expt::ParamMultiplexer::parampack_init(pol, fp);
+                         IndexType ii = it.get_id(0);
+                         if (ii < len)
+                         {
+                           RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
+                         }
+                         red.combine(fp);
+                       });
+      });
 
-    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
-    RAJA::expt::ParamMultiplexer::parampack_init(pol, *res);
-    auto reduction = ::sycl::reduction(res, f_params, combiner);
-
-    q->submit([&](::sycl::handler& h) {
-       h.parallel_for(::sycl::range<1>(len), reduction,
-                      [=](::sycl::item<1> it, auto& red) {
-                        Index_type ii = it.get_id(0);
-                        ForallParam fp;
-                        RAJA::expt::ParamMultiplexer::parampack_init(pol, fp);
-                        if (ii < len)
-                        {
-                          RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
-                        }
-                        red.combine(fp);
-                      });
-     }).wait();  // Need to wait for completion to free memory
-    RAJA::expt::ParamMultiplexer::parampack_combine(pol, f_params, *res);
-    // Free our device memory
+      q->wait();
+      RAJA::expt::ParamMultiplexer::parampack_combine(pol, f_params, *res);
+      RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params);
+    }
+    else
+    {
+      q->submit([&](::sycl::handler& h) {
+        h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize},
+                       [=](::sycl::nd_item<1> it) {
+                         IndexType ii = it.get_global_id(0);
+                         if (ii < len)
+                         {
+                           loop_body(begin[ii]);
+                         }
+                       });
+      });
+
+      if (!Async)
+      {
+        q->wait();
+      }
+    }
     ::sycl::free(res, *q);
-    ::sycl::free(lbody, *q);
     ::sycl::free(beg, *q);
 
-    RAJA_FT_END;
+    // If we had to allocate device memory, free it
+    if constexpr (!is_lbody_trivially_copyable)
+    {
+      ::sycl::free(lbody, *q);
+      ;
+    }
+    RAJA_FT_END
   }
-  RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params);
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }

From 46b98c26e9229f927132bb78895584c9a42b644d Mon Sep 17 00:00:00 2001
From: john bowen <bowen36@llnl.gov>
Date: Wed, 17 Dec 2025 11:40:30 -0800
Subject: [PATCH 2/4] Fix declaration issues in merged forall implementations

---
 include/RAJA/policy/openmp_target/forall.hpp |   4 +-
 include/RAJA/policy/sycl/forall.hpp          | 160 ++++++++++---------
 2 files changed, 83 insertions(+), 81 deletions(-)

diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp
index de4045378c..968a36a4b9 100644
--- a/include/RAJA/policy/openmp_target/forall.hpp
+++ b/include/RAJA/policy/openmp_target/forall.hpp
@@ -52,7 +52,6 @@ forall_impl(resources::Omp omp_res,
   if constexpr (!is_forall_param_empty)
   {
     RAJA::expt::ParamMultiplexer::parampack_init(p, f_params);
-    RAJA_OMP_DECLARE_REDUCTION_COMBINE;
   }
 
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
@@ -93,6 +92,7 @@ forall_impl(resources::Omp omp_res,
   }
   else
   {
+    RAJA_OMP_DECLARE_REDUCTION_COMBINE
 #pragma omp target teams distribute parallel for num_teams(numteams)           \
     schedule(static, 1) map(to                                                 \
                             : body, begin_it) reduction(combine                \
@@ -125,7 +125,6 @@ forall_impl(resources::Omp omp_res,
   if constexpr (!is_forall_param_empty)
   {
     RAJA::expt::ParamMultiplexer::parampack_init(p, f_params);
-    RAJA_OMP_DECLARE_REDUCTION_COMBINE;
   }
 
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
@@ -135,6 +134,7 @@ forall_impl(resources::Omp omp_res,
 
   if constexpr (!is_forall_param_empty)
   {
+    RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 #pragma omp target teams distribute parallel for schedule(static, 1)           \
     firstprivate(body, begin_it) reduction(combine                             \
                                            : f_params)
diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp
index b6d935020f..a65fa97bd8 100644
--- a/include/RAJA/policy/sycl/forall.hpp
+++ b/include/RAJA/policy/sycl/forall.hpp
@@ -104,6 +104,13 @@ forall_impl(resources::Sycl& sycl_res,
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = camp::decay<decltype(pol)>;
+  // Deduce at compile time if lbody is trivially constructible and if user
+  // has supplied parameters.  These will be used to determine which sycl launch
+  // to configure below.
+  constexpr bool is_parampack_empty =
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>::value;
+  constexpr bool is_lbody_trivially_copyable =
+      std::is_trivially_copyable<LoopBody>::value;
 
   //
   // Compute the requested iteration space size
@@ -111,107 +118,102 @@ forall_impl(resources::Sycl& sycl_res,
   Iterator begin = std::begin(iter);
   Iterator end   = std::end(iter);
   IndexType len  = std::distance(begin, end);
-  constexpr bool is_parampack_empty =
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>::value;
-  constexpr bool is_lbody_trivially_copyable =
-      std::is_trivially_copyable<LoopBody>::value;
-  LOOP_BODY* lbody = loop_body;
-  RAJA_FT_BEGIN;
-  if constexpr (!is_parampack_empty)
+
+  // Return immediately if there is no work to be done
+  if (len <= 0 || BlockSize <= 0)
   {
-    RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params);
+    return resources::EventProxy<resources::Sycl>(sycl_res);
   }
-  if constexpr (is_lbody_trivially_copyable)
+
+  //
+  // Compute the number of blocks
+  //
+  sycl_dim_t blockSize {BlockSize};
+  sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
+
+  ::sycl::queue* q  = sycl_res.get_queue();
+  LoopBody* lbody   = &loop_body;
+  Iterator* d_begin = &begin;
+
+  if constexpr (!is_parampack_empty)
   {
-    lbody = &loop_body;
+    RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params);
   }
-  else
+  if constexpr (!is_lbody_trivially_copyable)
   {
     //
     // Setup shared memory buffers
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
-    q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
+    lbody = (LoopBody*)::sycl::malloc_device(sizeof(LoopBody), *q);
+    q->memcpy(lbody, &loop_body, sizeof(LoopBody)).wait();
 
-    beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
-    q->memcpy(beg, &begin, sizeof(Iterator)).wait();
+    d_begin = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
+    q->memcpy(d_begin, &begin, sizeof(Iterator)).wait();
   }
 
-  // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0)
+
+  // Both the parallel_for call, combinations, and resolution are all
+  // unique to the parameter case, so we make a constexpr branch here
+  if constexpr (!is_parampack_empty)
   {
 
-    //
-    // Compute the number of blocks
-    //
-    sycl_dim_t blockSize {BlockSize};
-    sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
+    auto combiner = [](ForallParam x, ForallParam y) {
+      RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y);
+      return x;
+    };
+
+    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
+    RAJA::expt::ParamMultiplexer::parampack_init(pol, *res);
+    auto reduction = ::sycl::reduction(res, f_params, combiner);
+
+    q->submit([&](::sycl::handler& h) {
+      h.parallel_for(::sycl::range<1>(len), reduction,
+                     [=](::sycl::item<1> it, auto& red) {
+                       ForallParam fp;
+                       RAJA::expt::ParamMultiplexer::parampack_init(pol, fp);
+                       IndexType ii = it.get_id(0);
+                       if (ii < len)
+                       {
+                         RAJA::expt::invoke_body(fp, loop_body, (*d_begin)[ii]);
+                       }
+                       red.combine(fp);
+                     });
+    });
 
-    ::sycl::queue* q = sycl_res.get_queue();
-    // Both the parallel_for call, combinations, and resolution are all
-    // unique to the parameter case, so we make a constexpr branch here
-    if constexpr (!is_parampack_empty)
+    q->wait();
+    RAJA::expt::ParamMultiplexer::parampack_combine(pol, f_params, *res);
+    RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params);
+    ::sycl::free(res, *q);
+  }
+  else
+  {
+    q->submit([&](::sycl::handler& h) {
+      h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize},
+                     [=](::sycl::nd_item<1> it) {
+                       IndexType ii = it.get_global_id(0);
+                       if (ii < len)
+                       {
+                         loop_body((*d_begin)[ii]);
+                       }
+                     });
+    });
+
+    if (!Async)
     {
-
-      auto combiner = [](ForallParam x, ForallParam y) {
-        RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y);
-        return x;
-      };
-
-      ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
-      RAJA::expt::ParamMultiplexer::parampack_init(pol, *res);
-      auto reduction = ::sycl::reduction(res, f_params, combiner);
-
-      q->submit([&](::sycl::handler& h) {
-        h.parallel_for(::sycl::range<1>(len), reduction,
-                       [=](::sycl::item<1> it, auto& red) {
-                         ForallParam fp;
-                         RAJA::expt::ParamMultiplexer::parampack_init(pol, fp);
-                         IndexType ii = it.get_id(0);
-                         if (ii < len)
-                         {
-                           RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
-                         }
-                         red.combine(fp);
-                       });
-      });
-
       q->wait();
-      RAJA::expt::ParamMultiplexer::parampack_combine(pol, f_params, *res);
-      RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params);
-    }
-    else
-    {
-      q->submit([&](::sycl::handler& h) {
-        h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize},
-                       [=](::sycl::nd_item<1> it) {
-                         IndexType ii = it.get_global_id(0);
-                         if (ii < len)
-                         {
-                           loop_body(begin[ii]);
-                         }
-                       });
-      });
-
-      if (!Async)
-      {
-        q->wait();
-      }
     }
-    ::sycl::free(res, *q);
-    ::sycl::free(beg, *q);
+  }
+  ::sycl::free(d_begin, *q);
 
-    // If we had to allocate device memory, free it
-    if constexpr (!is_lbody_trivially_copyable)
-    {
-      ::sycl::free(lbody, *q);
-      ;
-    }
-    RAJA_FT_END
+  // If we had to allocate device memory, free it
+  if constexpr (!is_lbody_trivially_copyable)
+  {
+    ::sycl::free(lbody, *q);
   }
 
+
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 

From 1b69f61aecef7e7dea9c320441932a16335b0e73 Mon Sep 17 00:00:00 2001
From: john bowen <bowen36@llnl.gov>
Date: Thu, 18 Dec 2025 14:25:48 -0800
Subject: [PATCH 3/4] Sycl forall implementation passing unit tests

---
 include/RAJA/policy/sycl/forall.hpp | 37 ++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp
index a65fa97bd8..061c1b47f5 100644
--- a/include/RAJA/policy/sycl/forall.hpp
+++ b/include/RAJA/policy/sycl/forall.hpp
@@ -103,7 +103,8 @@ forall_impl(resources::Sycl& sycl_res,
   using Iterator = camp::decay<decltype(std::begin(iter))>;
   using IndexType =
       camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = camp::decay<decltype(pol)>;
+  using EXEC_POL  = camp::decay<decltype(pol)>;
+  using LOOP_BODY = camp::decay<LoopBody>;
   // Deduce at compile time if lbody is trivially constructible and if user
   // has supplied parameters.  These will be used to determine which sycl launch
   // to configure below.
@@ -132,8 +133,8 @@ forall_impl(resources::Sycl& sycl_res,
   sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
   ::sycl::queue* q  = sycl_res.get_queue();
-  LoopBody* lbody   = &loop_body;
-  Iterator* d_begin = &begin;
+  LOOP_BODY* lbody  = nullptr;
+  Iterator* d_begin = nullptr;
 
   if constexpr (!is_parampack_empty)
   {
@@ -146,19 +147,17 @@ forall_impl(resources::Sycl& sycl_res,
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    lbody = (LoopBody*)::sycl::malloc_device(sizeof(LoopBody), *q);
-    q->memcpy(lbody, &loop_body, sizeof(LoopBody)).wait();
+    lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LoopBody), *q);
+    q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
 
     d_begin = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
     q->memcpy(d_begin, &begin, sizeof(Iterator)).wait();
   }
 
-
   // Both the parallel_for call, combinations, and resolution are all
   // unique to the parameter case, so we make a constexpr branch here
   if constexpr (!is_parampack_empty)
   {
-
     auto combiner = [](ForallParam x, ForallParam y) {
       RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y);
       return x;
@@ -176,7 +175,14 @@ forall_impl(resources::Sycl& sycl_res,
                        IndexType ii = it.get_id(0);
                        if (ii < len)
                        {
-                         RAJA::expt::invoke_body(fp, loop_body, (*d_begin)[ii]);
+                         if constexpr (is_lbody_trivially_copyable)
+                         {
+                           RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
+                         }
+                         else
+                         {
+                           RAJA::expt::invoke_body(fp, *lbody, (*d_begin)[ii]);
+                         }
                        }
                        red.combine(fp);
                      });
@@ -184,9 +190,10 @@ forall_impl(resources::Sycl& sycl_res,
 
     q->wait();
     RAJA::expt::ParamMultiplexer::parampack_combine(pol, f_params, *res);
-    RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params);
     ::sycl::free(res, *q);
+    RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params);
   }
+  // Note: separate branches
   else
   {
     q->submit([&](::sycl::handler& h) {
@@ -195,7 +202,14 @@ forall_impl(resources::Sycl& sycl_res,
                        IndexType ii = it.get_global_id(0);
                        if (ii < len)
                        {
-                         loop_body((*d_begin)[ii]);
+                         if constexpr (is_lbody_trivially_copyable)
+                         {
+                           loop_body(begin[ii]);
+                         }
+                         else
+                         {
+                           (*lbody)((*d_begin)[ii]);
+                         }
                        }
                      });
     });
@@ -205,12 +219,13 @@ forall_impl(resources::Sycl& sycl_res,
       q->wait();
     }
   }
-  ::sycl::free(d_begin, *q);
+
 
   // If we had to allocate device memory, free it
   if constexpr (!is_lbody_trivially_copyable)
   {
     ::sycl::free(lbody, *q);
+    ::sycl::free(d_begin, *q);
   }
 
 

From 7fa8730bd53bc9f28ce2edfa3628cec36e7c7411 Mon Sep 17 00:00:00 2001
From: john bowen <bowen36@llnl.gov>
Date: Mon, 22 Dec 2025 09:49:23 -0800
Subject: [PATCH 4/4] Simplify SYCL launch implementation

---
 include/RAJA/policy/sycl/launch.hpp | 334 ++++++----------------------
 1 file changed, 73 insertions(+), 261 deletions(-)

diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index f69e2c4424..6ad06ded2c 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -31,98 +31,33 @@ namespace RAJA
 template<bool async>
 struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 {
-
-  // If the launch lambda is trivially copyable
-  template<typename BODY_IN,
-           typename ReduceParams,
-           typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
-                                   bool>::type = true>
+  template<typename LoopBody, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
-  {
-
-    /*Get the queue from concrete resource */
-    ::sycl::queue* q = res.get<camp::resources::Sycl>().get_queue();
-
-    //
-    // Compute the number of blocks and threads
-    //
-
-    const ::sycl::range<3> blockSize(params.threads.value[2],
-                                     params.threads.value[1],
-                                     params.threads.value[0]);
-
-    const ::sycl::range<3> gridSize(
-        params.threads.value[2] * params.teams.value[2],
-        params.threads.value[1] * params.teams.value[1],
-        params.threads.value[0] * params.teams.value[0]);
-
-    // Only launch kernel if we have something to iterate over
-    constexpr int zero = 0;
-    if (params.threads.value[0] > zero && params.threads.value[1] > zero &&
-        params.threads.value[2] > zero && params.teams.value[0] > zero &&
-        params.teams.value[1] > zero && params.teams.value[2] > zero)
-    {
-
-      RAJA_FT_BEGIN;
-
-      q->submit([&](::sycl::handler& h) {
-        auto s_vec = ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
-
-        h.parallel_for(
-            ::sycl::nd_range<3>(gridSize, blockSize),
-            [=](::sycl::nd_item<3> itm) {
-              LaunchContext ctx;
-              ctx.itm = &itm;
-
-              // Point to shared memory
-              ctx.shared_mem_ptr =
-                  s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
-
-              body_in(ctx);
-            });
-      });
-
-      if (!async)
-      {
-        q->wait();
-      }
-
-      RAJA_FT_END;
-    }
-
-    return resources::EventProxy<resources::Resource>(res);
-  }
-
-  // If the launch lambda is trivially copyable and we have explcit reduction
-  // parameters
-  template<typename BODY_IN,
-           typename ReduceParams,
-           typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
-                                   bool>::type = true>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      concepts::negate<
-          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>>
   exec(RAJA::resources::Resource res,
        const LaunchParams& launch_params,
-       BODY_IN&& body_in,
+       LoopBody&& loop_body,
        ReduceParams launch_reducers)
   {
-    using EXEC_POL = RAJA::sycl_launch_t<async, 0>;
+    using EXEC_POL  = RAJA::sycl_launch_t<async, 0>;
+    using LOOP_BODY = camp::decay<LoopBody>;
+    // Deduce at compile time if lbody is trivially constructible and if user
+    // has supplied parameters.  These will be used to determine which sycl
+    // launch to configure below.
+    constexpr bool is_parampack_empty =
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>::value;
+    constexpr bool is_lbody_trivially_copyable =
+        std::is_trivially_copyable<LOOP_BODY>::value;
     EXEC_POL pol {};
 
     /*Get the queue from concrete resource */
     ::sycl::queue* q = res.get<camp::resources::Sycl>().get_queue();
 
-    RAJA::expt::ParamMultiplexer::parampack_init(pol, launch_reducers);
+    if constexpr (!is_parampack_empty)
+    {
+      RAJA::expt::ParamMultiplexer::parampack_init(pol, launch_reducers);
+    }
 
     //
     // Compute the number of blocks and threads
@@ -138,198 +73,39 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 
     // Only launch kernel if we have something to iterate over
     constexpr int zero = 0;
-    if (launch_params.threads.value[0] > zero &&
-        launch_params.threads.value[1] > zero &&
-        launch_params.threads.value[2] > zero &&
-        launch_params.teams.value[0] > zero &&
-        launch_params.teams.value[1] > zero &&
-        launch_params.teams.value[2] > zero)
+    if (launch_params.threads.value[0] <= zero ||
+        launch_params.threads.value[1] <= zero ||
+        launch_params.threads.value[2] <= zero ||
+        launch_params.teams.value[0] <= zero ||
+        launch_params.teams.value[1] <= zero ||
+        launch_params.teams.value[2] <= zero)
     {
-
-
-      auto combiner = [](ReduceParams x, ReduceParams y) {
-        RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y);
-        return x;
-      };
-
-      RAJA_FT_BEGIN;
-
-      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1, *q);
-      RAJA::expt::ParamMultiplexer::parampack_init(pol, *res);
-      auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
-
-      q->submit([&](::sycl::handler& h) {
-         auto s_vec =
-             ::sycl::local_accessor<char, 1>(launch_params.shared_mem_size, h);
-
-         h.parallel_for(
-             ::sycl::nd_range<3>(gridSize, blockSize), reduction,
-             [=](::sycl::nd_item<3> itm, auto& red) {
-               LaunchContext ctx;
-               ctx.itm = &itm;
-
-               // Point to shared memory
-               ctx.shared_mem_ptr =
-                   s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
-
-               ReduceParams fp;
-               RAJA::expt::ParamMultiplexer::parampack_init(pol, fp);
-
-               RAJA::expt::invoke_body(fp, body_in, ctx);
-
-               red.combine(fp);
-             });
-       }).wait();  // Need to wait for completion to free memory
-
-      RAJA::expt::ParamMultiplexer::parampack_combine(pol, launch_reducers,
-                                                      *res);
-      ::sycl::free(res, *q);
-
-      RAJA_FT_END;
+      return resources::EventProxy<resources::Resource>(res);
     }
 
-    RAJA::expt::ParamMultiplexer::parampack_resolve(pol, launch_reducers);
 
-    return resources::EventProxy<resources::Resource>(res);
-  }
-
-  // If the launch lambda is not trivially copyable
-  template<typename BODY_IN,
-           typename ReduceParams,
-           typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
-                                   bool>::type = true>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
-  {
-
-    /*Get the queue from concrete resource */
-    ::sycl::queue* q = res.get<camp::resources::Sycl>().get_queue();
+    RAJA_FT_BEGIN;
 
+    using LOOP_BODY  = camp::decay<LoopBody>;
+    LOOP_BODY* lbody = nullptr;
     //
-    // Compute the number of blocks and threads
+    // Kernel body is nontrivially copyable, create space on device and copy
+    // to Workaround until "is_device_copyable" is supported
     //
-
-    const ::sycl::range<3> blockSize(params.threads.value[2],
-                                     params.threads.value[1],
-                                     params.threads.value[0]);
-
-    const ::sycl::range<3> gridSize(
-        params.threads.value[2] * params.teams.value[2],
-        params.threads.value[1] * params.teams.value[1],
-        params.threads.value[0] * params.teams.value[0]);
-
-    // Only launch kernel if we have something to iterate over
-    constexpr int zero = 0;
-    if (params.threads.value[0] > zero && params.threads.value[1] > zero &&
-        params.threads.value[2] > zero && params.teams.value[0] > zero &&
-        params.teams.value[1] > zero && params.teams.value[2] > zero)
+    if constexpr (!is_lbody_trivially_copyable)
     {
-
-      RAJA_FT_BEGIN;
-
-      //
-      // Kernel body is nontrivially copyable, create space on device and copy
-      // to Workaround until "is_device_copyable" is supported
-      //
-      using LOOP_BODY = camp::decay<BODY_IN>;
-      LOOP_BODY* lbody;
       lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
-      q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
-
-      q->submit([&](::sycl::handler& h) {
-         auto s_vec =
-             ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
-
-         h.parallel_for(
-             ::sycl::nd_range<3>(gridSize, blockSize),
-             [=](::sycl::nd_item<3> itm) {
-               LaunchContext ctx;
-               ctx.itm = &itm;
-
-               // Point to shared memory
-               ctx.shared_mem_ptr =
-                   s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
-
-               (*lbody)(ctx);
-             });
-       }).wait();  // Need to wait for completion to free memory
-
-      ::sycl::free(lbody, *q);
-
-      RAJA_FT_END;
+      q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
     }
-
-    return resources::EventProxy<resources::Resource>(res);
-  }
-
-  // If the launch lambda is not trivially copyable
-  template<typename BODY_IN,
-           typename ReduceParams,
-           typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
-                                   bool>::type = true>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      concepts::negate<
-          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
-       BODY_IN&& body_in,
-       ReduceParams launch_reducers)
-  {
-    using EXEC_POL = RAJA::sycl_launch_t<async, 0>;
-    EXEC_POL pol {};
-
-    /*Get the queue from concrete resource */
-    ::sycl::queue* q = res.get<camp::resources::Sycl>().get_queue();
-
-    RAJA::expt::ParamMultiplexer::parampack_init(pol, launch_reducers);
-
-    //
-    // Compute the number of blocks and threads
-    //
-    const ::sycl::range<3> blockSize(launch_params.threads.value[2],
-                                     launch_params.threads.value[1],
-                                     launch_params.threads.value[0]);
-
-    const ::sycl::range<3> gridSize(
-        launch_params.threads.value[2] * launch_params.teams.value[2],
-        launch_params.threads.value[1] * launch_params.teams.value[1],
-        launch_params.threads.value[0] * launch_params.teams.value[0]);
-
-    // Only launch kernel if we have something to iterate over
-    constexpr int zero = 0;
-    if (launch_params.threads.value[0] > zero &&
-        launch_params.threads.value[1] > zero &&
-        launch_params.threads.value[2] > zero &&
-        launch_params.teams.value[0] > zero &&
-        launch_params.teams.value[1] > zero &&
-        launch_params.teams.value[2] > zero)
+    // Both the parallel_for call, combinations, and resolution are all
+    // unique to the parameter case, so we make a constexpr branch here
+    if constexpr (!is_parampack_empty)
     {
-
-
       auto combiner = [](ReduceParams x, ReduceParams y) {
         RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y);
         return x;
       };
 
-      RAJA_FT_BEGIN;
-
-      //
-      // Kernel body is nontrivially copyable, create space on device and copy
-      // to Workaround until "is_device_copyable" is supported
-      //
-      using LOOP_BODY = camp::decay<BODY_IN>;
-      LOOP_BODY* lbody;
-      lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
-      q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
-
       ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1, *q);
       RAJA::expt::ParamMultiplexer::parampack_init(pol, *res);
       auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
@@ -350,8 +126,14 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 
                ReduceParams fp;
                RAJA::expt::ParamMultiplexer::parampack_init(pol, fp);
-
-               RAJA::expt::invoke_body(fp, *lbody, ctx);
+               if constexpr (is_lbody_trivially_copyable)
+               {
+                 RAJA::expt::invoke_body(fp, loop_body, ctx);
+               }
+               else
+               {
+                 RAJA::expt::invoke_body(fp, *lbody, ctx);
+               }
 
                red.combine(fp);
              });
@@ -361,11 +143,41 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
                                                       *res);
       ::sycl::free(res, *q);
       ::sycl::free(lbody, *q);
+      RAJA::expt::ParamMultiplexer::parampack_resolve(pol, launch_reducers);
+    }
+    else
+    {
+      q->submit([&](::sycl::handler& h) {
+        auto s_vec =
+            ::sycl::local_accessor<char, 1>(launch_params.shared_mem_size, h);
+
+        h.parallel_for(
+            ::sycl::nd_range<3>(gridSize, blockSize),
+            [=](::sycl::nd_item<3> itm) {
+              LaunchContext ctx;
+              ctx.itm = &itm;
 
-      RAJA_FT_END;
+              // Point to shared memory
+              ctx.shared_mem_ptr =
+                  s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+              if constexpr (is_lbody_trivially_copyable)
+              {
+                loop_body(ctx);
+              }
+              else
+              {
+                (*lbody)(ctx);
+              }
+            });
+      });
+
+      if (!async)
+      {
+        q->wait();
+      }
     }
+    RAJA_FT_END;
 
-    RAJA::expt::ParamMultiplexer::parampack_resolve(pol, launch_reducers);
 
     return resources::EventProxy<resources::Resource>(res);
   }