From 861fd9cd0555c7a6792c7d498f3d3d4d6e6134cc Mon Sep 17 00:00:00 2001 From: john bowen Date: Fri, 12 Dec 2025 14:58:55 -0800 Subject: [PATCH 1/4] Cleanup sycl and openmp target forall implementations --- include/RAJA/policy/openmp_target/forall.hpp | 150 +++----- include/RAJA/policy/sequential/launch.hpp | 52 +-- include/RAJA/policy/sycl/forall.hpp | 344 ++++--------------- 3 files changed, 144 insertions(+), 402 deletions(-) diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp index 3e2b256c51..de4045378c 100644 --- a/include/RAJA/policy/openmp_target/forall.hpp +++ b/include/RAJA/policy/openmp_target/forall.hpp @@ -39,9 +39,7 @@ template RAJA_INLINE concepts::enable_if_t< resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - concepts::negate< - RAJA::expt::type_traits::is_ForallParamPack_empty>> + RAJA::expt::type_traits::is_ForallParamPack> forall_impl(resources::Omp omp_res, const omp_target_parallel_for_exec& p, Iterable&& iter, @@ -49,65 +47,14 @@ forall_impl(resources::Omp omp_res, ForallParam f_params) { using EXEC_POL = camp::decay; - - RAJA::expt::ParamMultiplexer::parampack_init(p, f_params); - RAJA_OMP_DECLARE_REDUCTION_COMBINE; - - using Body = typename std::remove_reference::type; - Body body = loop_body; - - RAJA_EXTRACT_BED_IT(iter); - - // Reset if exceed CUDA threads per block limit. - int tperteam = ThreadsPerTeam; - if (tperteam > omp::MAXNUMTHREADS) - { - tperteam = omp::MAXNUMTHREADS; - } - - // calculate number of teams based on user defined threads per team - // datasize is distance between begin() and end() of iterable - auto numteams = RAJA_DIVIDE_CEILING_INT(distance_it, tperteam); - if (numteams > tperteam) - { - // Omp target reducers will write team # results, into Threads-sized array. - // Need to insure NumTeams <= Threads to prevent array out of bounds access. - numteams = tperteam; - } - - // thread_limit(tperteam) unused due to XL seg fault (when tperteam != - // distance) - auto i = distance_it; - -#pragma omp target teams distribute parallel for num_teams(numteams) \ - schedule(static, 1) map(to \ - : body, begin_it) reduction(combine \ - : f_params) - for (i = 0; i < distance_it; ++i) + constexpr bool is_forall_param_empty = + RAJA::expt::type_traits::is_ForallParamPack_empty::value; + if constexpr (!is_forall_param_empty) { - Body ib = body; - RAJA::expt::invoke_body(f_params, ib, begin_it[i]); + RAJA::expt::ParamMultiplexer::parampack_init(p, f_params); + RAJA_OMP_DECLARE_REDUCTION_COMBINE; } - RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params); - - return resources::EventProxy(omp_res); -} - -template -RAJA_INLINE concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - RAJA::expt::type_traits::is_ForallParamPack_empty> -forall_impl(resources::Omp omp_res, - const omp_target_parallel_for_exec&, - Iterable&& iter, - Func&& loop_body, - ForallParam) -{ using Body = typename std::remove_reference::type; Body body = loop_body; @@ -133,14 +80,30 @@ forall_impl(resources::Omp omp_res, // thread_limit(tperteam) unused due to XL seg fault (when tperteam != // distance) auto i = distance_it; - + if constexpr (is_forall_param_empty) + { #pragma omp target teams distribute parallel for num_teams(numteams) \ schedule(static, 1) map(to \ : body, begin_it) - for (i = 0; i < distance_it; ++i) + for (i = 0; i < distance_it; ++i) + { + Body ib = body; + ib(begin_it[i]); + } + } + else { - Body ib = body; - ib(begin_it[i]); +#pragma omp target teams distribute parallel for num_teams(numteams) \ + schedule(static, 1) map(to \ + : body, begin_it) reduction(combine \ + : f_params) + for (i = 0; i < distance_it; ++i) + { + Body ib = body; + RAJA::expt::invoke_body(f_params, ib, begin_it[i]); + } + + RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params); } return resources::EventProxy(omp_res); @@ -149,9 +112,7 @@ forall_impl(resources::Omp omp_res, template RAJA_INLINE concepts::enable_if_t< resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - concepts::negate< - RAJA::expt::type_traits::is_ForallParamPack_empty>> + RAJA::expt::type_traits::is_ForallParamPack> forall_impl(resources::Omp omp_res, const omp_target_parallel_for_exec_nt& p, Iterable&& iter, @@ -159,53 +120,42 @@ forall_impl(resources::Omp omp_res, ForallParam f_params) { using EXEC_POL = camp::decay; - - RAJA::expt::ParamMultiplexer::parampack_init(p, f_params); - RAJA_OMP_DECLARE_REDUCTION_COMBINE; + constexpr bool is_forall_param_empty = + RAJA::expt::type_traits::is_ForallParamPack_empty::value; + if constexpr (!is_forall_param_empty) + { + RAJA::expt::ParamMultiplexer::parampack_init(p, f_params); + RAJA_OMP_DECLARE_REDUCTION_COMBINE; + } using Body = typename std::remove_reference::type; Body body = loop_body; RAJA_EXTRACT_BED_IT(iter); + if constexpr (!is_forall_param_empty) + { #pragma omp target teams distribute parallel for schedule(static, 1) \ firstprivate(body, begin_it) reduction(combine \ : f_params) - for (decltype(distance_it) i = 0; i < distance_it; ++i) - { - Body ib = body; - RAJA::expt::invoke_body(f_params, ib, begin_it[i]); - } - - RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params); - - return resources::EventProxy(omp_res); -} - -template -RAJA_INLINE concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - RAJA::expt::type_traits::is_ForallParamPack_empty> -forall_impl(resources::Omp omp_res, - const omp_target_parallel_for_exec_nt&, - Iterable&& iter, - Func&& loop_body, - ForallParam) -{ - using Body = typename std::remove_reference::type; - Body body = loop_body; - - RAJA_EXTRACT_BED_IT(iter); + for (decltype(distance_it) i = 0; i < distance_it; ++i) + { + Body ib = body; + RAJA::expt::invoke_body(f_params, ib, begin_it[i]); + } + RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params); + } + else + { #pragma omp target teams distribute parallel for schedule(static, 1) \ firstprivate(body, begin_it) - for (decltype(distance_it) i = 0; i < distance_it; ++i) - { - Body ib = body; - ib(begin_it[i]); + for (decltype(distance_it) i = 0; i < distance_it; ++i) + { + Body ib = body; + ib(begin_it[i]); + } } - return resources::EventProxy(omp_res); } diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp index ee98804ecf..2522ea8b98 100644 --- a/include/RAJA/policy/sequential/launch.hpp +++ b/include/RAJA/policy/sequential/launch.hpp @@ -39,37 +39,10 @@ struct LaunchExecute template<> struct LaunchExecute { - template static concepts::enable_if_t< resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - RAJA::expt::type_traits::is_ForallParamPack_empty> - exec(RAJA::resources::Resource res, - LaunchParams const& params, - BODY const& body, - ReduceParams& RAJA_UNUSED_ARG(ReduceParams)) - { - - LaunchContext ctx; - - char* kernel_local_mem = new char[params.shared_mem_size]; - ctx.shared_mem_ptr = kernel_local_mem; - - body(ctx); - - delete[] kernel_local_mem; - ctx.shared_mem_ptr = nullptr; - - return resources::EventProxy(res); - } - - template - static concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - concepts::negate< - RAJA::expt::type_traits::is_ForallParamPack_empty>> + RAJA::expt::type_traits::is_ForallParamPack> exec(RAJA::resources::Resource res, LaunchParams const& launch_params, BODY const& body, @@ -77,19 +50,32 @@ struct LaunchExecute { using EXEC_POL = RAJA::seq_exec; EXEC_POL pol {}; - - expt::ParamMultiplexer::parampack_init(pol, launch_reducers); + constexpr bool is_parampack_empty = + RAJA::expt::type_traits::is_ForallParamPack_empty::value; + if constexpr (!is_parampack_empty) + { + expt::ParamMultiplexer::parampack_init(pol, launch_reducers); + } LaunchContext ctx; char* kernel_local_mem = new char[launch_params.shared_mem_size]; ctx.shared_mem_ptr = kernel_local_mem; - expt::invoke_body(launch_reducers, body, ctx); + if constexpr (!is_parampack_empty) + { + expt::invoke_body(launch_reducers, body, ctx); + } + else + { + body(ctx); + } delete[] kernel_local_mem; ctx.shared_mem_ptr = nullptr; - - expt::ParamMultiplexer::parampack_resolve(pol, launch_reducers); + if constexpr (!is_parampack_empty) + { + expt::ParamMultiplexer::parampack_resolve(pol, launch_reducers); + } return resources::EventProxy(res); } diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp index fd719d68c9..b6d935020f 100644 --- a/include/RAJA/policy/sycl/forall.hpp +++ b/include/RAJA/policy/sycl/forall.hpp @@ -84,27 +84,26 @@ ::sycl::range<1> getGridDim(size_t len, size_t block_size) //////////////////////////////////////////////////////////////////////// // + template {}, - bool>::type = true> + typename ForallParam> RAJA_INLINE concepts::enable_if_t< resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - RAJA::expt::type_traits::is_ForallParamPack_empty> + RAJA::expt::type_traits::is_ForallParamPack> forall_impl(resources::Sycl& sycl_res, - sycl_exec, + sycl_exec const& pol, Iterable&& iter, LoopBody&& loop_body, - ForallParam) -{ + ForallParam f_params) +{ using Iterator = camp::decay; using IndexType = camp::decay; + using EXEC_POL = camp::decay; // // Compute the requested iteration space size @@ -112,95 +111,22 @@ forall_impl(resources::Sycl& sycl_res, Iterator begin = std::begin(iter); Iterator end = std::end(iter); IndexType len = std::distance(begin, end); - - // Only launch kernel if we have something to iterate over - if (len > 0 && BlockSize > 0) + constexpr bool is_parampack_empty = + RAJA::expt::type_traits::is_ForallParamPack_empty::value; + constexpr bool is_lbody_trivially_copyable = + std::is_trivially_copyable::value; + LOOP_BODY* lbody = loop_body; + RAJA_FT_BEGIN; + if constexpr (!is_parampack_empty) { - // Note: We could fix an incorrect workgroup size. - // It would change what was specified. - // For now, leave the device compiler to error with invalid WG size. - - // - // Compute the number of blocks - // - sycl_dim_t blockSize {BlockSize}; - sycl_dim_t gridSize = impl::getGridDim(static_cast(len), BlockSize); - - ::sycl::queue* q = sycl_res.get_queue(); - - q->submit([&](::sycl::handler& h) { - h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize}, - [=](::sycl::nd_item<1> it) { - IndexType ii = it.get_global_id(0); - if (ii < len) - { - loop_body(begin[ii]); - } - }); - }); - - if (!Async) - { - q->wait(); - } + RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params); } - - return resources::EventProxy(sycl_res); -} - -// -// Define if parampack is empty, avoids ambigous definitions. -// - -template {}, - bool>::type = true> -RAJA_INLINE concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - RAJA::expt::type_traits::is_ForallParamPack_empty> -forall_impl(resources::Sycl& sycl_res, - sycl_exec, - Iterable&& iter, - LoopBody&& loop_body, - ForallParam) -{ - using Iterator = camp::decay; - using LOOP_BODY = camp::decay; - using IndexType = - camp::decay; - - // - // Compute the requested iteration space size - // - Iterator begin = std::begin(iter); - Iterator end = std::end(iter); - IndexType len = std::distance(begin, end); - - // Only launch kernel if we have something to iterate over - if (len > 0 && BlockSize > 0) + if constexpr (is_lbody_trivially_copyable) + { + lbody = &loop_body; + } + else { - - // Note: We could fix an incorrect workgroup size. - // It would change what was specified. - // For now, leave the device compiler to error with invalid WG size. - - // - // Compute the number of blocks - // - sycl_dim_t blockSize {BlockSize}; - sycl_dim_t gridSize = impl::getGridDim(static_cast(len), BlockSize); - - ::sycl::queue* q = sycl_res.get_queue(); - - LOOP_BODY* lbody; - Iterator* beg; - - RAJA_FT_BEGIN; // // Setup shared memory buffers // Kernel body is nontrivially copyable, create space on device and copy to @@ -211,62 +137,8 @@ forall_impl(resources::Sycl& sycl_res, beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q); q->memcpy(beg, &begin, sizeof(Iterator)).wait(); - - q->submit([&](::sycl::handler& h) { - h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize}, - [=](::sycl::nd_item<1> it) { - Index_type ii = it.get_global_id(0); - - if (ii < len) - { - (*lbody)((*beg)[ii]); - } - }); - }).wait(); // Need to wait for completion to free memory - - // Free our device memory - ::sycl::free(lbody, *q); - ::sycl::free(beg, *q); - - RAJA_FT_END; } - return resources::EventProxy(sycl_res); -} - -template {}, - bool>::type = true> -RAJA_INLINE concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - concepts::negate< - RAJA::expt::type_traits::is_ForallParamPack_empty>> -forall_impl(resources::Sycl& sycl_res, - sycl_exec const& pol, - Iterable&& iter, - LoopBody&& loop_body, - ForallParam f_params) - -{ - using Iterator = camp::decay; - using IndexType = - camp::decay; - using EXEC_POL = camp::decay; - - // - // Compute the requested iteration space size - // - Iterator begin = std::begin(iter); - Iterator end = std::end(iter); - IndexType len = std::distance(begin, end); - - RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params); - // Only launch kernel if we have something to iterate over if (len > 0 && BlockSize > 0) { @@ -276,135 +148,69 @@ forall_impl(resources::Sycl& sycl_res, // sycl_dim_t blockSize {BlockSize}; sycl_dim_t gridSize = impl::getGridDim(static_cast(len), BlockSize); - RAJA_UNUSED_VAR(gridSize); - - ::sycl::queue* q = sycl_res.get_queue(); - - auto combiner = [](ForallParam x, ForallParam y) { - RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y); - return x; - }; - - ForallParam* res = ::sycl::malloc_shared(1, *q); - RAJA::expt::ParamMultiplexer::parampack_init(pol, *res); - auto reduction = ::sycl::reduction(res, f_params, combiner); - - q->submit([&](::sycl::handler& h) { - h.parallel_for(::sycl::range<1>(len), reduction, - [=](::sycl::item<1> it, auto& red) { - ForallParam fp; - RAJA::expt::ParamMultiplexer::parampack_init(pol, fp); - IndexType ii = it.get_id(0); - if (ii < len) - { - RAJA::expt::invoke_body(fp, loop_body, begin[ii]); - } - red.combine(fp); - }); - }); - - q->wait(); - RAJA::expt::ParamMultiplexer::parampack_combine(pol, f_params, *res); - ::sycl::free(res, *q); - } - RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params); - - return resources::EventProxy(sycl_res); -} - -template {}, - bool>::type = true> -RAJA_INLINE concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - concepts::negate< - RAJA::expt::type_traits::is_ForallParamPack_empty>> -forall_impl(resources::Sycl& sycl_res, - sycl_exec const& pol, - Iterable&& iter, - LoopBody&& loop_body, - ForallParam f_params) - -{ - using Iterator = camp::decay; - using LOOP_BODY = camp::decay; - using IndexType = - camp::decay; - using EXEC_POL = camp::decay; - - // - // Compute the requested iteration space size - // - Iterator begin = std::begin(iter); - Iterator end = std::end(iter); - IndexType len = std::distance(begin, end); - - RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params); - - // Only launch kernel if we have something to iterate over - if (len > 0 && BlockSize > 0) - { - // - // Compute the number of blocks - // - sycl_dim_t blockSize {BlockSize}; - sycl_dim_t gridSize = impl::getGridDim(static_cast(len), BlockSize); - RAJA_UNUSED_VAR(gridSize); ::sycl::queue* q = sycl_res.get_queue(); + // Both the parallel_for call, combinations, and resolution are all + // unique to the parameter case, so we make a constexpr branch here + if constexpr (!is_parampack_empty) + { - auto combiner = [](ForallParam x, ForallParam y) { - RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y); - return x; - }; - - // START - // - LOOP_BODY* lbody; - Iterator* beg; - RAJA_FT_BEGIN; - // - // Setup shared memory buffers - // Kernel body is nontrivially copyable, create space on device and copy to - // Workaround until "is_device_copyable" is supported - // - lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q); - q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait(); - - beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q); - q->memcpy(beg, &begin, sizeof(Iterator)).wait(); + auto combiner = [](ForallParam x, ForallParam y) { + RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y); + return x; + }; + + ForallParam* res = ::sycl::malloc_shared(1, *q); + RAJA::expt::ParamMultiplexer::parampack_init(pol, *res); + auto reduction = ::sycl::reduction(res, f_params, combiner); + + q->submit([&](::sycl::handler& h) { + h.parallel_for(::sycl::range<1>(len), reduction, + [=](::sycl::item<1> it, auto& red) { + ForallParam fp; + RAJA::expt::ParamMultiplexer::parampack_init(pol, fp); + IndexType ii = it.get_id(0); + if (ii < len) + { + RAJA::expt::invoke_body(fp, loop_body, begin[ii]); + } + red.combine(fp); + }); + }); - ForallParam* res = ::sycl::malloc_shared(1, *q); - RAJA::expt::ParamMultiplexer::parampack_init(pol, *res); - auto reduction = ::sycl::reduction(res, f_params, combiner); - - q->submit([&](::sycl::handler& h) { - h.parallel_for(::sycl::range<1>(len), reduction, - [=](::sycl::item<1> it, auto& red) { - Index_type ii = it.get_id(0); - ForallParam fp; - RAJA::expt::ParamMultiplexer::parampack_init(pol, fp); - if (ii < len) - { - RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]); - } - red.combine(fp); - }); - }).wait(); // Need to wait for completion to free memory - RAJA::expt::ParamMultiplexer::parampack_combine(pol, f_params, *res); - // Free our device memory + q->wait(); + RAJA::expt::ParamMultiplexer::parampack_combine(pol, f_params, *res); + RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params); + } + else + { + q->submit([&](::sycl::handler& h) { + h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize}, + [=](::sycl::nd_item<1> it) { + IndexType ii = it.get_global_id(0); + if (ii < len) + { + loop_body(begin[ii]); + } + }); + }); + + if (!Async) + { + q->wait(); + } + } ::sycl::free(res, *q); - ::sycl::free(lbody, *q); ::sycl::free(beg, *q); - RAJA_FT_END; + // If we had to allocate device memory, free it + if constexpr (!is_lbody_trivially_copyable) + { + ::sycl::free(lbody, *q); + ; + } + RAJA_FT_END } - RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params); return resources::EventProxy(sycl_res); } From 46b98c26e9229f927132bb78895584c9a42b644d Mon Sep 17 00:00:00 2001 From: john bowen Date: Wed, 17 Dec 2025 11:40:30 -0800 Subject: [PATCH 2/4] Fix declaration issues in merged forall implementations --- include/RAJA/policy/openmp_target/forall.hpp | 4 +- include/RAJA/policy/sycl/forall.hpp | 160 ++++++++++--------- 2 files changed, 83 insertions(+), 81 deletions(-) diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp index de4045378c..968a36a4b9 100644 --- a/include/RAJA/policy/openmp_target/forall.hpp +++ b/include/RAJA/policy/openmp_target/forall.hpp @@ -52,7 +52,6 @@ forall_impl(resources::Omp omp_res, if constexpr (!is_forall_param_empty) { RAJA::expt::ParamMultiplexer::parampack_init(p, f_params); - RAJA_OMP_DECLARE_REDUCTION_COMBINE; } using Body = typename std::remove_reference::type; @@ -93,6 +92,7 @@ forall_impl(resources::Omp omp_res, } else { + RAJA_OMP_DECLARE_REDUCTION_COMBINE #pragma omp target teams distribute parallel for num_teams(numteams) \ schedule(static, 1) map(to \ : body, begin_it) reduction(combine \ @@ -125,7 +125,6 @@ forall_impl(resources::Omp omp_res, if constexpr (!is_forall_param_empty) { RAJA::expt::ParamMultiplexer::parampack_init(p, f_params); - RAJA_OMP_DECLARE_REDUCTION_COMBINE; } using Body = typename std::remove_reference::type; @@ -135,6 +134,7 @@ forall_impl(resources::Omp omp_res, if constexpr (!is_forall_param_empty) { + RAJA_OMP_DECLARE_REDUCTION_COMBINE; #pragma omp target teams distribute parallel for schedule(static, 1) \ firstprivate(body, begin_it) reduction(combine \ : f_params) diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp index b6d935020f..a65fa97bd8 100644 --- a/include/RAJA/policy/sycl/forall.hpp +++ b/include/RAJA/policy/sycl/forall.hpp @@ -104,6 +104,13 @@ forall_impl(resources::Sycl& sycl_res, using IndexType = camp::decay; using EXEC_POL = camp::decay; + // Deduce at compile time if lbody is trivially constructible and if user + // has supplied parameters. These will be used to determine which sycl launch + // to configure below. + constexpr bool is_parampack_empty = + RAJA::expt::type_traits::is_ForallParamPack_empty::value; + constexpr bool is_lbody_trivially_copyable = + std::is_trivially_copyable::value; // // Compute the requested iteration space size @@ -111,107 +118,102 @@ forall_impl(resources::Sycl& sycl_res, Iterator begin = std::begin(iter); Iterator end = std::end(iter); IndexType len = std::distance(begin, end); - constexpr bool is_parampack_empty = - RAJA::expt::type_traits::is_ForallParamPack_empty::value; - constexpr bool is_lbody_trivially_copyable = - std::is_trivially_copyable::value; - LOOP_BODY* lbody = loop_body; - RAJA_FT_BEGIN; - if constexpr (!is_parampack_empty) + + // Return immediately if there is no work to be done + if (len <= 0 || BlockSize <= 0) { - RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params); + return resources::EventProxy(sycl_res); } - if constexpr (is_lbody_trivially_copyable) + + // + // Compute the number of blocks + // + sycl_dim_t blockSize {BlockSize}; + sycl_dim_t gridSize = impl::getGridDim(static_cast(len), BlockSize); + + ::sycl::queue* q = sycl_res.get_queue(); + LoopBody* lbody = &loop_body; + Iterator* d_begin = &begin; + + if constexpr (!is_parampack_empty) { - lbody = &loop_body; + RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params); } - else + if constexpr (!is_lbody_trivially_copyable) { // // Setup shared memory buffers // Kernel body is nontrivially copyable, create space on device and copy to // Workaround until "is_device_copyable" is supported // - lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q); - q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait(); + lbody = (LoopBody*)::sycl::malloc_device(sizeof(LoopBody), *q); + q->memcpy(lbody, &loop_body, sizeof(LoopBody)).wait(); - beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q); - q->memcpy(beg, &begin, sizeof(Iterator)).wait(); + d_begin = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q); + q->memcpy(d_begin, &begin, sizeof(Iterator)).wait(); } - // Only launch kernel if we have something to iterate over - if (len > 0 && BlockSize > 0) + + // Both the parallel_for call, combinations, and resolution are all + // unique to the parameter case, so we make a constexpr branch here + if constexpr (!is_parampack_empty) { - // - // Compute the number of blocks - // - sycl_dim_t blockSize {BlockSize}; - sycl_dim_t gridSize = impl::getGridDim(static_cast(len), BlockSize); + auto combiner = [](ForallParam x, ForallParam y) { + RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y); + return x; + }; + + ForallParam* res = ::sycl::malloc_shared(1, *q); + RAJA::expt::ParamMultiplexer::parampack_init(pol, *res); + auto reduction = ::sycl::reduction(res, f_params, combiner); + + q->submit([&](::sycl::handler& h) { + h.parallel_for(::sycl::range<1>(len), reduction, + [=](::sycl::item<1> it, auto& red) { + ForallParam fp; + RAJA::expt::ParamMultiplexer::parampack_init(pol, fp); + IndexType ii = it.get_id(0); + if (ii < len) + { + RAJA::expt::invoke_body(fp, loop_body, (*d_begin)[ii]); + } + red.combine(fp); + }); + }); - ::sycl::queue* q = sycl_res.get_queue(); - // Both the parallel_for call, combinations, and resolution are all - // unique to the parameter case, so we make a constexpr branch here - if constexpr (!is_parampack_empty) + q->wait(); + RAJA::expt::ParamMultiplexer::parampack_combine(pol, f_params, *res); + RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params); + ::sycl::free(res, *q); + } + else + { + q->submit([&](::sycl::handler& h) { + h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize}, + [=](::sycl::nd_item<1> it) { + IndexType ii = it.get_global_id(0); + if (ii < len) + { + loop_body((*d_begin)[ii]); + } + }); + }); + + if (!Async) { - - auto combiner = [](ForallParam x, ForallParam y) { - RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y); - return x; - }; - - ForallParam* res = ::sycl::malloc_shared(1, *q); - RAJA::expt::ParamMultiplexer::parampack_init(pol, *res); - auto reduction = ::sycl::reduction(res, f_params, combiner); - - q->submit([&](::sycl::handler& h) { - h.parallel_for(::sycl::range<1>(len), reduction, - [=](::sycl::item<1> it, auto& red) { - ForallParam fp; - RAJA::expt::ParamMultiplexer::parampack_init(pol, fp); - IndexType ii = it.get_id(0); - if (ii < len) - { - RAJA::expt::invoke_body(fp, loop_body, begin[ii]); - } - red.combine(fp); - }); - }); - q->wait(); - RAJA::expt::ParamMultiplexer::parampack_combine(pol, f_params, *res); - RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params); - } - else - { - q->submit([&](::sycl::handler& h) { - h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize}, - [=](::sycl::nd_item<1> it) { - IndexType ii = it.get_global_id(0); - if (ii < len) - { - loop_body(begin[ii]); - } - }); - }); - - if (!Async) - { - q->wait(); - } } - ::sycl::free(res, *q); - ::sycl::free(beg, *q); + } + ::sycl::free(d_begin, *q); - // If we had to allocate device memory, free it - if constexpr (!is_lbody_trivially_copyable) - { - ::sycl::free(lbody, *q); - ; - } - RAJA_FT_END + // If we had to allocate device memory, free it + if constexpr (!is_lbody_trivially_copyable) + { + ::sycl::free(lbody, *q); } + return resources::EventProxy(sycl_res); } From 1b69f61aecef7e7dea9c320441932a16335b0e73 Mon Sep 17 00:00:00 2001 From: john bowen Date: Thu, 18 Dec 2025 14:25:48 -0800 Subject: [PATCH 3/4] Sycl forall implementation passing unit tests --- include/RAJA/policy/sycl/forall.hpp | 37 ++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp index a65fa97bd8..061c1b47f5 100644 --- a/include/RAJA/policy/sycl/forall.hpp +++ b/include/RAJA/policy/sycl/forall.hpp @@ -103,7 +103,8 @@ forall_impl(resources::Sycl& sycl_res, using Iterator = camp::decay; using IndexType = camp::decay; - using EXEC_POL = camp::decay; + using EXEC_POL = camp::decay; + using LOOP_BODY = camp::decay; // Deduce at compile time if lbody is trivially constructible and if user // has supplied parameters. These will be used to determine which sycl launch // to configure below. @@ -132,8 +133,8 @@ forall_impl(resources::Sycl& sycl_res, sycl_dim_t gridSize = impl::getGridDim(static_cast(len), BlockSize); ::sycl::queue* q = sycl_res.get_queue(); - LoopBody* lbody = &loop_body; - Iterator* d_begin = &begin; + LOOP_BODY* lbody = nullptr; + Iterator* d_begin = nullptr; if constexpr (!is_parampack_empty) { @@ -146,19 +147,17 @@ forall_impl(resources::Sycl& sycl_res, // Kernel body is nontrivially copyable, create space on device and copy to // Workaround until "is_device_copyable" is supported // - lbody = (LoopBody*)::sycl::malloc_device(sizeof(LoopBody), *q); - q->memcpy(lbody, &loop_body, sizeof(LoopBody)).wait(); + lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LoopBody), *q); + q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait(); d_begin = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q); q->memcpy(d_begin, &begin, sizeof(Iterator)).wait(); } - // Both the parallel_for call, combinations, and resolution are all // unique to the parameter case, so we make a constexpr branch here if constexpr (!is_parampack_empty) { - auto combiner = [](ForallParam x, ForallParam y) { RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y); return x; @@ -176,7 +175,14 @@ forall_impl(resources::Sycl& sycl_res, IndexType ii = it.get_id(0); if (ii < len) { - RAJA::expt::invoke_body(fp, loop_body, (*d_begin)[ii]); + if constexpr (is_lbody_trivially_copyable) + { + RAJA::expt::invoke_body(fp, loop_body, begin[ii]); + } + else + { + RAJA::expt::invoke_body(fp, *lbody, (*d_begin)[ii]); + } } red.combine(fp); }); @@ -184,9 +190,10 @@ forall_impl(resources::Sycl& sycl_res, q->wait(); RAJA::expt::ParamMultiplexer::parampack_combine(pol, f_params, *res); - RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params); ::sycl::free(res, *q); + RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params); } + // Note: separate branches else { q->submit([&](::sycl::handler& h) { @@ -195,7 +202,14 @@ forall_impl(resources::Sycl& sycl_res, IndexType ii = it.get_global_id(0); if (ii < len) { - loop_body((*d_begin)[ii]); + if constexpr (is_lbody_trivially_copyable) + { + loop_body(begin[ii]); + } + else + { + (*lbody)((*d_begin)[ii]); + } } }); }); @@ -205,12 +219,13 @@ forall_impl(resources::Sycl& sycl_res, q->wait(); } } - ::sycl::free(d_begin, *q); + // If we had to allocate device memory, free it if constexpr (!is_lbody_trivially_copyable) { ::sycl::free(lbody, *q); + ::sycl::free(d_begin, *q); } From 7fa8730bd53bc9f28ce2edfa3628cec36e7c7411 Mon Sep 17 00:00:00 2001 From: john bowen Date: Mon, 22 Dec 2025 09:49:23 -0800 Subject: [PATCH 4/4] Simplify SYCL launch implementation --- include/RAJA/policy/sycl/launch.hpp | 334 ++++++---------------------- 1 file changed, 73 insertions(+), 261 deletions(-) diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp index f69e2c4424..6ad06ded2c 100644 --- a/include/RAJA/policy/sycl/launch.hpp +++ b/include/RAJA/policy/sycl/launch.hpp @@ -31,98 +31,33 @@ namespace RAJA template struct LaunchExecute> { - - // If the launch lambda is trivially copyable - template {}, - bool>::type = true> + template static concepts::enable_if_t< resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - RAJA::expt::type_traits::is_ForallParamPack_empty> - exec(RAJA::resources::Resource res, - const LaunchParams& params, - BODY_IN&& body_in, - ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) - { - - /*Get the queue from concrete resource */ - ::sycl::queue* q = res.get().get_queue(); - - // - // Compute the number of blocks and threads - // - - const ::sycl::range<3> blockSize(params.threads.value[2], - params.threads.value[1], - params.threads.value[0]); - - const ::sycl::range<3> gridSize( - params.threads.value[2] * params.teams.value[2], - params.threads.value[1] * params.teams.value[1], - params.threads.value[0] * params.teams.value[0]); - - // Only launch kernel if we have something to iterate over - constexpr int zero = 0; - if (params.threads.value[0] > zero && params.threads.value[1] > zero && - params.threads.value[2] > zero && params.teams.value[0] > zero && - params.teams.value[1] > zero && params.teams.value[2] > zero) - { - - RAJA_FT_BEGIN; - - q->submit([&](::sycl::handler& h) { - auto s_vec = ::sycl::local_accessor(params.shared_mem_size, h); - - h.parallel_for( - ::sycl::nd_range<3>(gridSize, blockSize), - [=](::sycl::nd_item<3> itm) { - LaunchContext ctx; - ctx.itm = &itm; - - // Point to shared memory - ctx.shared_mem_ptr = - s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); - - body_in(ctx); - }); - }); - - if (!async) - { - q->wait(); - } - - RAJA_FT_END; - } - - return resources::EventProxy(res); - } - - // If the launch lambda is trivially copyable and we have explcit reduction - // parameters - template {}, - bool>::type = true> - static concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - concepts::negate< - RAJA::expt::type_traits::is_ForallParamPack_empty>> + RAJA::expt::type_traits::is_ForallParamPack> exec(RAJA::resources::Resource res, const LaunchParams& launch_params, - BODY_IN&& body_in, + LoopBody&& loop_body, ReduceParams launch_reducers) { - using EXEC_POL = RAJA::sycl_launch_t; + using EXEC_POL = RAJA::sycl_launch_t; + using LOOP_BODY = camp::decay; + // Deduce at compile time if lbody is trivially constructible and if user + // has supplied parameters. These will be used to determine which sycl + // launch to configure below. + constexpr bool is_parampack_empty = + RAJA::expt::type_traits::is_ForallParamPack_empty::value; + constexpr bool is_lbody_trivially_copyable = + std::is_trivially_copyable::value; EXEC_POL pol {}; /*Get the queue from concrete resource */ ::sycl::queue* q = res.get().get_queue(); - RAJA::expt::ParamMultiplexer::parampack_init(pol, launch_reducers); + if constexpr (!is_parampack_empty) + { + RAJA::expt::ParamMultiplexer::parampack_init(pol, launch_reducers); + } // // Compute the number of blocks and threads @@ -138,198 +73,39 @@ struct LaunchExecute> // Only launch kernel if we have something to iterate over constexpr int zero = 0; - if (launch_params.threads.value[0] > zero && - launch_params.threads.value[1] > zero && - launch_params.threads.value[2] > zero && - launch_params.teams.value[0] > zero && - launch_params.teams.value[1] > zero && - launch_params.teams.value[2] > zero) + if (launch_params.threads.value[0] <= zero || + launch_params.threads.value[1] <= zero || + launch_params.threads.value[2] <= zero || + launch_params.teams.value[0] <= zero || + launch_params.teams.value[1] <= zero || + launch_params.teams.value[2] <= zero) { - - - auto combiner = [](ReduceParams x, ReduceParams y) { - RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y); - return x; - }; - - RAJA_FT_BEGIN; - - ReduceParams* res = ::sycl::malloc_shared(1, *q); - RAJA::expt::ParamMultiplexer::parampack_init(pol, *res); - auto reduction = ::sycl::reduction(res, launch_reducers, combiner); - - q->submit([&](::sycl::handler& h) { - auto s_vec = - ::sycl::local_accessor(launch_params.shared_mem_size, h); - - h.parallel_for( - ::sycl::nd_range<3>(gridSize, blockSize), reduction, - [=](::sycl::nd_item<3> itm, auto& red) { - LaunchContext ctx; - ctx.itm = &itm; - - // Point to shared memory - ctx.shared_mem_ptr = - s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); - - ReduceParams fp; - RAJA::expt::ParamMultiplexer::parampack_init(pol, fp); - - RAJA::expt::invoke_body(fp, body_in, ctx); - - red.combine(fp); - }); - }).wait(); // Need to wait for completion to free memory - - RAJA::expt::ParamMultiplexer::parampack_combine(pol, launch_reducers, - *res); - ::sycl::free(res, *q); - - RAJA_FT_END; + return resources::EventProxy(res); } - RAJA::expt::ParamMultiplexer::parampack_resolve(pol, launch_reducers); - return resources::EventProxy(res); - } - - // If the launch lambda is not trivially copyable - template {}, - bool>::type = true> - static concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - RAJA::expt::type_traits::is_ForallParamPack_empty> - exec(RAJA::resources::Resource res, - const LaunchParams& params, - BODY_IN&& body_in, - ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) - { - - /*Get the queue from concrete resource */ - ::sycl::queue* q = res.get().get_queue(); + RAJA_FT_BEGIN; + using LOOP_BODY = camp::decay; + LOOP_BODY* lbody = nullptr; // - // Compute the number of blocks and threads + // Kernel body is nontrivially copyable, create space on device and copy + // to Workaround until "is_device_copyable" is supported // - - const ::sycl::range<3> blockSize(params.threads.value[2], - params.threads.value[1], - params.threads.value[0]); - - const ::sycl::range<3> gridSize( - params.threads.value[2] * params.teams.value[2], - params.threads.value[1] * params.teams.value[1], - params.threads.value[0] * params.teams.value[0]); - - // Only launch kernel if we have something to iterate over - constexpr int zero = 0; - if (params.threads.value[0] > zero && params.threads.value[1] > zero && - params.threads.value[2] > zero && params.teams.value[0] > zero && - params.teams.value[1] > zero && params.teams.value[2] > zero) + if constexpr (!is_lbody_trivially_copyable) { - - RAJA_FT_BEGIN; - - // - // Kernel body is nontrivially copyable, create space on device and copy - // to Workaround until "is_device_copyable" is supported - // - using LOOP_BODY = camp::decay; - LOOP_BODY* lbody; lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q); - q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait(); - - q->submit([&](::sycl::handler& h) { - auto s_vec = - ::sycl::local_accessor(params.shared_mem_size, h); - - h.parallel_for( - ::sycl::nd_range<3>(gridSize, blockSize), - [=](::sycl::nd_item<3> itm) { - LaunchContext ctx; - ctx.itm = &itm; - - // Point to shared memory - ctx.shared_mem_ptr = - s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); - - (*lbody)(ctx); - }); - }).wait(); // Need to wait for completion to free memory - - ::sycl::free(lbody, *q); - - RAJA_FT_END; + q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait(); } - - return resources::EventProxy(res); - } - - // If the launch lambda is not trivially copyable - template {}, - bool>::type = true> - static concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - concepts::negate< - RAJA::expt::type_traits::is_ForallParamPack_empty>> - exec(RAJA::resources::Resource res, - const LaunchParams& launch_params, - BODY_IN&& body_in, - ReduceParams launch_reducers) - { - using EXEC_POL = RAJA::sycl_launch_t; - EXEC_POL pol {}; - - /*Get the queue from concrete resource */ - ::sycl::queue* q = res.get().get_queue(); - - RAJA::expt::ParamMultiplexer::parampack_init(pol, launch_reducers); - - // - // Compute the number of blocks and threads - // - const ::sycl::range<3> blockSize(launch_params.threads.value[2], - launch_params.threads.value[1], - launch_params.threads.value[0]); - - const ::sycl::range<3> gridSize( - launch_params.threads.value[2] * launch_params.teams.value[2], - launch_params.threads.value[1] * launch_params.teams.value[1], - launch_params.threads.value[0] * launch_params.teams.value[0]); - - // Only launch kernel if we have something to iterate over - constexpr int zero = 0; - if (launch_params.threads.value[0] > zero && - launch_params.threads.value[1] > zero && - launch_params.threads.value[2] > zero && - launch_params.teams.value[0] > zero && - launch_params.teams.value[1] > zero && - launch_params.teams.value[2] > zero) + // Both the parallel_for call, combinations, and resolution are all + // unique to the parameter case, so we make a constexpr branch here + if constexpr (!is_parampack_empty) { - - auto combiner = [](ReduceParams x, ReduceParams y) { RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y); return x; }; - RAJA_FT_BEGIN; - - // - // Kernel body is nontrivially copyable, create space on device and copy - // to Workaround until "is_device_copyable" is supported - // - using LOOP_BODY = camp::decay; - LOOP_BODY* lbody; - lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q); - q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait(); - ReduceParams* res = ::sycl::malloc_shared(1, *q); RAJA::expt::ParamMultiplexer::parampack_init(pol, *res); auto reduction = ::sycl::reduction(res, launch_reducers, combiner); @@ -350,8 +126,14 @@ struct LaunchExecute> ReduceParams fp; RAJA::expt::ParamMultiplexer::parampack_init(pol, fp); - - RAJA::expt::invoke_body(fp, *lbody, ctx); + if constexpr (is_lbody_trivially_copyable) + { + RAJA::expt::invoke_body(fp, loop_body, ctx); + } + else + { + RAJA::expt::invoke_body(fp, *lbody, ctx); + } red.combine(fp); }); @@ -361,11 +143,41 @@ struct LaunchExecute> *res); ::sycl::free(res, *q); ::sycl::free(lbody, *q); + RAJA::expt::ParamMultiplexer::parampack_resolve(pol, launch_reducers); + } + else + { + q->submit([&](::sycl::handler& h) { + auto s_vec = + ::sycl::local_accessor(launch_params.shared_mem_size, h); + + h.parallel_for( + ::sycl::nd_range<3>(gridSize, blockSize), + [=](::sycl::nd_item<3> itm) { + LaunchContext ctx; + ctx.itm = &itm; - RAJA_FT_END; + // Point to shared memory + ctx.shared_mem_ptr = + s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + if constexpr (is_lbody_trivially_copyable) + { + loop_body(ctx); + } + else + { + (*lbody)(ctx); + } + }); + }); + + if (!async) + { + q->wait(); + } } + RAJA_FT_END; - RAJA::expt::ParamMultiplexer::parampack_resolve(pol, launch_reducers); return resources::EventProxy(res); }