diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp index 3e2b256c51..968a36a4b9 100644 --- a/include/RAJA/policy/openmp_target/forall.hpp +++ b/include/RAJA/policy/openmp_target/forall.hpp @@ -39,9 +39,7 @@ template RAJA_INLINE concepts::enable_if_t< resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - concepts::negate< - RAJA::expt::type_traits::is_ForallParamPack_empty>> + RAJA::expt::type_traits::is_ForallParamPack> forall_impl(resources::Omp omp_res, const omp_target_parallel_for_exec& p, Iterable&& iter, @@ -49,65 +47,13 @@ forall_impl(resources::Omp omp_res, ForallParam f_params) { using EXEC_POL = camp::decay; - - RAJA::expt::ParamMultiplexer::parampack_init(p, f_params); - RAJA_OMP_DECLARE_REDUCTION_COMBINE; - - using Body = typename std::remove_reference::type; - Body body = loop_body; - - RAJA_EXTRACT_BED_IT(iter); - - // Reset if exceed CUDA threads per block limit. - int tperteam = ThreadsPerTeam; - if (tperteam > omp::MAXNUMTHREADS) - { - tperteam = omp::MAXNUMTHREADS; - } - - // calculate number of teams based on user defined threads per team - // datasize is distance between begin() and end() of iterable - auto numteams = RAJA_DIVIDE_CEILING_INT(distance_it, tperteam); - if (numteams > tperteam) - { - // Omp target reducers will write team # results, into Threads-sized array. - // Need to insure NumTeams <= Threads to prevent array out of bounds access. - numteams = tperteam; - } - - // thread_limit(tperteam) unused due to XL seg fault (when tperteam != - // distance) - auto i = distance_it; - -#pragma omp target teams distribute parallel for num_teams(numteams) \ - schedule(static, 1) map(to \ - : body, begin_it) reduction(combine \ - : f_params) - for (i = 0; i < distance_it; ++i) + constexpr bool is_forall_param_empty = + RAJA::expt::type_traits::is_ForallParamPack_empty::value; + if constexpr (!is_forall_param_empty) { - Body ib = body; - RAJA::expt::invoke_body(f_params, ib, begin_it[i]); + RAJA::expt::ParamMultiplexer::parampack_init(p, f_params); } - RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params); - - return resources::EventProxy(omp_res); -} - -template -RAJA_INLINE concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - RAJA::expt::type_traits::is_ForallParamPack_empty> -forall_impl(resources::Omp omp_res, - const omp_target_parallel_for_exec&, - Iterable&& iter, - Func&& loop_body, - ForallParam) -{ using Body = typename std::remove_reference::type; Body body = loop_body; @@ -133,14 +79,31 @@ forall_impl(resources::Omp omp_res, // thread_limit(tperteam) unused due to XL seg fault (when tperteam != // distance) auto i = distance_it; - + if constexpr (is_forall_param_empty) + { #pragma omp target teams distribute parallel for num_teams(numteams) \ schedule(static, 1) map(to \ : body, begin_it) - for (i = 0; i < distance_it; ++i) + for (i = 0; i < distance_it; ++i) + { + Body ib = body; + ib(begin_it[i]); + } + } + else { - Body ib = body; - ib(begin_it[i]); + RAJA_OMP_DECLARE_REDUCTION_COMBINE +#pragma omp target teams distribute parallel for num_teams(numteams) \ + schedule(static, 1) map(to \ + : body, begin_it) reduction(combine \ + : f_params) + for (i = 0; i < distance_it; ++i) + { + Body ib = body; + RAJA::expt::invoke_body(f_params, ib, begin_it[i]); + } + + RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params); } return resources::EventProxy(omp_res); @@ -149,9 +112,7 @@ forall_impl(resources::Omp omp_res, template RAJA_INLINE concepts::enable_if_t< resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - concepts::negate< - RAJA::expt::type_traits::is_ForallParamPack_empty>> + RAJA::expt::type_traits::is_ForallParamPack> forall_impl(resources::Omp omp_res, const omp_target_parallel_for_exec_nt& p, Iterable&& iter, @@ -159,53 +120,42 @@ forall_impl(resources::Omp omp_res, ForallParam f_params) { using EXEC_POL = camp::decay; - - RAJA::expt::ParamMultiplexer::parampack_init(p, f_params); - RAJA_OMP_DECLARE_REDUCTION_COMBINE; + constexpr bool is_forall_param_empty = + RAJA::expt::type_traits::is_ForallParamPack_empty::value; + if constexpr (!is_forall_param_empty) + { + RAJA::expt::ParamMultiplexer::parampack_init(p, f_params); + } using Body = typename std::remove_reference::type; Body body = loop_body; RAJA_EXTRACT_BED_IT(iter); + if constexpr (!is_forall_param_empty) + { + RAJA_OMP_DECLARE_REDUCTION_COMBINE; #pragma omp target teams distribute parallel for schedule(static, 1) \ firstprivate(body, begin_it) reduction(combine \ : f_params) - for (decltype(distance_it) i = 0; i < distance_it; ++i) - { - Body ib = body; - RAJA::expt::invoke_body(f_params, ib, begin_it[i]); - } - - RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params); - - return resources::EventProxy(omp_res); -} - -template -RAJA_INLINE concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - RAJA::expt::type_traits::is_ForallParamPack_empty> -forall_impl(resources::Omp omp_res, - const omp_target_parallel_for_exec_nt&, - Iterable&& iter, - Func&& loop_body, - ForallParam) -{ - using Body = typename std::remove_reference::type; - Body body = loop_body; - - RAJA_EXTRACT_BED_IT(iter); + for (decltype(distance_it) i = 0; i < distance_it; ++i) + { + Body ib = body; + RAJA::expt::invoke_body(f_params, ib, begin_it[i]); + } + RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params); + } + else + { #pragma omp target teams distribute parallel for schedule(static, 1) \ firstprivate(body, begin_it) - for (decltype(distance_it) i = 0; i < distance_it; ++i) - { - Body ib = body; - ib(begin_it[i]); + for (decltype(distance_it) i = 0; i < distance_it; ++i) + { + Body ib = body; + ib(begin_it[i]); + } } - return resources::EventProxy(omp_res); } diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp index ee98804ecf..2522ea8b98 100644 --- a/include/RAJA/policy/sequential/launch.hpp +++ b/include/RAJA/policy/sequential/launch.hpp @@ -39,37 +39,10 @@ struct LaunchExecute template<> struct LaunchExecute { - template static concepts::enable_if_t< resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - RAJA::expt::type_traits::is_ForallParamPack_empty> - exec(RAJA::resources::Resource res, - LaunchParams const& params, - BODY const& body, - ReduceParams& RAJA_UNUSED_ARG(ReduceParams)) - { - - LaunchContext ctx; - - char* kernel_local_mem = new char[params.shared_mem_size]; - ctx.shared_mem_ptr = kernel_local_mem; - - body(ctx); - - delete[] kernel_local_mem; - ctx.shared_mem_ptr = nullptr; - - return resources::EventProxy(res); - } - - template - static concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - concepts::negate< - RAJA::expt::type_traits::is_ForallParamPack_empty>> + RAJA::expt::type_traits::is_ForallParamPack> exec(RAJA::resources::Resource res, LaunchParams const& launch_params, BODY const& body, @@ -77,19 +50,32 @@ struct LaunchExecute { using EXEC_POL = RAJA::seq_exec; EXEC_POL pol {}; - - expt::ParamMultiplexer::parampack_init(pol, launch_reducers); + constexpr bool is_parampack_empty = + RAJA::expt::type_traits::is_ForallParamPack_empty::value; + if constexpr (!is_parampack_empty) + { + expt::ParamMultiplexer::parampack_init(pol, launch_reducers); + } LaunchContext ctx; char* kernel_local_mem = new char[launch_params.shared_mem_size]; ctx.shared_mem_ptr = kernel_local_mem; - expt::invoke_body(launch_reducers, body, ctx); + if constexpr (!is_parampack_empty) + { + expt::invoke_body(launch_reducers, body, ctx); + } + else + { + body(ctx); + } delete[] kernel_local_mem; ctx.shared_mem_ptr = nullptr; - - expt::ParamMultiplexer::parampack_resolve(pol, launch_reducers); + if constexpr (!is_parampack_empty) + { + expt::ParamMultiplexer::parampack_resolve(pol, launch_reducers); + } return resources::EventProxy(res); } diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp index fd719d68c9..061c1b47f5 100644 --- a/include/RAJA/policy/sycl/forall.hpp +++ b/include/RAJA/policy/sycl/forall.hpp @@ -84,27 +84,34 @@ ::sycl::range<1> getGridDim(size_t len, size_t block_size) //////////////////////////////////////////////////////////////////////// // + template {}, - bool>::type = true> + typename ForallParam> RAJA_INLINE concepts::enable_if_t< resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - RAJA::expt::type_traits::is_ForallParamPack_empty> + RAJA::expt::type_traits::is_ForallParamPack> forall_impl(resources::Sycl& sycl_res, - sycl_exec, + sycl_exec const& pol, Iterable&& iter, LoopBody&& loop_body, - ForallParam) -{ + ForallParam f_params) +{ using Iterator = camp::decay; using IndexType = camp::decay; + using EXEC_POL = camp::decay; + using LOOP_BODY = camp::decay; + // Deduce at compile time if lbody is trivially constructible and if user + // has supplied parameters. These will be used to determine which sycl launch + // to configure below. + constexpr bool is_parampack_empty = + RAJA::expt::type_traits::is_ForallParamPack_empty::value; + constexpr bool is_lbody_trivially_copyable = + std::is_trivially_copyable::value; // // Compute the requested iteration space size @@ -113,173 +120,44 @@ forall_impl(resources::Sycl& sycl_res, Iterator end = std::end(iter); IndexType len = std::distance(begin, end); - // Only launch kernel if we have something to iterate over - if (len > 0 && BlockSize > 0) + // Return immediately if there is no work to be done + if (len <= 0 || BlockSize <= 0) { - // Note: We could fix an incorrect workgroup size. - // It would change what was specified. - // For now, leave the device compiler to error with invalid WG size. - - // - // Compute the number of blocks - // - sycl_dim_t blockSize {BlockSize}; - sycl_dim_t gridSize = impl::getGridDim(static_cast(len), BlockSize); - - ::sycl::queue* q = sycl_res.get_queue(); - - q->submit([&](::sycl::handler& h) { - h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize}, - [=](::sycl::nd_item<1> it) { - IndexType ii = it.get_global_id(0); - if (ii < len) - { - loop_body(begin[ii]); - } - }); - }); - - if (!Async) - { - q->wait(); - } + return resources::EventProxy(sycl_res); } - return resources::EventProxy(sycl_res); -} - -// -// Define if parampack is empty, avoids ambigous definitions. -// - -template {}, - bool>::type = true> -RAJA_INLINE concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - RAJA::expt::type_traits::is_ForallParamPack_empty> -forall_impl(resources::Sycl& sycl_res, - sycl_exec, - Iterable&& iter, - LoopBody&& loop_body, - ForallParam) -{ - using Iterator = camp::decay; - using LOOP_BODY = camp::decay; - using IndexType = - camp::decay; - // - // Compute the requested iteration space size + // Compute the number of blocks // - Iterator begin = std::begin(iter); - Iterator end = std::end(iter); - IndexType len = std::distance(begin, end); + sycl_dim_t blockSize {BlockSize}; + sycl_dim_t gridSize = impl::getGridDim(static_cast(len), BlockSize); - // Only launch kernel if we have something to iterate over - if (len > 0 && BlockSize > 0) - { + ::sycl::queue* q = sycl_res.get_queue(); + LOOP_BODY* lbody = nullptr; + Iterator* d_begin = nullptr; - // Note: We could fix an incorrect workgroup size. - // It would change what was specified. - // For now, leave the device compiler to error with invalid WG size. - - // - // Compute the number of blocks - // - sycl_dim_t blockSize {BlockSize}; - sycl_dim_t gridSize = impl::getGridDim(static_cast(len), BlockSize); - - ::sycl::queue* q = sycl_res.get_queue(); - - LOOP_BODY* lbody; - Iterator* beg; - - RAJA_FT_BEGIN; + if constexpr (!is_parampack_empty) + { + RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params); + } + if constexpr (!is_lbody_trivially_copyable) + { // // Setup shared memory buffers // Kernel body is nontrivially copyable, create space on device and copy to // Workaround until "is_device_copyable" is supported // - lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q); + lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LoopBody), *q); q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait(); - beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q); - q->memcpy(beg, &begin, sizeof(Iterator)).wait(); - - q->submit([&](::sycl::handler& h) { - h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize}, - [=](::sycl::nd_item<1> it) { - Index_type ii = it.get_global_id(0); - - if (ii < len) - { - (*lbody)((*beg)[ii]); - } - }); - }).wait(); // Need to wait for completion to free memory - - // Free our device memory - ::sycl::free(lbody, *q); - ::sycl::free(beg, *q); - - RAJA_FT_END; + d_begin = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q); + q->memcpy(d_begin, &begin, sizeof(Iterator)).wait(); } - return resources::EventProxy(sycl_res); -} - -template {}, - bool>::type = true> -RAJA_INLINE concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - concepts::negate< - RAJA::expt::type_traits::is_ForallParamPack_empty>> -forall_impl(resources::Sycl& sycl_res, - sycl_exec const& pol, - Iterable&& iter, - LoopBody&& loop_body, - ForallParam f_params) - -{ - using Iterator = camp::decay; - using IndexType = - camp::decay; - using EXEC_POL = camp::decay; - - // - // Compute the requested iteration space size - // - Iterator begin = std::begin(iter); - Iterator end = std::end(iter); - IndexType len = std::distance(begin, end); - - RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params); - - // Only launch kernel if we have something to iterate over - if (len > 0 && BlockSize > 0) + // Both the parallel_for call, combinations, and resolution are all + // unique to the parameter case, so we make a constexpr branch here + if constexpr (!is_parampack_empty) { - - // - // Compute the number of blocks - // - sycl_dim_t blockSize {BlockSize}; - sycl_dim_t gridSize = impl::getGridDim(static_cast(len), BlockSize); - RAJA_UNUSED_VAR(gridSize); - - ::sycl::queue* q = sycl_res.get_queue(); - auto combiner = [](ForallParam x, ForallParam y) { RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y); return x; @@ -297,7 +175,14 @@ forall_impl(resources::Sycl& sycl_res, IndexType ii = it.get_id(0); if (ii < len) { - RAJA::expt::invoke_body(fp, loop_body, begin[ii]); + if constexpr (is_lbody_trivially_copyable) + { + RAJA::expt::invoke_body(fp, loop_body, begin[ii]); + } + else + { + RAJA::expt::invoke_body(fp, *lbody, (*d_begin)[ii]); + } } red.combine(fp); }); @@ -306,105 +191,43 @@ forall_impl(resources::Sycl& sycl_res, q->wait(); RAJA::expt::ParamMultiplexer::parampack_combine(pol, f_params, *res); ::sycl::free(res, *q); + RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params); } - RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params); - - return resources::EventProxy(sycl_res); -} - -template {}, - bool>::type = true> -RAJA_INLINE concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - concepts::negate< - RAJA::expt::type_traits::is_ForallParamPack_empty>> -forall_impl(resources::Sycl& sycl_res, - sycl_exec const& pol, - Iterable&& iter, - LoopBody&& loop_body, - ForallParam f_params) - -{ - using Iterator = camp::decay; - using LOOP_BODY = camp::decay; - using IndexType = - camp::decay; - using EXEC_POL = camp::decay; - - // - // Compute the requested iteration space size - // - Iterator begin = std::begin(iter); - Iterator end = std::end(iter); - IndexType len = std::distance(begin, end); - - RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params); - - // Only launch kernel if we have something to iterate over - if (len > 0 && BlockSize > 0) + // Note: separate branches + else { - // - // Compute the number of blocks - // - sycl_dim_t blockSize {BlockSize}; - sycl_dim_t gridSize = impl::getGridDim(static_cast(len), BlockSize); - RAJA_UNUSED_VAR(gridSize); - - ::sycl::queue* q = sycl_res.get_queue(); - - auto combiner = [](ForallParam x, ForallParam y) { - RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y); - return x; - }; - - // START - // - LOOP_BODY* lbody; - Iterator* beg; - RAJA_FT_BEGIN; - // - // Setup shared memory buffers - // Kernel body is nontrivially copyable, create space on device and copy to - // Workaround until "is_device_copyable" is supported - // - lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q); - q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait(); + q->submit([&](::sycl::handler& h) { + h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize}, + [=](::sycl::nd_item<1> it) { + IndexType ii = it.get_global_id(0); + if (ii < len) + { + if constexpr (is_lbody_trivially_copyable) + { + loop_body(begin[ii]); + } + else + { + (*lbody)((*d_begin)[ii]); + } + } + }); + }); - beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q); - q->memcpy(beg, &begin, sizeof(Iterator)).wait(); + if (!Async) + { + q->wait(); + } + } - ForallParam* res = ::sycl::malloc_shared(1, *q); - RAJA::expt::ParamMultiplexer::parampack_init(pol, *res); - auto reduction = ::sycl::reduction(res, f_params, combiner); - q->submit([&](::sycl::handler& h) { - h.parallel_for(::sycl::range<1>(len), reduction, - [=](::sycl::item<1> it, auto& red) { - Index_type ii = it.get_id(0); - ForallParam fp; - RAJA::expt::ParamMultiplexer::parampack_init(pol, fp); - if (ii < len) - { - RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]); - } - red.combine(fp); - }); - }).wait(); // Need to wait for completion to free memory - RAJA::expt::ParamMultiplexer::parampack_combine(pol, f_params, *res); - // Free our device memory - ::sycl::free(res, *q); + // If we had to allocate device memory, free it + if constexpr (!is_lbody_trivially_copyable) + { ::sycl::free(lbody, *q); - ::sycl::free(beg, *q); - - RAJA_FT_END; + ::sycl::free(d_begin, *q); } - RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params); + return resources::EventProxy(sycl_res); } diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp index f69e2c4424..6ad06ded2c 100644 --- a/include/RAJA/policy/sycl/launch.hpp +++ b/include/RAJA/policy/sycl/launch.hpp @@ -31,98 +31,33 @@ namespace RAJA template struct LaunchExecute> { - - // If the launch lambda is trivially copyable - template {}, - bool>::type = true> + template static concepts::enable_if_t< resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - RAJA::expt::type_traits::is_ForallParamPack_empty> - exec(RAJA::resources::Resource res, - const LaunchParams& params, - BODY_IN&& body_in, - ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) - { - - /*Get the queue from concrete resource */ - ::sycl::queue* q = res.get().get_queue(); - - // - // Compute the number of blocks and threads - // - - const ::sycl::range<3> blockSize(params.threads.value[2], - params.threads.value[1], - params.threads.value[0]); - - const ::sycl::range<3> gridSize( - params.threads.value[2] * params.teams.value[2], - params.threads.value[1] * params.teams.value[1], - params.threads.value[0] * params.teams.value[0]); - - // Only launch kernel if we have something to iterate over - constexpr int zero = 0; - if (params.threads.value[0] > zero && params.threads.value[1] > zero && - params.threads.value[2] > zero && params.teams.value[0] > zero && - params.teams.value[1] > zero && params.teams.value[2] > zero) - { - - RAJA_FT_BEGIN; - - q->submit([&](::sycl::handler& h) { - auto s_vec = ::sycl::local_accessor(params.shared_mem_size, h); - - h.parallel_for( - ::sycl::nd_range<3>(gridSize, blockSize), - [=](::sycl::nd_item<3> itm) { - LaunchContext ctx; - ctx.itm = &itm; - - // Point to shared memory - ctx.shared_mem_ptr = - s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); - - body_in(ctx); - }); - }); - - if (!async) - { - q->wait(); - } - - RAJA_FT_END; - } - - return resources::EventProxy(res); - } - - // If the launch lambda is trivially copyable and we have explcit reduction - // parameters - template {}, - bool>::type = true> - static concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - concepts::negate< - RAJA::expt::type_traits::is_ForallParamPack_empty>> + RAJA::expt::type_traits::is_ForallParamPack> exec(RAJA::resources::Resource res, const LaunchParams& launch_params, - BODY_IN&& body_in, + LoopBody&& loop_body, ReduceParams launch_reducers) { - using EXEC_POL = RAJA::sycl_launch_t; + using EXEC_POL = RAJA::sycl_launch_t; + using LOOP_BODY = camp::decay; + // Deduce at compile time if lbody is trivially constructible and if user + // has supplied parameters. These will be used to determine which sycl + // launch to configure below. + constexpr bool is_parampack_empty = + RAJA::expt::type_traits::is_ForallParamPack_empty::value; + constexpr bool is_lbody_trivially_copyable = + std::is_trivially_copyable::value; EXEC_POL pol {}; /*Get the queue from concrete resource */ ::sycl::queue* q = res.get().get_queue(); - RAJA::expt::ParamMultiplexer::parampack_init(pol, launch_reducers); + if constexpr (!is_parampack_empty) + { + RAJA::expt::ParamMultiplexer::parampack_init(pol, launch_reducers); + } // // Compute the number of blocks and threads @@ -138,198 +73,39 @@ struct LaunchExecute> // Only launch kernel if we have something to iterate over constexpr int zero = 0; - if (launch_params.threads.value[0] > zero && - launch_params.threads.value[1] > zero && - launch_params.threads.value[2] > zero && - launch_params.teams.value[0] > zero && - launch_params.teams.value[1] > zero && - launch_params.teams.value[2] > zero) + if (launch_params.threads.value[0] <= zero || + launch_params.threads.value[1] <= zero || + launch_params.threads.value[2] <= zero || + launch_params.teams.value[0] <= zero || + launch_params.teams.value[1] <= zero || + launch_params.teams.value[2] <= zero) { - - - auto combiner = [](ReduceParams x, ReduceParams y) { - RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y); - return x; - }; - - RAJA_FT_BEGIN; - - ReduceParams* res = ::sycl::malloc_shared(1, *q); - RAJA::expt::ParamMultiplexer::parampack_init(pol, *res); - auto reduction = ::sycl::reduction(res, launch_reducers, combiner); - - q->submit([&](::sycl::handler& h) { - auto s_vec = - ::sycl::local_accessor(launch_params.shared_mem_size, h); - - h.parallel_for( - ::sycl::nd_range<3>(gridSize, blockSize), reduction, - [=](::sycl::nd_item<3> itm, auto& red) { - LaunchContext ctx; - ctx.itm = &itm; - - // Point to shared memory - ctx.shared_mem_ptr = - s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); - - ReduceParams fp; - RAJA::expt::ParamMultiplexer::parampack_init(pol, fp); - - RAJA::expt::invoke_body(fp, body_in, ctx); - - red.combine(fp); - }); - }).wait(); // Need to wait for completion to free memory - - RAJA::expt::ParamMultiplexer::parampack_combine(pol, launch_reducers, - *res); - ::sycl::free(res, *q); - - RAJA_FT_END; + return resources::EventProxy(res); } - RAJA::expt::ParamMultiplexer::parampack_resolve(pol, launch_reducers); - return resources::EventProxy(res); - } - - // If the launch lambda is not trivially copyable - template {}, - bool>::type = true> - static concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - RAJA::expt::type_traits::is_ForallParamPack_empty> - exec(RAJA::resources::Resource res, - const LaunchParams& params, - BODY_IN&& body_in, - ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) - { - - /*Get the queue from concrete resource */ - ::sycl::queue* q = res.get().get_queue(); + RAJA_FT_BEGIN; + using LOOP_BODY = camp::decay; + LOOP_BODY* lbody = nullptr; // - // Compute the number of blocks and threads + // Kernel body is nontrivially copyable, create space on device and copy + // to Workaround until "is_device_copyable" is supported // - - const ::sycl::range<3> blockSize(params.threads.value[2], - params.threads.value[1], - params.threads.value[0]); - - const ::sycl::range<3> gridSize( - params.threads.value[2] * params.teams.value[2], - params.threads.value[1] * params.teams.value[1], - params.threads.value[0] * params.teams.value[0]); - - // Only launch kernel if we have something to iterate over - constexpr int zero = 0; - if (params.threads.value[0] > zero && params.threads.value[1] > zero && - params.threads.value[2] > zero && params.teams.value[0] > zero && - params.teams.value[1] > zero && params.teams.value[2] > zero) + if constexpr (!is_lbody_trivially_copyable) { - - RAJA_FT_BEGIN; - - // - // Kernel body is nontrivially copyable, create space on device and copy - // to Workaround until "is_device_copyable" is supported - // - using LOOP_BODY = camp::decay; - LOOP_BODY* lbody; lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q); - q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait(); - - q->submit([&](::sycl::handler& h) { - auto s_vec = - ::sycl::local_accessor(params.shared_mem_size, h); - - h.parallel_for( - ::sycl::nd_range<3>(gridSize, blockSize), - [=](::sycl::nd_item<3> itm) { - LaunchContext ctx; - ctx.itm = &itm; - - // Point to shared memory - ctx.shared_mem_ptr = - s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); - - (*lbody)(ctx); - }); - }).wait(); // Need to wait for completion to free memory - - ::sycl::free(lbody, *q); - - RAJA_FT_END; + q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait(); } - - return resources::EventProxy(res); - } - - // If the launch lambda is not trivially copyable - template {}, - bool>::type = true> - static concepts::enable_if_t< - resources::EventProxy, - RAJA::expt::type_traits::is_ForallParamPack, - concepts::negate< - RAJA::expt::type_traits::is_ForallParamPack_empty>> - exec(RAJA::resources::Resource res, - const LaunchParams& launch_params, - BODY_IN&& body_in, - ReduceParams launch_reducers) - { - using EXEC_POL = RAJA::sycl_launch_t; - EXEC_POL pol {}; - - /*Get the queue from concrete resource */ - ::sycl::queue* q = res.get().get_queue(); - - RAJA::expt::ParamMultiplexer::parampack_init(pol, launch_reducers); - - // - // Compute the number of blocks and threads - // - const ::sycl::range<3> blockSize(launch_params.threads.value[2], - launch_params.threads.value[1], - launch_params.threads.value[0]); - - const ::sycl::range<3> gridSize( - launch_params.threads.value[2] * launch_params.teams.value[2], - launch_params.threads.value[1] * launch_params.teams.value[1], - launch_params.threads.value[0] * launch_params.teams.value[0]); - - // Only launch kernel if we have something to iterate over - constexpr int zero = 0; - if (launch_params.threads.value[0] > zero && - launch_params.threads.value[1] > zero && - launch_params.threads.value[2] > zero && - launch_params.teams.value[0] > zero && - launch_params.teams.value[1] > zero && - launch_params.teams.value[2] > zero) + // Both the parallel_for call, combinations, and resolution are all + // unique to the parameter case, so we make a constexpr branch here + if constexpr (!is_parampack_empty) { - - auto combiner = [](ReduceParams x, ReduceParams y) { RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, x, y); return x; }; - RAJA_FT_BEGIN; - - // - // Kernel body is nontrivially copyable, create space on device and copy - // to Workaround until "is_device_copyable" is supported - // - using LOOP_BODY = camp::decay; - LOOP_BODY* lbody; - lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q); - q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait(); - ReduceParams* res = ::sycl::malloc_shared(1, *q); RAJA::expt::ParamMultiplexer::parampack_init(pol, *res); auto reduction = ::sycl::reduction(res, launch_reducers, combiner); @@ -350,8 +126,14 @@ struct LaunchExecute> ReduceParams fp; RAJA::expt::ParamMultiplexer::parampack_init(pol, fp); - - RAJA::expt::invoke_body(fp, *lbody, ctx); + if constexpr (is_lbody_trivially_copyable) + { + RAJA::expt::invoke_body(fp, loop_body, ctx); + } + else + { + RAJA::expt::invoke_body(fp, *lbody, ctx); + } red.combine(fp); }); @@ -361,11 +143,41 @@ struct LaunchExecute> *res); ::sycl::free(res, *q); ::sycl::free(lbody, *q); + RAJA::expt::ParamMultiplexer::parampack_resolve(pol, launch_reducers); + } + else + { + q->submit([&](::sycl::handler& h) { + auto s_vec = + ::sycl::local_accessor(launch_params.shared_mem_size, h); + + h.parallel_for( + ::sycl::nd_range<3>(gridSize, blockSize), + [=](::sycl::nd_item<3> itm) { + LaunchContext ctx; + ctx.itm = &itm; - RAJA_FT_END; + // Point to shared memory + ctx.shared_mem_ptr = + s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + if constexpr (is_lbody_trivially_copyable) + { + loop_body(ctx); + } + else + { + (*lbody)(ctx); + } + }); + }); + + if (!async) + { + q->wait(); + } } + RAJA_FT_END; - RAJA::expt::ParamMultiplexer::parampack_resolve(pol, launch_reducers); return resources::EventProxy(res); }