diff --git a/include/RAJA/pattern/launch/launch_context_policy.hpp b/include/RAJA/pattern/launch/launch_context_policy.hpp new file mode 100644 index 0000000000..acf707615d --- /dev/null +++ b/include/RAJA/pattern/launch/launch_context_policy.hpp @@ -0,0 +1,124 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file containing template types of RAJA::LaunchContextT + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_pattern_context_policy_HPP +#define RAJA_pattern_context_policy_HPP + +namespace RAJA +{ + +template +class LaunchContextT; + +class LaunchContextDefaultPolicy; + +#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) +class LaunchContextDim3Policy; +#endif + +namespace detail +{ + + +template +struct has_single_call_operator : std::false_type +{}; + +template +struct has_single_call_operator< + T, + std::enable_if_t< + !std::is_same_v::operator()), void>>> + : std::true_type +{}; + +template +struct function_traits +{}; + +template +struct function_traits +{ + using result_type = R; + static constexpr std::size_t arity = sizeof...(Args); + + template + struct arg + { + static_assert(N < arity, "argument index out of range"); + using type = typename std::tuple_element>::type; + }; +}; + +template +struct function_traits : function_traits +{}; + +template +struct function_traits : function_traits +{}; + +template +struct function_traits : function_traits +{ + using functional_type = C; +}; + +template +struct function_traits : function_traits +{ + using functional_type = C; +}; + +template>::value> +struct functional_traits : function_traits> +{}; + +template +struct functional_traits + : function_traits::operator())> +{}; + +template +struct has_arg0 : std::false_type +{}; + +template +struct has_arg0::template arg<0>::type, + void>>> : std::true_type +{}; + +template::value> +struct launch_context_type +{ + using type = LaunchContextT; +}; + +template +struct launch_context_type +{ + using type = typename functional_traits::template arg<0>::type; +}; + + +} // namespace detail + +} // namespace RAJA +#endif diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index 67d9b74ce2..2739cca8d4 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -20,6 +20,7 @@ #include "RAJA/config.hpp" #include "RAJA/internal/get_platform.hpp" +#include "RAJA/pattern/launch/launch_context_policy.hpp" #include "RAJA/util/StaticLayout.hpp" #include "RAJA/util/macros.hpp" #include "RAJA/util/plugins.hpp" @@ -176,13 +177,12 @@ struct LaunchParams Threads apply(Threads const& a) { return (threads = a); } }; -class LaunchContext +class LaunchContextBase { public: // Bump style allocator used to // get memory from the pool size_t shared_mem_offset; - void* shared_mem_ptr; #if defined(RAJA_SYCL_ACTIVE) @@ -190,7 +190,7 @@ class LaunchContext mutable ::sycl::nd_item<3>* itm; #endif - RAJA_HOST_DEVICE LaunchContext() + RAJA_HOST_DEVICE LaunchContextBase() : shared_mem_offset(0), shared_mem_ptr(nullptr) {} @@ -209,20 +209,6 @@ class LaunchContext return static_cast(mem_ptr); } - /* - //Odd dependecy with atomics is breaking CI builds - template RAJA_HOST_DEVICE auto - getSharedMemoryView(size_t bytes, arg idx, args... idxs) - { - T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset]; - - shared_mem_offset += bytes*sizeof(T); - return RAJA::View>(mem_ptr, idx, - idxs...); - } - */ - RAJA_HOST_DEVICE void releaseSharedMemory() { // On the cpu/gpu we want to restart the count @@ -243,6 +229,39 @@ class LaunchContext } }; +template<> +class LaunchContextT : public LaunchContextBase +{ +public: + static constexpr bool hasDim3 = false; + + using LaunchContextBase::LaunchContextBase; +}; + +// Preserve backwards compatibility +using LaunchContext = LaunchContextT; + +#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) +template<> +class LaunchContextT : public LaunchContextBase +{ +public: + static constexpr bool hasDim3 = true; + + dim3 thread_id; + dim3 block_dim; + + LaunchContextT() : LaunchContextBase(), thread_id(), block_dim() {} + + RAJA_DEVICE + LaunchContextT(dim3 thread, dim3 block) + : LaunchContextBase(), + thread_id(thread), + block_dim(block) + {} +}; +#endif + template struct LaunchExecute; diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index d9cca09216..3368928d1b 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -33,17 +33,28 @@ __global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY body_in, ReduceParams reduce_params) { - LaunchContext ctx; - using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); auto& body = privatizer.get_priv(); // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - ctx.shared_mem_ptr = raja_shmem_ptr; - RAJA::expt::invoke_body(reduce_params, body, ctx); + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + + if constexpr (LaunchContextType::hasDim3) + { + LaunchContextType ctx(threadIdx, blockDim); + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } + else + { + LaunchContextType ctx; + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } // Using a flatten global policy as we may use all dimensions RAJA::expt::ParamMultiplexer::parampack_combine( @@ -143,7 +154,6 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ body_in, ReduceParams reduce_params) { - LaunchContext ctx; using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -151,9 +161,22 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - ctx.shared_mem_ptr = raja_shmem_ptr; - RAJA::expt::invoke_body(reduce_params, body, ctx); + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + + if constexpr (LaunchContextType::hasDim3) + { + LaunchContextType ctx(threadIdx, blockDim); + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } + else + { + LaunchContextType ctx; + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } // Using a flatten global policy as we may use all dimensions RAJA::expt::ParamMultiplexer::parampack_combine( @@ -245,6 +268,65 @@ struct LaunchExecute< } }; +/* + Loop methods which rely on a copy of threaIdx/BlockDim + for performance. In collaboration with AMD we have have this + to be more performat. +*/ + +namespace expt +{ + +template +struct cuda_ctx_thread_loop; + +using cuda_ctx_thread_loop_x = cuda_ctx_thread_loop; +using cuda_ctx_thread_loop_y = cuda_ctx_thread_loop; +using cuda_ctx_thread_loop_z = cuda_ctx_thread_loop; + +template +RAJA_INLINE RAJA_DEVICE int get_dim(Dim3Like const& d) +{ + if constexpr (DIM == named_dim::x) + { + return d.x; + } + else if constexpr (DIM == named_dim::y) + { + return d.y; + } + else + { + static_assert(DIM == named_dim::z, "Unsupported named_dim"); + return d.z; + } +} + +} // namespace expt + +template +struct LoopExecute, SEGMENT> +{ + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContextT const& ctx, + SEGMENT const& segment, + BODY const& body) + { + const int len = segment.end() - segment.begin(); + constexpr int int_dim = static_cast(DIM); + + const int thread_idx = expt::get_dim(ctx.thread_id); + const int stride = expt::get_dim(ctx.block_dim); + + for (int i = thread_idx; i < len; i += stride) + { + body(*(segment.begin() + i)); + } + } +}; + /* CUDA generic loop implementations */ @@ -341,9 +423,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -369,9 +451,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -405,9 +487,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -441,9 +523,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -471,9 +553,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -516,9 +598,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -567,9 +649,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -591,9 +673,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -621,9 +703,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -649,9 +731,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -677,9 +759,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -713,9 +795,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -749,9 +831,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -779,9 +861,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -824,9 +906,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -888,9 +970,9 @@ struct LoopExecute::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -920,9 +1002,9 @@ struct LoopExecute::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -962,9 +1044,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -999,9 +1081,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1050,9 +1132,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1087,9 +1169,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1125,9 +1207,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1151,9 +1233,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1186,9 +1268,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1221,9 +1303,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1251,9 +1333,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1292,9 +1374,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1335,9 +1417,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1368,9 +1450,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1418,9 +1500,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1476,9 +1558,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1502,9 +1584,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1538,9 +1620,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1574,9 +1656,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1604,9 +1686,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1646,9 +1728,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1690,9 +1772,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1723,9 +1805,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1777,9 +1859,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index f3ae8f87c1..3bf98d022e 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -32,7 +32,6 @@ template __global__ void launch_new_reduce_global_fcn(const BODY body_in, ReduceParams reduce_params) { - LaunchContext ctx; using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -40,9 +39,22 @@ __global__ void launch_new_reduce_global_fcn(const BODY body_in, // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - ctx.shared_mem_ptr = raja_shmem_ptr; - RAJA::expt::invoke_body(reduce_params, body, ctx); + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + + if constexpr (LaunchContextType::hasDim3) + { + LaunchContextType ctx(threadIdx, blockDim); + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } + else + { + LaunchContextType ctx; + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } // Using a flatten global policy as we may use all dimensions RAJA::expt::ParamMultiplexer::parampack_combine( @@ -137,7 +149,6 @@ __launch_bounds__(num_threads, 1) __global__ void launch_new_reduce_global_fcn_fixed(const BODY body_in, ReduceParams reduce_params) { - LaunchContext ctx; using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(body_in); @@ -145,9 +156,22 @@ __launch_bounds__(num_threads, 1) __global__ // Set pointer to shared memory extern __shared__ char raja_shmem_ptr[]; - ctx.shared_mem_ptr = raja_shmem_ptr; - RAJA::expt::invoke_body(reduce_params, body, ctx); + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + + if constexpr (LaunchContextType::hasDim3) + { + LaunchContextType ctx(threadIdx, blockDim); + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } + else + { + LaunchContextType ctx; + ctx.shared_mem_ptr = raja_shmem_ptr; + RAJA::expt::invoke_body(reduce_params, body, ctx); + } // Using a flatten global policy as we may use all dimensions RAJA::expt::ParamMultiplexer::parampack_combine( @@ -239,6 +263,70 @@ struct LaunchExecute> } }; +/* + Loop methods which rely on a copy of threaIdx/BlockDim + for performance. In collaboration with AMD we have have this + to be more performant. +*/ + +namespace expt +{ + +template +struct hip_ctx_thread_loop; + +using hip_ctx_thread_loop_x = hip_ctx_thread_loop; +using hip_ctx_thread_loop_y = hip_ctx_thread_loop; +using hip_ctx_thread_loop_z = hip_ctx_thread_loop; + +template +RAJA_INLINE RAJA_DEVICE int get_dim(Dim3Like const& d) +{ + if constexpr (DIM == named_dim::x) + { + return d.x; + } + else if constexpr (DIM == named_dim::y) + { + return d.y; + } + else + { + static_assert(DIM == named_dim::z, "Unsupported named_dim"); + return d.z; + } +} + +} // namespace expt + +/* + Loop exec methods will have to be reworked to be hasDim3 aware +*/ + +template +struct LoopExecute, SEGMENT> +{ + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContextT const& ctx, + SEGMENT const& segment, + BODY const& body) + { + + const int len = segment.end() - segment.begin(); + constexpr int int_dim = static_cast(DIM); + + const int thread_idx = expt::get_dim(ctx.thread_id); + const int stride = expt::get_dim(ctx.block_dim); + + for (int i = thread_idx; i < len; i += stride) + { + body(*(segment.begin() + i)); + } + } +}; + /* HIP generic loop implementations */ @@ -253,9 +341,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -277,9 +365,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -307,9 +395,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -335,9 +423,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -363,9 +451,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -399,9 +487,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -435,9 +523,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -465,9 +553,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -510,9 +598,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -561,9 +649,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -585,9 +673,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -615,9 +703,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -643,9 +731,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -671,9 +759,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -707,9 +795,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -743,9 +831,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -773,9 +861,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -818,9 +906,9 @@ struct LoopICountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -882,9 +970,9 @@ struct LoopExecute::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -914,9 +1002,9 @@ struct LoopExecute::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -956,9 +1044,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -993,9 +1081,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1044,9 +1132,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1081,9 +1169,9 @@ struct LoopExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -1119,9 +1207,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1145,9 +1233,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1180,9 +1268,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1215,9 +1303,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1245,9 +1333,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1286,9 +1374,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1329,9 +1417,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1362,9 +1450,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1412,9 +1500,9 @@ struct TileExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1470,9 +1558,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1496,9 +1584,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1532,9 +1620,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1568,9 +1656,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1598,9 +1686,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1640,9 +1728,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, @@ -1684,9 +1772,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -1717,9 +1805,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, SEGMENT const& segment0, @@ -1771,9 +1859,9 @@ struct TileTCountExecute< using diff_t = typename std::iterator_traits< typename SEGMENT::iterator>::difference_type; - template + template static RAJA_INLINE RAJA_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size0, TILE_T tile_size1, TILE_T tile_size2, diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp index 2ba8066fe9..51cfaff7f9 100644 --- a/include/RAJA/policy/openmp/launch.hpp +++ b/include/RAJA/policy/openmp/launch.hpp @@ -28,7 +28,6 @@ template<> struct LaunchExecute { - template static concepts::enable_if_t< resources::EventProxy, @@ -45,11 +44,15 @@ struct LaunchExecute EXEC_POL pol {}; using BodyType = decltype(thread_privatize(body)); + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + auto parallel_section = [&](ReduceParams& f_params, auto func) { - LaunchContext ctx; + LaunchContextType ctx; + auto loop_body = thread_privatize(body); static_assert(std::is_invocable::value, + LaunchContextType&>::value, "Internal RAJA error: Check the parallel kernel passed to " "OpenMP Parallel section in openmp/launch.hpp"); @@ -74,7 +77,7 @@ struct LaunchExecute // pragma so that the reduction parameter pack it operates on is the // version tracked by the combine OpenMP syntax auto parallel_kernel = [&](ReduceParams& f_params, BodyType& body, - LaunchContext& ctx) { + LaunchContextType& ctx) { expt::invoke_body(f_params, body.get_priv(), ctx); }; parallel_section(f_params, parallel_kernel); @@ -84,7 +87,7 @@ struct LaunchExecute { RAJA::region([&]() { auto parallel_kernel = [&](ReduceParams&, BodyType& body, - LaunchContext& ctx) { + LaunchContextType& ctx) { body.get_priv()(ctx); }; parallel_section(f_params, parallel_kernel); @@ -101,9 +104,9 @@ template struct LoopExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -121,9 +124,9 @@ struct LoopExecute }); } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -149,9 +152,9 @@ struct LoopExecute }); } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -187,9 +190,9 @@ template struct LoopExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -203,9 +206,9 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -225,9 +228,9 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -260,9 +263,9 @@ template struct LoopICountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -276,9 +279,9 @@ struct LoopICountExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -298,9 +301,9 @@ struct LoopICountExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -333,9 +336,9 @@ template struct LoopExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -361,9 +364,9 @@ struct LoopExecute }); } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -400,9 +403,9 @@ template struct LoopICountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -428,9 +431,9 @@ struct LoopICountExecute }); } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -466,9 +469,9 @@ template struct TileExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -493,9 +496,9 @@ template struct TileTCountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -522,9 +525,9 @@ template struct TileExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -543,9 +546,9 @@ template struct TileTCountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp index ee98804ecf..26eb287eb0 100644 --- a/include/RAJA/policy/sequential/launch.hpp +++ b/include/RAJA/policy/sequential/launch.hpp @@ -51,7 +51,10 @@ struct LaunchExecute ReduceParams& RAJA_UNUSED_ARG(ReduceParams)) { - LaunchContext ctx; + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + + LaunchContextType ctx; char* kernel_local_mem = new char[params.shared_mem_size]; ctx.shared_mem_ptr = kernel_local_mem; @@ -80,7 +83,10 @@ struct LaunchExecute expt::ParamMultiplexer::parampack_init(pol, launch_reducers); - LaunchContext ctx; + using LaunchContextType = + typename RAJA::detail::launch_context_type::type; + + LaunchContextType ctx; char* kernel_local_mem = new char[launch_params.shared_mem_size]; ctx.shared_mem_ptr = kernel_local_mem; @@ -113,9 +119,9 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -127,9 +133,9 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -149,9 +155,9 @@ struct LoopExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -181,9 +187,9 @@ template struct LoopICountExecute { - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment, BODY const& body) { @@ -194,9 +200,9 @@ struct LoopICountExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, BODY const& body) @@ -216,9 +222,9 @@ struct LoopICountExecute } } - template + template static RAJA_INLINE RAJA_HOST_DEVICE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), SEGMENT const& segment0, SEGMENT const& segment1, SEGMENT const& segment2, @@ -250,9 +256,9 @@ template struct TileExecute { - template + template static RAJA_HOST_DEVICE RAJA_INLINE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body) @@ -271,9 +277,9 @@ template struct TileTCountExecute { - template + template static RAJA_HOST_DEVICE RAJA_INLINE void exec( - LaunchContext const RAJA_UNUSED_ARG(&ctx), + LaunchContextT const RAJA_UNUSED_ARG(&ctx), TILE_T tile_size, SEGMENT const& segment, BODY const& body)