diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 4138edd5821e6..eb99c0d124525 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -5,6 +5,7 @@ namespace sycl { inline namespace _V1 { +#define __DPCPP_ENABLE_UNFINISHED_KHR_EXTENSIONS #ifdef __DPCPP_ENABLE_UNFINISHED_KHR_EXTENSIONS namespace khr { @@ -153,27 +154,24 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - submit( - q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); }, - codeLoc); + (void)codeLoc; + q.parallel_for_no_handler(nd_range<1>(r, size), k); } template <typename KernelType> void launch_grouped(const queue &q, range<2> r, range<2> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - submit( - q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); }, - codeLoc); + (void)codeLoc; + q.parallel_for_no_handler(nd_range<2>(r, size), k); } template <typename KernelType> void launch_grouped(const queue &q, range<3> r, range<3> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - submit( - q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); }, - codeLoc); + (void)codeLoc; + q.parallel_for_no_handler(nd_range<3>(r, size), k); } template <typename... Args> @@ -283,7 +281,8 @@ template <typename KernelType> void launch_task(const sycl::queue &q, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - submit(q, [&](handler &h) { launch_task<KernelType>(h, k); }, codeLoc); + (void)codeLoc; + q.single_task_no_handler(k); } template <typename... Args> diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index a0dbdf5c540e8..2ec6d2e46ddf4 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -2784,6 +2784,112 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> { CodeLoc); } + // no_handler + +private: + // NOTE: the name of this function - "kernel_single_task" - is used by the + // Front End to determine kernel invocation kind. + template <typename KernelName, typename KernelType, typename... Props> +#ifdef __SYCL_DEVICE_ONLY__ + [[__sycl_detail__::add_ir_attributes_function( + "sycl-single-task", + ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name..., + nullptr, + ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]] +#endif + + __SYCL_KERNEL_ATTR__ static void + kernel_single_task(const KernelType &KernelFunc) { +#ifdef __SYCL_DEVICE_ONLY__ + KernelFunc(); +#else + (void)KernelFunc; +#endif + } + + // NOTE: the name of these functions - "kernel_parallel_for" - are used by the + // Front End to determine kernel invocation kind. + template <typename KernelName, typename ElementType, typename KernelType, + typename... Props> +#ifdef __SYCL_DEVICE_ONLY__ + [[__sycl_detail__::add_ir_attributes_function( + ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name..., + ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]] +#endif + __SYCL_KERNEL_ATTR__ static void + kernel_parallel_for(const KernelType &KernelFunc) { +#ifdef __SYCL_DEVICE_ONLY__ + KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>())); +#else + (void)KernelFunc; +#endif + } + + template <int Dims> static sycl::range<3> padRange(sycl::range<Dims> Range) { + if constexpr (Dims == 3) { + return Range; + } else { + sycl::range<3> Res{0, 0, 0}; + for (int I = 0; I < Dims; ++I) + Res[I] = Range[I]; + return Res; + } + } + + template <int Dims> static sycl::id<3> padId(sycl::id<Dims> Id) { + if constexpr (Dims == 3) { + return Id; + } else { + sycl::id<3> Res{0, 0, 0}; + for (int I = 0; I < Dims; ++I) + Res[I] = Id[I]; + return Res; + } + } + + template <typename KernelName, typename KernelType, int Dims> + void submit_no_handler(nd_range<Dims> Range, const KernelType &KernelFunc) const { + + using NameT = + typename detail::get_kernel_name_t<KernelName, KernelType>::name; + + const char *KernelN = detail::getKernelName<NameT>(); + KernelType Kernel = KernelFunc; + void *KernelFuncPtr = reinterpret_cast<void *>(&Kernel); + int KernelNumParams = detail::getKernelNumParams<NameT>(); + detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = &(detail::getKernelParamDesc<NameT>); + bool IsKernelESIMD = detail::isKernelESIMD<NameT>(); + bool HasSpecialCapt = detail::hasSpecialCaptures<NameT>(); + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr = detail::getKernelNameBasedCache<NameT>(); + + assert(HasSpecialCapt == false); + assert(IsKernelESIMD == false); + + submit_no_handler_impl(Range, KernelN, KernelFuncPtr, KernelNumParams, KernelParamDescGetter, + KernelNameBasedCachePtr); + } + +public: + /// single_task version not using handler + template <typename KernelName = detail::auto_name, typename KernelType> + void single_task_no_handler(const KernelType &KernelFunc) const { + + kernel_single_task<KernelName, KernelType, + ext::oneapi::experimental::empty_properties_t>(KernelFunc); + submit_no_handler<KernelName, KernelType, 1>(nd_range<1>{}, KernelFunc); + } + + template <typename KernelName = detail::auto_name, int Dims, + typename KernelType> + void parallel_for_no_handler(nd_range<Dims> Range, const KernelType &KernelFunc) const { + + kernel_parallel_for<KernelName, sycl::nd_item<Dims>, KernelType, + ext::oneapi::experimental::empty_properties_t>(KernelFunc); + submit_no_handler<KernelName, KernelType, Dims>(Range, KernelFunc); + } + + + /// parallel_for version with a kernel represented as a lambda + range that /// specifies global size only. /// @@ -3686,6 +3792,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + // no_handler + + template<int Dims> + void submit_no_handler_impl(nd_range<Dims> Range, const char *KernelName, void *KernelFunc, + int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const; + /// Submits a command group function object to the queue, in order to be /// scheduled for execution on the device. /// diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 193eee39b5ce4..9c37faf84327a 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -415,6 +415,119 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, } #endif +// no_handler + +void queue_impl::extractArgsAndReqsFromLambda( + char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), + size_t NumKernelParams, std::vector<ArgDesc> &Args) { + size_t IndexShift = 0; + + Args.reserve(NumKernelParams); + + for (size_t I = 0; I < NumKernelParams; ++I) { + detail::kernel_param_desc_t ParamDesc = ParamDescGetter(I); + void *Ptr = LambdaPtr + ParamDesc.offset; + const detail::kernel_param_kind_t &Kind = ParamDesc.kind; + const int &Size = ParamDesc.info; + + Args.emplace_back(Kind, Ptr, Size, I + IndexShift); + } +} + +void queue_impl::submit_no_handler( + const std::shared_ptr<queue_impl> &Self, + detail::NDRDescT NDRDesc, const char *KernelName, + void *KernelFunc, int KernelNumParams, + detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) { + + std::vector<ur_event_handle_t> RawEvents; + std::vector<detail::ArgDesc> Args; + + assert(!MQueue->hasCommandGraph()); + + // TODO external event + + bool KernelFastPath = true; + + std::unique_lock<std::mutex> Lock(MMutex); + + EventImplPtr &LastEvent = MDefaultGraphDeps.LastEventPtr; + + if (isInOrder() && LastEvent && !Scheduler::CheckEventReadiness(MContext, LastEvent)) { + KernelFastPath = false; + } + + if (KernelFastPath) { + EventImplPtr &LastEvent = MDefaultGraphDeps.LastEventPtr; + + if (isInOrder() && LastEvent && LastEvent->getHandle()) { + RawEvents.push_back(LastEvent->getHandle()); + } + + enqueueImpKernel( + Self, + NDRDesc, // MNDRDesc + Args, + nullptr, // KernelBundleImpPtr + nullptr, // MKernel + KernelName, + KernelNameBasedCachePtr, // MKernelNameBasedCachePtr + RawEvents, + nullptr, // out event + nullptr, // getMemAllocationFunc + UR_KERNEL_CACHE_CONFIG_DEFAULT, // MKernelCacheConfig + false, // MKernelIsCooperative + false, // MKernelUsesClusterLaunch + 0, // MKernelWorkGroupMemorySize + nullptr, // BinImage + KernelFunc, // MKernelFuncPtr + KernelNumParams, // MKernelNumArgs + KernelParamDescGetter, // MKernelParamDescGetter + false); // MKernelHasSpecialCaptures + } else { + std::unique_ptr<detail::CG> CommandGroup; + detail::CG::StorageInitHelper CGData; + std::vector<detail::ArgDesc> Args; + std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage; + std::vector<std::shared_ptr<const void>> AuxiliaryResources; + detail::code_location CodeLoc = {}; + + extractArgsAndReqsFromLambda((char *)KernelFunc, KernelParamDescGetter, + KernelNumParams, Args); + + EventImplPtr &LastEvent = MDefaultGraphDeps.LastEventPtr; + CGData.MEvents.push_back(LastEvent); + + CommandGroup.reset(new detail::CGExecKernel( + std::move(NDRDesc), + nullptr, // MHostKernel + nullptr, // MKernel + nullptr, // MKernelBundle + std::move(CGData), // CGData + std::move(Args), // MArgs + KernelName, // MKernelName + KernelNameBasedCachePtr, // MKernelNameBasedCachePtr + std::move(StreamStorage), // MStreamStorage + std::move(AuxiliaryResources), // MAuxiliaryResources + detail::CGType::Kernel, + UR_KERNEL_CACHE_CONFIG_DEFAULT, // MKernelCacheConfig + false, // MKernelIsCooperative + false, // MKernelUsesClusterLaunch + 0, // MKernelWorkGroupMemorySize + CodeLoc)); // MCodeLoc + + detail::EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( + std::move(CommandGroup), + Self, // MQueue + false); // MEventNeeded + + if (isInOrder()) { + MDefaultGraphDeps.LastEventPtr = EventImpl; + } + } +} + template <typename HandlerFuncT> event queue_impl::submitWithHandler(const std::vector<event> &DepEvents, bool CallerNeedsEvent, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 99490ba2851c4..8461c64f83dc0 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -378,6 +378,18 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> { /*CallerNeedsEvent=*/false, Loc, IsTopCodeLoc, SubmitInfo); } + // no_handler +private: + void extractArgsAndReqsFromLambda( + char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), + size_t NumKernelParams, std::vector<ArgDesc> &Args); + +public: + void submit_no_handler(const std::shared_ptr<queue_impl> &Self, + detail::NDRDescT NDRDesc, const char *KernelName, void *KernelFunc, int KernelNumParams, + detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr); + /// Performs a blocking wait for the completion of all enqueued tasks in the /// queue. /// diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 8b39334f5b432..ed2468fc2f8dd 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -319,6 +319,21 @@ void queue::submit_without_event_impl( impl->submit_without_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } +// no_handler + +template<int Dims> +void queue::submit_no_handler_impl(nd_range<Dims> Range, const char *KernelName, void *KernelFunc, + int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const { + + detail::NDRDescT NDRDesc{padRange(Range.get_global_range()), + padRange(Range.get_local_range()), + padId(Range.get_offset()), Dims}; + + impl->submit_no_handler(impl, NDRDesc, KernelName, KernelFunc, KernelNumParams, + KernelParamDescGetter, KernelNameBasedCachePtr); +} + void queue::wait_proxy(const detail::code_location &CodeLoc) { impl->wait(CodeLoc); } @@ -474,6 +489,18 @@ void queue::ext_oneapi_set_external_event(const event &external_event) { const property_list &queue::getPropList() const { return impl->getPropList(); } +template void queue::submit_no_handler_impl<1>(nd_range<1> Range, const char *KernelName, void *KernelFunc, + int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const; + +template void queue::submit_no_handler_impl<2>(nd_range<2> Range, const char *KernelName, void *KernelFunc, + int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const; + +template void queue::submit_no_handler_impl<3>(nd_range<3> Range, const char *KernelName, void *KernelFunc, + int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const; + } // namespace _V1 } // namespace sycl