-
Notifications
You must be signed in to change notification settings - Fork 769
[WIP] No handler submit #18842
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: sycl
Are you sure you want to change the base?
[WIP] No handler submit #18842
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2784,6 +2784,112 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> { | |
CodeLoc); | ||
} | ||
|
||
// no_handler | ||
|
||
private: | ||
// NOTE: the name of this function - "kernel_single_task" - is used by the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this live outside
|
||
// Front End to determine kernel invocation kind. | ||
template <typename KernelName, typename KernelType, typename... Props> | ||
#ifdef __SYCL_DEVICE_ONLY__ | ||
[[__sycl_detail__::add_ir_attributes_function( | ||
"sycl-single-task", | ||
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name..., | ||
nullptr, | ||
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]] | ||
#endif | ||
|
||
__SYCL_KERNEL_ATTR__ static void | ||
kernel_single_task(const KernelType &KernelFunc) { | ||
#ifdef __SYCL_DEVICE_ONLY__ | ||
KernelFunc(); | ||
#else | ||
(void)KernelFunc; | ||
#endif | ||
} | ||
|
||
// NOTE: the name of these functions - "kernel_parallel_for" - are used by the | ||
// Front End to determine kernel invocation kind. | ||
template <typename KernelName, typename ElementType, typename KernelType, | ||
typename... Props> | ||
#ifdef __SYCL_DEVICE_ONLY__ | ||
[[__sycl_detail__::add_ir_attributes_function( | ||
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name..., | ||
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]] | ||
#endif | ||
__SYCL_KERNEL_ATTR__ static void | ||
kernel_parallel_for(const KernelType &KernelFunc) { | ||
#ifdef __SYCL_DEVICE_ONLY__ | ||
KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>())); | ||
#else | ||
(void)KernelFunc; | ||
#endif | ||
} | ||
|
||
template <int Dims> static sycl::range<3> padRange(sycl::range<Dims> Range) { | ||
if constexpr (Dims == 3) { | ||
return Range; | ||
} else { | ||
sycl::range<3> Res{0, 0, 0}; | ||
for (int I = 0; I < Dims; ++I) | ||
Res[I] = Range[I]; | ||
return Res; | ||
} | ||
} | ||
|
||
template <int Dims> static sycl::id<3> padId(sycl::id<Dims> Id) { | ||
if constexpr (Dims == 3) { | ||
return Id; | ||
} else { | ||
sycl::id<3> Res{0, 0, 0}; | ||
for (int I = 0; I < Dims; ++I) | ||
Res[I] = Id[I]; | ||
return Res; | ||
} | ||
} | ||
|
||
template <typename KernelName, typename KernelType, int Dims> | ||
void submit_no_handler(nd_range<Dims> Range, const KernelType &KernelFunc) const { | ||
|
||
using NameT = | ||
typename detail::get_kernel_name_t<KernelName, KernelType>::name; | ||
|
||
const char *KernelN = detail::getKernelName<NameT>(); | ||
KernelType Kernel = KernelFunc; | ||
void *KernelFuncPtr = reinterpret_cast<void *>(&Kernel); | ||
int KernelNumParams = detail::getKernelNumParams<NameT>(); | ||
detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = &(detail::getKernelParamDesc<NameT>); | ||
bool IsKernelESIMD = detail::isKernelESIMD<NameT>(); | ||
bool HasSpecialCapt = detail::hasSpecialCaptures<NameT>(); | ||
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr = detail::getKernelNameBasedCache<NameT>(); | ||
|
||
assert(HasSpecialCapt == false); | ||
assert(IsKernelESIMD == false); | ||
|
||
submit_no_handler_impl(Range, KernelN, KernelFuncPtr, KernelNumParams, KernelParamDescGetter, | ||
KernelNameBasedCachePtr); | ||
} | ||
|
||
public: | ||
/// single_task version not using handler | ||
template <typename KernelName = detail::auto_name, typename KernelType> | ||
void single_task_no_handler(const KernelType &KernelFunc) const { | ||
|
||
kernel_single_task<KernelName, KernelType, | ||
ext::oneapi::experimental::empty_properties_t>(KernelFunc); | ||
submit_no_handler<KernelName, KernelType, 1>(nd_range<1>{}, KernelFunc); | ||
} | ||
|
||
template <typename KernelName = detail::auto_name, int Dims, | ||
typename KernelType> | ||
void parallel_for_no_handler(nd_range<Dims> Range, const KernelType &KernelFunc) const { | ||
|
||
kernel_parallel_for<KernelName, sycl::nd_item<Dims>, KernelType, | ||
ext::oneapi::experimental::empty_properties_t>(KernelFunc); | ||
submit_no_handler<KernelName, KernelType, Dims>(Range, KernelFunc); | ||
} | ||
|
||
|
||
|
||
/// parallel_for version with a kernel represented as a lambda + range that | ||
/// specifies global size only. | ||
/// | ||
|
@@ -3686,6 +3792,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> { | |
const detail::code_location &CodeLoc, | ||
bool IsTopCodeLoc) const; | ||
|
||
// no_handler | ||
|
||
template<int Dims> | ||
void submit_no_handler_impl(nd_range<Dims> Range, const char *KernelName, void *KernelFunc, | ||
int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int), | ||
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const; | ||
|
||
/// Submits a command group function object to the queue, in order to be | ||
/// scheduled for execution on the device. | ||
/// | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -415,6 +415,116 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, | |
} | ||
#endif | ||
|
||
// no_handler | ||
|
||
void queue_impl::extractArgsAndReqsFromLambda( | ||
char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), | ||
size_t NumKernelParams, std::vector<ArgDesc> &Args) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why can't you just return |
||
size_t IndexShift = 0; | ||
|
||
Args.reserve(NumKernelParams); | ||
|
||
for (size_t I = 0; I < NumKernelParams; ++I) { | ||
detail::kernel_param_desc_t ParamDesc = ParamDescGetter(I); | ||
void *Ptr = LambdaPtr + ParamDesc.offset; | ||
const detail::kernel_param_kind_t &Kind = ParamDesc.kind; | ||
const int &Size = ParamDesc.info; | ||
|
||
Args.emplace_back(Kind, Ptr, Size, I + IndexShift); | ||
} | ||
} | ||
|
||
void queue_impl::submit_no_handler( | ||
const std::shared_ptr<queue_impl> &Self, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not necessary, see #18715. |
||
detail::NDRDescT NDRDesc, const char *KernelName, | ||
Comment on lines
+437
to
+439
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we template this on the number of dimensions, like you did for I think that would allow us to get rid of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the review. It is a good point, that there is potential for optimization related to NDRDescT. Currently, the enqueueImpKernel scheduler function takes it as an argument, and applies some transformations, so I left it as is. I think there is work in progress to optimize this, and once it is complete, I can update the flow. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't agree. We have so much instance of ugliest code duplication added with a promise to refactor once design settles. Please do the right thing from the start. |
||
void *KernelFunc, int KernelNumParams, | ||
detail::kernel_param_desc_t (*KernelParamDescGetter)(int), | ||
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) { | ||
Comment on lines
+440
to
+442
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO, this should be a single argument. Ideally, renamed |
||
|
||
std::vector<ur_event_handle_t> RawEvents; | ||
std::vector<detail::ArgDesc> Args; | ||
|
||
// TODO exclude graphs | ||
|
||
// TODO external event | ||
|
||
bool KernelFastPath = true; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a very bad variable name. |
||
|
||
{ | ||
std::unique_lock<std::mutex> Lock(MMutex); | ||
EventImplPtr &LastEvent = MDefaultGraphDeps.LastEventPtr; | ||
|
||
if (isInOrder() && LastEvent && !Scheduler::CheckEventReadiness(MContext, LastEvent)) { | ||
KernelFastPath = false; | ||
ur_event_handle_t LastEventHandle = LastEvent->getHandle(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the event is not ready, I think getHandle() can return NULL (if the command is a host task or not yet enqueued yet) |
||
if (LastEventHandle) { | ||
RawEvents.push_back(LastEventHandle); | ||
} | ||
} | ||
} | ||
|
||
if (KernelFastPath) { | ||
enqueueImpKernel( | ||
Self, | ||
NDRDesc, // MNDRDesc | ||
Args, | ||
nullptr, // KernelBundleImpPtr | ||
nullptr, // MKernel | ||
KernelName, | ||
KernelNameBasedCachePtr, // MKernelNameBasedCachePtr | ||
RawEvents, | ||
nullptr, // out event | ||
nullptr, // getMemAllocationFunc | ||
UR_KERNEL_CACHE_CONFIG_DEFAULT, // MKernelCacheConfig | ||
false, // MKernelIsCooperative | ||
false, // MKernelUsesClusterLaunch | ||
0, // MKernelWorkGroupMemorySize | ||
nullptr, // BinImage | ||
KernelFunc, // MKernelFuncPtr | ||
KernelNumParams, // MKernelNumArgs | ||
KernelParamDescGetter, // MKernelParamDescGetter | ||
false); // MKernelHasSpecialCaptures | ||
} else { | ||
std::unique_ptr<detail::CG> CommandGroup; | ||
detail::CG::StorageInitHelper CGData; | ||
std::vector<detail::ArgDesc> Args; | ||
std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage; | ||
std::vector<std::shared_ptr<const void>> AuxiliaryResources; | ||
detail::code_location CodeLoc = {}; | ||
|
||
extractArgsAndReqsFromLambda((char *)KernelFunc, KernelParamDescGetter, | ||
KernelNumParams, Args); | ||
|
||
CommandGroup.reset(new detail::CGExecKernel( | ||
std::move(NDRDesc), | ||
nullptr, // MHostKernel | ||
nullptr, // MKernel | ||
nullptr, // MKernelBundle | ||
std::move(CGData), // CGData | ||
Args, // MArgs | ||
KernelName, // MKernelName | ||
KernelNameBasedCachePtr, // MKernelNameBasedCachePtr | ||
std::move(StreamStorage), // MStreamStorage | ||
std::move(AuxiliaryResources), // MAuxiliaryResources | ||
detail::CGType::Kernel, | ||
UR_KERNEL_CACHE_CONFIG_DEFAULT, // MKernelCacheConfig | ||
false, // MKernelIsCooperative | ||
false, // MKernelUsesClusterLaunch | ||
0, // MKernelWorkGroupMemorySize | ||
CodeLoc)); // MCodeLoc | ||
|
||
detail::EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( | ||
std::move(CommandGroup), | ||
Self, // MQueue | ||
false); // MEventNeeded | ||
|
||
if (isInOrder()) { | ||
std::unique_lock<std::mutex> Lock(MMutex); | ||
MDefaultGraphDeps.LastEventPtr = EventImpl; | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this work with the existing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the intention is to support concurrent new and old submissions. |
||
} | ||
|
||
template <typename HandlerFuncT> | ||
event queue_impl::submitWithHandler(const std::vector<event> &DepEvents, | ||
bool CallerNeedsEvent, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why don't we call it
queue::launch_grouped
?