Skip to content

[WIP] No handler submit #18842

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: sycl
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 9 additions & 10 deletions sycl/include/sycl/khr/free_function_commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
namespace sycl {
inline namespace _V1 {

#define __DPCPP_ENABLE_UNFINISHED_KHR_EXTENSIONS
#ifdef __DPCPP_ENABLE_UNFINISHED_KHR_EXTENSIONS
namespace khr {

Expand Down Expand Up @@ -153,27 +154,24 @@ void launch_grouped(const queue &q, range<1> r, range<1> size,
const KernelType &k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
(void)codeLoc;
q.parallel_for_no_handler(nd_range<1>(r, size), k);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't we call it queue::launch_grouped?

}
template <typename KernelType>
void launch_grouped(const queue &q, range<2> r, range<2> size,
const KernelType &k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
(void)codeLoc;
q.parallel_for_no_handler(nd_range<2>(r, size), k);
}
template <typename KernelType>
void launch_grouped(const queue &q, range<3> r, range<3> size,
const KernelType &k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
(void)codeLoc;
q.parallel_for_no_handler(nd_range<3>(r, size), k);
}

template <typename... Args>
Expand Down Expand Up @@ -283,7 +281,8 @@ template <typename KernelType>
void launch_task(const sycl::queue &q, const KernelType &k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
submit(q, [&](handler &h) { launch_task<KernelType>(h, k); }, codeLoc);
(void)codeLoc;
q.single_task_no_handler(k);
}

template <typename... Args>
Expand Down
113 changes: 113 additions & 0 deletions sycl/include/sycl/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2784,6 +2784,112 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
CodeLoc);
}

// no_handler

private:
// NOTE: the name of this function - "kernel_single_task" - is used by the
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this live outside queue directly in sycl::detail? If private queue access is an issue, then maybe

struct LaunchUtils {
  static blah-blah(...) {}
};

class queue {
friend struct LaunchUtils;
}

// Front End to determine kernel invocation kind.
template <typename KernelName, typename KernelType, typename... Props>
#ifdef __SYCL_DEVICE_ONLY__
[[__sycl_detail__::add_ir_attributes_function(
"sycl-single-task",
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name...,
nullptr,
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
#endif

__SYCL_KERNEL_ATTR__ static void
kernel_single_task(const KernelType &KernelFunc) {
#ifdef __SYCL_DEVICE_ONLY__
KernelFunc();
#else
(void)KernelFunc;
#endif
}

// NOTE: the name of these functions - "kernel_parallel_for" - are used by the
// Front End to determine kernel invocation kind.
template <typename KernelName, typename ElementType, typename KernelType,
typename... Props>
#ifdef __SYCL_DEVICE_ONLY__
[[__sycl_detail__::add_ir_attributes_function(
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name...,
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
#endif
__SYCL_KERNEL_ATTR__ static void
kernel_parallel_for(const KernelType &KernelFunc) {
#ifdef __SYCL_DEVICE_ONLY__
KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
#else
(void)KernelFunc;
#endif
}

template <int Dims> static sycl::range<3> padRange(sycl::range<Dims> Range) {
if constexpr (Dims == 3) {
return Range;
} else {
sycl::range<3> Res{0, 0, 0};
for (int I = 0; I < Dims; ++I)
Res[I] = Range[I];
return Res;
}
}

template <int Dims> static sycl::id<3> padId(sycl::id<Dims> Id) {
if constexpr (Dims == 3) {
return Id;
} else {
sycl::id<3> Res{0, 0, 0};
for (int I = 0; I < Dims; ++I)
Res[I] = Id[I];
return Res;
}
}

template <typename KernelName, typename KernelType, int Dims>
void submit_no_handler(nd_range<Dims> Range, const KernelType &KernelFunc) const {

using NameT =
typename detail::get_kernel_name_t<KernelName, KernelType>::name;

const char *KernelN = detail::getKernelName<NameT>();
KernelType Kernel = KernelFunc;
void *KernelFuncPtr = reinterpret_cast<void *>(&Kernel);
int KernelNumParams = detail::getKernelNumParams<NameT>();
detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = &(detail::getKernelParamDesc<NameT>);
bool IsKernelESIMD = detail::isKernelESIMD<NameT>();
bool HasSpecialCapt = detail::hasSpecialCaptures<NameT>();
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr = detail::getKernelNameBasedCache<NameT>();

assert(HasSpecialCapt == false);
assert(IsKernelESIMD == false);

submit_no_handler_impl(Range, KernelN, KernelFuncPtr, KernelNumParams, KernelParamDescGetter,
KernelNameBasedCachePtr);
}

public:
/// single_task version not using handler
template <typename KernelName = detail::auto_name, typename KernelType>
void single_task_no_handler(const KernelType &KernelFunc) const {

kernel_single_task<KernelName, KernelType,
ext::oneapi::experimental::empty_properties_t>(KernelFunc);
submit_no_handler<KernelName, KernelType, 1>(nd_range<1>{}, KernelFunc);
}

template <typename KernelName = detail::auto_name, int Dims,
typename KernelType>
void parallel_for_no_handler(nd_range<Dims> Range, const KernelType &KernelFunc) const {

kernel_parallel_for<KernelName, sycl::nd_item<Dims>, KernelType,
ext::oneapi::experimental::empty_properties_t>(KernelFunc);
submit_no_handler<KernelName, KernelType, Dims>(Range, KernelFunc);
}



/// parallel_for version with a kernel represented as a lambda + range that
/// specifies global size only.
///
Expand Down Expand Up @@ -3686,6 +3792,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
const detail::code_location &CodeLoc,
bool IsTopCodeLoc) const;

// no_handler

template<int Dims>
void submit_no_handler_impl(nd_range<Dims> Range, const char *KernelName, void *KernelFunc,
int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const;

/// Submits a command group function object to the queue, in order to be
/// scheduled for execution on the device.
///
Expand Down
110 changes: 110 additions & 0 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,116 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
}
#endif

// no_handler

void queue_impl::extractArgsAndReqsFromLambda(
char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int),
size_t NumKernelParams, std::vector<ArgDesc> &Args) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can't you just return Args?

size_t IndexShift = 0;

Args.reserve(NumKernelParams);

for (size_t I = 0; I < NumKernelParams; ++I) {
detail::kernel_param_desc_t ParamDesc = ParamDescGetter(I);
void *Ptr = LambdaPtr + ParamDesc.offset;
const detail::kernel_param_kind_t &Kind = ParamDesc.kind;
const int &Size = ParamDesc.info;

Args.emplace_back(Kind, Ptr, Size, I + IndexShift);
}
}

void queue_impl::submit_no_handler(
const std::shared_ptr<queue_impl> &Self,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not necessary, see #18715.

detail::NDRDescT NDRDesc, const char *KernelName,
Comment on lines +437 to +439
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we template this on the number of dimensions, like you did for queue::submit_no_handler?

I think that would allow us to get rid of detail::NDRDescT and just pass the nd_range<D> all the way down the stack. We could get rid of padId and padRange

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the review. It is a good point, that there is potential for optimization related to NDRDescT. Currently, the enqueueImpKernel scheduler function takes it as an argument, and applies some transformations, so I left it as is. I think there is work in progress to optimize this, and once it is complete, I can update the flow.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't agree. We have so much instance of ugliest code duplication added with a promise to refactor once design settles. Please do the right thing from the start.

void *KernelFunc, int KernelNumParams,
detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) {
Comment on lines +440 to +442
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO, this should be a single argument. Ideally, renamed KerneNameBasedCachePtr->TypeErasedKernelInfo. I've had a discussion with @sergey-semenov about this, he was going to prototype that change. Can you sync with him about this?


std::vector<ur_event_handle_t> RawEvents;
std::vector<detail::ArgDesc> Args;

// TODO exclude graphs

// TODO external event

bool KernelFastPath = true;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a very bad variable name. Fast in a legacy code as a hack is somewhat meaningful, but in the new highly-optimized code we need to be very specific about what exactly is fast here and why it isn't true in general.


{
std::unique_lock<std::mutex> Lock(MMutex);
EventImplPtr &LastEvent = MDefaultGraphDeps.LastEventPtr;

if (isInOrder() && LastEvent && !Scheduler::CheckEventReadiness(MContext, LastEvent)) {
KernelFastPath = false;
ur_event_handle_t LastEventHandle = LastEvent->getHandle();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the event is not ready, I think getHandle() can return NULL (if the command is a host task or not yet enqueued yet)

if (LastEventHandle) {
RawEvents.push_back(LastEventHandle);
}
}
}

if (KernelFastPath) {
enqueueImpKernel(
Self,
NDRDesc, // MNDRDesc
Args,
nullptr, // KernelBundleImpPtr
nullptr, // MKernel
KernelName,
KernelNameBasedCachePtr, // MKernelNameBasedCachePtr
RawEvents,
nullptr, // out event
nullptr, // getMemAllocationFunc
UR_KERNEL_CACHE_CONFIG_DEFAULT, // MKernelCacheConfig
false, // MKernelIsCooperative
false, // MKernelUsesClusterLaunch
0, // MKernelWorkGroupMemorySize
nullptr, // BinImage
KernelFunc, // MKernelFuncPtr
KernelNumParams, // MKernelNumArgs
KernelParamDescGetter, // MKernelParamDescGetter
false); // MKernelHasSpecialCaptures
} else {
std::unique_ptr<detail::CG> CommandGroup;
detail::CG::StorageInitHelper CGData;
std::vector<detail::ArgDesc> Args;
std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
std::vector<std::shared_ptr<const void>> AuxiliaryResources;
detail::code_location CodeLoc = {};

extractArgsAndReqsFromLambda((char *)KernelFunc, KernelParamDescGetter,
KernelNumParams, Args);

CommandGroup.reset(new detail::CGExecKernel(
std::move(NDRDesc),
nullptr, // MHostKernel
nullptr, // MKernel
nullptr, // MKernelBundle
std::move(CGData), // CGData
Args, // MArgs
KernelName, // MKernelName
KernelNameBasedCachePtr, // MKernelNameBasedCachePtr
std::move(StreamStorage), // MStreamStorage
std::move(AuxiliaryResources), // MAuxiliaryResources
detail::CGType::Kernel,
UR_KERNEL_CACHE_CONFIG_DEFAULT, // MKernelCacheConfig
false, // MKernelIsCooperative
false, // MKernelUsesClusterLaunch
0, // MKernelWorkGroupMemorySize
CodeLoc)); // MCodeLoc

detail::EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG(
std::move(CommandGroup),
Self, // MQueue
false); // MEventNeeded

if (isInOrder()) {
std::unique_lock<std::mutex> Lock(MMutex);
MDefaultGraphDeps.LastEventPtr = EventImpl;
}
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this work with the existing queue_impl::MLastEvent if there are concurrent old-style submissions?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the intention is to support concurrent new and old submissions.

}

template <typename HandlerFuncT>
event queue_impl::submitWithHandler(const std::vector<event> &DepEvents,
bool CallerNeedsEvent,
Expand Down
12 changes: 12 additions & 0 deletions sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,18 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
/*CallerNeedsEvent=*/false, Loc, IsTopCodeLoc, SubmitInfo);
}

// no_handler
private:
void extractArgsAndReqsFromLambda(
char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int),
size_t NumKernelParams, std::vector<ArgDesc> &Args);

public:
void submit_no_handler(const std::shared_ptr<queue_impl> &Self,
detail::NDRDescT NDRDesc, const char *KernelName, void *KernelFunc, int KernelNumParams,
detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr);

/// Performs a blocking wait for the completion of all enqueued tasks in the
/// queue.
///
Expand Down
27 changes: 27 additions & 0 deletions sycl/source/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,21 @@ void queue::submit_without_event_impl(
impl->submit_without_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc);
}

// no_handler

template<int Dims>
void queue::submit_no_handler_impl(nd_range<Dims> Range, const char *KernelName, void *KernelFunc,
int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const {

detail::NDRDescT NDRDesc{padRange(Range.get_global_range()),
padRange(Range.get_local_range()),
padId(Range.get_offset()), Dims};

impl->submit_no_handler(impl, NDRDesc, KernelName, KernelFunc, KernelNumParams,
KernelParamDescGetter, KernelNameBasedCachePtr);
}

void queue::wait_proxy(const detail::code_location &CodeLoc) {
impl->wait(CodeLoc);
}
Expand Down Expand Up @@ -474,6 +489,18 @@ void queue::ext_oneapi_set_external_event(const event &external_event) {

const property_list &queue::getPropList() const { return impl->getPropList(); }

template void queue::submit_no_handler_impl<1>(nd_range<1> Range, const char *KernelName, void *KernelFunc,
int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const;

template void queue::submit_no_handler_impl<2>(nd_range<2> Range, const char *KernelName, void *KernelFunc,
int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const;

template void queue::submit_no_handler_impl<3>(nd_range<3> Range, const char *KernelName, void *KernelFunc,
int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const;

} // namespace _V1
} // namespace sycl

Expand Down
Loading