Skip to content

Commit 00a716a

Browse files
[SYCL] Pass handler & instead of queue across ABI for reduction utils (#18834)
Queue might be `nullptr` in case of graph, but the information these utilitiess query is device-specific. By passing entire `handler &` and having access to graph information we'd be able to return more precise results. Another positive side-effect is that we eliminiate explicit `std::shared_ptr<queue_impl>` which is a small step forward in the ongoing refactoring efforts to prefer passing `*_impl` by raw ptr/ref with explicit `shared_from_this` whenever lifetimes need to be extended.
1 parent 0211dd1 commit 00a716a

File tree

4 files changed

+89
-89
lines changed

4 files changed

+89
-89
lines changed

sycl/include/sycl/reduction.hpp

Lines changed: 24 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -140,17 +140,10 @@ template <typename... Ts> ReduTupleT<Ts...> makeReduTupleT(Ts... Elements) {
140140
return sycl::detail::make_tuple(Elements...);
141141
}
142142

143-
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
144-
__SYCL_EXPORT size_t reduGetMaxWGSize(const std::shared_ptr<queue_impl> &Queue,
143+
__SYCL_EXPORT size_t reduGetMaxWGSize(handler &cgh,
145144
size_t LocalMemBytesPerWorkItem);
146-
__SYCL_EXPORT size_t reduGetPreferredWGSize(
147-
const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem);
148-
#else
149-
__SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
150-
size_t LocalMemBytesPerWorkItem);
151-
__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
145+
__SYCL_EXPORT size_t reduGetPreferredWGSize(handler &cgh,
152146
size_t LocalMemBytesPerWorkItem);
153-
#endif
154147
__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
155148
size_t &NWorkGroups);
156149

@@ -1224,15 +1217,12 @@ template <>
12241217
struct NDRangeReduction<reduction::strategy::local_atomic_and_atomic_cross_wg> {
12251218
template <typename KernelName, int Dims, typename PropertiesT,
12261219
typename KernelType, typename Reduction>
1227-
static void run(handler &CGH,
1228-
const std::shared_ptr<detail::queue_impl> &Queue,
1229-
nd_range<Dims> NDRange, PropertiesT &Properties,
1220+
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
12301221
Reduction &Redu, KernelType &KernelFunc) {
12311222
static_assert(Reduction::has_identity,
12321223
"Identityless reductions are not supported by the "
12331224
"local_atomic_and_atomic_cross_wg strategy.");
12341225

1235-
std::ignore = Queue;
12361226
using Name = __sycl_reduction_kernel<
12371227
reduction::MainKrn, KernelName,
12381228
reduction::strategy::local_atomic_and_atomic_cross_wg>;
@@ -1276,15 +1266,12 @@ struct NDRangeReduction<
12761266
reduction::strategy::group_reduce_and_last_wg_detection> {
12771267
template <typename KernelName, int Dims, typename PropertiesT,
12781268
typename KernelType, typename Reduction>
1279-
static void run(handler &CGH,
1280-
const std::shared_ptr<detail::queue_impl> &Queue,
1281-
nd_range<Dims> NDRange, PropertiesT &Properties,
1269+
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
12821270
Reduction &Redu, KernelType &KernelFunc) {
12831271
static_assert(Reduction::has_identity,
12841272
"Identityless reductions are not supported by the "
12851273
"group_reduce_and_last_wg_detection strategy.");
12861274

1287-
std::ignore = Queue;
12881275
size_t NElements = Reduction::num_elements;
12891276
size_t WGSize = NDRange.get_local_range().size();
12901277
size_t NWorkGroups = NDRange.get_group_range().size();
@@ -1476,9 +1463,7 @@ void doTreeReductionOnTuple(size_t WorkSize, size_t LID,
14761463
template <> struct NDRangeReduction<reduction::strategy::range_basic> {
14771464
template <typename KernelName, int Dims, typename PropertiesT,
14781465
typename KernelType, typename Reduction>
1479-
static void run(handler &CGH,
1480-
const std::shared_ptr<detail::queue_impl> &Queue,
1481-
nd_range<Dims> NDRange, PropertiesT &Properties,
1466+
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
14821467
Reduction &Redu, KernelType &KernelFunc) {
14831468
using reducer_type = typename Reduction::reducer_type;
14841469
using element_type = typename ReducerTraits<reducer_type>::element_type;
@@ -1490,7 +1475,6 @@ template <> struct NDRangeReduction<reduction::strategy::range_basic> {
14901475
constexpr bool UsePartialSumForOutput =
14911476
!Reduction::is_usm && Reduction::has_identity;
14921477

1493-
std::ignore = Queue;
14941478
size_t NElements = Reduction::num_elements;
14951479
size_t WGSize = NDRange.get_local_range().size();
14961480
size_t NWorkGroups = NDRange.get_group_range().size();
@@ -1588,15 +1572,12 @@ template <>
15881572
struct NDRangeReduction<reduction::strategy::group_reduce_and_atomic_cross_wg> {
15891573
template <typename KernelName, int Dims, typename PropertiesT,
15901574
typename KernelType, typename Reduction>
1591-
static void run(handler &CGH,
1592-
const std::shared_ptr<detail::queue_impl> &Queue,
1593-
nd_range<Dims> NDRange, PropertiesT &Properties,
1575+
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
15941576
Reduction &Redu, KernelType &KernelFunc) {
15951577
static_assert(Reduction::has_identity,
15961578
"Identityless reductions are not supported by the "
15971579
"group_reduce_and_atomic_cross_wg strategy.");
15981580

1599-
std::ignore = Queue;
16001581
using Name = __sycl_reduction_kernel<
16011582
reduction::MainKrn, KernelName,
16021583
reduction::strategy::group_reduce_and_atomic_cross_wg>;
@@ -1625,14 +1606,11 @@ struct NDRangeReduction<
16251606
reduction::strategy::local_mem_tree_and_atomic_cross_wg> {
16261607
template <typename KernelName, int Dims, typename PropertiesT,
16271608
typename KernelType, typename Reduction>
1628-
static void run(handler &CGH,
1629-
const std::shared_ptr<detail::queue_impl> &Queue,
1630-
nd_range<Dims> NDRange, PropertiesT &Properties,
1609+
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
16311610
Reduction &Redu, KernelType &KernelFunc) {
16321611
using reducer_type = typename Reduction::reducer_type;
16331612
using element_type = typename ReducerTraits<reducer_type>::element_type;
16341613

1635-
std::ignore = Queue;
16361614
using Name = __sycl_reduction_kernel<
16371615
reduction::MainKrn, KernelName,
16381616
reduction::strategy::local_mem_tree_and_atomic_cross_wg>;
@@ -1687,9 +1665,7 @@ struct NDRangeReduction<
16871665
reduction::strategy::group_reduce_and_multiple_kernels> {
16881666
template <typename KernelName, int Dims, typename PropertiesT,
16891667
typename KernelType, typename Reduction>
1690-
static void run(handler &CGH,
1691-
const std::shared_ptr<detail::queue_impl> &Queue,
1692-
nd_range<Dims> NDRange, PropertiesT &Properties,
1668+
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
16931669
Reduction &Redu, KernelType &KernelFunc) {
16941670
static_assert(Reduction::has_identity,
16951671
"Identityless reductions are not supported by the "
@@ -1708,7 +1684,7 @@ struct NDRangeReduction<
17081684
// TODO: currently the maximal work group size is determined for the given
17091685
// queue/device, while it may be safer to use queries to the kernel compiled
17101686
// for the device.
1711-
size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
1687+
size_t MaxWGSize = reduGetMaxWGSize(CGH, OneElemSize);
17121688
if (NDRange.get_local_range().size() > MaxWGSize)
17131689
throw sycl::exception(make_error_code(errc::nd_range),
17141690
"The implementation handling parallel_for with"
@@ -1826,9 +1802,7 @@ struct NDRangeReduction<
18261802
template <> struct NDRangeReduction<reduction::strategy::basic> {
18271803
template <typename KernelName, int Dims, typename PropertiesT,
18281804
typename KernelType, typename Reduction>
1829-
static void run(handler &CGH,
1830-
const std::shared_ptr<detail::queue_impl> &Queue,
1831-
nd_range<Dims> NDRange, PropertiesT &Properties,
1805+
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
18321806
Reduction &Redu, KernelType &KernelFunc) {
18331807
using element_type = typename Reduction::reducer_element_type;
18341808

@@ -1837,7 +1811,7 @@ template <> struct NDRangeReduction<reduction::strategy::basic> {
18371811
// TODO: currently the maximal work group size is determined for the given
18381812
// queue/device, while it may be safer to use queries to the kernel
18391813
// compiled for the device.
1840-
size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
1814+
size_t MaxWGSize = reduGetMaxWGSize(CGH, OneElemSize);
18411815
if (NDRange.get_local_range().size() > MaxWGSize)
18421816
throw sycl::exception(make_error_code(errc::nd_range),
18431817
"The implementation handling parallel_for with"
@@ -2602,9 +2576,8 @@ tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
26022576
template <> struct NDRangeReduction<reduction::strategy::multi> {
26032577
template <typename KernelName, int Dims, typename PropertiesT,
26042578
typename... RestT>
2605-
static void
2606-
run(handler &CGH, const std::shared_ptr<detail::queue_impl> &Queue,
2607-
nd_range<Dims> NDRange, PropertiesT &Properties, RestT... Rest) {
2579+
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
2580+
RestT... Rest) {
26082581
std::tuple<RestT...> ArgsTuple(Rest...);
26092582
constexpr size_t NumArgs = sizeof...(RestT);
26102583
auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
@@ -2615,7 +2588,7 @@ template <> struct NDRangeReduction<reduction::strategy::multi> {
26152588
// TODO: currently the maximal work group size is determined for the given
26162589
// queue/device, while it is safer to use queries to the kernel compiled
26172590
// for the device.
2618-
size_t MaxWGSize = reduGetMaxWGSize(Queue, LocalMemPerWorkItem);
2591+
size_t MaxWGSize = reduGetMaxWGSize(CGH, LocalMemPerWorkItem);
26192592
if (NDRange.get_local_range().size() > MaxWGSize)
26202593
throw sycl::exception(make_error_code(errc::nd_range),
26212594
"The implementation handling parallel_for with"
@@ -2646,13 +2619,10 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
26462619

26472620
template <typename KernelName, int Dims, typename PropertiesT,
26482621
typename KernelType, typename Reduction>
2649-
static void run(handler &CGH,
2650-
const std::shared_ptr<detail::queue_impl> &Queue,
2651-
nd_range<Dims> NDRange, PropertiesT &Properties,
2622+
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
26522623
Reduction &Redu, KernelType &KernelFunc) {
26532624
auto Delegate = [&](auto Impl) {
2654-
Impl.template run<KernelName>(CGH, Queue, NDRange, Properties, Redu,
2655-
KernelFunc);
2625+
Impl.template run<KernelName>(CGH, NDRange, Properties, Redu, KernelFunc);
26562626
};
26572627

26582628
if constexpr (Reduction::has_float64_atomics) {
@@ -2694,10 +2664,9 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
26942664
}
26952665
template <typename KernelName, int Dims, typename PropertiesT,
26962666
typename... RestT>
2697-
static void
2698-
run(handler &CGH, const std::shared_ptr<detail::queue_impl> &Queue,
2699-
nd_range<Dims> NDRange, PropertiesT &Properties, RestT... Rest) {
2700-
return Impl<Strat::multi>::run<KernelName>(CGH, Queue, NDRange, Properties,
2667+
static void run(handler &CGH, nd_range<Dims> NDRange, PropertiesT &Properties,
2668+
RestT... Rest) {
2669+
return Impl<Strat::multi>::run<KernelName>(CGH, NDRange, Properties,
27012670
Rest...);
27022671
}
27032672
};
@@ -2706,12 +2675,11 @@ template <typename KernelName, reduction::strategy Strategy, int Dims,
27062675
typename PropertiesT, typename... RestT>
27072676
void reduction_parallel_for(handler &CGH, nd_range<Dims> NDRange,
27082677
PropertiesT Properties, RestT... Rest) {
2709-
NDRangeReduction<Strategy>::template run<KernelName>(CGH, CGH.MQueue, NDRange,
2710-
Properties, Rest...);
2678+
NDRangeReduction<Strategy>::template run<KernelName>(CGH, NDRange, Properties,
2679+
Rest...);
27112680
}
27122681

2713-
__SYCL_EXPORT uint32_t
2714-
reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
2682+
__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh);
27152683

27162684
template <typename KernelName, reduction::strategy Strategy, int Dims,
27172685
typename PropertiesT, typename... RestT>
@@ -2742,13 +2710,13 @@ void reduction_parallel_for(handler &CGH, range<Dims> Range,
27422710
#ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
27432711
__SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
27442712
#else
2745-
reduGetMaxNumConcurrentWorkGroups(CGH.MQueue);
2713+
reduGetMaxNumConcurrentWorkGroups(CGH);
27462714
#endif
27472715

27482716
// TODO: currently the preferred work group size is determined for the given
27492717
// queue/device, while it is safer to use queries to the kernel pre-compiled
27502718
// for the device.
2751-
size_t PrefWGSize = reduGetPreferredWGSize(CGH.MQueue, OneElemSize);
2719+
size_t PrefWGSize = reduGetPreferredWGSize(CGH, OneElemSize);
27522720

27532721
size_t NWorkItems = Range.size();
27542722
size_t WGSize = std::min(NWorkItems, PrefWGSize);

sycl/source/detail/reduction.cpp

Lines changed: 59 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,24 @@ __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
4949
return WGSize;
5050
}
5151

52+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
53+
// Inline this helper:
54+
#endif
55+
uint32_t reduGetMaxNumConcurrentWorkGroups(device_impl &Dev) {
56+
uint32_t NumThreads = Dev.get_info<sycl::info::device::max_compute_units>();
57+
// TODO: The heuristics here require additional tuning for various devices
58+
// and vendors. Also, it would be better to check vendor/generation/etc.
59+
if (Dev.is_gpu() && Dev.get_info<sycl::info::device::host_unified_memory>())
60+
NumThreads *= 8;
61+
return NumThreads;
62+
}
5263
// Returns the estimated number of physical threads on the device associated
5364
// with the given queue.
65+
__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh) {
66+
return reduGetMaxNumConcurrentWorkGroups(getSyclObjImpl(cgh)->get_device());
67+
}
68+
69+
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
5470
__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
5571
std::shared_ptr<sycl::detail::queue_impl> Queue) {
5672
// TODO: Graphs extension explicit API uses a handler with no queue attached,
@@ -63,25 +79,14 @@ __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
6379
if (Queue == nullptr) {
6480
return 8;
6581
}
66-
device Dev = Queue->get_device();
67-
uint32_t NumThreads = Dev.get_info<sycl::info::device::max_compute_units>();
68-
// TODO: The heuristics here require additional tuning for various devices
69-
// and vendors. Also, it would be better to check vendor/generation/etc.
70-
if (Dev.is_gpu() && Dev.get_info<sycl::info::device::host_unified_memory>())
71-
NumThreads *= 8;
72-
return NumThreads;
82+
return reduGetMaxNumConcurrentWorkGroups(Queue->getDeviceImpl());
7383
}
84+
#endif
7485

7586
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
76-
__SYCL_EXPORT size_t
77-
reduGetMaxWGSize(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
78-
size_t LocalMemBytesPerWorkItem) {
79-
#else
80-
__SYCL_EXPORT size_t
81-
reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
82-
size_t LocalMemBytesPerWorkItem) {
87+
// Inline this helper:
8388
#endif
84-
device Dev = Queue->get_device();
89+
size_t reduGetMaxWGSize(device_impl &Dev, size_t LocalMemBytesPerWorkItem) {
8590
size_t MaxWGSize = Dev.get_info<sycl::info::device::max_work_group_size>();
8691

8792
size_t WGSizePerMem = MaxWGSize * 2;
@@ -118,26 +123,24 @@ reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
118123

119124
return WGSize;
120125
}
126+
__SYCL_EXPORT size_t reduGetMaxWGSize(handler &cgh,
127+
size_t LocalMemBytesPerWorkItem) {
128+
return reduGetMaxWGSize(getSyclObjImpl(cgh)->get_device(),
129+
LocalMemBytesPerWorkItem);
130+
}
131+
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
132+
__SYCL_EXPORT
133+
size_t reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
134+
size_t LocalMemBytesPerWorkItem) {
135+
return reduGetMaxWGSize(Queue->getDeviceImpl(), LocalMemBytesPerWorkItem);
136+
}
137+
#endif
121138

122139
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
123-
__SYCL_EXPORT size_t reduGetPreferredWGSize(
124-
const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem) {
125-
#else
126-
__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
127-
size_t LocalMemBytesPerWorkItem) {
140+
// Inline this helper:
128141
#endif
129-
// TODO: Graphs extension explicit API uses a handler with a null queue to
130-
// process CGFs, in future we should have access to the device so we can
131-
// correctly calculate this.
132-
//
133-
// The 32 value was chosen as the hardcoded value as it is the returned
134-
// value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on
135-
// Intel HD Graphics devices used as a L0 backend during development.
136-
if (Queue == nullptr) {
137-
return 32;
138-
}
139-
device Dev = Queue->get_device();
140-
142+
size_t reduGetPreferredWGSize(device_impl &Dev,
143+
size_t LocalMemBytesPerWorkItem) {
141144
// The maximum WGSize returned by CPU devices is very large and does not
142145
// help the reduction implementation: since all work associated with a
143146
// work-group is typically assigned to one CPU thread, selecting a large
@@ -174,8 +177,31 @@ __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
174177
}
175178

176179
// Use the maximum work-group size otherwise.
177-
return reduGetMaxWGSize(Queue, LocalMemBytesPerWorkItem);
180+
return reduGetMaxWGSize(Dev, LocalMemBytesPerWorkItem);
181+
}
182+
__SYCL_EXPORT size_t reduGetPreferredWGSize(handler &cgh,
183+
size_t LocalMemBytesPerWorkItem) {
184+
return reduGetPreferredWGSize(getSyclObjImpl(cgh)->get_device(),
185+
LocalMemBytesPerWorkItem);
178186
}
187+
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
188+
__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
189+
size_t LocalMemBytesPerWorkItem) {
190+
// TODO: Graphs extension explicit API uses a handler with a null queue to
191+
// process CGFs, in future we should have access to the device so we can
192+
// correctly calculate this.
193+
//
194+
// The 32 value was chosen as the hardcoded value as it is the returned
195+
// value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on
196+
// Intel HD Graphics devices used as a L0 backend during development.
197+
if (Queue == nullptr) {
198+
return 32;
199+
}
200+
device_impl &Dev = Queue->getDeviceImpl();
201+
202+
return reduGetPreferredWGSize(Dev, LocalMemBytesPerWorkItem);
203+
}
204+
#endif
179205

180206
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
181207
__SYCL_EXPORT void

0 commit comments

Comments
 (0)