Skip to content

Commit 898cbbb

Browse files
[SYCL] [Graph] Reduce number of queue_impl creation (#18223)
We were creating queue_impl each time we were enqueueing a node or updating it. It makes sense to cache queue_impl as a private member of `exec_graph_impl`.
1 parent 34c6e0d commit 898cbbb

File tree

2 files changed

+15
-25
lines changed

2 files changed

+15
-25
lines changed

sycl/source/detail/graph_impl.cpp

+11-21
Original file line numberDiff line numberDiff line change
@@ -833,17 +833,9 @@ exec_graph_impl::enqueueNodeDirect(sycl::context Ctx,
833833
return NewSyncPoint;
834834
}
835835

836-
ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNode(
837-
sycl::context Ctx, std::shared_ptr<sycl::detail::device_impl> DeviceImpl,
838-
ur_exp_command_buffer_handle_t CommandBuffer,
839-
std::shared_ptr<node_impl> Node) {
840-
841-
// Queue which will be used for allocation operations for accessors.
842-
// Will also be used in native commands to return to the user in
843-
// `interop_handler::get_native_queue()` calls.
844-
auto AllocaQueue = std::make_shared<sycl::detail::queue_impl>(
845-
DeviceImpl, sycl::detail::getSyclObjImpl(Ctx), sycl::async_handler{},
846-
sycl::property_list{});
836+
ur_exp_command_buffer_sync_point_t
837+
exec_graph_impl::enqueueNode(ur_exp_command_buffer_handle_t CommandBuffer,
838+
std::shared_ptr<node_impl> Node) {
847839

848840
std::vector<ur_exp_command_buffer_sync_point_t> Deps;
849841
for (auto &N : Node->MPredecessors) {
@@ -852,8 +844,8 @@ ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNode(
852844

853845
sycl::detail::EventImplPtr Event =
854846
sycl::detail::Scheduler::getInstance().addCG(
855-
Node->getCGCopy(), AllocaQueue, /*EventNeeded=*/true, CommandBuffer,
856-
Deps);
847+
Node->getCGCopy(), MQueueImpl,
848+
/*EventNeeded=*/true, CommandBuffer, Deps);
857849

858850
if (MIsUpdatable) {
859851
MCommandMap[Node] = Event->getCommandBufferCommand();
@@ -898,8 +890,7 @@ void exec_graph_impl::createCommandBuffers(
898890
MSyncPoints[Node] =
899891
enqueueNodeDirect(MContext, DeviceImpl, OutCommandBuffer, Node);
900892
} else {
901-
MSyncPoints[Node] =
902-
enqueueNode(MContext, DeviceImpl, OutCommandBuffer, Node);
893+
MSyncPoints[Node] = enqueueNode(OutCommandBuffer, Node);
903894
}
904895

905896
// Append Node requirements to overall graph requirements
@@ -926,6 +917,10 @@ exec_graph_impl::exec_graph_impl(sycl::context Context,
926917
const std::shared_ptr<graph_impl> &GraphImpl,
927918
const property_list &PropList)
928919
: MSchedule(), MGraphImpl(GraphImpl), MSyncPoints(),
920+
MQueueImpl(std::make_shared<sycl::detail::queue_impl>(
921+
sycl::detail::getSyclObjImpl(GraphImpl->getDevice()),
922+
sycl::detail::getSyclObjImpl(Context), sycl::async_handler{},
923+
sycl::property_list{})),
929924
MDevice(GraphImpl->getDevice()), MContext(Context), MRequirements(),
930925
MExecutionEvents(),
931926
MIsUpdatable(PropList.has_property<property::graph::updatable>()),
@@ -1369,16 +1364,11 @@ void exec_graph_impl::update(
13691364
++It;
13701365
}
13711366

1372-
auto AllocaQueue = std::make_shared<sycl::detail::queue_impl>(
1373-
sycl::detail::getSyclObjImpl(MGraphImpl->getDevice()),
1374-
sycl::detail::getSyclObjImpl(MGraphImpl->getContext()),
1375-
sycl::async_handler{}, sycl::property_list{});
1376-
13771367
// Track the event for the update command since execution may be blocked by
13781368
// other scheduler commands
13791369
auto UpdateEvent =
13801370
sycl::detail::Scheduler::getInstance().addCommandGraphUpdate(
1381-
this, Nodes, AllocaQueue, std::move(UpdateRequirements),
1371+
this, Nodes, MQueueImpl, std::move(UpdateRequirements),
13821372
MExecutionEvents);
13831373

13841374
MExecutionEvents.push_back(UpdateEvent);

sycl/source/detail/graph_impl.hpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#pragma once
1010

11+
#include "detail/queue_impl.hpp"
1112
#include <sycl/detail/cg_types.hpp>
1213
#include <sycl/detail/os_util.hpp>
1314
#include <sycl/ext/oneapi/experimental/graph.hpp>
@@ -1405,14 +1406,11 @@ class exec_graph_impl {
14051406
private:
14061407
/// Create a command-group for the node and add it to command-buffer by going
14071408
/// through the scheduler.
1408-
/// @param Ctx Context to use.
1409-
/// @param DeviceImpl Device associated with the enqueue.
14101409
/// @param CommandBuffer Command-buffer to add node to as a command.
14111410
/// @param Node The node being enqueued.
14121411
/// @return UR sync point created for this node in the command-buffer.
14131412
ur_exp_command_buffer_sync_point_t
1414-
enqueueNode(sycl::context Ctx, sycl::detail::DeviceImplPtr DeviceImpl,
1415-
ur_exp_command_buffer_handle_t CommandBuffer,
1413+
enqueueNode(ur_exp_command_buffer_handle_t CommandBuffer,
14161414
std::shared_ptr<node_impl> Node);
14171415

14181416
/// Enqueue a node directly to the command-buffer without going through the
@@ -1510,6 +1508,8 @@ class exec_graph_impl {
15101508
std::unordered_map<std::shared_ptr<node_impl>,
15111509
ur_exp_command_buffer_sync_point_t>
15121510
MSyncPoints;
1511+
/// Sycl queue impl ptr associated with this graph.
1512+
std::shared_ptr<sycl::detail::queue_impl> MQueueImpl;
15131513
/// Map of nodes in the exec graph to the partition number to which they
15141514
/// belong.
15151515
std::unordered_map<std::shared_ptr<node_impl>, int> MPartitionNodes;

0 commit comments

Comments
 (0)