diff --git a/sycl/test-e2e/Basic/multi_context_wait.cpp b/sycl/test-e2e/Basic/multi_context_wait.cpp new file mode 100644 index 0000000000000..96de68897d7b1 --- /dev/null +++ b/sycl/test-e2e/Basic/multi_context_wait.cpp @@ -0,0 +1,100 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include +#include + +#include +#include + +std::vector submit_dependencies(sycl::queue q1, sycl::queue q2, + int *mem1, int *mem2) { + int delay_ops = 1024 * 1024; + auto delay = [=] { + volatile int value = delay_ops; + while (--value) + ; + }; + + auto ev1 = + q1.parallel_for(sycl::range<1>(1024), [=]([[maybe_unused]] auto u) { + delay(); + mem1[u.get_id()] = 1; + }); + auto ev2 = + q2.parallel_for(sycl::range<1>(1024), [=]([[maybe_unused]] auto u) { + delay(); + mem2[u.get_id()] = 2; + }); + + return {ev1, ev2}; +} + +void test_host_task() { + sycl::context c1{}; + sycl::context c2{}; + + sycl::queue q1(c1, sycl::default_selector_v); + sycl::queue q2(c2, sycl::default_selector_v); + + auto mem1 = sycl::malloc_host(1024, q1); + auto mem2 = sycl::malloc_host(1024, q2); + + auto events = submit_dependencies(q1, q2, mem1, mem2); + + q2.submit([&](sycl::handler &cgh) { + cgh.depends_on(events[0]); + cgh.depends_on(events[1]); + cgh.host_task([=]() { + for (int i = 0; i < 1024; i++) { + assert(mem1[i] == 1); + assert(mem2[i] == 2); + } + }); + }); + + q2.wait(); + + sycl::free(mem1, c1); + sycl::free(mem2, c2); +} + +void test_kernel() { + sycl::context c1{}; + sycl::context c2{}; + + sycl::queue q1(c1, sycl::default_selector_v); + sycl::queue q2(c2, sycl::default_selector_v); + + auto mem1 = sycl::malloc_device(1024, q1); + auto mem2 = sycl::malloc_device(1024, q2); + + auto events = submit_dependencies(q1, q2, mem1, mem2); + + q1.submit([&](sycl::handler &cgh) { + cgh.depends_on(events[0]); + cgh.depends_on(events[1]); + cgh.parallel_for(sycl::range<1>(1024), + [=](auto item) { assert(mem1[item.get_id()] == 1); }); + }); + + q2.submit([&](sycl::handler &cgh) { + cgh.depends_on(events[0]); + cgh.depends_on(events[1]); + cgh.parallel_for(sycl::range<1>(1024), + [=](auto item) { assert(mem2[item.get_id()] == 2); }); + }); + + q1.wait(); + q2.wait(); + + sycl::free(mem1, c1); + sycl::free(mem2, c2); +} + +int main() { + test_host_task(); + test_kernel(); + + return 0; +} diff --git a/unified-runtime/source/adapters/opencl/event.cpp b/unified-runtime/source/adapters/opencl/event.cpp index dc017ee3947f2..bb13f297b60bf 100644 --- a/unified-runtime/source/adapters/opencl/event.cpp +++ b/unified-runtime/source/adapters/opencl/event.cpp @@ -149,12 +149,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { UR_APIEXPORT ur_result_t UR_APICALL urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { - std::vector CLEvents(numEvents); + ur_context_handle_t hContext = phEventWaitList[0]->Context; + std::vector CLEvents; + CLEvents.reserve(numEvents); + + // clWaitForEvents can only be called on events from the same context. + // If the events are from different contexts, we need to wait for each + // set of events separately. for (uint32_t i = 0; i < numEvents; i++) { - CLEvents[i] = phEventWaitList[i]->CLEvent; + if (phEventWaitList[i]->Context != hContext) { + CL_RETURN_ON_FAILURE(clWaitForEvents(CLEvents.size(), CLEvents.data())); + CLEvents.clear(); + } + + CLEvents.push_back(phEventWaitList[i]->CLEvent); + hContext = phEventWaitList[i]->Context; + } + if (CLEvents.size()) { + CL_RETURN_ON_FAILURE(clWaitForEvents(CLEvents.size(), CLEvents.data())); } - cl_int RetErr = clWaitForEvents(numEvents, CLEvents.data()); - CL_RETURN_ON_FAILURE(RetErr); return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/test/conformance/event/urEventWait.cpp b/unified-runtime/test/conformance/event/urEventWait.cpp index 74a5ae5eff116..769faf3206675 100644 --- a/unified-runtime/test/conformance/event/urEventWait.cpp +++ b/unified-runtime/test/conformance/event/urEventWait.cpp @@ -7,56 +7,78 @@ #include #include -struct urEventWaitTest : uur::urQueueTest { +struct urEventWaitTest : uur::urDeviceTest { void SetUp() override { - UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp()); - ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_ONLY, size, - nullptr, &src_buffer)); - ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_WRITE_ONLY, size, - nullptr, &dst_buffer)); - input.assign(count, 42); - ASSERT_SUCCESS(urEnqueueMemBufferWrite(queue, src_buffer, false, 0, size, - input.data(), 0, nullptr, &event)); - ASSERT_SUCCESS(urEventWait(1, &event)); + UUR_RETURN_ON_FATAL_FAILURE(urDeviceTest::SetUp()); + + for (size_t i = 0; i < maxNumContexts; ++i) { + ur_context_handle_t context = nullptr; + ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, &context)); + ASSERT_NE(context, nullptr); + contexts.push_back(context); + + ur_queue_handle_t queue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, device, 0, &queue)); + ASSERT_NE(queue, nullptr); + queues.push_back(queue); + + src_buffer.emplace_back(); + dst_buffer.emplace_back(); + + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size, + nullptr, &src_buffer[i])); + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size, + nullptr, &dst_buffer[i])); + input.emplace_back(); + input[i].assign(count, uint32_t(99 + i)); + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queue, src_buffer[i], true, 0, + size, input[i].data(), 0, nullptr, + nullptr)); + } } void TearDown() override { - if (src_buffer) { - EXPECT_SUCCESS(urMemRelease(src_buffer)); + for (size_t i = 0; i < src_buffer.size(); ++i) { + EXPECT_SUCCESS(urMemRelease(src_buffer[i])); + EXPECT_SUCCESS(urMemRelease(dst_buffer[i])); } - if (dst_buffer) { - EXPECT_SUCCESS(urMemRelease(dst_buffer)); + for (size_t i = 0; i < queues.size(); ++i) { + EXPECT_SUCCESS(urQueueRelease(queues[i])); } - if (event) { - EXPECT_SUCCESS(urEventRelease(event)); + for (size_t i = 0; i < contexts.size(); ++i) { + EXPECT_SUCCESS(urContextRelease(contexts[i])); } - urQueueTest::TearDown(); + UUR_RETURN_ON_FATAL_FAILURE(urDeviceTest::TearDown()); } + const size_t maxNumContexts = 5; + std::vector contexts; + std::vector queues; + std::vector src_buffer; + std::vector dst_buffer; const size_t count = 1024; const size_t size = sizeof(uint32_t) * count; - ur_mem_handle_t src_buffer = nullptr; - ur_mem_handle_t dst_buffer = nullptr; - ur_event_handle_t event = nullptr; - std::vector input; + std::vector> input; }; + UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEventWaitTest); TEST_P(urEventWaitTest, Success) { UUR_KNOWN_FAILURE_ON(uur::NativeCPU{}); ur_event_handle_t event1 = nullptr; - ASSERT_SUCCESS(urEnqueueMemBufferCopy(queue, src_buffer, dst_buffer, 0, 0, - size, 0, nullptr, &event1)); + ASSERT_SUCCESS(urEnqueueMemBufferCopy(queues[0], src_buffer[0], dst_buffer[0], + 0, 0, size, 0, nullptr, &event1)); std::vector output(count, 1); ur_event_handle_t event2 = nullptr; - ASSERT_SUCCESS(urEnqueueMemBufferRead(queue, dst_buffer, false, 0, size, - output.data(), 0, nullptr, &event2)); + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[0], dst_buffer[0], false, 0, + size, output.data(), 0, nullptr, + &event2)); std::vector events{event1, event2}; - EXPECT_SUCCESS(urQueueFlush(queue)); + EXPECT_SUCCESS(urQueueFlush(queues[0])); ASSERT_SUCCESS( urEventWait(static_cast(events.size()), events.data())); - ASSERT_EQ(input, output); + ASSERT_EQ(input[0], output); EXPECT_SUCCESS(urEventRelease(event1)); EXPECT_SUCCESS(urEventRelease(event2)); @@ -75,3 +97,71 @@ TEST_P(urEventWaitNegativeTest, InvalidNullPointerEventList) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, urEventWait(1, nullptr)); } + +TEST_P(urEventWaitTest, WaitWithMultipleContexts) { + UUR_KNOWN_FAILURE_ON(uur::NativeCPU{}); + + for (size_t i = 0; i < maxNumContexts; i++) { + ASSERT_SUCCESS(urEnqueueMemBufferCopy(queues[i], src_buffer[i], + dst_buffer[i], 0, 0, size, 0, nullptr, + nullptr)); + } + + std::vector events; + std::vector> output; + for (size_t i = 0; i < maxNumContexts; i++) { + output.emplace_back(count, 1); + events.emplace_back(); + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[i], dst_buffer[i], false, 0, + size, output[i].data(), 0, nullptr, + &events.back())); + } + + ASSERT_SUCCESS( + urEventWait(static_cast(events.size()), events.data())); + + for (size_t i = 0; i < maxNumContexts; i++) { + ASSERT_EQ(input[i], output[i]); + } + + for (auto &event : events) { + EXPECT_SUCCESS(urEventRelease(event)); + } +} + +TEST_P(urEventWaitTest, WithCrossContextDependencies) { + // OpenCL: https://github.com/intel/llvm/issues/18765 + UUR_KNOWN_FAILURE_ON(uur::NativeCPU{}, uur::OpenCL{}); + + std::vector output(count, 1); + + std::vector events; + for (size_t i = 0; i < maxNumContexts - 1; i++) { + auto waitEvent = events.size() ? &events.back() : nullptr; + ur_event_handle_t event = nullptr; + ASSERT_SUCCESS( + urEnqueueMemBufferCopy(queues[i], src_buffer[i], src_buffer[i + 1], 0, + 0, size, waitEvent ? 1 : 0, waitEvent, &event)); + events.push_back(event); + } + + ur_event_handle_t event1 = nullptr; + ASSERT_SUCCESS(urEnqueueMemBufferCopy(queues.back(), src_buffer.back(), + dst_buffer.back(), 0, 0, size, 1, + &events.back(), &event1)); + + ur_event_handle_t event2 = nullptr; + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues.back(), dst_buffer.back(), false, + 0, size, output.data(), 0, nullptr, + &event2)); + + events.push_back(event1); + events.push_back(event2); + + ASSERT_SUCCESS( + urEventWait(static_cast(events.size()), events.data())); + ASSERT_EQ(input.front(), output); + + EXPECT_SUCCESS(urEventRelease(event1)); + EXPECT_SUCCESS(urEventRelease(event2)); +}