Skip to content

Commit 7f729b7

Browse files
Detect GPU hang in clWaitForEvents
This change: - moves NEO::WaitStatus to a separate file - enables detection of GPU hang in clWaitForEvents - adjusts most of blocking calls in CommandStreamReceiver to return WaitStatus - adds ULTs to cover the new code Related-To: NEO-6681 Signed-off-by: Patryk Wrobel <[email protected]>
1 parent f2e1361 commit 7f729b7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+486
-94
lines changed

level_zero/core/source/cmdqueue/cmdqueue.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "shared/source/command_stream/command_stream_receiver.h"
99
#include "shared/source/command_stream/csr_definitions.h"
1010
#include "shared/source/command_stream/linear_stream.h"
11+
#include "shared/source/command_stream/wait_status.h"
1112
#include "shared/source/debug_settings/debug_settings_manager.h"
1213
#include "shared/source/memory_manager/memory_manager.h"
1314

level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
*/
77

88
#include "shared/source/command_stream/scratch_space_controller_xehp_and_later.h"
9+
#include "shared/source/command_stream/wait_status.h"
910
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
1011
#include "shared/test/common/libult/ult_command_stream_receiver.h"
1112
#include "shared/test/common/mocks/mock_command_stream_receiver.h"

opencl/source/command_queue/command_queue.cpp

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -229,19 +229,25 @@ bool CommandQueue::isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState
229229
return false;
230230
}
231231

232-
void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait) {
232+
WaitStatus CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait) {
233233
WAIT_ENTER()
234234

235+
WaitStatus waitStatus{WaitStatus::Ready};
236+
235237
DBG_LOG(LogTaskCounts, __FUNCTION__, "Waiting for taskCount:", gpgpuTaskCountToWait);
236238
DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "Current taskCount:", getHwTag());
237239

238240
if (!skipWait) {
239241
bool forcePowerSavingMode = this->throttle == QueueThrottle::LOW;
240242

241-
getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(gpgpuTaskCountToWait,
242-
flushStampToWait,
243-
useQuickKmdSleep,
244-
forcePowerSavingMode);
243+
waitStatus = getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(gpgpuTaskCountToWait,
244+
flushStampToWait,
245+
useQuickKmdSleep,
246+
forcePowerSavingMode);
247+
if (waitStatus == WaitStatus::GpuHang) {
248+
return WaitStatus::GpuHang;
249+
}
250+
245251
DEBUG_BREAK_IF(getHwTag() < gpgpuTaskCountToWait);
246252

247253
if (gtpinIsGTPinInitialized()) {
@@ -251,17 +257,25 @@ void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEn
251257

252258
for (const CopyEngineState &copyEngine : copyEnginesToWait) {
253259
auto bcsCsr = getBcsCommandStreamReceiver(copyEngine.engineType);
254-
bcsCsr->waitForTaskCountWithKmdNotifyFallback(copyEngine.taskCount, 0, false, false);
255-
bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(copyEngine.taskCount);
256-
}
257260

258-
if (cleanTemporaryAllocationList) {
259-
getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(gpgpuTaskCountToWait);
260-
} else {
261-
getGpgpuCommandStreamReceiver().waitForTaskCount(gpgpuTaskCountToWait);
261+
waitStatus = bcsCsr->waitForTaskCountWithKmdNotifyFallback(copyEngine.taskCount, 0, false, false);
262+
if (waitStatus == WaitStatus::GpuHang) {
263+
return WaitStatus::GpuHang;
264+
}
265+
266+
waitStatus = bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(copyEngine.taskCount);
267+
if (waitStatus == WaitStatus::GpuHang) {
268+
return WaitStatus::GpuHang;
269+
}
262270
}
263271

272+
waitStatus = cleanTemporaryAllocationList
273+
? getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(gpgpuTaskCountToWait)
274+
: getGpgpuCommandStreamReceiver().waitForTaskCount(gpgpuTaskCountToWait);
275+
264276
WAIT_LEAVE()
277+
278+
return waitStatus;
265279
}
266280

267281
bool CommandQueue::isQueueBlocked() {

opencl/source/command_queue/command_queue.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2018-2021 Intel Corporation
2+
* Copyright (C) 2018-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -209,9 +209,9 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
209209

210210
MOCKABLE_VIRTUAL bool isQueueBlocked();
211211

212-
MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait);
213-
MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
214-
this->waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, true, false);
212+
MOCKABLE_VIRTUAL WaitStatus waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait);
213+
MOCKABLE_VIRTUAL WaitStatus waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
214+
return this->waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, true, false);
215215
}
216216
MOCKABLE_VIRTUAL void waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList);
217217
MOCKABLE_VIRTUAL void waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler) {
@@ -223,7 +223,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
223223
const cl_event *eventWaitList);
224224

225225
MOCKABLE_VIRTUAL CommandStreamReceiver &getGpgpuCommandStreamReceiver() const;
226-
CommandStreamReceiver *getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const;
226+
MOCKABLE_VIRTUAL CommandStreamReceiver *getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const;
227227
CommandStreamReceiver *getBcsForAuxTranslation() const;
228228
MOCKABLE_VIRTUAL CommandStreamReceiver &selectCsrForBuiltinOperation(const CsrSelectionArgs &args) const;
229229
Device &getDevice() const noexcept;

opencl/source/event/event.cpp

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
#include "opencl/source/helpers/hardware_commands_helper.h"
2828
#include "opencl/source/mem_obj/mem_obj.h"
2929

30+
#include <algorithm>
31+
3032
namespace NEO {
3133

3234
Event::Event(
@@ -417,23 +419,26 @@ void Event::getBoundaryTimestampValues(TimestampPacketContainer *timestampContai
417419
}
418420
}
419421

420-
inline bool Event::wait(bool blocking, bool useQuickKmdSleep) {
422+
inline WaitStatus Event::wait(bool blocking, bool useQuickKmdSleep) {
421423
while (this->taskCount == CompletionStamp::notReady) {
422424
if (blocking == false) {
423-
return false;
425+
return WaitStatus::NotReady;
424426
}
425427
}
426428

427429
Range<CopyEngineState> states{&bcsState, bcsState.isValid() ? 1u : 0u};
428-
cmdQueue->waitUntilComplete(taskCount.load(), states, flushStamp->peekStamp(), useQuickKmdSleep);
430+
const auto waitStatus = cmdQueue->waitUntilComplete(taskCount.load(), states, flushStamp->peekStamp(), useQuickKmdSleep);
431+
if (waitStatus == WaitStatus::GpuHang) {
432+
return WaitStatus::GpuHang;
433+
}
429434
updateExecutionStatus();
430435

431436
DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0);
432437

433438
auto *allocationStorage = cmdQueue->getGpgpuCommandStreamReceiver().getInternalAllocationStorage();
434439
allocationStorage->cleanAllocationList(this->taskCount, TEMPORARY_ALLOCATION);
435440

436-
return true;
441+
return WaitStatus::Ready;
437442
}
438443

439444
void Event::updateExecutionStatus() {
@@ -630,16 +635,23 @@ cl_int Event::waitForEvents(cl_uint numEvents,
630635
// pointers to workerLists - for fast swap operations
631636
WorkerListT *currentlyPendingEvents = &workerList1;
632637
WorkerListT *pendingEventsLeft = &workerList2;
638+
WaitStatus eventWaitStatus = WaitStatus::NotReady;
633639

634640
while (currentlyPendingEvents->size() > 0) {
635-
for (auto &e : *currentlyPendingEvents) {
636-
Event *event = castToObjectOrAbort<Event>(e);
641+
for (auto current = currentlyPendingEvents->begin(), end = currentlyPendingEvents->end(); current != end; ++current) {
642+
Event *event = castToObjectOrAbort<Event>(*current);
637643
if (event->peekExecutionStatus() < CL_COMPLETE) {
638644
return CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
639645
}
640646

641-
if (event->wait(false, false) == false) {
647+
eventWaitStatus = event->wait(false, false);
648+
if (eventWaitStatus == WaitStatus::NotReady) {
642649
pendingEventsLeft->push_back(event);
650+
} else if (eventWaitStatus == WaitStatus::GpuHang) {
651+
setExecutionStatusToAbortedDueToGpuHang(pendingEventsLeft->begin(), pendingEventsLeft->end());
652+
setExecutionStatusToAbortedDueToGpuHang(current, end);
653+
654+
return CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
643655
}
644656
}
645657

@@ -650,6 +662,13 @@ cl_int Event::waitForEvents(cl_uint numEvents,
650662
return CL_SUCCESS;
651663
}
652664

665+
inline void Event::setExecutionStatusToAbortedDueToGpuHang(cl_event *first, cl_event *last) {
666+
std::for_each(first, last, [](cl_event &e) {
667+
Event *event = castToObjectOrAbort<Event>(e);
668+
event->transitionExecutionStatus(executionAbortedDueToGpuHang);
669+
});
670+
}
671+
653672
uint32_t Event::getTaskLevel() {
654673
return taskLevel;
655674
}

opencl/source/event/event.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
/*
2-
* Copyright (C) 2018-2021 Intel Corporation
2+
* Copyright (C) 2018-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
66
*/
77

88
#pragma once
9+
#include "shared/source/command_stream/wait_status.h"
910
#include "shared/source/helpers/flush_stamp.h"
1011
#include "shared/source/os_interface/os_time.h"
1112
#include "shared/source/os_interface/performance_counters.h"
@@ -80,6 +81,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
8081
};
8182

8283
static const cl_ulong objectMagic = 0x80134213A43C981ALL;
84+
static constexpr cl_int executionAbortedDueToGpuHang = -777;
8385

8486
Event(CommandQueue *cmdQueue, cl_command_type cmdType,
8587
uint32_t taskLevel, uint32_t taskCount);
@@ -206,9 +208,8 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
206208
// adds a callback (execution state change listener) to this event's list of callbacks
207209
void addCallback(Callback::ClbFuncT fn, cl_int type, void *data);
208210

209-
//returns true on success
210-
//if(blocking==false), will return with false instead of blocking while waiting for completion
211-
virtual bool wait(bool blocking, bool useQuickKmdSleep);
211+
//if(blocking==false), will return with WaitStatus::NotReady instead of blocking while waiting for completion
212+
virtual WaitStatus wait(bool blocking, bool useQuickKmdSleep);
212213

213214
bool isUserEvent() const {
214215
return (CL_COMMAND_USER == cmdType);
@@ -347,6 +348,8 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
347348
void unblockEventsBlockedByThis(int32_t transitionStatus);
348349
void submitCommand(bool abortBlockedTasks);
349350

351+
static void setExecutionStatusToAbortedDueToGpuHang(cl_event *first, cl_event *last);
352+
350353
bool currentCmdQVirtualEvent;
351354
std::atomic<Command *> cmdToSubmit;
352355
std::atomic<Command *> submittedCmd;

opencl/source/event/user_event.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2018-2021 Intel Corporation
2+
* Copyright (C) 2018-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -24,13 +24,13 @@ void UserEvent::updateExecutionStatus() {
2424
return;
2525
}
2626

27-
bool UserEvent::wait(bool blocking, bool useQuickKmdSleep) {
27+
WaitStatus UserEvent::wait(bool blocking, bool useQuickKmdSleep) {
2828
while (updateStatusAndCheckCompletion() == false) {
2929
if (blocking == false) {
30-
return false;
30+
return WaitStatus::NotReady;
3131
}
3232
}
33-
return true;
33+
return WaitStatus::Ready;
3434
}
3535

3636
uint32_t UserEvent::getTaskLevel() {
@@ -53,16 +53,15 @@ VirtualEvent::VirtualEvent(CommandQueue *cmdQ, Context *ctx)
5353
}
5454

5555
void VirtualEvent::updateExecutionStatus() {
56-
;
5756
}
5857

59-
bool VirtualEvent::wait(bool blocking, bool useQuickKmdSleep) {
58+
WaitStatus VirtualEvent::wait(bool blocking, bool useQuickKmdSleep) {
6059
while (updateStatusAndCheckCompletion() == false) {
6160
if (blocking == false) {
62-
return false;
61+
return WaitStatus::NotReady;
6362
}
6463
}
65-
return true;
64+
return WaitStatus::Ready;
6665
}
6766

6867
uint32_t VirtualEvent::getTaskLevel() {

opencl/source/event/user_event.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2018-2021 Intel Corporation
2+
* Copyright (C) 2018-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -18,7 +18,7 @@ class UserEvent : public Event {
1818

1919
~UserEvent() override = default;
2020

21-
bool wait(bool blocking, bool useQuickKmdSleep) override;
21+
WaitStatus wait(bool blocking, bool useQuickKmdSleep) override;
2222

2323
void updateExecutionStatus() override;
2424

@@ -33,7 +33,7 @@ class VirtualEvent : public Event {
3333

3434
~VirtualEvent() override = default;
3535

36-
bool wait(bool blocking, bool useQuickKmdSleep) override;
36+
WaitStatus wait(bool blocking, bool useQuickKmdSleep) override;
3737

3838
bool setStatus(cl_int status) override;
3939

opencl/test/unit_test/api/cl_enqueue_wait_for_events_tests.inl

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
/*
2-
* Copyright (C) 2018-2021 Intel Corporation
2+
* Copyright (C) 2018-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
66
*/
77

8+
#include "shared/source/command_stream/wait_status.h"
89
#include "shared/source/helpers/array_count.h"
910

1011
#include "opencl/source/command_queue/command_queue.h"
@@ -60,9 +61,9 @@ TEST_F(clEnqueueWaitForEventsTests, GivenProperParamsWhenClEnqueueWaitForEventsI
6061
MyEvent(Context *context)
6162
: UserEvent(context) {
6263
}
63-
bool wait(bool blocking, bool quickKmdSleep) override {
64+
WaitStatus wait(bool blocking, bool quickKmdSleep) override {
6465
wasWaitCalled = true;
65-
return true;
66+
return WaitStatus::Ready;
6667
};
6768
bool wasWaitCalled = false;
6869
};

0 commit comments

Comments
 (0)