Skip to content

Commit 50841bf

Browse files
Set arg local for all devices in Kernel
Related-To: NEO-5001 Signed-off-by: Mateusz Jablonski <[email protected]>
1 parent 2e346b5 commit 50841bf

File tree

2 files changed

+97
-76
lines changed

2 files changed

+97
-76
lines changed

opencl/source/kernel/kernel.cpp

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1271,47 +1271,57 @@ bool Kernel::requiresCoherency() {
12711271
return false;
12721272
}
12731273

1274-
cl_int Kernel::setArgLocal(uint32_t argIndex,
1274+
cl_int Kernel::setArgLocal(uint32_t argIndexIn,
12751275
size_t argSize,
12761276
const void *argVal) {
1277-
auto rootDeviceIndex = getDevice().getRootDeviceIndex();
1278-
auto crossThreadData = reinterpret_cast<uint32_t *>(getCrossThreadData(rootDeviceIndex));
1279-
auto &defaultKernelInfo = getDefaultKernelInfo();
1277+
std::bitset<64> isArgSet{};
1278+
storeKernelArg(argIndexIn, SLM_OBJ, nullptr, argVal, argSize);
12801279

1281-
storeKernelArg(argIndex, SLM_OBJ, nullptr, argVal, argSize);
1280+
for (auto &pClDevice : getDevices()) {
1281+
auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
1282+
if (isArgSet.test(rootDeviceIndex)) {
1283+
continue;
1284+
}
1285+
auto crossThreadData = reinterpret_cast<uint32_t *>(getCrossThreadData(rootDeviceIndex));
1286+
auto &kernelInfo = *kernelInfos[rootDeviceIndex];
1287+
auto &kernelDeviceInfo = kernelDeviceInfos[rootDeviceIndex];
12821288

1283-
kernelDeviceInfos[rootDeviceIndex].slmSizes[argIndex] = argSize;
1289+
uint32_t argIndex = argIndexIn;
12841290

1285-
// Extract our current slmOffset
1286-
auto slmOffset = *ptrOffset(crossThreadData,
1287-
defaultKernelInfo.kernelArgInfo[argIndex].kernelArgPatchInfoVector[0].crossthreadOffset);
1291+
kernelDeviceInfo.slmSizes[argIndex] = argSize;
12881292

1289-
// Add our size
1290-
slmOffset += static_cast<uint32_t>(argSize);
1293+
// Extract our current slmOffset
1294+
auto slmOffset = *ptrOffset(crossThreadData,
1295+
kernelInfo.kernelArgInfo[argIndex].kernelArgPatchInfoVector[0].crossthreadOffset);
12911296

1292-
// Update all slm offsets after this argIndex
1293-
++argIndex;
1294-
while (argIndex < kernelDeviceInfos[rootDeviceIndex].slmSizes.size()) {
1295-
const auto &kernelArgInfo = defaultKernelInfo.kernelArgInfo[argIndex];
1296-
auto slmAlignment = kernelArgInfo.slmAlignment;
1297+
// Add our size
1298+
slmOffset += static_cast<uint32_t>(argSize);
12971299

1298-
// If an local argument, alignment should be non-zero
1299-
if (slmAlignment) {
1300-
// Align to specified alignment
1301-
slmOffset = alignUp(slmOffset, slmAlignment);
1300+
// Update all slm offsets after this argIndex
1301+
++argIndex;
1302+
while (argIndex < kernelDeviceInfo.slmSizes.size()) {
1303+
const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex];
1304+
auto slmAlignment = kernelArgInfo.slmAlignment;
1305+
1306+
// If an local argument, alignment should be non-zero
1307+
if (slmAlignment) {
1308+
// Align to specified alignment
1309+
slmOffset = alignUp(slmOffset, slmAlignment);
1310+
1311+
// Patch our new offset into cross thread data
1312+
auto patchLocation = ptrOffset(crossThreadData,
1313+
kernelArgInfo.kernelArgPatchInfoVector[0].crossthreadOffset);
1314+
*patchLocation = slmOffset;
1315+
}
13021316

1303-
// Patch our new offset into cross thread data
1304-
auto patchLocation = ptrOffset(crossThreadData,
1305-
kernelArgInfo.kernelArgPatchInfoVector[0].crossthreadOffset);
1306-
*patchLocation = slmOffset;
1317+
slmOffset += static_cast<uint32_t>(kernelDeviceInfo.slmSizes[argIndex]);
1318+
++argIndex;
13071319
}
13081320

1309-
slmOffset += static_cast<uint32_t>(kernelDeviceInfos[rootDeviceIndex].slmSizes[argIndex]);
1310-
++argIndex;
1321+
kernelDeviceInfo.slmTotalSize = kernelInfo.workloadInfo.slmStaticSize + alignUp(slmOffset, KB);
1322+
isArgSet.set(rootDeviceIndex);
13111323
}
13121324

1313-
kernelDeviceInfos[rootDeviceIndex].slmTotalSize = defaultKernelInfo.workloadInfo.slmStaticSize + alignUp(slmOffset, KB);
1314-
13151325
return CL_SUCCESS;
13161326
}
13171327

opencl/test/unit_test/kernel/kernel_slm_arg_tests.cpp

Lines changed: 59 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
#include "shared/source/helpers/ptr_math.h"
1010

1111
#include "opencl/source/kernel/kernel.h"
12-
#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
12+
#include "opencl/test/unit_test/fixtures/multi_root_device_fixture.h"
1313
#include "opencl/test/unit_test/mocks/mock_context.h"
1414
#include "opencl/test/unit_test/mocks/mock_kernel.h"
1515
#include "opencl/test/unit_test/mocks/mock_program.h"
@@ -19,86 +19,97 @@
1919

2020
using namespace NEO;
2121

22-
class KernelSlmArgTest : public Test<ClDeviceFixture> {
22+
class KernelSlmArgTest : public MultiRootDeviceWithSubDevicesFixture {
2323
protected:
2424
void SetUp() override {
25-
ClDeviceFixture::SetUp();
26-
pKernelInfo = std::make_unique<KernelInfo>();
27-
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = 1;
28-
29-
KernelArgPatchInfo kernelArgPatchInfo;
30-
31-
pKernelInfo->kernelArgInfo.resize(3);
32-
pKernelInfo->kernelArgInfo[2].kernelArgPatchInfoVector.push_back(kernelArgPatchInfo);
33-
pKernelInfo->kernelArgInfo[1].kernelArgPatchInfoVector.push_back(kernelArgPatchInfo);
34-
pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector.push_back(kernelArgPatchInfo);
35-
36-
pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset = 0x10;
37-
pKernelInfo->kernelArgInfo[0].slmAlignment = 0x1;
38-
pKernelInfo->kernelArgInfo[1].kernelArgPatchInfoVector[0].crossthreadOffset = 0x20;
39-
pKernelInfo->kernelArgInfo[1].kernelArgPatchInfoVector[0].size = sizeof(void *);
40-
pKernelInfo->kernelArgInfo[2].kernelArgPatchInfoVector[0].crossthreadOffset = 0x30;
41-
pKernelInfo->kernelArgInfo[2].slmAlignment = 0x400;
42-
pKernelInfo->workloadInfo.slmStaticSize = 3 * KB;
43-
44-
program = std::make_unique<MockProgram>(toClDeviceVector(*pClDevice));
45-
pKernel = new MockKernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
25+
MultiRootDeviceWithSubDevicesFixture::SetUp();
26+
27+
program = std::make_unique<MockProgram>(context.get(), false, context->getDevices());
28+
KernelInfoContainer kernelInfos;
29+
kernelInfos.resize(3);
30+
for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
31+
32+
pKernelInfo[rootDeviceIndex] = std::make_unique<KernelInfo>();
33+
pKernelInfo[rootDeviceIndex]->kernelDescriptor.kernelAttributes.simdSize = 1;
34+
35+
KernelArgPatchInfo kernelArgPatchInfo;
36+
37+
pKernelInfo[rootDeviceIndex]->kernelArgInfo.resize(3);
38+
pKernelInfo[rootDeviceIndex]->kernelArgInfo[1].kernelArgPatchInfoVector.push_back(kernelArgPatchInfo);
39+
pKernelInfo[rootDeviceIndex]->kernelArgInfo[2].kernelArgPatchInfoVector.push_back(kernelArgPatchInfo);
40+
pKernelInfo[rootDeviceIndex]->kernelArgInfo[0].kernelArgPatchInfoVector.push_back(kernelArgPatchInfo);
41+
42+
pKernelInfo[rootDeviceIndex]->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset = 0x10;
43+
pKernelInfo[rootDeviceIndex]->kernelArgInfo[0].slmAlignment = 0x1;
44+
pKernelInfo[rootDeviceIndex]->kernelArgInfo[0].metadata.addressQualifier = KernelArgMetadata::AddrLocal;
45+
pKernelInfo[rootDeviceIndex]->kernelArgInfo[1].kernelArgPatchInfoVector[0].crossthreadOffset = 0x20;
46+
pKernelInfo[rootDeviceIndex]->kernelArgInfo[1].kernelArgPatchInfoVector[0].size = sizeof(void *);
47+
pKernelInfo[rootDeviceIndex]->kernelArgInfo[2].kernelArgPatchInfoVector[0].crossthreadOffset = 0x30;
48+
pKernelInfo[rootDeviceIndex]->kernelArgInfo[2].slmAlignment = 0x400;
49+
pKernelInfo[rootDeviceIndex]->kernelArgInfo[2].metadata.addressQualifier = KernelArgMetadata::AddrLocal;
50+
pKernelInfo[rootDeviceIndex]->workloadInfo.slmStaticSize = 3 * KB;
51+
52+
kernelInfos[rootDeviceIndex] = pKernelInfo[rootDeviceIndex].get();
53+
}
54+
pKernel = new MockKernel(program.get(), kernelInfos);
4655
ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
4756

48-
pKernel->setKernelArgHandler(0, &Kernel::setArgLocal);
49-
pKernel->setKernelArgHandler(1, &Kernel::setArgImmediate);
50-
pKernel->setKernelArgHandler(2, &Kernel::setArgLocal);
51-
52-
uint32_t crossThreadData[0x40] = {};
53-
crossThreadData[0x20 / sizeof(uint32_t)] = 0x12344321;
54-
pKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData));
57+
for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
58+
crossThreadData[rootDeviceIndex][0x20 / sizeof(uint32_t)] = 0x12344321;
59+
pKernel->setCrossThreadDataForRootDeviceIndex(rootDeviceIndex, &crossThreadData[rootDeviceIndex], sizeof(crossThreadData[rootDeviceIndex]));
60+
}
5561
}
5662

5763
void TearDown() override {
5864
delete pKernel;
5965

60-
ClDeviceFixture::TearDown();
66+
MultiRootDeviceWithSubDevicesFixture::TearDown();
6167
}
6268

6369
cl_int retVal = CL_SUCCESS;
6470
std::unique_ptr<MockProgram> program;
6571
MockKernel *pKernel = nullptr;
66-
std::unique_ptr<KernelInfo> pKernelInfo;
72+
std::unique_ptr<KernelInfo> pKernelInfo[3];
6773

6874
static const size_t slmSize0 = 0x200;
6975
static const size_t slmSize2 = 0x30;
76+
uint32_t crossThreadData[3][0x40]{};
7077
};
7178

7279
TEST_F(KernelSlmArgTest, WhenSettingSizeThenAlignmentOfHigherSlmArgsIsUpdated) {
7380
pKernel->setArg(0, slmSize0, nullptr);
7481
pKernel->setArg(2, slmSize2, nullptr);
7582

76-
auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData(rootDeviceIndex));
77-
auto slmOffset = ptrOffset(crossThreadData, 0x10);
78-
EXPECT_EQ(0u, *slmOffset);
83+
for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
84+
auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData(rootDeviceIndex));
85+
auto slmOffset = ptrOffset(crossThreadData, 0x10);
86+
EXPECT_EQ(0u, *slmOffset);
7987

80-
slmOffset = ptrOffset(crossThreadData, 0x20);
81-
EXPECT_EQ(0x12344321u, *slmOffset);
88+
slmOffset = ptrOffset(crossThreadData, 0x20);
89+
EXPECT_EQ(0x12344321u, *slmOffset);
8290

83-
slmOffset = ptrOffset(crossThreadData, 0x30);
84-
EXPECT_EQ(0x400u, *slmOffset);
91+
slmOffset = ptrOffset(crossThreadData, 0x30);
92+
EXPECT_EQ(0x400u, *slmOffset);
8593

86-
EXPECT_EQ(5 * KB, pKernel->kernelDeviceInfos[rootDeviceIndex].slmTotalSize);
94+
EXPECT_EQ(5 * KB, pKernel->kernelDeviceInfos[rootDeviceIndex].slmTotalSize);
95+
}
8796
}
8897

8998
TEST_F(KernelSlmArgTest, GivenReverseOrderWhenSettingSizeThenAlignmentOfHigherSlmArgsIsUpdated) {
9099
pKernel->setArg(2, slmSize2, nullptr);
91100
pKernel->setArg(0, slmSize0, nullptr);
92101

93-
auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData(rootDeviceIndex));
94-
auto slmOffset = ptrOffset(crossThreadData, 0x10);
95-
EXPECT_EQ(0u, *slmOffset);
102+
for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
103+
auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData(rootDeviceIndex));
104+
auto slmOffset = ptrOffset(crossThreadData, 0x10);
105+
EXPECT_EQ(0u, *slmOffset);
96106

97-
slmOffset = ptrOffset(crossThreadData, 0x20);
98-
EXPECT_EQ(0x12344321u, *slmOffset);
107+
slmOffset = ptrOffset(crossThreadData, 0x20);
108+
EXPECT_EQ(0x12344321u, *slmOffset);
99109

100-
slmOffset = ptrOffset(crossThreadData, 0x30);
101-
EXPECT_EQ(0x400u, *slmOffset);
110+
slmOffset = ptrOffset(crossThreadData, 0x30);
111+
EXPECT_EQ(0x400u, *slmOffset);
102112

103-
EXPECT_EQ(5 * KB, pKernel->kernelDeviceInfos[rootDeviceIndex].slmTotalSize);
113+
EXPECT_EQ(5 * KB, pKernel->kernelDeviceInfos[rootDeviceIndex].slmTotalSize);
114+
}
104115
}

0 commit comments

Comments
 (0)