Skip to content

Commit a558763

Browse files
committed
test(kernel): 添加 attention 单测
Signed-off-by: YdrMaster <[email protected]>
1 parent 6e66bf2 commit a558763

File tree

3 files changed

+54
-4
lines changed

3 files changed

+54
-4
lines changed

src/04kernel/src/kernels/attention/cuda_kernel.cu

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ namespace refactor::kernel {
140140
auto att = reinterpret_cast<half *>(workspace);
141141
auto workspaceQK = reinterpret_cast<uint8_t *>(workspace) + hardware::alignBytes(info.attSize(0), 256);
142142
auto workspaceAV = workspaceQK + hardware::alignBytes(d->workspaceSizeQK, 256);
143+
auto stream = cudaStreamLegacy;
143144
{
144145
half alpha = rsqrtf(info.headDim), beta = 0;
145146
cublasLtMatmul(
@@ -152,13 +153,14 @@ namespace refactor::kernel {
152153
att, d->att.get(),
153154
&d->algoQK,
154155
workspaceQK, d->workspaceSizeQK,
155-
cudaStreamLegacy);
156+
stream);
156157
}
157158
auto attLen = info.attLen(0);
158159
auto bufLen = attLen;
159160
softmax<<<dim3(info.batch * info.nHead, info.seqLen),
160161
std::min(1024u, attLen),
161-
attLen * sizeof(float)>>>(
162+
attLen * sizeof(float),
163+
stream>>>(
162164
att, causualMask, attLen, bufLen);
163165
{
164166
half alpha = 1, beta = 0;
@@ -172,7 +174,7 @@ namespace refactor::kernel {
172174
o, d->q.get(),
173175
&d->algoAV,
174176
workspaceAV, d->workspaceSizeAV,
175-
cudaStreamLegacy);
177+
stream);
176178
};
177179
};
178180

src/04kernel/src/utilities/cuda/cublaslt_utils.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ namespace refactor::kernel::cublas {
109109
CUDA_ASSERT(cudaGetDeviceProperties(&prop, device));
110110

111111
auto workspace = std::numeric_limits<uint64_t>::max();
112-
auto alignment = prop.textureAlignment;
112+
uint32_t alignment = prop.textureAlignment;
113113

114114
cublasLtMatmulPreference_t preference;
115115
CUBLASLT_ASSERT(cublasLtMatmulPreferenceCreate(&preference));
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#ifdef USE_CUDA
2+
3+
#include "../../../src/kernels/attention/cuda_kernel.hh"
4+
#include "hardware/device_manager.h"
5+
#include <gtest/gtest.h>
6+
#include <numeric>
7+
8+
using namespace refactor;
9+
using namespace kernel;
10+
using namespace hardware;
11+
12+
TEST(kernel, AttentionCudaNoKvCache) {
13+
// build routine
14+
AttentionInfo info{
15+
.dataType = DataType::FP16,
16+
.batch = 1,
17+
.nHead = 4,
18+
.nKVHead = 4,
19+
.seqLen = 31,
20+
.headDim = 256,
21+
.cacheLen = 0,
22+
.concatCache = false,
23+
.resetCache = false,
24+
};
25+
auto q = Tensor::share(DataType::FP16, Shape{info.batch, info.nHead, info.seqLen, info.headDim}),
26+
k = Tensor::share(DataType::FP16, Shape{info.batch, info.nKVHead, info.seqLen, info.headDim}),
27+
v = Tensor::share(DataType::FP16, Shape{info.batch, info.nKVHead, info.seqLen, info.headDim}),
28+
o = q;
29+
auto kernel = AttentionCuda::build(info);
30+
ASSERT_TRUE(kernel);
31+
auto res = runtime::Resources();
32+
auto [routine, workspaceSize] = kernel->lower(res);
33+
// malloc
34+
auto &dev = *device::init(Device::Type::Nvidia, 0, "");
35+
auto qGpu = dev.malloc(q->bytesSize()),
36+
kGpu = dev.malloc(k->bytesSize()),
37+
vGpu = dev.malloc(v->bytesSize()),
38+
oGpu = dev.malloc(o->bytesSize()),
39+
workspace = dev.malloc(workspaceSize);
40+
// inference
41+
{
42+
void const *inputs[]{*qGpu, *kGpu, *vGpu};
43+
void *outputs[]{*oGpu};
44+
routine(res, *workspace, inputs, outputs);
45+
}
46+
}
47+
48+
#endif

0 commit comments

Comments
 (0)