Skip to content

random_sample_workspace #92

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions operatorspy/tests/random_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@ class RandomSampleDescriptor(Structure):


def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
indices = torch.zeros([topk], dtype = torch.int64)
if(torch_device == "cuda"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

可以均使用int64临时替代,不用再判断cuda了

indices = torch.zeros([topk], dtype = torch.uint64)
else:
indices = torch.zeros([topk], dtype = torch.int64)
dataNp = data.clone().detach()
sorted_indices = torch.arange(voc)

Expand All @@ -52,7 +55,7 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):

globalM = dataNp[0]
dataNp = (dataNp - globalM) / temperature
dataNp = torch.softmax(dataNp.float(), dim = 0)
dataNp = torch.softmax(dataNp, dim = 0)
sum_s = 0
for end in range(topk):
sum_s += dataNp[end]
Expand Down Expand Up @@ -88,15 +91,15 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
ans = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu")
else:
ans = random_sample_0(data)
if(torch_device == 'mlu' or torch_device == 'npu'):
if(torch_device != "cuda"):

indices = torch.zeros([1], dtype = torch.int64).to(torch_device)
else:

indices = torch.zeros([1], dtype = torch.uint64).to(torch_device)
x_tensor = to_tensor(data, lib)
indices_tensor = to_tensor(indices, lib)
if(torch_device == 'mlu' or torch_device == 'npu'):
if(torch_device != 'cuda'):
indices_tensor.descriptor.contents.dt = U64 # treat int64 as uint64


Expand Down
11 changes: 3 additions & 8 deletions src/ops/random_sample/cpu/random_sample.cc
Original file line number Diff line number Diff line change
Expand Up @@ -135,16 +135,11 @@ void random_sample_cpu_f16(RandomSampleCpuDescriptor_t desc,
auto index_ = reinterpret_cast<uint64_t *>(result);
auto source = reinterpret_cast<const uint16_t *>(probs);

char *origin = reinterpret_cast<char *>(workspace);
uint16_t *logits_ = (uint16_t *) origin;

std::copy(source, source + voc, logits_);

float M = f16_to_f32(logits_[0]);
float M = f16_to_f32(source[0]);
int index = 0;
for (int j = 1; j < voc; j++) {
if (M < f16_to_f32(logits_[j])) {
M = f16_to_f32(logits_[j]);
if (M < f16_to_f32(source[j])) {
M = f16_to_f32(source[j]);
index = j;
}
}
Expand Down
127 changes: 87 additions & 40 deletions src/ops/random_sample/cuda/random_sample.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,38 @@
#include "random_sample.cuh"
#include <cub/block/block_reduce.cuh>
#include <cub/cub.cuh>

template<class T, int BLOCK_DIM>
__global__ void argmaxKernel(T *val_out, int voc, uint64_t *result) {
float localM = -__FLT_MAX__;
uint64_t index = threadIdx.x;
for (int i = threadIdx.x; i < voc; i += BLOCK_DIM) {
if (localM < static_cast<float>(val_out[i])) {
localM = static_cast<float>(val_out[i]);
index = i;
}
}
__shared__ uint64_t globalInd[BLOCK_DIM];
__shared__ float globalM[BLOCK_DIM];
globalInd[threadIdx.x] = index;
globalM[threadIdx.x] = localM;
for (int strip = BLOCK_DIM / 2; strip > 0; strip /= 2) {
if (threadIdx.x < strip) {
if (globalM[threadIdx.x] < globalM[threadIdx.x + strip]) {
globalM[threadIdx.x] = globalM[threadIdx.x + strip];
globalInd[threadIdx.x] = globalInd[threadIdx.x + strip];
}
}
__syncthreads();
}
result[0] = globalInd[0];
}
template<class T, int BLOCK_DIM>
__global__ void softmax(
T *val_out,
int topk,
float temperature, int voc) {
float sum_s = 0.0f;
for (int i = threadIdx.x; i < topk; i += BLOCK_DIM) {
for (int i = threadIdx.x; i < voc; i += BLOCK_DIM) {
sum_s += __expf(static_cast<float>(val_out[i] - val_out[0]) / temperature);
}
__shared__ float sum_inverse_total;
Expand Down Expand Up @@ -84,26 +108,51 @@ void inclusive_sum(
data, data, voc,
stream);
}
template<class T, class I>
void random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
int voc, cudaStream_t stream) {

infiniopStatus_t random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
int voc, DT dtype) {
if (dtype_eq(dtype, F16)) {
sort_pairs_descending<half, uint64_t>(nullptr, size_radix_sort,
nullptr, nullptr,
nullptr, nullptr,
voc, nullptr);

sort_pairs_descending<T, I>(nullptr, size_radix_sort,
nullptr, nullptr,
nullptr, nullptr,
voc, stream);
inclusive_sum<half>(
nullptr, size_scan,
nullptr, voc,
nullptr);
return STATUS_SUCCESS;
} else if (dtype_eq(dtype, F32)) {
sort_pairs_descending<float, uint64_t>(nullptr, size_radix_sort,
nullptr, nullptr,
nullptr, nullptr,
voc, nullptr);

inclusive_sum<T>(
nullptr, size_scan,
nullptr, voc,
stream);
inclusive_sum<float>(
nullptr, size_scan,
nullptr, voc,
nullptr);
return STATUS_SUCCESS;
} else if (dtype_eq(dtype, F64)) {
sort_pairs_descending<double, uint64_t>(nullptr, size_radix_sort,
nullptr, nullptr,
nullptr, nullptr,
voc, nullptr);

inclusive_sum<double>(
nullptr, size_scan,
nullptr, voc,
nullptr);
return STATUS_SUCCESS;
} else {
return STATUS_BAD_TENSOR_DTYPE;
}
}
__global__ void random_sample_kernel(uint64_t *result,
uint64_t *key_out) {
result[0] = key_out[0];
}
void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace, void *result,
void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace, uint64_t workspace_size, void *result,
void const *probs,
float random_val,
float topp,
Expand All @@ -112,36 +161,34 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace
void *stream) {
int voc = desc->voc;
//下面这段代码在排序
char *origin = reinterpret_cast<char *>(workspace);
char *keyTmp = origin + voc * sizeof(half);
half *val_out = (half *) origin;

uint64_t *key_in = (uint64_t *) keyTmp;
uint64_t *key_out = key_in + voc;

index<<<(voc + 1023) / 1024, 1024, 0, (cudaStream_t) stream>>>(key_in, voc);
//下面开始计算workspace空间
size_t size_radix_sort;
size_t size_scan;
random_sample_workspace<half, uint64_t>(size_radix_sort, size_scan,
voc, (cudaStream_t) stream);
void *workspace_extra;
cudaMalloc(&workspace_extra, size_radix_sort + size_scan);
sort_pairs_descending<half, uint64_t>(
workspace_extra, size_radix_sort,
(half *) probs, val_out,
key_in, key_out,
voc, (cudaStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上
//排序结束,然后开始做softmax变换

if (topp > 0 && topk > 1) {
char *origin = reinterpret_cast<char *>(workspace);
char *keyTmp = origin + voc * sizeof(half);
half *val_out = (half *) origin;

uint64_t *key_in = (uint64_t *) keyTmp;
uint64_t *key_out = key_in + voc;

index<<<(voc + 1023) / 1024, 1024, 0, (cudaStream_t) stream>>>(key_in, voc);
//下面开始计算workspace空间

void *workspace_extra = reinterpret_cast<char *>(workspace) + desc->step;
uint64_t workspace_len = workspace_size - desc->step;
sort_pairs_descending<half, uint64_t>(
workspace_extra, workspace_len,
(half *) probs, val_out,
key_in, key_out,
voc, (cudaStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上
//排序结束,然后开始做softmax变换
int BLOCK_DIM = 1024;
int num_blocks = (voc + BLOCK_DIM - 1) / BLOCK_DIM;
softmax<half, 1024><<<num_blocks, BLOCK_DIM, 0, (cudaStream_t) stream>>>(val_out, topk,
temperature, voc);


inclusive_sum<half>(
workspace_extra, size_scan,
workspace_extra, workspace_len,
val_out, voc,
(cudaStream_t) stream);//该函数会实现scan功能不断累加结果
random_sample_kernel<half><<<1, 1, 0, (cudaStream_t) stream>>>((uint64_t *) result,
Expand All @@ -152,10 +199,10 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace
key_out);

} else {
random_sample_kernel<<<1, 1, 0, (cudaStream_t) stream>>>((uint64_t *) result,
key_out);
int BLOCK_DIM = 1024;
int num_blocks = (voc + BLOCK_DIM - 1) / BLOCK_DIM;
argmaxKernel<half, 1024><<<num_blocks, BLOCK_DIM, 0, (cudaStream_t) stream>>>((half *) probs, voc, (uint64_t *) result);
}
cudaFree(workspace_extra);
}

infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc,
Expand All @@ -172,7 +219,7 @@ infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc,
return STATUS_BAD_DEVICE;
}
if (dtype_eq(desc->dtype, F16)) {
random_sample_nv_gpu_f16(desc, workspace, result, probs, random_val, topp, topk, temperature, stream);
random_sample_nv_gpu_f16(desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
return STATUS_SUCCESS;
}

Expand Down
6 changes: 4 additions & 2 deletions src/ops/random_sample/cuda/random_sample.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,17 @@ struct RandomSampleCudaDescriptor {
int voc;
DT rDtype;
int rLength;
int step;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

指针偏移请使用size_t, 命名使用offset一类。其余使用int的地方都需要修改,根据情况使用uint64或int64

};

typedef struct RandomSampleCudaDescriptor *RandomSampleCudaDescriptor_t;

infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle,
RandomSampleCudaDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result,
infiniopTensorDescriptor_t probs);

infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, uint64_t *size);
infiniopStatus_t random_sample_workspace(size_t &size_radix_sort, size_t &size_scan,
int voc, DT dtype);
infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, unsigned long int *size);

infiniopStatus_t cudaRandomSample(RandomSampleCudaDescriptor_t desc,
void *workspace,
Expand Down
19 changes: 15 additions & 4 deletions src/ops/random_sample/cuda/random_sample_cuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,37 @@ infiniopStatus_t cudaCreateRandomSampleDescriptor(CudaHandle_t handle,
if (probs->ndim != 1) {
return STATUS_BAD_TENSOR_SHAPE;
}
if (!dtype_eq(result->dt, U64))
if (!dtype_eq(probs->dt, F16) && !dtype_eq(result->dt, U64)) {
return STATUS_BAD_TENSOR_DTYPE;
}

int voc = probs->shape[0];
int rLength = result->shape[0];
if (result->ndim != 1 && rLength != 1) {
return STATUS_BAD_TENSOR_SHAPE;
}
int step = 2 * voc * sizeof(uint64_t) + voc * sizeof(probs->dt);
*desc_ptr = new RandomSampleCudaDescriptor{
handle->device,
handle->device_id,
probs->dt,
voc,
result->dt,
rLength};
rLength,
step};

return STATUS_SUCCESS;
}

infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, uint64_t *size) {
*size = desc->voc * (2 * sizeof(uint64_t) + sizeof(desc->dtype));
infiniopStatus_t cudaGetRandomSampleWorkspaceSize(RandomSampleCudaDescriptor_t desc, unsigned long int *size) {
size_t size_radix_sort;
size_t size_scan;
infiniopStatus_t status = random_sample_workspace(size_radix_sort, size_scan,
desc->voc, desc->dtype);
if (status != STATUS_SUCCESS) {
return status;
}
*size = desc->step + std::max(size_radix_sort, size_scan);
return STATUS_SUCCESS;
}

Expand Down
2 changes: 1 addition & 1 deletion src/ops/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, uin
// split the dimension dim of a tensor descriptor into multiple dimensions
inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, uint64_t dim, const std::vector<uint64_t> &dims) {
uint64_t ndim = desc->ndim;
if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (uint64_t)1, std::multiplies{})) {
if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (uint64_t) 1, std::multiplies{})) {
return nullptr;
}
uint64_t new_ndim = ndim + dims.size() - 1;
Expand Down
Loading