Skip to content

太初平台causal_softmax算子 #171

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion operatorspy/tests/causal_softmax.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def test(lib, handle, torch_device, x_shape, x_stride=None, x_dtype=torch.float1
None,
)
)
#print(x.flatten()[0], ans.flatten()[0])
assert torch.allclose(x, ans, atol=0, rtol=1e-2)
check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor))

Expand Down Expand Up @@ -106,6 +107,14 @@ def test_ascend(lib, test_cases):
test(lib, handle, "npu", x_shape, x_stride)

destroy_handle(lib, handle)
def test_teco(lib, test_cases):
import torch_sdaa
device = DeviceEnum.DEVICE_TECO
handle = create_handle(lib, device)
for x_shape, x_stride in test_cases:
test(lib, handle, "sdaa", x_shape, x_stride)

destroy_handle(lib, handle)

if __name__ == "__main__":
test_cases = [
Expand Down Expand Up @@ -147,6 +156,8 @@ def test_ascend(lib, test_cases):
test_bang(lib, test_cases)
if args.ascend:
test_ascend(lib, test_cases)
if not (args.cpu or args.cuda or args.bang or args.ascend):
if args.teco:
test_teco(lib, test_cases)
if not (args.cpu or args.cuda or args.bang or args.ascend or args.teco):
test_cpu(lib, test_cases)
print("Test passed!")
23 changes: 21 additions & 2 deletions src/ops/causal_softmax/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
#endif
#ifdef ENABLE_NV_GPU
#include "../../devices/cuda/common_cuda.h"
#include "cuda/causal_softmax.cuh"
#include "../../devices/cuda/cuda_handle.h"
#include "cuda/causal_softmax.cuh"
#endif
#ifdef ENABLE_CAMBRICON_MLU
#include "../../devices/bang/bang_handle.h"
Expand All @@ -18,6 +18,9 @@
#ifdef ENABLE_ASCEND_NPU
#include "ascend/causal_softmax_aclnn.h"
#endif
#ifdef ENABLE_TECO_SDAA
#include "teco/causal_softmax_sdaa.h"
#endif

__C infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor(
infiniopHandle_t handle,
Expand All @@ -30,7 +33,7 @@ __C infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor(
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaCreateCausalSoftmaxDescriptor((CudaHandle_t)handle, (CausalSoftmaxCudaDescriptor_t *) desc_ptr, y_desc);
return cudaCreateCausalSoftmaxDescriptor((CudaHandle_t) handle, (CausalSoftmaxCudaDescriptor_t *) desc_ptr, y_desc);
}

#endif
Expand All @@ -44,6 +47,10 @@ __C infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor(
case DevAscendNpu: {
return aclnnCreateCausalSoftmaxDescriptor((AscendHandle_t) handle, (CausalSoftmaxAclnnDescriptor_t *) desc_ptr, y_desc);
}
#endif
#ifdef ENABLE_TECO_SDAA
case DevTecoSDAA:
return tecoCreateCausalSoftmaxDescriptor((TecoHandle_t) handle, (CausalSoftmaxTecoDescriptor_t *) desc_ptr, y_desc);
#endif
}
return STATUS_BAD_DEVICE;
Expand Down Expand Up @@ -72,6 +79,10 @@ __C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmax
case DevAscendNpu: {
return aclnnGetCausalSoftmaxWorkspaceSize((CausalSoftmaxAclnnDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_TECO_SDAA
case DevTecoSDAA:
return tecoGetCausalSoftmaxWorkspaceSize((CausalSoftmaxTecoDescriptor_t) desc, size);
#endif
}
return STATUS_BAD_DEVICE;
Expand Down Expand Up @@ -99,6 +110,10 @@ __C infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t des
case DevAscendNpu: {
return aclnnCausalSoftmax((CausalSoftmaxAclnnDescriptor_t) desc, workspace, workspace_size, data, stream);
}
#endif
#ifdef ENABLE_TECO_SDAA
case DevTecoSDAA:
return tecoCausalSoftmax((CausalSoftmaxTecoDescriptor_t) desc, workspace, workspace_size, data, stream);
#endif
}
return STATUS_BAD_DEVICE;
Expand Down Expand Up @@ -126,6 +141,10 @@ __C infiniopStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftma
case DevAscendNpu: {
return aclnnDestroyCausalSoftmaxDescriptor((CausalSoftmaxAclnnDescriptor_t) desc);
}
#endif
#ifdef ENABLE_TECO_SDAA
case DevTecoSDAA:
return tecoDestroyCausalSoftmaxDescriptor((CausalSoftmaxTecoDescriptor_t) desc);
#endif
}
return STATUS_BAD_DEVICE;
Expand Down
34 changes: 34 additions & 0 deletions src/ops/causal_softmax/teco/causal_softmax_sdaa.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#ifndef __SDAA_CAUSAL_SOFTMAX_H__
#define __SDAA_CAUSAL_SOFTMAX_H__
#include "../../../devices/teco/teco_handle.h"
#include "../../utils.h"
#include "operators.h"
#include <sdaa_runtime.h>
struct CausalSoftmaxTecoDescriptor {
Device device;
int device_id;
DT dtype;
int ndim;
int *stride;
int *shape;
};

typedef struct CausalSoftmaxTecoDescriptor *CausalSoftmaxTecoDescriptor_t;


infiniopStatus_t tecoCreateCausalSoftmaxDescriptor(TecoHandle_t handle,
CausalSoftmaxTecoDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc);

infiniopStatus_t tecoGetCausalSoftmaxWorkspaceSize(CausalSoftmaxTecoDescriptor_t desc, uint64_t *size);

infiniopStatus_t tecoCausalSoftmax(CausalSoftmaxTecoDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *data,
void *stream);

infiniopStatus_t tecoDestroyCausalSoftmaxDescriptor(CausalSoftmaxTecoDescriptor_t desc);


#endif
243 changes: 243 additions & 0 deletions src/ops/causal_softmax/teco/causal_softmax_sdaa.scpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
#include "causal_softmax_sdaa.h"

__local__ halfv16 h_local;
__local__ floatv16 f_local;

infiniopStatus_t tecoCreateCausalSoftmaxDescriptor(TecoHandle_t handle,
CausalSoftmaxTecoDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc){
if (y_desc->ndim < 2 || y_desc->shape[y_desc->ndim - 1] < y_desc->shape[y_desc->ndim - 2]) {
return STATUS_BAD_TENSOR_SHAPE;
}

int ndim = y_desc->ndim;
int *shape = (int *)malloc(ndim * sizeof(int));
int *stride = (int *)malloc(ndim * sizeof(int));


for (int i = 0; i < ndim; i++) {
stride[i] = static_cast<int>(y_desc->strides[i]);
shape[i] = static_cast<int>(y_desc->shape[i]);
}

*desc_ptr = new CausalSoftmaxTecoDescriptor{
handle->device,
handle->device_id,
y_desc->dt,
ndim,
stride,
shape};
return STATUS_SUCCESS;
}

infiniopStatus_t tecoGetCausalSoftmaxWorkspaceSize(CausalSoftmaxTecoDescriptor_t desc, uint64_t *size) {
*size = desc->ndim * sizeof(int) * 2;
return STATUS_SUCCESS;
}

template<typename T>
__global__ void causalSoftmax(T *destination, int *shape, int *stride, int ndim, int mask){
int othersize = 1;
for(int i = 0; i < ndim - 1; i++){
othersize *= shape[i];
}
int remain = othersize % threadDim;
int step_easy = (othersize - remain) / threadDim;
int step_hard = step_easy + 1;
int step = (threadIdx < remain ? step_hard : step_easy);
int ind_start = (threadIdx < remain ? threadIdx * step_hard : (remain * step_hard + (threadIdx - remain) * step_easy));

int dimsize = shape[ndim - 1];
int buf_size = 16;

for (int i = ind_start; i < ind_start + step; i++) {
int ind_d = 0;
int ind_i = i;
int lastI = ind_i % shape[ndim - 2];

int remain_dhead = (lastI + mask + 1) % buf_size;
int repeat = (lastI + mask + 1 - remain_dhead) / buf_size;//针对前面这部分做softmax

int length = dimsize - (lastI + mask + 1);
int remainI = length % buf_size;
int rI = (length - remainI) / buf_size;//把后面这部分赋值为0

for (int j = ndim - 2; j >= 0; --j) {
ind_d += (ind_i % shape[j]) * stride[j];
ind_i /= shape[j];
}
//下面开始计算max,sum

float new_max = destination[ind_d];
float old_max = new_max;
float sum_value = 0.0f;
for(int r = 0; r < repeat; r++){
int start = ind_d + r * buf_size;
if constexpr (std::is_same<T, half>::value){
simd_load(h_local, destination + start);
f_local = simd_cvt_h2f(h_local);
}
else if constexpr (std::is_same<T, float>::value){
simd_load(f_local, destination + start);
}
for(int k = 0; k < buf_size; k++){
if(new_max < f_local[k]){
new_max = f_local[k];
}
}
for(int k = 0; k < buf_size; k++){
f_local[k] = expf(f_local[k] - new_max);
}
if(r > 0){
sum_value = sum_value * expf(old_max - new_max);
}
sum_value += simd_redsum(f_local);
old_max = new_max;
}
if(remain_dhead){
int start = ind_d + repeat * buf_size;
for(int k = 0; k < remain_dhead; k++){
if constexpr (std::is_same<T, half>::value){
if (new_max < static_cast<float>(destination[start + k])){
new_max = static_cast<float>(destination[start + k]);
}
}
else if constexpr (std::is_same<T, float>::value){
if (new_max < destination[start + k]){
new_max = destination[start + k];
}
}
}
if (repeat > 0){
sum_value = sum_value * expf(old_max - new_max);
}
for(int k = 0; k < remain_dhead; k++){
if constexpr (std::is_same<T, half>::value){
sum_value += expf(static_cast<float>(destination[start + k]) - new_max);
}
else if constexpr (std::is_same<T, float>::value){
sum_value += expf(destination[start + k] - new_max);
}
}
}

float sum_inv = 1.0f / sum_value;
//下面开始做softmax变换
for(int r = 0; r < repeat; r++){
int start = ind_d + r * buf_size;
if constexpr (std::is_same<T, half>::value){
simd_load(h_local, destination + start);
f_local = simd_cvt_h2f(h_local);
}
else if constexpr (std::is_same<T, float>::value){
simd_load(f_local, destination + start);
}

for(int k = 0; k < buf_size; k++){
f_local[k] = expf(f_local[k] - new_max) * sum_inv;
}
if constexpr (std::is_same<T, half>::value){
h_local = simd_cvt_f2h(f_local);
simd_store(h_local, destination + start);
}
else if constexpr (std::is_same<T, float>::value){
simd_store(f_local, destination + start);
}
}
if(remain_dhead){
int start = ind_d + repeat * buf_size;
for(int k = 0; k < remain_dhead; k++){
if constexpr (std::is_same<T, half>::value){
destination[start + k] = static_cast<half>(expf(static_cast<float>(destination[start + k]) - new_max) * sum_inv);
}
else if constexpr (std::is_same<T, float>::value){
destination[start + k] = expf(destination[start + k] - new_max) * sum_inv;
}
}

}

//针对剩下部分赋值为0
for(int r = 0; r < rI; r++){
int start = ind_d + mask + 1 + lastI + r * buf_size;
if constexpr (std::is_same<T, half>::value){
for(int k = 0; k < buf_size; k++){
destination[start + k] = static_cast<half>(0.0f);
}
}
else if constexpr (std::is_same<T, float>::value){
for(int k = 0; k < buf_size; k++){
destination[start + k] = 0.0f;
}
}
/***
if constexpr (std::is_same<T, half>::value){
simd_load(h_local, destination + start);
for(int k = 0; k < buf_size; k++){
h_local[k] = static_cast<half>(0.0f);
}
simd_store(h_local, destination + start);
}
else if constexpr (std::is_same<T, float>::value){
simd_load(f_local, destination + start);
for(int k = 0; k < buf_size; k++){
f_local[k] = 0.0f;
}
simd_store(f_local, destination + start);
}
***/
}

if (remainI){
int start = ind_d + mask + 1 + lastI + rI * buf_size;
if constexpr (std::is_same<T, half>::value){
for(int k = 0; k < remainI; k++){
destination[start + k] = static_cast<half>(0.0f);
}
}
else if constexpr (std::is_same<T, float>::value){
for(int k = 0; k < remainI; k++){
destination[start + k] = 0.0f;
}
}
}

}
}

infiniopStatus_t tecoCausalSoftmax(CausalSoftmaxTecoDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *data,
void *stream){
int ndim = desc->ndim;
int mask = desc->shape[ndim - 1] - desc->shape[ndim - 2];

int *teco_stride = reinterpret_cast<int *>(workspace);
int *teco_shape = teco_stride + ndim;

sdaaMemcpy(teco_stride, desc->stride, ndim * sizeof(int), sdaaMemcpyHostToDevice);
sdaaMemcpy(teco_shape, desc->shape, ndim * sizeof(int), sdaaMemcpyHostToDevice);
sdaaDeviceSynchronize();
if(dtype_eq(desc->dtype, F16)){
auto destination = reinterpret_cast<half *>(data);
causalSoftmax<half><<<1, (sdaaStream_t)stream>>>(destination, teco_shape, teco_stride, ndim, mask);
sdaaDeviceSynchronize();
return STATUS_SUCCESS;
}
else if(dtype_eq(desc->dtype, F32)){
auto destination = reinterpret_cast<float *>(data);
causalSoftmax<float><<<1, (sdaaStream_t)stream>>>(destination, teco_shape, teco_stride, ndim, mask);
sdaaDeviceSynchronize();
return STATUS_SUCCESS;
}

return STATUS_BAD_TENSOR_DTYPE;
}

infiniopStatus_t tecoDestroyCausalSoftmaxDescriptor(CausalSoftmaxTecoDescriptor_t desc){
//free(desc->stride);
//free(desc->shape);
delete desc;
return STATUS_SUCCESS;
}