modified cpu and cuda argmax

xgqdut2016 · xgqdut2016 · commit 9dc74c1c2f57 · 2024-11-20T11:00:01.000+08:00
diff --git a/src/ops/random_sample/cpu/random_sample.cc b/src/ops/random_sample/cpu/random_sample.cc
@@ -135,16 +135,11 @@ void random_sample_cpu_f16(RandomSampleCpuDescriptor_t desc,
     auto index_ = reinterpret_cast<uint64_t *>(result);
     auto source = reinterpret_cast<const uint16_t *>(probs);
 
-    char *origin = reinterpret_cast<char *>(workspace);
-    uint16_t *logits_ = (uint16_t *) origin;
-
-    std::copy(source, source + voc, logits_);
-
-    float M = f16_to_f32(logits_[0]);
+    float M = f16_to_f32(source[0]);
     int index = 0;
     for (int j = 1; j < voc; j++) {
-        if (M < f16_to_f32(logits_[j])) {
-            M = f16_to_f32(logits_[j]);
+        if (M < f16_to_f32(source[j])) {
+            M = f16_to_f32(source[j]);
             index = j;
         }
     }
diff --git a/src/ops/random_sample/cuda/random_sample.cu b/src/ops/random_sample/cuda/random_sample.cu
@@ -3,7 +3,31 @@
 #include "random_sample.cuh"
 #include <cub/block/block_reduce.cuh>
 #include <cub/cub.cuh>
-
+template<class T, int BLOCK_DIM>
+__global__ void argmaxKernel(T *val_out, int voc, uint64_t *result) {
+    float localM = -__FLT_MAX__;
+    uint64_t index = threadIdx.x;
+    for (int i = threadIdx.x; i < voc; i += BLOCK_DIM) {
+        if (localM < static_cast<float>(val_out[i])) {
+            localM = static_cast<float>(val_out[i]);
+            index = i;
+        }
+    }
+    __shared__ uint64_t globalInd[BLOCK_DIM];
+    __shared__ float globalM[BLOCK_DIM];
+    globalInd[threadIdx.x] = index;
+    globalM[threadIdx.x] = localM;
+    for (int strip = BLOCK_DIM / 2; strip > 0; strip /= 2) {
+        if (threadIdx.x < strip) {
+            if (globalM[threadIdx.x] < globalM[threadIdx.x + strip]) {
+                globalM[threadIdx.x] = globalM[threadIdx.x + strip];
+                globalInd[threadIdx.x] = globalInd[threadIdx.x + strip];
+            }
+        }
+        __syncthreads();
+    }
+    result[0] = globalInd[0];
+}
 template<class T, int BLOCK_DIM>
 __global__ void softmax(
     T *val_out,
@@ -132,25 +156,26 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace
                               void *stream) {
     int voc = desc->voc;
     //下面这段代码在排序
-    char *origin = reinterpret_cast<char *>(workspace);
-    char *keyTmp = origin + voc * sizeof(half);
-    half *val_out = (half *) origin;
 
-    uint64_t *key_in = (uint64_t *) keyTmp;
-    uint64_t *key_out = key_in + voc;
+    if (topp > 0 && topk > 1) {
+        char *origin = reinterpret_cast<char *>(workspace);
+        char *keyTmp = origin + voc * sizeof(half);
+        half *val_out = (half *) origin;
 
-    index<<<(voc + 1023) / 1024, 1024, 0, (cudaStream_t) stream>>>(key_in, voc);
-    //下面开始计算workspace空间
+        uint64_t *key_in = (uint64_t *) keyTmp;
+        uint64_t *key_out = key_in + voc;
 
-    void *workspace_extra = reinterpret_cast<char *>(workspace) + desc->step;
-    uint64_t workspace_len = workspace_size - desc->step;
-    sort_pairs_descending<half, uint64_t>(
-        workspace_extra, workspace_len,
-        (half *) probs, val_out,
-        key_in, key_out,
-        voc, (cudaStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上
-    //排序结束，然后开始做softmax变换
-    if (topp > 0 && topk > 1) {
+        index<<<(voc + 1023) / 1024, 1024, 0, (cudaStream_t) stream>>>(key_in, voc);
+        //下面开始计算workspace空间
+
+        void *workspace_extra = reinterpret_cast<char *>(workspace) + desc->step;
+        uint64_t workspace_len = workspace_size - desc->step;
+        sort_pairs_descending<half, uint64_t>(
+            workspace_extra, workspace_len,
+            (half *) probs, val_out,
+            key_in, key_out,
+            voc, (cudaStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上
+                                        //排序结束，然后开始做softmax变换
         int BLOCK_DIM = 1024;
         int num_blocks = (voc + BLOCK_DIM - 1) / BLOCK_DIM;
         softmax<half, 1024><<<num_blocks, BLOCK_DIM, 0, (cudaStream_t) stream>>>(val_out, topk,
@@ -169,8 +194,9 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace
                                                                        key_out);
 
     } else {
-        random_sample_kernel<<<1, 1, 0, (cudaStream_t) stream>>>((uint64_t *) result,
-                                                                 key_out);
+        int BLOCK_DIM = 1024;
+        int num_blocks = (voc + BLOCK_DIM - 1) / BLOCK_DIM;
+        argmaxKernel<half, 1024><<<num_blocks, BLOCK_DIM, 0, (cudaStream_t) stream>>>((half *) probs, voc, (uint64_t *) result);
     }
 }