From 7c719aa0e6edadc91358252befb984b3694390af Mon Sep 17 00:00:00 2001 From: voidwarlock <1707297086@qq.com> Date: Fri, 14 Mar 2025 17:31:59 +0800 Subject: [PATCH] =?UTF-8?q?[=E8=AE=AD=E7=BB=83=E8=90=A5]=E6=96=B0=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E7=AE=97=E5=AD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/infini_operators.h | 8 + include/ops/clip/clip.h | 28 ++ include/ops/gather/gather.h | 28 ++ include/ops/gather_elements/gather_elements.h | 28 ++ include/ops/reduceMax/reduceMax.h | 25 ++ include/ops/reduceMean/reduceMean.h | 25 ++ include/ops/reduceMin/reduceMin.h | 25 ++ include/ops/reduceSum/reduceSum.h | 25 ++ include/ops/where/where.h | 29 +++ new_operators.md | 240 ++++++++++++++++++ operatorspy/tests/clip.py | 159 ++++++++++++ operatorspy/tests/gather.py | 164 ++++++++++++ operatorspy/tests/gather_elements.py | 157 ++++++++++++ operatorspy/tests/reduce_max.py | 191 ++++++++++++++ operatorspy/tests/reduce_mean.py | 189 ++++++++++++++ operatorspy/tests/reduce_min.py | 193 ++++++++++++++ operatorspy/tests/reduce_sum.py | 189 ++++++++++++++ operatorspy/tests/where.py | 161 ++++++++++++ src/ops/clip/cpu/clip_cpu.cc | 83 ++++++ src/ops/clip/cpu/clip_cpu.h | 27 ++ src/ops/clip/cuda/clip.cc | 74 ++++++ src/ops/clip/cuda/clip.cu | 57 +++++ src/ops/clip/cuda/clip.cuh | 36 +++ src/ops/clip/operator.cc | 71 ++++++ src/ops/gather/cpu/gather_cpu.cc | 122 +++++++++ src/ops/gather/cpu/gather_cpu.h | 31 +++ src/ops/gather/cuda/gather.cc | 93 +++++++ src/ops/gather/cuda/gather.cu | 76 ++++++ src/ops/gather/cuda/gather.cuh | 40 +++ src/ops/gather/operator.cc | 67 +++++ .../cpu/gather_elements_cpu.cc | 113 +++++++++ .../gather_elements/cpu/gather_elements_cpu.h | 31 +++ .../gather_elements/cuda/gather_elements.cc | 86 +++++++ .../gather_elements/cuda/gather_elements.cu | 74 ++++++ .../gather_elements/cuda/gather_elements.cuh | 40 +++ src/ops/gather_elements/operator.cc | 67 +++++ src/ops/reduce/cpu/reduce_cpu.cc | 178 +++++++++++++ src/ops/reduce/cpu/reduce_cpu.h | 45 ++++ src/ops/reduce/cuda/reduce.cc | 92 +++++++ src/ops/reduce/cuda/reduce.cu | 169 ++++++++++++ src/ops/reduce/cuda/reduce.cuh | 47 ++++ src/ops/reduce/operator.cc | 89 +++++++ src/ops/reduce/reduce.h | 31 +++ src/ops/reduceMax/operator.cc | 50 ++++ src/ops/reduceMean/operator.cc | 51 ++++ src/ops/reduceMin/operator.cc | 51 ++++ src/ops/reduceSum/operator.cc | 51 ++++ src/ops/where/cpu/where_cpu.cc | 76 ++++++ src/ops/where/cpu/where_cpu.h | 28 ++ src/ops/where/cuda/where.cu | 58 +++++ src/ops/where/cuda/where.cuh | 37 +++ src/ops/where/cuda/where_cuda.cc | 52 ++++ src/ops/where/operator.cc | 68 +++++ 53 files changed, 4225 insertions(+) create mode 100644 include/ops/clip/clip.h create mode 100644 include/ops/gather/gather.h create mode 100644 include/ops/gather_elements/gather_elements.h create mode 100644 include/ops/reduceMax/reduceMax.h create mode 100644 include/ops/reduceMean/reduceMean.h create mode 100644 include/ops/reduceMin/reduceMin.h create mode 100644 include/ops/reduceSum/reduceSum.h create mode 100644 include/ops/where/where.h create mode 100644 new_operators.md create mode 100644 operatorspy/tests/clip.py create mode 100644 operatorspy/tests/gather.py create mode 100644 operatorspy/tests/gather_elements.py create mode 100644 operatorspy/tests/reduce_max.py create mode 100644 operatorspy/tests/reduce_mean.py create mode 100644 operatorspy/tests/reduce_min.py create mode 100644 operatorspy/tests/reduce_sum.py create mode 100644 operatorspy/tests/where.py create mode 100644 src/ops/clip/cpu/clip_cpu.cc create mode 100644 src/ops/clip/cpu/clip_cpu.h create mode 100644 src/ops/clip/cuda/clip.cc create mode 100644 src/ops/clip/cuda/clip.cu create mode 100644 src/ops/clip/cuda/clip.cuh create mode 100644 src/ops/clip/operator.cc create mode 100644 src/ops/gather/cpu/gather_cpu.cc create mode 100644 src/ops/gather/cpu/gather_cpu.h create mode 100644 src/ops/gather/cuda/gather.cc create mode 100644 src/ops/gather/cuda/gather.cu create mode 100644 src/ops/gather/cuda/gather.cuh create mode 100644 src/ops/gather/operator.cc create mode 100644 src/ops/gather_elements/cpu/gather_elements_cpu.cc create mode 100644 src/ops/gather_elements/cpu/gather_elements_cpu.h create mode 100644 src/ops/gather_elements/cuda/gather_elements.cc create mode 100644 src/ops/gather_elements/cuda/gather_elements.cu create mode 100644 src/ops/gather_elements/cuda/gather_elements.cuh create mode 100644 src/ops/gather_elements/operator.cc create mode 100644 src/ops/reduce/cpu/reduce_cpu.cc create mode 100644 src/ops/reduce/cpu/reduce_cpu.h create mode 100644 src/ops/reduce/cuda/reduce.cc create mode 100644 src/ops/reduce/cuda/reduce.cu create mode 100644 src/ops/reduce/cuda/reduce.cuh create mode 100644 src/ops/reduce/operator.cc create mode 100644 src/ops/reduce/reduce.h create mode 100644 src/ops/reduceMax/operator.cc create mode 100644 src/ops/reduceMean/operator.cc create mode 100644 src/ops/reduceMin/operator.cc create mode 100644 src/ops/reduceSum/operator.cc create mode 100644 src/ops/where/cpu/where_cpu.cc create mode 100644 src/ops/where/cpu/where_cpu.h create mode 100644 src/ops/where/cuda/where.cu create mode 100644 src/ops/where/cuda/where.cuh create mode 100644 src/ops/where/cuda/where_cuda.cc create mode 100644 src/ops/where/operator.cc diff --git a/include/infini_operators.h b/include/infini_operators.h index 9a5a2555..be84ee3f 100644 --- a/include/infini_operators.h +++ b/include/infini_operators.h @@ -5,15 +5,23 @@ #include "ops/causal_softmax/causal_softmax.h" #include "ops/global_avg_pool/global_avg_pool.h" #include "ops/expand/expand.h" +#include "ops/gather_elements/gather_elements.h" +#include "ops/gather/gather.h" #include "ops/gemm/gemm.h" +#include "ops/clip/clip.h" #include "ops/conv/conv.h" #include "ops/matmul/matmul.h" #include "ops/max_pool/max_pool.h" #include "ops/mlp/mlp.h" #include "ops/random_sample/random_sample.h" #include "ops/rearrange/rearrange.h" +#include "ops/reduceMax/reduceMax.h" +#include "ops/reduceMean/reduceMean.h" +#include "ops/reduceMin/reduceMin.h" +#include "ops/reduceSum/reduceSum.h" #include "ops/relu/relu.h" #include "ops/rms_norm/rms_norm.h" #include "ops/rotary_embedding/rotary_embedding.h" #include "ops/swiglu/swiglu.h" +#include "ops/where/where.h" #include "tensor/tensor_descriptor.h" diff --git a/include/ops/clip/clip.h b/include/ops/clip/clip.h new file mode 100644 index 00000000..82220bc3 --- /dev/null +++ b/include/ops/clip/clip.h @@ -0,0 +1,28 @@ +#ifndef CLIP_H +#define CLIP_H + +#include "../../export.h" +#include "../../operators.h" +#include + +typedef struct{ + Device device; +}ClipDescriptor; + +typedef ClipDescriptor *infiniopClipDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle, + infiniopClipDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + float max, + float min); + +__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, + void *dst, + void const *src, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/ops/gather/gather.h b/include/ops/gather/gather.h new file mode 100644 index 00000000..2d598b03 --- /dev/null +++ b/include/ops/gather/gather.h @@ -0,0 +1,28 @@ +#ifndef GATHER_H +#define GATHER_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct{ + Device device; +}GatherDescriptor; + +typedef GatherDescriptor *infiniopGatherDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateGatherDescriptor(infiniopHandle_t handle, + infiniopGatherDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t data, + infiniopTensorDescriptor_t indices, + int axis); + +__C __export infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, + void *dst, + void const *data, + void const *indices, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/ops/gather_elements/gather_elements.h b/include/ops/gather_elements/gather_elements.h new file mode 100644 index 00000000..7178e96b --- /dev/null +++ b/include/ops/gather_elements/gather_elements.h @@ -0,0 +1,28 @@ +#ifndef GATHER_ELEMENTS_H +#define GATHER_ELEMENTS_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct{ + Device device; +}GatherElementsDescriptor; + +typedef GatherElementsDescriptor *infiniopGatherElementsDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateGatherElementsDescriptor(infiniopHandle_t handle, + infiniopGatherElementsDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t data, + infiniopTensorDescriptor_t indices, + int axis); + +__C __export infiniopStatus_t infiniopGatherElements(infiniopGatherElementsDescriptor_t desc, + void *dst, + void const *data, + void const *indices, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyGatherElementsDescriptor(infiniopGatherElementsDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/ops/reduceMax/reduceMax.h b/include/ops/reduceMax/reduceMax.h new file mode 100644 index 00000000..e4f47c54 --- /dev/null +++ b/include/ops/reduceMax/reduceMax.h @@ -0,0 +1,25 @@ +#ifndef REDUCEMAX_H +#define REDUCEMAX_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ReduceMaxDescriptor { + Device device; +} ReduceMaxDescriptor; +typedef ReduceMaxDescriptor *infiniopReduceMaxDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReduceMaxDescriptor(infiniopHandle_t handle, + infiniopReduceMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + int* axis, + const int num_axis, + int const keepdims); + +__C __export infiniopStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopReduceMax(infiniopReduceMaxDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream); + +__C __export infiniopStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc); +#endif diff --git a/include/ops/reduceMean/reduceMean.h b/include/ops/reduceMean/reduceMean.h new file mode 100644 index 00000000..d8767213 --- /dev/null +++ b/include/ops/reduceMean/reduceMean.h @@ -0,0 +1,25 @@ +#ifndef REDUCEMEAN_H +#define REDUCEMEAN_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ReduceMeanDescriptor { + Device device; +} ReduceMeanDescriptor; +typedef ReduceMeanDescriptor *infiniopReduceMeanDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReduceMeanDescriptor(infiniopHandle_t handle, + infiniopReduceMeanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + int* axis, + const int num_axis, + int const keepdims); + +__C __export infiniopStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopReduceMean(infiniopReduceMeanDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream); + +__C __export infiniopStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc); +#endif diff --git a/include/ops/reduceMin/reduceMin.h b/include/ops/reduceMin/reduceMin.h new file mode 100644 index 00000000..43197434 --- /dev/null +++ b/include/ops/reduceMin/reduceMin.h @@ -0,0 +1,25 @@ +#ifndef REDUCEMIN_H +#define REDUCEMIN_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ReduceMinDescriptor { + Device device; +} ReduceMinDescriptor; +typedef ReduceMinDescriptor *infiniopReduceMinDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReduceMinDescriptor(infiniopHandle_t handle, + infiniopReduceMinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + int* axis, + const int num_axis, + int const keepdims); + +__C __export infiniopStatus_t infiniopGetReduceMinWorkspaceSize(infiniopReduceMinDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopReduceMin(infiniopReduceMinDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream); + +__C __export infiniopStatus_t infiniopDestroyReduceMinDescriptor(infiniopReduceMinDescriptor_t desc); +#endif diff --git a/include/ops/reduceSum/reduceSum.h b/include/ops/reduceSum/reduceSum.h new file mode 100644 index 00000000..2cec204e --- /dev/null +++ b/include/ops/reduceSum/reduceSum.h @@ -0,0 +1,25 @@ +#ifndef REDUCESUM_H +#define REDUCESUM_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ReduceSumDescriptor { + Device device; +} ReduceSumDescriptor; +typedef ReduceSumDescriptor *infiniopReduceSumDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReduceSumDescriptor(infiniopHandle_t handle, + infiniopReduceSumDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + int* axis, + const int num_axis, + int const keepdims); + +__C __export infiniopStatus_t infiniopGetReduceSumWorkspaceSize(infiniopReduceSumDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopReduceSum(infiniopReduceSumDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream); + +__C __export infiniopStatus_t infiniopDestroyReduceSumDescriptor(infiniopReduceSumDescriptor_t desc); +#endif diff --git a/include/ops/where/where.h b/include/ops/where/where.h new file mode 100644 index 00000000..3a6e0382 --- /dev/null +++ b/include/ops/where/where.h @@ -0,0 +1,29 @@ +#ifndef WHERE_H +#define WHERE_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct WhereDescriptor{ + Device device; +} WhereDescriptor; + +typedef WhereDescriptor *infiniopWhereDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t condition); + +__C __export infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, + void *dst, + void const *x, + void const *y, + void const *condition, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc); + +#endif diff --git a/new_operators.md b/new_operators.md new file mode 100644 index 00000000..5081dd04 --- /dev/null +++ b/new_operators.md @@ -0,0 +1,240 @@ +## Clip + +### 基本属性 + +| 描述 | 裁剪, 把输入张量中的元素限制在给定的范围之内 | +| --------------------------------- | ----------------------------------------------- | +| 是否支持原地(in-place)计算 | 支持原地 | +| 是否需要额外工作空间(workspace) | 不需要 | + +公式为: + +Y=min(max(X,min_val),max_val) + +### 接口定义 + +#### 创建算子描述 + +```C +infiniopStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle, +infiniopClipDescriptor_t *desc_ptr,infiniopTensorDescriptor_t dst, +infiniopTensorDescriptor_t src, float max, float min); +``` + +**参数说明** + +| handle | 硬件控柄 | +| -------- | -------------------------------------------------- | +| desc_ptr | 算子描述符的地址 | +| dst | 输出张量描述。形状与src相同。类型与src相同 | +| src | 输入张量描述。形状不定,类型可以为fp16 | +| max | float类型,所取值的上界 | +| min | float 类型,所取值的下界 | + +**返回值** + +| STATUS_SUCCESS | 成功 | +| --------------------------- | ------------------------ | +| STATUS_BAD_PARAM | 参数张量不统一 | +| STATUS_BAD_TENSOR_DTYPE | 输入输出张量类型不被支持或两者不一致| +| STATUS_BAD_TENSOR_SHAPE | 输入输出张量形状不符合要求 | +| STATUS_BAD_TENSOR_STRIDES | 张量步长不符合要求 | + +#### 计算 + +```C +infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, +void *dst, void const *src, void *stream); +``` + +#### 删除算子描述 + +```C +infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc); +``` + + + + + +## Where + +### 基本属性 + +| 描述 | 条件选择, 根据一个条件张量从两个输入张量中选择元素,构建输出张量 | +| --------------------------------- | ----------------------------------------------- | +| 是否支持原地(in-place)计算 | 支持原地 | +| 是否需要额外工作空间(workspace) | 不需要 | + +公式为: + +output[i]=(condition[i] ? X[i] : Y[i]) + +### 接口定义 + +#### 创建算子描述 + +```C +infiniopStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle, +infiniopWhereDescriptor_t *desc_ptr, infiniopTensorDescriptor_t dst, +infiniopTensorDescriptor_t x,infiniopTensorDescriptor_t y, +infiniopTensorDescriptor_t condition); +``` + +**参数说明** + +| handle | 硬件控柄 | +| -------------- | -------------------------------------------------- | +| desc_ptr | 算子描述符的地址 | +| dst | 输出张量描述。形状与x,y和condition相同。类型与x和y相同 | +| x | 输入张量描述。形状不定,类型可以为fp16 | +| y | 输入张量描述。形状与x相同,类型与x相同 | +| condition | 输入条件张量描述。形状与x和y相同,类型为uint8_t | + + +**返回值** + +| STATUS_SUCCESS | 成功 | +| --------------------------- | -------------------------------------------------------- | +| STATUS_BAD_PARAM | 参数张量不统一 | +| STATUS_BAD_TENSOR_DTYPE | (1)输入输出张量类型不被支持或两者不一致 (2)条件张量类型不支持 | +| STATUS_BAD_TENSOR_SHAPE | (1)输入输出张量形状不符合要求 (2)条件张量形状不与输入输出张量形状一致 | +| STATUS_BAD_TENSOR_STRIDES | 张量步长不符合要求 | + +#### 计算 + +```C +infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, +void *dst,void const *x,void const *y,void const *condition,void *stream); +``` + +#### 删除算子描述 + +```C +infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc); +``` + + +## Gather + +### 基本属性 + +| 描述 | 索引收集, 根据指定的索引从输入数据中收集元素,并将这些元素组织成一个新的张量输出 | +| --------------------------------- | ------------------------------------------------------------------------ | +| 是否支持原地(in-place)计算 | 不支持原地 | +| 是否需要额外工作空间(workspace) | 不需要 | + +公式为: + +在指定的axis上 output = data[index] + +### 接口定义 + +#### 创建算子描述 + +```C +infiniopStatus_t infiniopCreateGatherDescriptor(infiniopHandle_t handle,infiniopGatherDescriptor_t *desc_ptr,infiniopTensorDescriptor_t dst, +infiniopTensorDescriptor_t data,infiniopTensorDescriptor_t indices, +int axis); +``` + +**参数说明** + +| handle | 硬件控柄 | +| -------------- | -------------------------------------------------- | +| desc_ptr | 算子描述符的地址 | +| dst | 输出张量描述。形状是data_shape[:axis]+indices_shape+data_shape[axis+1:]相同 类型与data相同 | +| data | 输入张量描述。形状不定,类型可以为fp16 | +| indices | 索引张量描述。形状不定,类型为一切int32或int64 | +| axis | int类型,表示指定操作的轴, 其值不能超过data的秩 | + + + +**返回值** + +| STATUS_SUCCESS | 成功 | +| --------------------------- | ------------------------------------------------------ | +| STATUS_BAD_PARAM | 参数张量不统一 (1)可能是axis取值不合理 | +| STATUS_BAD_TENSOR_DTYPE | (1)输入输出张量类型不被支持或两者不一致 (2)索引值张量类型不被支持 | +| STATUS_BAD_TENSOR_SHAPE | (1)输入输出张量形状不符合要求 (2)索引张量形状不符合要求 | +| STATUS_BAD_TENSOR_STRIDES | 张量步长不符合要求 | + +#### 计算 + +```C +infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, +void *dst, void const *data, void const *indices, void *stream); +``` + +#### 删除算子描述 + +```C +infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc); +``` + + +## Reduce + +### 基本属性 + +| 描述 | 归约, 沿指定的一个或多个维度应用一个二元操作(求最值,求加法),输入张量缩减为形状较小的输出张量 | +| --------------------------------- | ---------------------------------------------------------------------------------------- | +| 是否支持原地(in-place)计算 | 不支持原地 | +| 是否需要额外工作空间(workspace) | 需要 | + +公式为: + +在指定的axis上 计算(Max,Min,Mean) + +### 接口定义 + +#### 创建算子描述 + +```C +infiniopStatus_t infiniopCreateReduce(Max,Min,Mean)Descriptor(infiniopHandle_t handle, +infiniopReduce(Max,Min,Mean)Descriptor_t *desc_ptr,infiniopTensorDescriptor_t dst, +infiniopTensorDescriptor_t src,int* axis,const int num_axis, int const keepdims); +``` + +**参数说明** + +| handle | 硬件控柄 | +| -------------- | --------------------------------------------------------------------------- | +| desc_ptr | 算子描述符的地址 | +| dst | 输出张量描述。形状取决于keepdims,保留时rank(dst)=rank(src),被规约的轴大小为1;不保留时rank(dst) = rank(src)-rank(axis), 相当于去除冗余维度, 类型与src相同 | +| src | 输入张量描述。形状不定,类型可以为fp16 | +| axis | int类型的数组,形状小于src的秩,指定需要reduce的轴,支持多个轴同时规约 | +| num_axis | axis数组的形状大小。大小小于src的秩,类型为int | +| keepdims | 确定是否保留形状。为正整数时保留,类型为int | + + + +**返回值** + +| STATUS_SUCCESS | 成功 | +| --------------------------- | ------------------------ | +| STATUS_BAD_PARAM | 参数张量不统一 (1)可能是axis取值不合理 (2)可能是选择的规约模式不支持 | +| STATUS_BAD_TENSOR_DTYPE | 输入输出张量类型不被支持或两者不一致| +| STATUS_BAD_TENSOR_SHAPE | 输入输出张量形状不符合要求 | +| STATUS_BAD_TENSOR_STRIDES | 张量步长不符合要求 | + + +#### 计算工作空间 + +```C +infiniopStatus_t infiniopGetReduce(Max,Min,Mean)WorkspaceSize(infiniopReduce(Max,Min,Mean)Descriptor_t desc, uint64_t *size); +``` + +#### 计算 + +```C +infiniopStatus_t infiniopReduce(Max,Min,Mean)(infiniopReduce(Max,Min,Mean)Descriptor_t desc, +void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream); + +``` + +#### 删除算子描述 + +```C +infiniopStatus_t infiniopDestroyReduce(Max,Min,Mean)Descriptor(infiniopReduce(Max,Min,Mean)Descriptor_t desc); +``` \ No newline at end of file diff --git a/operatorspy/tests/clip.py b/operatorspy/tests/clip.py new file mode 100644 index 00000000..4a74b3f2 --- /dev/null +++ b/operatorspy/tests/clip.py @@ -0,0 +1,159 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +class ClipDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopClipDescriptor_t = POINTER(ClipDescriptor) + +def clip(x, min_val = -torch.inf, max_val = torch.inf): + return torch.clip(x, min_val, max_val) + +def test( + lib, + handle, + torch_device, + tensor_shape, + max_val, + min_val, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Clip on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + + x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1 + y = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else x + + + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = clip(x,min_val,max_val) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = clip(x,min_val,max_val) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor + descriptor = infiniopClipDescriptor_t() + + check_error( + lib.infiniopCreateClipDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + max_val, + min_val + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error(lib.infiniopClip(descriptor, y_tensor.data, x_tensor.data, None)) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopClip(descriptor, y_tensor.data, x_tensor.data, None) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(y, ans, atol=1e-0, rtol=1e-0) + check_error(lib.infiniopDestroyClipDescriptor(descriptor)) +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for tensor_shape, max_val, min_val, inplace in test_cases: + test(lib, handle, "cpu", tensor_shape, max_val, min_val, tensor_dtype=torch.float32, inplace=inplace) + test(lib, handle, "cpu", tensor_shape, max_val, min_val, tensor_dtype=torch.float16, inplace=inplace) + destroy_handle(lib, handle) + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for tensor_shape, max_val, min_val, inplace in test_cases: + test(lib, handle, "cuda", tensor_shape, max_val, min_val, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", tensor_shape, max_val, min_val, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + +if __name__ == "__main__": + test_cases = [ + # tensor_shape, inplace + ((), 0.5, -0.5, Inplace.OUT_OF_PLACE), + ((), 0.5, -0.5, Inplace.INPLACE_X), + ((1, 3), 0.5, -0.5, Inplace.OUT_OF_PLACE), + ((3, 3), 0.5, -0.5, Inplace.OUT_OF_PLACE), + ((3, 3, 13, 9, 17), 0.5, -0.5, Inplace.INPLACE_X), + ((32, 20, 512), 0.5, -0.5, Inplace.INPLACE_X), + ((2, 3, 4, 5), 0.5, -0.5, Inplace.INPLACE_X), + ((33, 333, 333), 0.5, -0.5, Inplace.OUT_OF_PLACE), + ((32, 256, 112, 112), 0.5, -0.5, Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateClipDescriptor.restype = c_int32 + lib.infiniopCreateClipDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopClipDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_float, + c_float, + ] + lib.infiniopClip.restype = c_int32 + lib.infiniopClip.argtypes = [ + infiniopClipDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyClipDescriptor.restype = c_int32 + lib.infiniopDestroyClipDescriptor.argtypes = [ + infiniopClipDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/operatorspy/tests/gather.py b/operatorspy/tests/gather.py new file mode 100644 index 00000000..e757fbd2 --- /dev/null +++ b/operatorspy/tests/gather.py @@ -0,0 +1,164 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + +class GatherDescriptor(Structure): + _fields_ = [("device", c_int32)] + +infiniopGatherDescriptor_t = POINTER(GatherDescriptor) + +def gather(x, indices, axis = 0): + idx = [slice(None)] * x.ndim + idx[axis] = indices + return x[tuple(idx)] + + +def test( + lib, + handle, + torch_device, + tensor_shape, + indices_shape, + axis, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Gather on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} axis:{axis} inplace: {inplace.name}" + ) + data = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1 + indices = torch.randint(0, tensor_shape[axis], indices_shape, dtype=torch.int32).to(torch_device) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = gather(data, indices, axis) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = gather(data, indices, axis) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + dst = torch.empty(ans.shape, dtype=tensor_dtype).to(torch_device) + data_tensor = to_tensor(data, lib) + dst_tensor = to_tensor(dst, lib) + indices_tensor = to_tensor(indices, lib) + descriptor = infiniopGatherDescriptor_t() + + check_error( + lib.infiniopCreateGatherDescriptor( + handle, + ctypes.byref(descriptor), + dst_tensor.descriptor, + data_tensor.descriptor, + indices_tensor.descriptor, + axis, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + dst_tensor.descriptor.contents.invalidate() + data_tensor.descriptor.contents.invalidate() + indices_tensor.descriptor.contents.invalidate() + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error(lib.infiniopGather(descriptor, dst_tensor.data, data_tensor.data, indices_tensor.data, None)) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopGather(descriptor, dst_tensor.data, data_tensor.data, indices_tensor.data, None) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(dst, ans, atol=0, rtol=0) + check_error(lib.infiniopDestroyGatherDescriptor(descriptor)) + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for tensor_shape, dst_shape, axis, inplace in test_cases: + test(lib, handle, "cpu", tensor_shape, dst_shape, axis, tensor_dtype=torch.float32, inplace=inplace) + test(lib, handle, "cpu", tensor_shape, dst_shape, axis, tensor_dtype=torch.float16, inplace=inplace) + destroy_handle(lib, handle) + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for tensor_shape, dst_shape, axis, inplace in test_cases: + test(lib, handle, "cuda", tensor_shape, dst_shape, axis, tensor_dtype=torch.float32, inplace=inplace) + test(lib, handle, "cuda", tensor_shape, dst_shape, axis, tensor_dtype=torch.float16, inplace=inplace) + destroy_handle(lib, handle) + +if __name__ == "__main__": + test_cases = [ + # tensor_shape, inplace + ((2, 3, 2), (2, 2), 1, Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (6, 7, 8), 2, Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5, 6), (7, 8, 9, 10), 3, Inplace.OUT_OF_PLACE), + ((5, 10), (2,), 0, Inplace.OUT_OF_PLACE), + ((3, 2), (2, 2), 0, Inplace.OUT_OF_PLACE), + ((3, 2), (3, 2), 1, Inplace.OUT_OF_PLACE), + ((33, 333, 333), (4, 5), 1, Inplace.OUT_OF_PLACE), + ((3, 2, 2), (3, 2), 2, Inplace.OUT_OF_PLACE), + ((2, 3, 4), (3, ), 1, Inplace.OUT_OF_PLACE), + ((32, 256, 112, 112), (112, ), 2, Inplace.OUT_OF_PLACE), + + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateGatherDescriptor.restype = c_int32 + lib.infiniopCreateGatherDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopGatherDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_int32, + ] + lib.infiniopGather.restype = c_int32 + lib.infiniopGather.argtypes = [ + infiniopGatherDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyGatherDescriptor.restype = c_int32 + lib.infiniopDestroyGatherDescriptor.argtypes = [ + infiniopGatherDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + + if args.cuda: + test_cuda(lib, test_cases) + + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/operatorspy/tests/gather_elements.py b/operatorspy/tests/gather_elements.py new file mode 100644 index 00000000..815e5d04 --- /dev/null +++ b/operatorspy/tests/gather_elements.py @@ -0,0 +1,157 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + +class GatherElementsDescriptor(Structure): + _fields_ = [("device", c_int32)] + +infiniopGatherElementsDescriptor_t = POINTER(GatherElementsDescriptor) + +def gather(x, indices, axis = 0): + return torch.gather(x, axis, indices) + +def test( + lib, + handle, + torch_device, + tensor_shape, + indices_shape, + axis, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Gather on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} axis:{axis} inplace: {inplace.name}" + ) + data = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1 + dst = torch.empty(indices_shape, dtype=tensor_dtype).to(torch_device) + indices = torch.randint(0, tensor_shape[axis], indices_shape, dtype=torch.int64).to(torch_device) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = gather(data, indices, axis) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = gather(data, indices, axis) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + data_tensor = to_tensor(data, lib) + dst_tensor = to_tensor(dst, lib) + indices_tensor = to_tensor(indices, lib) + descriptor = infiniopGatherElementsDescriptor_t() + + check_error( + lib.infiniopCreateGatherElementsDescriptor( + handle, + ctypes.byref(descriptor), + dst_tensor.descriptor, + data_tensor.descriptor, + indices_tensor.descriptor, + axis, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + dst_tensor.descriptor.contents.invalidate() + data_tensor.descriptor.contents.invalidate() + indices_tensor.descriptor.contents.invalidate() + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error(lib.infiniopGatherElements(descriptor, dst_tensor.data, data_tensor.data, indices_tensor.data, None)) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopGatherElements(descriptor, dst_tensor.data, data_tensor.data, indices_tensor.data, None) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(dst, ans, atol=0, rtol=0) + check_error(lib.infiniopDestroyGatherElementsDescriptor(descriptor)) + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for tensor_shape, dst_shape, axis, inplace in test_cases: + test(lib, handle, "cpu", tensor_shape, dst_shape, axis, tensor_dtype=torch.float32, inplace=inplace) + test(lib, handle, "cpu", tensor_shape, dst_shape, axis, tensor_dtype=torch.float16, inplace=inplace) + destroy_handle(lib, handle) + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for tensor_shape, dst_shape, axis, inplace in test_cases: + test(lib, handle, "cuda", tensor_shape, dst_shape, axis, tensor_dtype=torch.float32, inplace=inplace) + test(lib, handle, "cuda", tensor_shape, dst_shape, axis, tensor_dtype=torch.float16, inplace=inplace) + destroy_handle(lib, handle) + +if __name__ == "__main__": + test_cases = [ + # tensor_shape, inplace + ((3, 2), (2, 2), 0, Inplace.OUT_OF_PLACE), + ((3, 2), (3, 2), 1, Inplace.OUT_OF_PLACE), + ((33, 333, 333), (33, 200, 333), 1, Inplace.OUT_OF_PLACE), + ((3, 2, 2), (3, 2, 1), -1, Inplace.OUT_OF_PLACE), + ((2, 3, 4), (2, 2, 4), 1, Inplace.OUT_OF_PLACE), + ((32, 256, 112, 112), (32, 256, 112, 112), 2, Inplace.OUT_OF_PLACE), + + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateGatherElementsDescriptor.restype = c_int32 + lib.infiniopCreateGatherElementsDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopGatherElementsDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_int32, + ] + lib.infiniopGatherElements.restype = c_int32 + lib.infiniopGatherElements.argtypes = [ + infiniopGatherElementsDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyGatherElementsDescriptor.restype = c_int32 + lib.infiniopDestroyGatherElementsDescriptor.argtypes = [ + infiniopGatherElementsDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + + if args.cuda: + test_cuda(lib, test_cases) + + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/operatorspy/tests/reduce_max.py b/operatorspy/tests/reduce_max.py new file mode 100644 index 00000000..56efb401 --- /dev/null +++ b/operatorspy/tests/reduce_max.py @@ -0,0 +1,191 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + +class ReduceMaxDescriptor(Structure): + _fields_ = [("device", c_int32)] + +infiniopReduceMaxDescriptor_t = POINTER(ReduceMaxDescriptor) + +def reduceMax(x, axis, keepdims = True): + for i in range(len(axis)): + if keepdims: + out = torch.max(x, dim=axis[i], keepdim=keepdims) + else: + out = torch.max(x, dim=axis[i] - i, keepdim=keepdims) + x = out.values + return out + +def test( + lib, + handle, + torch_device, + tensor_shape, + axis, + keepdims, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing ReduceMax on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} axis:{axis} keepdim:{keepdims} inplace: {inplace.name}" + ) + x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1 + + out_shape = list(tensor_shape) + for i in axis: + if(keepdims==1): + out_shape[i] = 1 + else: + out_shape = out_shape[:i] + out_shape[i+1:] + if(len(axis) == len(tensor_shape)): + if(keepdims == 0): + out_shape = [1] + + y = torch.full(out_shape, -torch.inf, dtype=tensor_dtype).to(torch_device) + + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = reduceMax(x, axis, bool(keepdims)) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = reduceMax(x, axis, bool(keepdims)) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) + descriptor = infiniopReduceMaxDescriptor_t() + + check_error( + lib.infiniopCreateReduceMaxDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + (ctypes.c_int * len(axis))(*axis), + len(axis), + keepdims, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + workspaceSize = ctypes.c_uint64(0) + check_error( + lib.infiniopGetReduceMaxWorkspaceSize(descriptor, ctypes.byref(workspaceSize)) + ) + workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) + workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error(lib.infiniopReduceMax(descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None)) + + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopReduceMax(descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(y, ans.values, atol=0, rtol=0) + check_error(lib.infiniopDestroyReduceMaxDescriptor(descriptor)) + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for tensor_shape, axis, keepdims, inplace in test_cases: + test(lib, handle, "cpu", tensor_shape, axis, keepdims, tensor_dtype=torch.float32, inplace=inplace) + test(lib, handle, "cpu", tensor_shape, axis, keepdims, tensor_dtype=torch.float16, inplace=inplace) + destroy_handle(lib, handle) + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for tensor_shape, axis, keepdims, inplace in test_cases: + test(lib, handle, "cuda", tensor_shape, axis, keepdims, tensor_dtype=torch.float32, inplace=inplace) + test(lib, handle, "cuda", tensor_shape, axis, keepdims, tensor_dtype=torch.float16, inplace=inplace) + destroy_handle(lib, handle) + +if __name__ == "__main__": + test_cases = [ + # tensor_shape, inplace + ((2, 3), [1], 1, Inplace.OUT_OF_PLACE), + ((2, 3), [0, 1], 0, Inplace.OUT_OF_PLACE), + ((4, 7), [0, 1], 1, Inplace.OUT_OF_PLACE), + ((300, 1024), [1], 1, Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), [0, 1,2,3], 1, Inplace.OUT_OF_PLACE), + ((6, 6, 200), [-1], 1, Inplace.OUT_OF_PLACE), + ((6, 2, 7), [0, 1], 1, Inplace.OUT_OF_PLACE), + ((256, 400), [1], 0, Inplace.OUT_OF_PLACE), + ((32, 256, 112, 112), [1], 1, Inplace.OUT_OF_PLACE), + ((32, 288, 112, 112), [2], 0, Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateReduceMaxDescriptor.restype = c_int32 + lib.infiniopCreateReduceMaxDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopReduceMaxDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ctypes.POINTER(ctypes.c_int), + c_int32, + c_int32, + ] + lib.infiniopGetReduceMaxWorkspaceSize.restype = c_int32 + lib.infiniopGetReduceMaxWorkspaceSize.argtypes = [ + infiniopReduceMaxDescriptor_t, + POINTER(c_uint64), + ] + lib.infiniopReduceMax.restype = c_int32 + lib.infiniopReduceMax.argtypes = [ + infiniopReduceMaxDescriptor_t, + c_void_p, + c_uint64, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyReduceMaxDescriptor.restype = c_int32 + lib.infiniopDestroyReduceMaxDescriptor.argtypes = [ + infiniopReduceMaxDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + + if args.cuda: + test_cuda(lib, test_cases) + + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/operatorspy/tests/reduce_mean.py b/operatorspy/tests/reduce_mean.py new file mode 100644 index 00000000..6dc34f57 --- /dev/null +++ b/operatorspy/tests/reduce_mean.py @@ -0,0 +1,189 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + +class ReduceMeanDescriptor(Structure): + _fields_ = [("device", c_int32)] + +infiniopReduceMeanDescriptor_t = POINTER(ReduceMeanDescriptor) + +def reduceMean(x, axis = 0, keepdims = True): + return torch.mean(x, dim=axis, keepdim=keepdims) + +def test( + lib, + handle, + torch_device, + tensor_shape, + axis, + keepdims, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing ReduceMean on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} axis:{axis} keepdim:{keepdims} inplace: {inplace.name}" + ) + x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1 + + out_shape = list(tensor_shape) + for i in axis: + if(keepdims==1): + out_shape[i] = 1 + else: + out_shape = out_shape[:i] + out_shape[i+1:] + if(len(axis) == len(tensor_shape)): + if(keepdims == 0): + out_shape = [1] + + y = torch.zeros(out_shape, dtype=tensor_dtype).to(torch_device) + + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = reduceMean(x, axis, bool(keepdims)) + + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = reduceMean(x, axis, bool(keepdims)) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) + descriptor = infiniopReduceMeanDescriptor_t() + + check_error( + lib.infiniopCreateReduceMeanDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + (ctypes.c_int * len(axis))(*axis), + len(axis), + keepdims, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + workspaceSize = ctypes.c_uint64(0) + check_error( + lib.infiniopGetReduceMeanWorkspaceSize(descriptor, ctypes.byref(workspaceSize)) + ) + workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) + workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error(lib.infiniopReduceMean(descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None)) + + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopReduceMean(descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + if tensor_dtype == torch.float16: + assert torch.allclose(y, ans, atol=1e-3, rtol=1e-7) + else: + assert torch.allclose(y, ans, atol=1e-6, rtol=1e-6) + check_error(lib.infiniopDestroyReduceMeanDescriptor(descriptor)) + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for tensor_shape, axis, keepdims, inplace in test_cases: + test(lib, handle, "cpu", tensor_shape, axis, keepdims, tensor_dtype=torch.float32, inplace=inplace) + test(lib, handle, "cpu", tensor_shape, axis, keepdims, tensor_dtype=torch.float16, inplace=inplace) + destroy_handle(lib, handle) + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for tensor_shape, axis, keepdims, inplace in test_cases: + test(lib, handle, "cuda", tensor_shape, axis, keepdims, tensor_dtype=torch.float32, inplace=inplace) + test(lib, handle, "cuda", tensor_shape, axis, keepdims, tensor_dtype=torch.float16, inplace=inplace) + destroy_handle(lib, handle) + +if __name__ == "__main__": + test_cases = [ + # tensor_shape, inplace + ((2, 3), [1], 1, Inplace.OUT_OF_PLACE), + ((2, 3), [0, 1], 0, Inplace.OUT_OF_PLACE), + ((4, 7), [0], 1, Inplace.OUT_OF_PLACE), + ((300, 1024), [-1], 1, Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), [1, 2], 1, Inplace.OUT_OF_PLACE), + ((6, 6, 200), [0, 1], 1, Inplace.OUT_OF_PLACE), + ((6, 2, 7), [0, 2], 1, Inplace.OUT_OF_PLACE), + ((266, 400), [0], 0, Inplace.OUT_OF_PLACE), + ((32, 256, 112, 112), [3], 1, Inplace.OUT_OF_PLACE), + ((32, 288, 112, 112), [3], 0, Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateReduceMeanDescriptor.restype = c_int32 + lib.infiniopCreateReduceMeanDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopReduceMeanDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ctypes.POINTER(ctypes.c_int), + c_int32, + c_int32, + ] + lib.infiniopGetReduceMeanWorkspaceSize.restype = c_int32 + lib.infiniopGetReduceMeanWorkspaceSize.argtypes = [ + infiniopReduceMeanDescriptor_t, + POINTER(c_uint64), + ] + lib.infiniopReduceMean.restype = c_int32 + lib.infiniopReduceMean.argtypes = [ + infiniopReduceMeanDescriptor_t, + c_void_p, + c_uint64, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyReduceMeanDescriptor.restype = c_int32 + lib.infiniopDestroyReduceMeanDescriptor.argtypes = [ + infiniopReduceMeanDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + + if args.cuda: + test_cuda(lib, test_cases) + + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/operatorspy/tests/reduce_min.py b/operatorspy/tests/reduce_min.py new file mode 100644 index 00000000..c77334f6 --- /dev/null +++ b/operatorspy/tests/reduce_min.py @@ -0,0 +1,193 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + +class ReduceMinDescriptor(Structure): + _fields_ = [("device", c_int32)] + +infiniopReduceMinDescriptor_t = POINTER(ReduceMinDescriptor) + +def reduceMin(x, axis, keepdims = True): + for i in range(len(axis)): + if keepdims: + out = torch.min(x, dim=axis[i], keepdim=keepdims) + else: + out = torch.min(x, dim=axis[i] - i, keepdim=keepdims) + x = out.values + return out + +def test( + lib, + handle, + torch_device, + tensor_shape, + axis, + keepdims, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing ReduceMin on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} axis:{axis} keepdim:{keepdims} inplace: {inplace.name}" + ) + x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1 + + out_shape = list(tensor_shape) + for i in axis: + if(keepdims==1): + out_shape[i] = 1 + else: + out_shape = out_shape[:i] + out_shape[i+1:] + if(len(axis) == len(tensor_shape)): + if(keepdims == 0): + out_shape = [1] + + + y = torch.full(out_shape, torch.inf, dtype=tensor_dtype).to(torch_device) + + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = reduceMin(x, axis, bool(keepdims)) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = reduceMin(x, axis, bool(keepdims)) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) + descriptor = infiniopReduceMinDescriptor_t() + + check_error( + lib.infiniopCreateReduceMinDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + (ctypes.c_int * len(axis))(*axis), + len(axis), + keepdims, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + workspaceSize = ctypes.c_uint64(0) + check_error( + lib.infiniopGetReduceMinWorkspaceSize(descriptor, ctypes.byref(workspaceSize)) + ) + workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) + workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error(lib.infiniopReduceMin(descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None)) + + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopReduceMin(descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(y, ans.values, atol=0, rtol=0) + check_error(lib.infiniopDestroyReduceMinDescriptor(descriptor)) + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for tensor_shape, axis, keepdims, inplace in test_cases: + test(lib, handle, "cpu", tensor_shape, axis, keepdims, tensor_dtype=torch.float32, inplace=inplace) + test(lib, handle, "cpu", tensor_shape, axis, keepdims, tensor_dtype=torch.float16, inplace=inplace) + destroy_handle(lib, handle) + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for tensor_shape, axis, keepdims, inplace in test_cases: + test(lib, handle, "cuda", tensor_shape, axis, keepdims, tensor_dtype=torch.float32, inplace=inplace) + test(lib, handle, "cuda", tensor_shape, axis, keepdims, tensor_dtype=torch.float16, inplace=inplace) + destroy_handle(lib, handle) + +if __name__ == "__main__": + test_cases = [ + # tensor_shape, inplace + ((2, 3), [1], 1, Inplace.OUT_OF_PLACE), + ((2, 3), [0], 0, Inplace.OUT_OF_PLACE), + ((4, 7), [0, 1], 1, Inplace.OUT_OF_PLACE), + ((300, 1024), [1], 1, Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), [2], 1, Inplace.OUT_OF_PLACE), + ((6, 6, 200), [1], 1, Inplace.OUT_OF_PLACE), + ((4, 9), [1], 1, Inplace.OUT_OF_PLACE), + ((6, 2, 7), [0], 1, Inplace.OUT_OF_PLACE), + ((266, 400), [0], 0, Inplace.OUT_OF_PLACE), + ((32, 256, 112, 112), [2], 1, Inplace.OUT_OF_PLACE), + ((32, 288, 112, 112), [1], 0, Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateReduceMinDescriptor.restype = c_int32 + lib.infiniopCreateReduceMinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopReduceMinDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ctypes.POINTER(ctypes.c_int), + c_int32, + c_int32, + ] + lib.infiniopGetReduceMinWorkspaceSize.restype = c_int32 + lib.infiniopGetReduceMinWorkspaceSize.argtypes = [ + infiniopReduceMinDescriptor_t, + POINTER(c_uint64), + ] + lib.infiniopReduceMin.restype = c_int32 + lib.infiniopReduceMin.argtypes = [ + infiniopReduceMinDescriptor_t, + c_void_p, + c_uint64, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyReduceMinDescriptor.restype = c_int32 + lib.infiniopDestroyReduceMinDescriptor.argtypes = [ + infiniopReduceMinDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + + if args.cuda: + test_cuda(lib, test_cases) + + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/operatorspy/tests/reduce_sum.py b/operatorspy/tests/reduce_sum.py new file mode 100644 index 00000000..b4d90e68 --- /dev/null +++ b/operatorspy/tests/reduce_sum.py @@ -0,0 +1,189 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + +class ReduceSumDescriptor(Structure): + _fields_ = [("device", c_int32)] + +infiniopReduceSumDescriptor_t = POINTER(ReduceSumDescriptor) + +def reduceSum(x, axis = 0, keepdims = True): + return torch.sum(x, dim=axis, keepdim=keepdims) + +def test( + lib, + handle, + torch_device, + tensor_shape, + axis, + keepdims, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing ReduceSum on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} axis:{axis} keepdim:{keepdims} inplace: {inplace.name}" + ) + x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1 + + out_shape = list(tensor_shape) + for i in axis: + if(keepdims==1): + out_shape[i] = 1 + else: + out_shape = out_shape[:i] + out_shape[i+1:] + if(len(axis) == len(tensor_shape)): + if(keepdims == 0): + out_shape = [1] + + y = torch.zeros(out_shape, dtype=tensor_dtype).to(torch_device) + + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = reduceSum(x, axis, bool(keepdims)) + + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = reduceSum(x, axis, bool(keepdims)) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) + descriptor = infiniopReduceSumDescriptor_t() + + check_error( + lib.infiniopCreateReduceSumDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + (ctypes.c_int * len(axis))(*axis), + len(axis), + keepdims, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + workspaceSize = ctypes.c_uint64(0) + check_error( + lib.infiniopGetReduceSumWorkspaceSize(descriptor, ctypes.byref(workspaceSize)) + ) + workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) + workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error(lib.infiniopReduceSum(descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None)) + + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopReduceSum(descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + if tensor_dtype == torch.float16: + assert torch.allclose(y, ans, atol=1e-3, rtol=1e-7) + else: + assert torch.allclose(y, ans, atol=1e-6, rtol=1e-6) + check_error(lib.infiniopDestroyReduceSumDescriptor(descriptor)) + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for tensor_shape, axis, keepdims, inplace in test_cases: + test(lib, handle, "cpu", tensor_shape, axis, keepdims, tensor_dtype=torch.float32, inplace=inplace) + test(lib, handle, "cpu", tensor_shape, axis, keepdims, tensor_dtype=torch.float16, inplace=inplace) + destroy_handle(lib, handle) + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for tensor_shape, axis, keepdims, inplace in test_cases: + test(lib, handle, "cuda", tensor_shape, axis, keepdims, tensor_dtype=torch.float32, inplace=inplace) + test(lib, handle, "cuda", tensor_shape, axis, keepdims, tensor_dtype=torch.float16, inplace=inplace) + destroy_handle(lib, handle) + +if __name__ == "__main__": + test_cases = [ + # tensor_shape, inplace + ((2, 3), [1], 1, Inplace.OUT_OF_PLACE), + ((2, 3), [0, 1], 0, Inplace.OUT_OF_PLACE), + ((4, 7), [0], 1, Inplace.OUT_OF_PLACE), + ((300, 1024), [0], 1, Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), [-2, -1], 1, Inplace.OUT_OF_PLACE), + ((6, 6, 200), [0, 1], 1, Inplace.OUT_OF_PLACE), + ((6, 2, 7), [0, 2], 1, Inplace.OUT_OF_PLACE), + ((266, 400), [0], 0, Inplace.OUT_OF_PLACE), + ((32, 256, 112, 112), [3], 1, Inplace.OUT_OF_PLACE), + ((32, 288, 112, 112), [3], 0, Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateReduceSumDescriptor.restype = c_int32 + lib.infiniopCreateReduceSumDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopReduceSumDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ctypes.POINTER(ctypes.c_int), + c_int32, + c_int32, + ] + lib.infiniopGetReduceSumWorkspaceSize.restype = c_int32 + lib.infiniopGetReduceSumWorkspaceSize.argtypes = [ + infiniopReduceSumDescriptor_t, + POINTER(c_uint64), + ] + lib.infiniopReduceSum.restype = c_int32 + lib.infiniopReduceSum.argtypes = [ + infiniopReduceSumDescriptor_t, + c_void_p, + c_uint64, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyReduceSumDescriptor.restype = c_int32 + lib.infiniopDestroyReduceSumDescriptor.argtypes = [ + infiniopReduceSumDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + + if args.cuda: + test_cuda(lib, test_cases) + + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/operatorspy/tests/where.py b/operatorspy/tests/where.py new file mode 100644 index 00000000..70314e95 --- /dev/null +++ b/operatorspy/tests/where.py @@ -0,0 +1,161 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_uint8, c_void_p, c_float +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +class WhereDescriptor(Structure): + _fields_ = [("device", c_int32)] + +infiniopWhereDescriptor_t = POINTER(WhereDescriptor) + +def where(condition, x, y): + return torch.where(condition, x, y) + +def test( + lib, + handle, + torch_device, + tensor_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Where on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + + x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1 + y = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) + condition = torch.randint(0, 2, tensor_shape, dtype=torch.uint8).to(torch_device) + dst = torch.empty_like(x)if inplace == Inplace.OUT_OF_PLACE else x + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = where(condition, y, x) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = where(condition, y, x) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + x_tensor = to_tensor(x, lib) + dst_tensor = to_tensor(dst, lib)if inplace == Inplace.OUT_OF_PLACE else x_tensor + y_tensor = to_tensor(y, lib) + condition_tensor = to_tensor(condition, lib) + descriptor = infiniopWhereDescriptor_t() + + check_error( + lib.infiniopCreateWhereDescriptor( + handle, + ctypes.byref(descriptor), + dst_tensor.descriptor, + x_tensor.descriptor, + y_tensor.descriptor, + condition_tensor.descriptor, + ) + ) + + dst_tensor.descriptor.contents.invalidate() + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + condition_tensor.descriptor.contents.invalidate() + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error(lib.infiniopWhere(descriptor, dst_tensor.data, x_tensor.data, y_tensor.data, condition_tensor.data, None)) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopWhere(descriptor, dst_tensor.data, x_tensor.data, y_tensor.data, condition_tensor.data, None) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + assert torch.allclose(dst, ans, atol=0, rtol=0) + check_error(lib.infiniopDestroyWhereDescriptor(descriptor)) + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for tensor_shape, inplace in test_cases: + test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace) + test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace) + destroy_handle(lib, handle) + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for tensor_shape, inplace in test_cases: + test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float32, inplace=inplace) + test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float16, inplace=inplace) + destroy_handle(lib, handle) + +if __name__ == "__main__": + test_cases = [ + # tensor_shape, inplace + ((), Inplace.OUT_OF_PLACE), + ((), Inplace.INPLACE_X), + ((1, 3), Inplace.OUT_OF_PLACE), + ((3, 3), Inplace.OUT_OF_PLACE), + ((3, 3, 13, 9, 17), Inplace.INPLACE_X), + ((32, 20, 512), Inplace.INPLACE_X), + ((33, 333, 333), Inplace.OUT_OF_PLACE), + ((32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateWhereDescriptor.restype = c_int32 + lib.infiniopCreateWhereDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopWhereDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopWhere.restype = c_int32 + lib.infiniopWhere.argtypes = [ + infiniopWhereDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyWhereDescriptor.restype = c_int32 + lib.infiniopDestroyWhereDescriptor.argtypes = [ + infiniopWhereDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + + if args.cuda: + test_cuda(lib, test_cases) + + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/src/ops/clip/cpu/clip_cpu.cc b/src/ops/clip/cpu/clip_cpu.cc new file mode 100644 index 00000000..d23cacf4 --- /dev/null +++ b/src/ops/clip/cpu/clip_cpu.cc @@ -0,0 +1,83 @@ +#include "clip_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" +#include + +infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t, + ClipCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + float max, + float min) { + uint64_t ndim = src->ndim; + if (ndim != dst->ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (size_t i = 0; i < ndim; ++i) { + if (src->shape[i] != dst->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + if (!is_contiguous(src) || !is_contiguous(dst)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (src->dt != F16 && src->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (src->dt != dst->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + DT dtype = src->dt; + uint64_t data_size = std::accumulate(src->shape, src->shape + src->ndim, 1ULL, std::multiplies()); + *desc_ptr = new ClipCpuDescriptor{ + DevCpu, + dtype, + data_size, + max, + min, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyClipDescriptor(ClipCpuDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} + +template + +void clip_cpu(ClipCpuDescriptor_t desc, void *dst, void const *src) +{ + auto dstPtr = reinterpret_cast(dst); + auto srcPtr = reinterpret_cast(src); + if constexpr (std::is_same::value) + { + for(uint64_t i = 0; i < desc->data_size; i++) { + float val = f16_to_f32(srcPtr[i]); + float clipped_val = std::min(std::max(val, desc->min), desc->max); + dstPtr[i] = f32_to_f16(clipped_val); + } + } + else{ + for(uint64_t i = 0; i < desc->data_size; i++){ + dstPtr[i] = std::min(std::max(srcPtr[i], desc->min), desc->max); + } + } + +} +infiniopStatus_t cpuClip(ClipCpuDescriptor_t desc, + void *dst, void const *src, + void *stream) { + if(desc->dtype == F32) + { + clip_cpu(desc, dst, src); + } + else if(desc->dtype == F16) + { + clip_cpu(desc, dst, src); + } + return STATUS_SUCCESS; +} + diff --git a/src/ops/clip/cpu/clip_cpu.h b/src/ops/clip/cpu/clip_cpu.h new file mode 100644 index 00000000..584fb6de --- /dev/null +++ b/src/ops/clip/cpu/clip_cpu.h @@ -0,0 +1,27 @@ +#ifndef __CPU_CLIP_H__ +#define __CPU_CLIP_H__ +#include "operators.h" + +struct ClipCpuDescriptor { + Device device; + DT dtype; + uint64_t data_size; + float max; + float min; +}; + +typedef ClipCpuDescriptor *ClipCpuDescriptor_t; + +infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t, + ClipCpuDescriptor_t *, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + float max, + float min); + +infiniopStatus_t cpuClip(ClipCpuDescriptor_t desc, + void *dst, void const *src, + void *stream); + +infiniopStatus_t cpuDestroyClipDescriptor(ClipCpuDescriptor_t desc); +#endif \ No newline at end of file diff --git a/src/ops/clip/cuda/clip.cc b/src/ops/clip/cuda/clip.cc new file mode 100644 index 00000000..2d55cb69 --- /dev/null +++ b/src/ops/clip/cuda/clip.cc @@ -0,0 +1,74 @@ +#include "clip.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateClipDescriptor(CudaHandle_t handle, + ClipCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + float max, + float min) { + uint64_t ndim = src->ndim; + if (ndim != dst->ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (size_t i = 0; i < ndim; ++i) { + if (src->shape[i] != dst->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + if (!is_contiguous(src) || !is_contiguous(dst)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (src->dt != F16 && src->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (src->dt != dst->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + DT dtype = src->dt; + uint64_t data_size = std::accumulate(src->shape, src->shape + src->ndim, 1ULL, std::multiplies()); + void *max_ptr, *min_ptr; + if(dtype == F32) + { + checkCudaErrorWithCode(cudaMalloc(&max_ptr, sizeof(float)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc(&min_ptr, sizeof(float)), STATUS_MEMORY_NOT_ALLOCATED); + float* max_p = static_cast(max_ptr); + float* min_p = static_cast(min_ptr); + checkCudaErrorWithCode(cudaMemcpy(max_p, &max, sizeof(float), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMemcpy(min_p, &min, sizeof(float), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED); + } + else if(dtype == F16) + { + half max_h = __float2half(max); + half min_h = __float2half(min); + checkCudaErrorWithCode(cudaMalloc(&max_ptr, sizeof(half)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc(&min_ptr, sizeof(half)), STATUS_MEMORY_NOT_ALLOCATED); + half* max_p = static_cast(max_ptr); + half* min_p = static_cast(min_ptr); + checkCudaErrorWithCode(cudaMemcpy(max_p, &max_h, sizeof(half), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMemcpy(min_p, &min_h, sizeof(half), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED); + } + + + *desc_ptr = new ClipCudaDescriptor{ + DevNvGpu, + dtype, + handle->device_id, + ndim, + data_size, + static_cast(handle->prop.maxGridSize[0]), + max_ptr, + min_ptr, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyClipDescriptor(ClipCudaDescriptor_t desc) { + checkCudaErrorWithCode(cudaFree(desc->max_ptr), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaFree(desc->min_ptr), STATUS_EXECUTION_FAILED); + delete desc; + return STATUS_SUCCESS; +} \ No newline at end of file diff --git a/src/ops/clip/cuda/clip.cu b/src/ops/clip/cuda/clip.cu new file mode 100644 index 00000000..f2de2f1a --- /dev/null +++ b/src/ops/clip/cuda/clip.cu @@ -0,0 +1,57 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "clip.cuh" + +template +__global__ void clip( + Tdata *y, + const Tdata *x, + const Tdata *max_val, + const Tdata *min_val, + uint64_t offset) { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + + if constexpr (std::is_same::value) { + y[idx] = __hlt(x[idx], *min_val) ? *min_val : __hgt(x[idx], *max_val) ? *max_val : x[idx]; + } else { + y[idx] = x[idx] < *min_val ? *min_val : x[idx] > *max_val ? *max_val : x[idx]; + } +} + +template +infiniopStatus_t clip_nv_gpu(ClipCudaDescriptor_t desc, void *y, void const *x, void *stream) { + if (desc->data_size == 0) { + return STATUS_SUCCESS; + } + dim3 blockDims = dim3(std::min(static_cast(256), desc->data_size)); + dim3 gridDims = dim3(std::min(ROUND_UP_DIV(desc->data_size, blockDims.x), desc->max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + + const auto x_ = reinterpret_cast(x); + const auto y_ = reinterpret_cast(y); + const Tdata *max_val = reinterpret_cast(desc->max_ptr); + const Tdata *min_val = reinterpret_cast(desc->min_ptr); + + cudaStream_t cuda_stream = reinterpret_cast(stream); + +#pragma unroll + for (uint64_t i = 0; i < desc->data_size; i += step) { + clip<<>>( + y_, x_, max_val, min_val, i); + } + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc, + void *dst, void const *src, + void *stream) { + checkCudaError(cudaSetDevice(desc->device_id)); + if (desc->dtype == F16) { + return clip_nv_gpu(desc, dst, src, stream); + } + if (desc->dtype == F32) { + return clip_nv_gpu(desc, dst, src, stream); + } + return STATUS_BAD_TENSOR_DTYPE; +} \ No newline at end of file diff --git a/src/ops/clip/cuda/clip.cuh b/src/ops/clip/cuda/clip.cuh new file mode 100644 index 00000000..795b410c --- /dev/null +++ b/src/ops/clip/cuda/clip.cuh @@ -0,0 +1,36 @@ +#ifndef __CUDA_CLIP_H__ +#define __CUDA_CLIP_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" +#include +#include + +struct ClipCudaDescriptor { + Device device; + DT dtype; + int device_id; + uint64_t ndim; + uint64_t data_size; + uint64_t max_grid_size; + void *max_ptr; + void *min_ptr; +}; + +typedef struct ClipCudaDescriptor *ClipCudaDescriptor_t; + +infiniopStatus_t cudaCreateClipDescriptor(CudaHandle_t, + ClipCudaDescriptor_t *, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + float max, + float min); + +infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc, + void *dst, void const *src, + void *stream); + +infiniopStatus_t cudaDestroyClipDescriptor(ClipCudaDescriptor_t desc); + +#endif diff --git a/src/ops/clip/operator.cc b/src/ops/clip/operator.cc new file mode 100644 index 00000000..9e40d198 --- /dev/null +++ b/src/ops/clip/operator.cc @@ -0,0 +1,71 @@ +#include "../utils.h" +#include "operators.h" + +#include "ops/clip/clip.h" + +#ifdef ENABLE_CPU +#include "cpu/clip_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/clip.cuh" +#endif + +__C infiniopStatus_t infiniopCreateClipDescriptor( + infiniopHandle_t handle, + infiniopClipDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + float max, + float min){ + switch (handle->device) + { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateClipDescriptor(handle, (ClipCpuDescriptor_t *)desc_ptr, dst, src, max, min); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: + return cudaCreateClipDescriptor((CudaHandle_t) handle, (ClipCudaDescriptor_t *) desc_ptr, dst, src, max, min); + +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, + void *dst, + void const *src, + void *stream){ + switch (desc->device) + { +#ifdef ENABLE_CPU + case DevCpu: + return cpuClip((ClipCpuDescriptor_t)desc, dst, src, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: + return cudaClip((ClipCudaDescriptor_t) desc, dst, src, stream); + + +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc){ + switch (desc->device) + { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyClipDescriptor((ClipCpuDescriptor_t)desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: + return cudaDestroyClipDescriptor((ClipCudaDescriptor_t) desc); + + +#endif + } + return STATUS_BAD_DEVICE; +} \ No newline at end of file diff --git a/src/ops/gather/cpu/gather_cpu.cc b/src/ops/gather/cpu/gather_cpu.cc new file mode 100644 index 00000000..ead75320 --- /dev/null +++ b/src/ops/gather/cpu/gather_cpu.cc @@ -0,0 +1,122 @@ +#include "gather_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" + + +infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t, + GatherCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t data, + infiniopTensorDescriptor_t indices, + int axis) +{ + uint64_t data_ndim = data->ndim; + uint64_t indices_ndim = indices->ndim; + if(axis<0){ + axis += data_ndim; + } + if(axis >= data_ndim){ + return STATUS_BAD_PARAM; + } + if(data_ndim + indices_ndim - 1 != dst->ndim){ + return STATUS_BAD_TENSOR_SHAPE; + } + int j=0; + for (int i=0; indim; i++){ + if(ishape[i] != dst->shape[i]){ + return STATUS_BAD_TENSOR_SHAPE; + } + } + else if(i == axis){ + for(j=0; jshape[j] != dst->shape[i]){ + return STATUS_BAD_TENSOR_SHAPE; + } + i++; + } + } + else{ + if (data->shape[i - j] != dst->shape[i]){ + return STATUS_BAD_TENSOR_SHAPE; + } + } + } + + if (data->dt != F16 && data->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (!is_contiguous(data)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (!is_contiguous(indices)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (indices->dt != I8 && indices->dt != I16 && indices->dt != I32 && indices->dt != I64) { + return STATUS_BAD_TENSOR_DTYPE; + } + DT dtype = data->dt; + DT indice_type = indices->dt; + uint64_t dst_size = std::accumulate(dst->shape, dst->shape + dst->ndim, 1ULL, std::multiplies()); + uint64_t stride_axis = (data->strides)[axis]; + uint64_t shape_axis = (data->shape)[axis]; + uint64_t num_indices = std::accumulate(indices->shape, indices->shape + indices->ndim, 1ULL, std::multiplies()); + *desc_ptr = new GatherCpuDescriptor{ + DevCpu, + dtype, + indice_type, + dst_size, + stride_axis, + shape_axis, + num_indices, + }; + return STATUS_SUCCESS; +} + + + +infiniopStatus_t cpuDestroyGatherDescriptor(GatherCpuDescriptor_t desc){ + delete desc; + return STATUS_SUCCESS; +} + +template +void gather_cpu(GatherCpuDescriptor_t desc, + void *dst, + void const *data, + void const *indices){ + auto dstPtr = reinterpret_cast(dst); + auto dataPtr = reinterpret_cast(data); + auto indicePtr = reinterpret_cast(indices); + + for(uint64_t i = 0; i < desc->dst_size; i++) + { + uint64_t index = indicePtr[i / desc->stride_axis % desc->num_indices]; + int64_t linearId = desc->shape_axis * desc->stride_axis * (i / (desc->num_indices * desc->stride_axis)) + index * desc->stride_axis + i % desc->stride_axis; + dstPtr[i] = dataPtr[linearId]; + } +} + +infiniopStatus_t cpuGather(GatherCpuDescriptor_t desc, + void *dst, + void const *data, + void const *indices, + void *stream){ + if(desc->dtype == F32 && desc->indice_type == I32) + { + gather_cpu(desc, dst, data, indices); + } + else if(desc->dtype == F16 && desc->indice_type == I32) + { + gather_cpu(desc, dst, data, indices); + } + else if(desc->dtype == F32 && desc->indice_type == I64) + { + gather_cpu(desc, dst, data, indices); + } + else if(desc->dtype == F16 && desc->indice_type == I64) + { + gather_cpu(desc, dst, data, indices); + } + return STATUS_SUCCESS; +} \ No newline at end of file diff --git a/src/ops/gather/cpu/gather_cpu.h b/src/ops/gather/cpu/gather_cpu.h new file mode 100644 index 00000000..c8a403e9 --- /dev/null +++ b/src/ops/gather/cpu/gather_cpu.h @@ -0,0 +1,31 @@ +#ifndef __CPU_GATHER_H__ +#define __CPU_GATHER_H__ +#include "operators.h" + +struct GatherCpuDescriptor { + Device device; + DT dtype; + DT indice_type; + uint64_t dst_size; + uint64_t stride_axis; + uint64_t shape_axis; + uint64_t num_indices; +}; + +typedef GatherCpuDescriptor *GatherCpuDescriptor_t; + +infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t, + GatherCpuDescriptor_t *, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t data, + infiniopTensorDescriptor_t indices, + int const axis); + +infiniopStatus_t cpuGather(GatherCpuDescriptor_t desc, + void *dst, + void const *data, + void const *indices, + void *stream); + +infiniopStatus_t cpuDestroyGatherDescriptor(GatherCpuDescriptor_t desc); +#endif \ No newline at end of file diff --git a/src/ops/gather/cuda/gather.cc b/src/ops/gather/cuda/gather.cc new file mode 100644 index 00000000..f6e6841a --- /dev/null +++ b/src/ops/gather/cuda/gather.cc @@ -0,0 +1,93 @@ +#include "gather.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateGatherDescriptor(CudaHandle_t handle, + GatherCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t data, + infiniopTensorDescriptor_t indices, + int axis) { + uint64_t data_ndim = data->ndim; + uint64_t indices_ndim = indices->ndim; + if(axis<0){ + axis += data_ndim; + } + if(axis >= data_ndim){ + return STATUS_BAD_PARAM; + } + + int j=0; + for (int i=0; indim; i++){ + if(ishape[i] != dst->shape[i]){ + return STATUS_BAD_TENSOR_SHAPE; + } + } + else if(i == axis){ + for(j=0; jshape[j] != dst->shape[i]){ + return STATUS_BAD_TENSOR_SHAPE; + } + i++; + } + } + else{ + if (data->shape[i - j] != dst->shape[i]){ + return STATUS_BAD_TENSOR_SHAPE; + } + } + } + if (data->dt != F16 && data->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (!is_contiguous(data)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (!is_contiguous(indices)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (indices->dt != I8 && indices->dt != I16 && indices->dt != I32 && indices->dt != I64) { + return STATUS_BAD_TENSOR_DTYPE; + } + DT dtype = data->dt; + DT indice_type = indices->dt; + uint64_t dst_size = std::accumulate(dst->shape, dst->shape + dst->ndim, 1ULL, std::multiplies()); + + uint64_t stride_axis = (data->strides)[axis]; + uint64_t shape_axis = (data->shape)[axis]; + uint64_t num_indices = std::accumulate(indices->shape, indices->shape + indices->ndim, 1ULL, std::multiplies()); + + int64_t *stride_axis_ptr, *shape_axis_ptr; + uint64_t *num_indices_ptr; + checkCudaErrorWithCode(cudaMalloc(&stride_axis_ptr, sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc(&shape_axis_ptr, sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc(&num_indices_ptr, sizeof(uint64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMemcpy(stride_axis_ptr, &stride_axis, sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMemcpy(shape_axis_ptr, &shape_axis, sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMemcpy(num_indices_ptr, &num_indices, sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED); + + + *desc_ptr = new GatherCudaDescriptor{ + DevNvGpu, + dtype, + indice_type, + handle->device_id, + data_ndim, + dst_size, + stride_axis_ptr, + shape_axis_ptr, + num_indices_ptr, + static_cast(handle->prop.maxGridSize[0]), + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyGatherDescriptor(GatherCudaDescriptor_t desc) { + checkCudaErrorWithCode(cudaFree(desc->stride_axis_ptr), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaFree(desc->shape_axis_ptr), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaFree(desc->num_indices_ptr), STATUS_EXECUTION_FAILED); + delete desc; + return STATUS_SUCCESS; +} \ No newline at end of file diff --git a/src/ops/gather/cuda/gather.cu b/src/ops/gather/cuda/gather.cu new file mode 100644 index 00000000..6726aef0 --- /dev/null +++ b/src/ops/gather/cuda/gather.cu @@ -0,0 +1,76 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "gather.cuh" + +template +__global__ void gather( + Tdata *dst, + const Tdata *data, + const Idata *indices, + const int64_t *stride_axis, + const int64_t *shape_axis, + const uint64_t *num_indices, + uint64_t offset) { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + + Idata index = indices[idx / *stride_axis % *num_indices]; + + int64_t linearId = *shape_axis * *stride_axis * (idx / (*num_indices * *stride_axis)) + index * *stride_axis + idx % *stride_axis; + + dst[idx] = data[linearId]; + + +} + +template +infiniopStatus_t gather_nv_gpu(GatherCudaDescriptor_t desc, void *dst, void const *data, void const *indices, void *stream) { + if (desc->dst_size == 0) { + return STATUS_SUCCESS; + } + dim3 blockDims = dim3(std::min(static_cast(256), desc->dst_size)); + dim3 gridDims = dim3(std::min(ROUND_UP_DIV(desc->dst_size, blockDims.x), desc->max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + + const auto dst_ = reinterpret_cast(dst); + const auto data_ = reinterpret_cast(data); + const auto indices_ = reinterpret_cast(indices); + + const int64_t *stride_axis = reinterpret_cast(desc->stride_axis_ptr); + const int64_t *shape_axis = reinterpret_cast(desc->shape_axis_ptr); + const uint64_t *num_indices = reinterpret_cast(desc->num_indices_ptr); + + cudaStream_t cuda_stream = reinterpret_cast(stream); + +#pragma unroll + for (uint64_t i = 0; i < desc->dst_size; i += step) { + gather<<>>( + dst_, data_, indices_, stride_axis, shape_axis, num_indices, i); + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaGather(GatherCudaDescriptor_t desc, + void *dst, + void const *data, + void const *indices, + void *stream){ + checkCudaError(cudaSetDevice(desc->device_id)); + if(desc->dtype == F32 && desc->indice_type == I32) + { + return gather_nv_gpu(desc, dst, data, indices, stream); + } + else if(desc->dtype == F16 && desc->indice_type == I32) + { + return gather_nv_gpu(desc, dst, data, indices, stream); + } + else if(desc->dtype == F32 && desc->indice_type == I64) + { + return gather_nv_gpu(desc, dst, data, indices, stream); + } + else if(desc->dtype == F16 && desc->indice_type == I64) + { + return gather_nv_gpu(desc, dst, data, indices, stream); + } + + return STATUS_BAD_TENSOR_DTYPE; +} \ No newline at end of file diff --git a/src/ops/gather/cuda/gather.cuh b/src/ops/gather/cuda/gather.cuh new file mode 100644 index 00000000..f082d86a --- /dev/null +++ b/src/ops/gather/cuda/gather.cuh @@ -0,0 +1,40 @@ +#ifndef __CUDA_GATHER_H__ +#define __CUDA_GATHER_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" +#include +#include + +struct GatherCudaDescriptor { + Device device; + DT dtype; + DT indice_type; + int device_id; + uint64_t ndim; + uint64_t dst_size; + int64_t* stride_axis_ptr; + int64_t* shape_axis_ptr; + uint64_t* num_indices_ptr; + uint64_t max_grid_size; +}; + +typedef struct GatherCudaDescriptor *GatherCudaDescriptor_t; + +infiniopStatus_t cudaCreateGatherDescriptor(CudaHandle_t, + GatherCudaDescriptor_t *, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t data, + infiniopTensorDescriptor_t indices, + int axis); + +infiniopStatus_t cudaGather(GatherCudaDescriptor_t desc, + void *dst, + void const *data, + void const *indices, + void *stream); + +infiniopStatus_t cudaDestroyGatherDescriptor(GatherCudaDescriptor_t desc); + +#endif diff --git a/src/ops/gather/operator.cc b/src/ops/gather/operator.cc new file mode 100644 index 00000000..40fdc52d --- /dev/null +++ b/src/ops/gather/operator.cc @@ -0,0 +1,67 @@ +#include "../utils.h" +#include "operators.h" + +#include "ops/gather/gather.h" + +#ifdef ENABLE_CPU +#include "cpu/gather_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/gather.cuh" +#endif + +__C infiniopStatus_t infiniopCreateGatherDescriptor( + infiniopHandle_t handle, + infiniopGatherDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t data, + infiniopTensorDescriptor_t indices, + int axis){ + switch (handle->device) + { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateGatherDescriptor(handle, (GatherCpuDescriptor_t *)desc_ptr, dst, data, indices, axis); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: + return cudaCreateGatherDescriptor((CudaHandle_t)handle, (GatherCudaDescriptor_t *)desc_ptr, dst, data, indices, axis); +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, + void *dst, + void const *data, + void const *indices, + void *stream){ + switch (desc->device) + { +#ifdef ENABLE_CPU + case DevCpu: + return cpuGather((GatherCpuDescriptor_t)desc, dst, data, indices, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: + return cudaGather((GatherCudaDescriptor_t)desc, dst, data, indices, stream); +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc){ + switch (desc->device) + { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyGatherDescriptor((GatherCpuDescriptor_t)desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: + return cudaDestroyGatherDescriptor((GatherCudaDescriptor_t)desc); +#endif + } + return STATUS_BAD_DEVICE; +} \ No newline at end of file diff --git a/src/ops/gather_elements/cpu/gather_elements_cpu.cc b/src/ops/gather_elements/cpu/gather_elements_cpu.cc new file mode 100644 index 00000000..2dd7d683 --- /dev/null +++ b/src/ops/gather_elements/cpu/gather_elements_cpu.cc @@ -0,0 +1,113 @@ +#include "gather_elements_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" + + +infiniopStatus_t cpuCreateGatherElementsDescriptor(infiniopHandle_t, + GatherElementsCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t data, + infiniopTensorDescriptor_t indices, + int axis) +{ + uint64_t data_ndim = data->ndim; + uint64_t indices_ndim = indices->ndim; + if (indices_ndim > data_ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + if(axis<0){ + axis = axis + dst->ndim; + } + + for(size_t i = 0; i < indices_ndim; i++){ + if(indices->shape[i] != dst->shape[i]){ + return STATUS_BAD_TENSOR_SHAPE; + } + if(i == axis){ + if (data->shape[i] < indices->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + else{ + if (data->shape[i] != indices->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + } + if (data->dt != F16 && data->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (!is_contiguous(data)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (!is_contiguous(indices)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (indices->dt != I8 && indices->dt != I16 && indices->dt != I32 && indices->dt != I64) { + return STATUS_BAD_TENSOR_DTYPE; + } + DT dtype = data->dt; + DT indice_type = indices->dt; + uint64_t indices_size = std::accumulate(indices->shape, indices->shape + indices->ndim, 1ULL, std::multiplies()); + uint64_t stride_axis = (data->strides)[axis]; + uint64_t shape_axis = (data->shape)[axis]; + uint64_t num_indices = indices->shape[axis]; + *desc_ptr = new GatherElementsCpuDescriptor{ + DevCpu, + dtype, + indice_type, + indices_size, + stride_axis, + shape_axis, + num_indices, + }; + return STATUS_SUCCESS; +} + + + +infiniopStatus_t cpuDestroyGatherElementsDescriptor(GatherElementsCpuDescriptor_t desc){ + delete desc; + return STATUS_SUCCESS; +} + +template +void gather_cpu(GatherElementsCpuDescriptor_t desc, + void *dst, + void const *data, + void const *indices){ + auto dstPtr = reinterpret_cast(dst); + auto dataPtr = reinterpret_cast(data); + auto indicePtr = reinterpret_cast(indices); + for(uint64_t i = 0; i < desc->indices_size; i++) + { + uint64_t index = indicePtr[i]; + int64_t linearId = desc->shape_axis * desc->stride_axis * (i / (desc->num_indices * desc->stride_axis)) + index * desc->stride_axis + i % desc->stride_axis; + dstPtr[i] = dataPtr[linearId]; + } +} + +infiniopStatus_t cpuGatherElements(GatherElementsCpuDescriptor_t desc, + void *dst, + void const *data, + void const *indices, + void *stream){ + if(desc->dtype == F32 && desc->indice_type == I32) + { + gather_cpu(desc, dst, data, indices); + } + else if(desc->dtype == F16 && desc->indice_type == I32) + { + gather_cpu(desc, dst, data, indices); + } + else if(desc->dtype == F32 && desc->indice_type == I64) + { + gather_cpu(desc, dst, data, indices); + } + else if(desc->dtype == F16 && desc->indice_type == I64) + { + gather_cpu(desc, dst, data, indices); + } + + return STATUS_SUCCESS; +} \ No newline at end of file diff --git a/src/ops/gather_elements/cpu/gather_elements_cpu.h b/src/ops/gather_elements/cpu/gather_elements_cpu.h new file mode 100644 index 00000000..aebbdf49 --- /dev/null +++ b/src/ops/gather_elements/cpu/gather_elements_cpu.h @@ -0,0 +1,31 @@ +#ifndef __CPU_GATHER_ELEMENTS_H__ +#define __CPU_GATHER_ELEMENTS_H__ +#include "operators.h" + +struct GatherElementsCpuDescriptor { + Device device; + DT dtype; + DT indice_type; + uint64_t indices_size; + uint64_t stride_axis; + uint64_t shape_axis; + uint64_t num_indices; +}; + +typedef GatherElementsCpuDescriptor *GatherElementsCpuDescriptor_t; + +infiniopStatus_t cpuCreateGatherElementsDescriptor(infiniopHandle_t, + GatherElementsCpuDescriptor_t *, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t data, + infiniopTensorDescriptor_t indices, + int const axis); + +infiniopStatus_t cpuGatherElements(GatherElementsCpuDescriptor_t desc, + void *dst, + void const *data, + void const *indices, + void *stream); + +infiniopStatus_t cpuDestroyGatherElementsDescriptor(GatherElementsCpuDescriptor_t desc); +#endif \ No newline at end of file diff --git a/src/ops/gather_elements/cuda/gather_elements.cc b/src/ops/gather_elements/cuda/gather_elements.cc new file mode 100644 index 00000000..0604f436 --- /dev/null +++ b/src/ops/gather_elements/cuda/gather_elements.cc @@ -0,0 +1,86 @@ +#include "gather_elements.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateGatherElementsDescriptor(CudaHandle_t handle, + GatherElementsCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t data, + infiniopTensorDescriptor_t indices, + int axis) { + uint64_t data_ndim = data->ndim; + uint64_t indices_ndim = indices->ndim; + if (indices_ndim > data_ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + if(axis<0){ + axis = axis + dst->ndim; + } + + for(size_t i = 0; i < indices_ndim; i++){ + if(indices->shape[i] != dst->shape[i]){ + return STATUS_BAD_TENSOR_SHAPE; + } + if(i == axis){ + if (data->shape[i] < indices->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + else{ + if (data->shape[i] != indices->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + } + if (data->dt != F16 && data->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (!is_contiguous(data)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (!is_contiguous(indices)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (indices->dt != I8 && indices->dt != I16 && indices->dt != I32 && indices->dt != I64) { + return STATUS_BAD_TENSOR_DTYPE; + } + DT dtype = data->dt; + DT indice_type = indices->dt; + uint64_t indices_size = std::accumulate(indices->shape, indices->shape + indices->ndim, 1ULL, std::multiplies()); + + uint64_t stride_axis = (data->strides)[axis]; + uint64_t shape_axis = (data->shape)[axis]; + uint64_t num_indices = indices->shape[axis]; + + int64_t *stride_axis_ptr, *shape_axis_ptr, *num_indices_ptr; + checkCudaErrorWithCode(cudaMalloc(&stride_axis_ptr, sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc(&shape_axis_ptr, sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc(&num_indices_ptr, sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMemcpy(stride_axis_ptr, &stride_axis, sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMemcpy(shape_axis_ptr, &shape_axis, sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMemcpy(num_indices_ptr, &num_indices, sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED); + + + *desc_ptr = new GatherElementsCudaDescriptor{ + DevNvGpu, + dtype, + indice_type, + handle->device_id, + data_ndim, + indices_size, + stride_axis_ptr, + shape_axis_ptr, + num_indices_ptr, + static_cast(handle->prop.maxGridSize[0]), + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyGatherElementsDescriptor(GatherElementsCudaDescriptor_t desc) { + checkCudaErrorWithCode(cudaFree(desc->stride_axis_ptr), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaFree(desc->shape_axis_ptr), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaFree(desc->num_indices_ptr), STATUS_EXECUTION_FAILED); + delete desc; + return STATUS_SUCCESS; +} \ No newline at end of file diff --git a/src/ops/gather_elements/cuda/gather_elements.cu b/src/ops/gather_elements/cuda/gather_elements.cu new file mode 100644 index 00000000..b445c40b --- /dev/null +++ b/src/ops/gather_elements/cuda/gather_elements.cu @@ -0,0 +1,74 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "gather_elements.cuh" + +template +__global__ void gather( + Tdata *dst, + const Tdata *data, + const Idata *indices, + const int64_t *stride_axis, + const int64_t *shape_axis, + const int64_t *num_indices, + uint64_t offset) { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + + Idata index = indices[idx]; + int64_t linearId = *shape_axis * *stride_axis * (idx / (*num_indices * *stride_axis)) + index * *stride_axis + idx % *stride_axis; + dst[idx] = data[linearId]; + +} + +template +infiniopStatus_t gather_nv_gpu(GatherElementsCudaDescriptor_t desc, void *dst, void const *data, void const *indices, void *stream) { + if (desc->indices_size == 0) { + return STATUS_SUCCESS; + } + dim3 blockDims = dim3(std::min(static_cast(256), desc->indices_size)); + dim3 gridDims = dim3(std::min(ROUND_UP_DIV(desc->indices_size, blockDims.x), desc->max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + + const auto dst_ = reinterpret_cast(dst); + const auto data_ = reinterpret_cast(data); + const auto indices_ = reinterpret_cast(indices); + + const int64_t *stride_axis = reinterpret_cast(desc->stride_axis_ptr); + const int64_t *shape_axis = reinterpret_cast(desc->shape_axis_ptr); + const int64_t *num_indices = reinterpret_cast(desc->num_indices_ptr); + + cudaStream_t cuda_stream = reinterpret_cast(stream); + +#pragma unroll + for (uint64_t i = 0; i < desc->indices_size; i += step) { + gather<<>>( + dst_, data_, indices_, stride_axis, shape_axis, num_indices, i); + } + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaGatherElements(GatherElementsCudaDescriptor_t desc, + void *dst, + void const *data, + void const *indices, + void *stream){ + checkCudaError(cudaSetDevice(desc->device_id)); + if(desc->dtype == F32 && desc->indice_type == I32) + { + return gather_nv_gpu(desc, dst, data, indices, stream); + } + else if(desc->dtype == F16 && desc->indice_type == I32) + { + return gather_nv_gpu(desc, dst, data, indices, stream); + } + else if(desc->dtype == F32 && desc->indice_type == I64) + { + return gather_nv_gpu(desc, dst, data, indices, stream); + } + else if(desc->dtype == F16 && desc->indice_type == I64) + { + return gather_nv_gpu(desc, dst, data, indices, stream); + } + + return STATUS_BAD_TENSOR_DTYPE; +} \ No newline at end of file diff --git a/src/ops/gather_elements/cuda/gather_elements.cuh b/src/ops/gather_elements/cuda/gather_elements.cuh new file mode 100644 index 00000000..ac75763b --- /dev/null +++ b/src/ops/gather_elements/cuda/gather_elements.cuh @@ -0,0 +1,40 @@ +#ifndef __CUDA_GATHER_H__ +#define __CUDA_GATHER_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" +#include +#include + +struct GatherElementsCudaDescriptor { + Device device; + DT dtype; + DT indice_type; + int device_id; + uint64_t ndim; + uint64_t indices_size; + int64_t* stride_axis_ptr; + int64_t* shape_axis_ptr; + int64_t* num_indices_ptr; + uint64_t max_grid_size; +}; + +typedef struct GatherElementsCudaDescriptor *GatherElementsCudaDescriptor_t; + +infiniopStatus_t cudaCreateGatherElementsDescriptor(CudaHandle_t, + GatherElementsCudaDescriptor_t *, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t data, + infiniopTensorDescriptor_t indices, + int axis); + +infiniopStatus_t cudaGatherElements(GatherElementsCudaDescriptor_t desc, + void *dst, + void const *data, + void const *indices, + void *stream); + +infiniopStatus_t cudaDestroyGatherElementsDescriptor(GatherElementsCudaDescriptor_t desc); + +#endif diff --git a/src/ops/gather_elements/operator.cc b/src/ops/gather_elements/operator.cc new file mode 100644 index 00000000..980c39b0 --- /dev/null +++ b/src/ops/gather_elements/operator.cc @@ -0,0 +1,67 @@ +#include "../utils.h" +#include "operators.h" + +#include "ops/gather_elements/gather_elements.h" + +#ifdef ENABLE_CPU +#include "cpu/gather_elements_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/gather_elements.cuh" +#endif + +__C infiniopStatus_t infiniopCreateGatherElementsDescriptor( + infiniopHandle_t handle, + infiniopGatherElementsDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t data, + infiniopTensorDescriptor_t indices, + int axis){ + switch (handle->device) + { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateGatherElementsDescriptor(handle, (GatherElementsCpuDescriptor_t *)desc_ptr, dst, data, indices, axis); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: + return cudaCreateGatherElementsDescriptor((CudaHandle_t)handle, (GatherElementsCudaDescriptor_t *)desc_ptr, dst, data, indices, axis); +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopGatherElements(infiniopGatherElementsDescriptor_t desc, + void *dst, + void const *data, + void const *indices, + void *stream){ + switch (desc->device) + { +#ifdef ENABLE_CPU + case DevCpu: + return cpuGatherElements((GatherElementsCpuDescriptor_t)desc, dst, data, indices, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: + return cudaGatherElements((GatherElementsCudaDescriptor_t)desc, dst, data, indices, stream); +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyGatherElementsDescriptor(infiniopGatherElementsDescriptor_t desc){ + switch (desc->device) + { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyGatherElementsDescriptor((GatherElementsCpuDescriptor_t)desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: + return cudaDestroyGatherElementsDescriptor((GatherElementsCudaDescriptor_t)desc); +#endif + } + return STATUS_BAD_DEVICE; +} \ No newline at end of file diff --git a/src/ops/reduce/cpu/reduce_cpu.cc b/src/ops/reduce/cpu/reduce_cpu.cc new file mode 100644 index 00000000..b41f557d --- /dev/null +++ b/src/ops/reduce/cpu/reduce_cpu.cc @@ -0,0 +1,178 @@ +#include "reduce_cpu.h" +#include "../../utils.h" + +infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle, + ReduceCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + int *axis, + const int num_axis, + int keep_dims, + int reduce_type){ + uint64_t ndim = src->ndim; + int n = num_axis; + + for (int i=0; i= ndim){ + return STATUS_BAD_PARAM; + } + if(keep_dims == 1){ + if(dst->shape[axis[i]] != 1){ + return STATUS_BAD_TENSOR_SHAPE; + } + } + } + + if (!is_contiguous(src) || !is_contiguous(dst)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (src->dt != dst->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (src->dt != F16 && src->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (reduce_type > 3) { + return STATUS_BAD_PARAM; + } + DT dtype = src->dt; + uint64_t dst_size = std::accumulate(dst->shape, dst->shape + dst->ndim, 1ULL, std::multiplies()); + uint64_t data_size = std::accumulate(src->shape, src->shape + ndim, 1ULL, std::multiplies()); + uint64_t *shape_axis = new uint64_t[n]; + uint64_t *stride_axis = new uint64_t[n]; + uint64_t add_dim = 1; + for(int i = 0; ishape[axis[i]]; + add_dim *= shape_axis[i]; + stride_axis[i] = src->strides[axis[i]]; + } + + *desc_ptr = new ReduceCpuDescriptor{ + DevCpu, + dtype, + dst_size, + data_size, + stride_axis, + shape_axis, + add_dim, + n, + reduce_type, + }; + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuGetReduceWorkspaceSize(ReduceCpuDescriptor_t desc, uint64_t *size) { + + *size += desc->dst_size * sizeof(float); + + return STATUS_SUCCESS; +} + + +infiniopStatus_t cpuDestroyReduceDescriptor(ReduceCpuDescriptor_t desc) { + delete[] desc->shape_axis; + delete[] desc->stride_axis; + delete desc; + return STATUS_SUCCESS; +} + +template +void reduce(ReduceCpuDescriptor_t desc, float *workspacePtr, Tdata *dstPtr, Tdata const *srcPtr, uint64_t i, uint64_t index){ + switch (desc->reduce_mode) + { + case 0: + if constexpr (std::is_same::value) { + workspacePtr[index] = std::fmax(workspacePtr[index], f16_to_f32(srcPtr[i])); + dstPtr[index] = f32_to_f16(workspacePtr[index]); + } else { + dstPtr[index] = std::max(dstPtr[index], srcPtr[i]); + } + break; + case 1: + if constexpr (std::is_same::value) { + workspacePtr[index] = std::fmin(workspacePtr[index], f16_to_f32(srcPtr[i])); + dstPtr[index] = f32_to_f16(workspacePtr[index]); + } else { + dstPtr[index] = std::min(dstPtr[index], srcPtr[i]); + } + break; + default: + if constexpr (std::is_same::value) { + workspacePtr[index] += f16_to_f32(srcPtr[i]); + } + else{ + dstPtr[index] += srcPtr[i]; + } + break; + } +} + +template +void reduce_cpu(ReduceCpuDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src){ + auto dstPtr = reinterpret_cast(dst); + auto srcPtr = reinterpret_cast(src); + auto workspacePtr = reinterpret_cast(workspace); + + if constexpr (std::is_same::value){ + switch (desc->reduce_mode) + { + case 0: + std::fill(workspacePtr, workspacePtr + desc->dst_size, -std::numeric_limits::infinity()); + break; + case 1: + std::fill(workspacePtr, workspacePtr + desc->dst_size, std::numeric_limits::infinity()); + break; + default: + std::fill(workspacePtr, workspacePtr + desc->dst_size, 0); + break; + } + } + for(uint64_t i = 0; i < desc->data_size; i++){ + uint64_t index = i; +#pragma unroll + for(int j = 0; j < desc->axis_num; j++){ + index = index / (desc->stride_axis[j] * desc->shape_axis[j]) * desc->stride_axis[j] + index % desc->stride_axis[j]; + } + reduce(desc, workspacePtr, dstPtr, srcPtr, i, index); + } + if(desc->reduce_mode == 2){ + for(int k = 0; k < desc->dst_size; k++){ + if constexpr (std::is_same::value) { + dstPtr[k] = f32_to_f16(workspacePtr[k] / desc->add_dim); + } + else{ + dstPtr[k] /= desc->add_dim; + } + } + } + else if(desc->reduce_mode == 3){ + if constexpr (std::is_same::value){ + for(int k = 0; k < desc->dst_size; k++){ + dstPtr[k] = f32_to_f16(workspacePtr[k]); + } + } + } + +} + +infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *dst, + void const *src, + void *stream){ + + if(desc->dtype == F32) + { + reduce_cpu(desc, workspace, workspace_size, dst, src); + } + else if(desc->dtype == F16) + { + reduce_cpu(desc, workspace, workspace_size, dst, src); + } + return STATUS_SUCCESS; +} \ No newline at end of file diff --git a/src/ops/reduce/cpu/reduce_cpu.h b/src/ops/reduce/cpu/reduce_cpu.h new file mode 100644 index 00000000..1d2bfd85 --- /dev/null +++ b/src/ops/reduce/cpu/reduce_cpu.h @@ -0,0 +1,45 @@ +#ifndef __CPU_REDUCE_H__ +#define __CPU_REDUCE_H__ + +#include "../../../devices/cpu/common_cpu.h" +#include "operators.h" +#include +#include +#include +#include + +struct ReduceCpuDescriptor { + Device device; + DT dtype; + uint64_t dst_size; + uint64_t data_size; + uint64_t* stride_axis; + uint64_t* shape_axis; + uint64_t add_dim; + int axis_num; + int reduce_mode; +}; + +typedef struct ReduceCpuDescriptor *ReduceCpuDescriptor_t; + +infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle, + ReduceCpuDescriptor_t *, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + int *axis, + const int num_axis, + int const keepdims, + int reduce_type); + +infiniopStatus_t cpuGetReduceWorkspaceSize(ReduceCpuDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *dst, + void const *src, + void *stream); + +infiniopStatus_t cpuDestroyReduceDescriptor(ReduceCpuDescriptor_t desc); + +#endif diff --git a/src/ops/reduce/cuda/reduce.cc b/src/ops/reduce/cuda/reduce.cc new file mode 100644 index 00000000..c2bcb71f --- /dev/null +++ b/src/ops/reduce/cuda/reduce.cc @@ -0,0 +1,92 @@ +#include "reduce.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateReduceDescriptor(CudaHandle_t handle, + ReduceCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + int *axis, + int num_axis, + int keep_dims, + int reduce_type) { + uint64_t ndim = src->ndim; + int n = num_axis; + + for (int i=0; i= ndim){ + return STATUS_BAD_PARAM; + } + if(keep_dims == 1){ + if(dst->shape[axis[i]] != 1){ + return STATUS_BAD_TENSOR_SHAPE; + } + } + } + if (!is_contiguous(src) || !is_contiguous(dst)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (src->dt != dst->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (src->dt != F16 && src->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (reduce_type > 3) { + return STATUS_BAD_PARAM; + } + DT dtype = src->dt; + uint64_t dst_size = std::accumulate(dst->shape, dst->shape + dst->ndim, 1ULL, std::multiplies()); + uint64_t data_size = std::accumulate(src->shape, src->shape + ndim, 1ULL, std::multiplies()); + + int64_t *shape_axis = new int64_t[n]; + int64_t *stride_axis = new int64_t[n]; + uint64_t add_dim = 1; + for(int i = 0; ishape[axis[i]]; + add_dim *= shape_axis[i]; + stride_axis[i] = src->strides[axis[i]]; + } + + int64_t *stride_axis_ptr, *shape_axis_ptr; + checkCudaErrorWithCode(cudaMalloc(&stride_axis_ptr, n * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc(&shape_axis_ptr, n * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMemcpy(stride_axis_ptr, stride_axis, n * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMemcpy(shape_axis_ptr, shape_axis, n * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED); + + delete[] shape_axis; + delete[] stride_axis; + *desc_ptr = new ReduceCudaDescriptor{ + DevNvGpu, + dtype, + handle->device_id, + ndim, + dst_size, + data_size, + add_dim, + n, + stride_axis_ptr, + shape_axis_ptr, + reduce_type, + static_cast(handle->prop.maxGridSize[0]), + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaGetReduceWorkspaceSize(ReduceCudaDescriptor_t desc, uint64_t *size){ + *size = 0; + return STATUS_SUCCESS; + +} + +infiniopStatus_t cudaDestroyReduceDescriptor(ReduceCudaDescriptor_t desc) { + checkCudaErrorWithCode(cudaFree(desc->stride_axis_ptr), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaFree(desc->shape_axis_ptr), STATUS_EXECUTION_FAILED); + delete desc; + return STATUS_SUCCESS; +} \ No newline at end of file diff --git a/src/ops/reduce/cuda/reduce.cu b/src/ops/reduce/cuda/reduce.cu new file mode 100644 index 00000000..306f8628 --- /dev/null +++ b/src/ops/reduce/cuda/reduce.cu @@ -0,0 +1,169 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "reduce.cuh" +#include + +template +__global__ void reduce( + Tdata *dst, + Tdata const*src, + const int64_t *stride_axis, + const int64_t *shape_axis, + int axis_num, + uint64_t add_dim, + uint64_t offset, + int reduce_type) { + int64_t threadId = threadIdx.y * blockDim.x + threadIdx.x; + int64_t idx = offset + threadId + blockIdx.x * add_dim; + + extern __shared__ unsigned char sharedMemory[]; + + Tdata* sharedData = reinterpret_cast(sharedMemory); + + + int64_t shift_idx = idx; + #pragma unroll + for (int i=0; i 0) { + if (threadId < s && (threadId + s) < pre_s) { + switch(reduce_type) { + case 0: + if constexpr (std::is_same::value) { + sharedData[threadId] = __hmax(sharedData[threadId], sharedData[threadId + s]); + } else { + sharedData[threadId] = fmaxf(sharedData[threadId], sharedData[threadId + s]); + } + break; + case 1: + if constexpr (std::is_same::value) { + sharedData[threadId] = __hmin(sharedData[threadId], sharedData[threadId + s]); + } else { + sharedData[threadId] = fminf(sharedData[threadId], sharedData[threadId + s]); + } + break; + default: + if constexpr (std::is_same::value) { + sharedData[threadId] = __hadd(sharedData[threadId], sharedData[threadId + s]); + } else { + sharedData[threadId] += sharedData[threadId + s]; + } + break; + } + } + __syncthreads(); + if (pre_s % 2 != 0 && threadId == 0) { + switch(reduce_type) { + case 0: + if constexpr (std::is_same::value) { + sharedData[0] = __hmax(sharedData[0], sharedData[pre_s - 1]); + } else { + sharedData[0] = fmaxf(sharedData[0], sharedData[pre_s - 1]); + } + break; + case 1: + if constexpr (std::is_same::value) { + sharedData[0] = __hmin(sharedData[0], sharedData[pre_s - 1]); + } else { + sharedData[0] = fminf(sharedData[0], sharedData[pre_s - 1]); + } + break; + default: + if constexpr (std::is_same::value) { + sharedData[0] = __hadd(sharedData[0], sharedData[pre_s-1]); + } else { + sharedData[0] += sharedData[pre_s-1]; + } + break; + } + } + __syncthreads(); + + pre_s = s; + s = pre_s / 2; + } + + if(threadId == 0) + { + int64_t index = shift_idx; + #pragma unroll + for(int j = 0; j < axis_num; j++){ + index = index / (stride_axis[j] * shape_axis[j]) * stride_axis[j] + index % stride_axis[j]; + } + dst[index] = sharedData[threadId]; + if(reduce_type == 2) + { + float shape = static_cast(add_dim); + if constexpr (std::is_same::value) { + dst[index] = __float2half(__half2float(dst[index]) / (shape)); + //dst[index] /= __float2half(shape); + } else { + dst[index] /= shape; + } + } + } + +} + +template +infiniopStatus_t reduce_nv_gpu(ReduceCudaDescriptor_t desc, void *dst, void const *src, void *stream) { + if (desc->data_size == 0) { + return STATUS_SUCCESS; + } + const int64_t *stride_axis = reinterpret_cast(desc->stride_axis_ptr); + const int64_t *shape_axis = reinterpret_cast(desc->shape_axis_ptr); + + dim3 blockDims, gridDims; + size_t sharedMemSize; + uint64_t step; + if (desc->add_dim > 1024){ + blockDims = dim3(1024, ROUND_UP_DIV(desc->add_dim, 1024)); + sharedMemSize = desc->add_dim * sizeof(Tdata); + step = blockDims.x * blockDims.y; + }else{ + blockDims = dim3(desc->add_dim); + sharedMemSize = blockDims.x * sizeof(Tdata); + step = blockDims.x; + } + gridDims = dim3(std::min(ROUND_UP_DIV(desc->data_size, desc->add_dim), desc->max_grid_size)); + + step *= gridDims.x; + + + const auto dst_ = reinterpret_cast(dst); + const auto src_ = reinterpret_cast(src); + + cudaStream_t cuda_stream = reinterpret_cast(stream); + +#pragma unroll + for (uint64_t i = 0; i < desc->data_size; i += step) { + reduce<<>>( + dst_, src_, stride_axis, shape_axis, desc->axis_num, desc->add_dim, i, desc->reduce_mode); + } + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaReduce(ReduceCudaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *dst, + void const *src, + void *stream){ + checkCudaError(cudaSetDevice(desc->device_id)); + if (desc->dtype == F16) { + return reduce_nv_gpu(desc, dst, src, stream); + } + if (desc->dtype == F32) { + return reduce_nv_gpu(desc, dst, src, stream); + } + return STATUS_BAD_TENSOR_DTYPE; +} \ No newline at end of file diff --git a/src/ops/reduce/cuda/reduce.cuh b/src/ops/reduce/cuda/reduce.cuh new file mode 100644 index 00000000..55973044 --- /dev/null +++ b/src/ops/reduce/cuda/reduce.cuh @@ -0,0 +1,47 @@ +#ifndef __CUDA_REDUCE_H__ +#define __CUDA_REDUCE_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" +#include +#include + +struct ReduceCudaDescriptor { + Device device; + DT dtype; + int device_id; + uint64_t ndim; + uint64_t dst_size; + uint64_t data_size; + uint64_t add_dim; + int axis_num; + int64_t* stride_axis_ptr; + int64_t* shape_axis_ptr; + int reduce_mode; + uint64_t max_grid_size; +}; + +typedef struct ReduceCudaDescriptor *ReduceCudaDescriptor_t; + +infiniopStatus_t cudaCreateReduceDescriptor(CudaHandle_t, + ReduceCudaDescriptor_t *, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + int *axis, + int num_axis, + int keepdims, + int reduce_type); + +infiniopStatus_t cudaGetReduceWorkspaceSize(ReduceCudaDescriptor_t desc, uint64_t *size); + +infiniopStatus_t cudaReduce(ReduceCudaDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *dst, + void const *src, + void *stream); + +infiniopStatus_t cudaDestroyReduceDescriptor(ReduceCudaDescriptor_t desc); + +#endif diff --git a/src/ops/reduce/operator.cc b/src/ops/reduce/operator.cc new file mode 100644 index 00000000..2ca06e28 --- /dev/null +++ b/src/ops/reduce/operator.cc @@ -0,0 +1,89 @@ +#include "../utils.h" +#include "operators.h" +#include "reduce.h" + +#ifdef ENABLE_CPU +#include "cpu/reduce_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/common_cuda.h" +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/reduce.cuh" +#endif +#ifdef ENABLE_CAMBRICON_MLU +// TODO +#endif + +__C infiniopStatus_t infiniopCreateReduceDescriptor( + infiniopHandle_t handle, + infiniopReduceDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + int *axis, + const int num_axis, + int const keepdims, + int reduce_type) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateReduceDescriptor(handle, (ReduceCpuDescriptor_t *)desc_ptr, dst, src, axis, num_axis, keepdims, reduce_type); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateReduceDescriptor((CudaHandle_t) handle, (ReduceCudaDescriptor_t *) desc_ptr, dst, src, axis, num_axis, keepdims, reduce_type); + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *dst, + void const *src, + void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuReduce((ReduceCpuDescriptor_t)desc, workspace, workspace_size, dst, src, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaReduce((ReduceCudaDescriptor_t)desc, workspace, workspace_size, dst, src, stream); + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopGetReduceWorkspaceSize(infiniopReduceDescriptor_t desc, uint64_t *size) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuGetReduceWorkspaceSize((ReduceCpuDescriptor_t) desc, size); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaGetReduceWorkspaceSize((ReduceCudaDescriptor_t) desc, size); + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyReduceDescriptor(infiniopReduceDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyReduceDescriptor((ReduceCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaDestroyReduceDescriptor((ReduceCudaDescriptor_t)desc); + } + +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/reduce/reduce.h b/src/ops/reduce/reduce.h new file mode 100644 index 00000000..e66b57a9 --- /dev/null +++ b/src/ops/reduce/reduce.h @@ -0,0 +1,31 @@ +#ifndef REDUCE_H +#define REDUCE_H + +#include "export.h" +#include "operators.h" + +typedef struct ReduceDescriptor { + Device device; +} ReduceDescriptor; +typedef ReduceDescriptor *infiniopReduceDescriptor_t; + +__C infiniopStatus_t infiniopCreateReduceDescriptor(infiniopHandle_t handle, + infiniopReduceDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + int *axis, + const int num_axis, + int const keepdims, + int reduce_type); + +__C infiniopStatus_t infiniopGetReduceWorkspaceSize(infiniopReduceDescriptor_t desc, uint64_t *size); + +__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc, + void *workspace, + uint64_t workspace_size, + void *dst, + void const*src, + void *stream); + +__C infiniopStatus_t infiniopDestroyReduceDescriptor(infiniopReduceDescriptor_t desc); +#endif diff --git a/src/ops/reduceMax/operator.cc b/src/ops/reduceMax/operator.cc new file mode 100644 index 00000000..53503eee --- /dev/null +++ b/src/ops/reduceMax/operator.cc @@ -0,0 +1,50 @@ +#include "../reduce/reduce.h" +#include "../utils.h" +#include "ops/reduceMax/reduceMax.h" + +struct _ReduceMaxDescriptor { + Device device; + infiniopReduceDescriptor_t reduce_desc; + uint64_t workspace_size; +}; + +typedef struct _ReduceMaxDescriptor *_ReduceMaxDescriptor_t; +__C __export infiniopStatus_t infiniopCreateReduceMaxDescriptor(infiniopHandle_t handle, + infiniopReduceMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + int *axis, + const int num_axis, + int const keepdims) { +infiniopReduceDescriptor_t reduce_desc; +CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, dst, src, axis, num_axis, keepdims, 0), STATUS_SUCCESS); +uint64_t workspace_size = 0; +CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS); + +*(_ReduceMaxDescriptor_t *) desc_ptr = new _ReduceMaxDescriptor{ +handle->device, +reduce_desc, +workspace_size, +}; + +return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, uint64_t *size) { + *size = ((_ReduceMaxDescriptor_t) desc)->workspace_size; + return STATUS_SUCCESS; +} +__C __export infiniopStatus_t infiniopReduceMax(infiniopReduceMaxDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream) { +auto _desc = (_ReduceMaxDescriptor_t) desc; +if (workspace_size < _desc->workspace_size) { + return STATUS_MEMORY_NOT_ALLOCATED; +} +CHECK_STATUS(infiniopReduce(_desc->reduce_desc, workspace, workspace_size, dst, src, stream),STATUS_SUCCESS); +return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc) { +CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceMaxDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS); +delete desc; +return STATUS_SUCCESS; +} diff --git a/src/ops/reduceMean/operator.cc b/src/ops/reduceMean/operator.cc new file mode 100644 index 00000000..245ff710 --- /dev/null +++ b/src/ops/reduceMean/operator.cc @@ -0,0 +1,51 @@ +#include "../reduce/reduce.h" +#include "../utils.h" +#include "ops/reduceMean/reduceMean.h" + +struct _ReduceMeanDescriptor { + Device device; + infiniopReduceDescriptor_t reduce_desc; + uint64_t workspace_size; +}; + +typedef struct _ReduceMeanDescriptor *_ReduceMeanDescriptor_t; +__C __export infiniopStatus_t infiniopCreateReduceMeanDescriptor(infiniopHandle_t handle, + infiniopReduceMeanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + int* axis, + const int num_axis, + int const keepdims) { +infiniopReduceDescriptor_t reduce_desc; +CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, dst, src, axis, num_axis, keepdims, 2), STATUS_SUCCESS); +uint64_t workspace_size = 0; +CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS); + +*(_ReduceMeanDescriptor_t *) desc_ptr = new _ReduceMeanDescriptor{ +handle->device, +reduce_desc, +workspace_size, +}; + +return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, uint64_t *size) { + *size = ((_ReduceMeanDescriptor_t) desc)->workspace_size; + return STATUS_SUCCESS; +} +__C __export infiniopStatus_t infiniopReduceMean(infiniopReduceMeanDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream) { +auto _desc = (_ReduceMeanDescriptor_t) desc; +if (workspace_size < _desc->workspace_size) { + return STATUS_MEMORY_NOT_ALLOCATED; +} +CHECK_STATUS(infiniopReduce(_desc->reduce_desc, workspace, workspace_size, dst, src, stream), +STATUS_SUCCESS); +return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc) { +CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceMeanDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS); +delete desc; +return STATUS_SUCCESS; +} diff --git a/src/ops/reduceMin/operator.cc b/src/ops/reduceMin/operator.cc new file mode 100644 index 00000000..7d029f81 --- /dev/null +++ b/src/ops/reduceMin/operator.cc @@ -0,0 +1,51 @@ +#include "../reduce/reduce.h" +#include "../utils.h" +#include "ops/reduceMin/reduceMin.h" + +struct _ReduceMinDescriptor { + Device device; + infiniopReduceDescriptor_t reduce_desc; + uint64_t workspace_size; +}; + +typedef struct _ReduceMinDescriptor *_ReduceMinDescriptor_t; +__C __export infiniopStatus_t infiniopCreateReduceMinDescriptor(infiniopHandle_t handle, + infiniopReduceMinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + int* axis, + const int num_axis, + int const keepdims) { +infiniopReduceDescriptor_t reduce_desc; +CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, dst, src, axis, num_axis, keepdims, 1), STATUS_SUCCESS); +uint64_t workspace_size = 0; +CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS); + +*(_ReduceMinDescriptor_t *) desc_ptr = new _ReduceMinDescriptor{ +handle->device, +reduce_desc, +workspace_size, +}; + +return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopGetReduceMinWorkspaceSize(infiniopReduceMinDescriptor_t desc, uint64_t *size) { + *size = ((_ReduceMinDescriptor_t) desc)->workspace_size; + return STATUS_SUCCESS; +} +__C __export infiniopStatus_t infiniopReduceMin(infiniopReduceMinDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream) { +auto _desc = (_ReduceMinDescriptor_t) desc; +if (workspace_size < _desc->workspace_size) { + return STATUS_MEMORY_NOT_ALLOCATED; +} +CHECK_STATUS(infiniopReduce(_desc->reduce_desc, workspace, workspace_size, dst, src, stream), +STATUS_SUCCESS); +return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyReduceMinDescriptor(infiniopReduceMinDescriptor_t desc) { +CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceMinDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS); +delete desc; +return STATUS_SUCCESS; +} diff --git a/src/ops/reduceSum/operator.cc b/src/ops/reduceSum/operator.cc new file mode 100644 index 00000000..ff4012fa --- /dev/null +++ b/src/ops/reduceSum/operator.cc @@ -0,0 +1,51 @@ +#include "../reduce/reduce.h" +#include "../utils.h" +#include "ops/reduceSum/reduceSum.h" + +struct _ReduceSumDescriptor { + Device device; + infiniopReduceDescriptor_t reduce_desc; + uint64_t workspace_size; +}; + +typedef struct _ReduceSumDescriptor *_ReduceSumDescriptor_t; +__C __export infiniopStatus_t infiniopCreateReduceSumDescriptor(infiniopHandle_t handle, + infiniopReduceSumDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t src, + int* axis, + const int num_axis, + int const keepdims) { +infiniopReduceDescriptor_t reduce_desc; +CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, dst, src, axis, num_axis, keepdims, 3), STATUS_SUCCESS); +uint64_t workspace_size = 0; +CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS); + +*(_ReduceSumDescriptor_t *) desc_ptr = new _ReduceSumDescriptor{ +handle->device, +reduce_desc, +workspace_size, +}; + +return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopGetReduceSumWorkspaceSize(infiniopReduceSumDescriptor_t desc, uint64_t *size) { + *size = ((_ReduceSumDescriptor_t) desc)->workspace_size; + return STATUS_SUCCESS; +} +__C __export infiniopStatus_t infiniopReduceSum(infiniopReduceSumDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream) { +auto _desc = (_ReduceSumDescriptor_t) desc; +if (workspace_size < _desc->workspace_size) { + return STATUS_MEMORY_NOT_ALLOCATED; +} +CHECK_STATUS(infiniopReduce(_desc->reduce_desc, workspace, workspace_size, dst, src, stream), +STATUS_SUCCESS); +return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyReduceSumDescriptor(infiniopReduceSumDescriptor_t desc) { +CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceSumDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS); +delete desc; +return STATUS_SUCCESS; +} diff --git a/src/ops/where/cpu/where_cpu.cc b/src/ops/where/cpu/where_cpu.cc new file mode 100644 index 00000000..a0fe432d --- /dev/null +++ b/src/ops/where/cpu/where_cpu.cc @@ -0,0 +1,76 @@ +#include "where_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" +#include + +infiniopStatus_t cpuCreateWhereDescriptor(infiniopHandle_t handle, + WhereCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t condition){ + uint64_t ndim = condition->ndim; + if (ndim != x->ndim || ndim != y->ndim || ndim != dst->ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (size_t i = 0; i < ndim; ++i) { + if (condition->shape[i] != x->shape[i] || condition->shape[i] != y->shape[i] || condition->shape[i] != dst->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + if (!is_contiguous(condition) || !is_contiguous(x) || !is_contiguous(y) || !is_contiguous(dst)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (x->dt != y->dt || x->dt != dst->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (x->dt != F16 && x->dt != F32 && x->dt != U16) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (condition->dt != U8) { + return STATUS_BAD_TENSOR_DTYPE; + } + + DT dtype = x->dt; + uint64_t data_size = std::accumulate(condition->shape, condition->shape + ndim, 1ULL, std::multiplies()); + *desc_ptr = new WhereCpuDescriptor{ + DevCpu, + dtype, + data_size, + }; + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyWhereDescriptor(WhereCpuDescriptor_t desc){ + delete desc; + return STATUS_SUCCESS; +} + +template +void where_cpu(WhereCpuDescriptor_t desc, void *dst, void const *x, void const *y, void const *condition){ + auto dstPtr = reinterpret_cast(dst); + auto x_Ptr = reinterpret_cast(x); + auto y_Ptr = reinterpret_cast(y); + auto c_Ptr = reinterpret_cast(condition); + for(uint64_t i = 0; i < desc->data_size; i++){ + dstPtr[i] = (c_Ptr[i]) ? y_Ptr[i] : x_Ptr[i]; + } +} + +infiniopStatus_t cpuWhere(WhereCpuDescriptor_t desc, + void *dst, + void const *x, + void const *y, + void const *condition, + void *stream){ + if(desc->dtype == F32) + { + where_cpu(desc, dst, x, y, condition); + } + else if(desc->dtype == F16) + { + where_cpu(desc, dst, x, y, condition); + } + return STATUS_SUCCESS; +} + diff --git a/src/ops/where/cpu/where_cpu.h b/src/ops/where/cpu/where_cpu.h new file mode 100644 index 00000000..0d185b4a --- /dev/null +++ b/src/ops/where/cpu/where_cpu.h @@ -0,0 +1,28 @@ +#ifndef __CPU_WHERE_H__ +#define __CPU_WHERE_H__ +#include "operators.h" + +struct WhereCpuDescriptor { + Device device; + DT dtype; + uint64_t data_size; +}; + +typedef WhereCpuDescriptor *WhereCpuDescriptor_t; + +infiniopStatus_t cpuCreateWhereDescriptor(infiniopHandle_t, + WhereCpuDescriptor_t *, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t condition); + +infiniopStatus_t cpuWhere(WhereCpuDescriptor_t desc, + void *dst, + void const *x, + void const *y, + void const *condition, + void *stream); + +infiniopStatus_t cpuDestroyWhereDescriptor(WhereCpuDescriptor_t desc); +#endif \ No newline at end of file diff --git a/src/ops/where/cuda/where.cu b/src/ops/where/cuda/where.cu new file mode 100644 index 00000000..382af1fa --- /dev/null +++ b/src/ops/where/cuda/where.cu @@ -0,0 +1,58 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "where.cuh" + +template +__global__ void where( + Tdata *dst, + const Tdata *x, + const Tdata *y, + const uint8_t *condition, + uint64_t offset){ + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + + dst[idx] = (condition[idx]) ? y[idx] : x[idx]; + +} + + +template +infiniopStatus_t where_nv_gpu(WhereCudaDescriptor_t desc, void *dst, void const *x, void const *y, void const *condition, void *stream) { + if (desc->data_size == 0) { + return STATUS_SUCCESS; + } + dim3 blockDims = dim3(std::min(static_cast(256), desc->data_size)); + dim3 gridDims = dim3(std::min(ROUND_UP_DIV(desc->data_size, blockDims.x), desc->max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + + const auto dst_ = reinterpret_cast(dst); + const auto x_ = reinterpret_cast(x); + const auto y_ = reinterpret_cast(y); + const auto condition_ = reinterpret_cast(condition); + + cudaStream_t cuda_stream = reinterpret_cast(stream); + +#pragma unroll + for (uint64_t i = 0; i < desc->data_size; i += step) { + printf("%ld\n", (int64_t)i); + where<<>>(dst_, x_, y_, condition_, i); + } + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaWhere(WhereCudaDescriptor_t desc, + void *dst, + void const *x, + void const *y, + void const *condition, + void *stream) { + checkCudaError(cudaSetDevice(desc->device_id)); + if (desc->dtype == F16) { + return where_nv_gpu(desc, dst, x, y, condition, stream); + } + if (desc->dtype == F32) { + return where_nv_gpu(desc, dst, x, y, condition, stream); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/where/cuda/where.cuh b/src/ops/where/cuda/where.cuh new file mode 100644 index 00000000..9a96fe54 --- /dev/null +++ b/src/ops/where/cuda/where.cuh @@ -0,0 +1,37 @@ +#ifndef __CUDA_WHERE_H__ +#define __CUDA_WHERE_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" +#include +#include + +struct WhereCudaDescriptor { + Device device; + DT dtype; + int device_id; + uint64_t ndim; + uint64_t data_size; + uint64_t max_grid_size; +}; + +typedef struct WhereCudaDescriptor *WhereCudaDescriptor_t; + +infiniopStatus_t cudaCreateWhereDescriptor(CudaHandle_t, + WhereCudaDescriptor_t *, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t condition); + +infiniopStatus_t cudaWhere(WhereCudaDescriptor_t desc, + void *dst, + void const *x, + void const *y, + void const *condition, + void *stream); + +infiniopStatus_t cudaDestroyWhereDescriptor(WhereCudaDescriptor_t desc); + +#endif diff --git a/src/ops/where/cuda/where_cuda.cc b/src/ops/where/cuda/where_cuda.cc new file mode 100644 index 00000000..1ab20ed3 --- /dev/null +++ b/src/ops/where/cuda/where_cuda.cc @@ -0,0 +1,52 @@ +#include "where.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateWhereDescriptor(CudaHandle_t handle, + WhereCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t condition) { + uint64_t ndim = condition->ndim; + if (ndim != x->ndim || ndim != y->ndim || ndim != dst->ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (size_t i = 0; i < ndim; ++i) { + if (condition->shape[i] != x->shape[i] || condition->shape[i] != y->shape[i] || condition->shape[i] != dst->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + if (!is_contiguous(condition) || !is_contiguous(x) || !is_contiguous(y) || !is_contiguous(dst)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (x->dt != y->dt || x->dt != dst->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (x->dt != F16 && x->dt != F32 && x->dt != U16) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (condition->dt != U8) { + return STATUS_BAD_TENSOR_DTYPE; + } + + DT dtype = x->dt; + uint64_t data_size = std::accumulate(condition->shape, condition->shape + condition->ndim, 1ULL, std::multiplies()); + + + *desc_ptr = new WhereCudaDescriptor{ + DevNvGpu, + dtype, + handle->device_id, + ndim, + data_size, + static_cast(handle->prop.maxGridSize[0]), + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyWhereDescriptor(WhereCudaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} \ No newline at end of file diff --git a/src/ops/where/operator.cc b/src/ops/where/operator.cc new file mode 100644 index 00000000..9187c643 --- /dev/null +++ b/src/ops/where/operator.cc @@ -0,0 +1,68 @@ +#include "../utils.h" +#include "operators.h" + +#include "ops/where/where.h" + +#ifdef ENABLE_CPU +#include "cpu/where_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/where.cuh" +#endif + +__C infiniopStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t dst, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t condition){ + switch (handle->device) + { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateWhereDescriptor(handle, (WhereCpuDescriptor_t *)desc_ptr, dst, x, y, condition); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: + return cudaCreateWhereDescriptor((CudaHandle_t) handle, (WhereCudaDescriptor_t *)desc_ptr, dst, x, y, condition); +#endif + + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, + void *dst, + void const *x, + void const *y, + void const *condition, + void *stream){ + switch (desc->device) + { +#ifdef ENABLE_CPU + case DevCpu: + return cpuWhere((WhereCpuDescriptor_t)desc, dst, x, y, condition, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: + return cudaWhere((WhereCudaDescriptor_t) desc, dst, x, y, condition, stream); +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc){ + switch (desc->device) + { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyWhereDescriptor((WhereCpuDescriptor_t)desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: + return cudaDestroyWhereDescriptor((WhereCudaDescriptor_t)desc); +#endif + } + return STATUS_BAD_DEVICE; +} \ No newline at end of file