From 7c719aa0e6edadc91358252befb984b3694390af Mon Sep 17 00:00:00 2001
From: voidwarlock <1707297086@qq.com>
Date: Fri, 14 Mar 2025 17:31:59 +0800
Subject: [PATCH] =?UTF-8?q?[=E8=AE=AD=E7=BB=83=E8=90=A5]=E6=96=B0=E6=B7=BB?=
 =?UTF-8?q?=E5=8A=A0=E7=AE=97=E5=AD=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 include/infini_operators.h                    |   8 +
 include/ops/clip/clip.h                       |  28 ++
 include/ops/gather/gather.h                   |  28 ++
 include/ops/gather_elements/gather_elements.h |  28 ++
 include/ops/reduceMax/reduceMax.h             |  25 ++
 include/ops/reduceMean/reduceMean.h           |  25 ++
 include/ops/reduceMin/reduceMin.h             |  25 ++
 include/ops/reduceSum/reduceSum.h             |  25 ++
 include/ops/where/where.h                     |  29 +++
 new_operators.md                              | 240 ++++++++++++++++++
 operatorspy/tests/clip.py                     | 159 ++++++++++++
 operatorspy/tests/gather.py                   | 164 ++++++++++++
 operatorspy/tests/gather_elements.py          | 157 ++++++++++++
 operatorspy/tests/reduce_max.py               | 191 ++++++++++++++
 operatorspy/tests/reduce_mean.py              | 189 ++++++++++++++
 operatorspy/tests/reduce_min.py               | 193 ++++++++++++++
 operatorspy/tests/reduce_sum.py               | 189 ++++++++++++++
 operatorspy/tests/where.py                    | 161 ++++++++++++
 src/ops/clip/cpu/clip_cpu.cc                  |  83 ++++++
 src/ops/clip/cpu/clip_cpu.h                   |  27 ++
 src/ops/clip/cuda/clip.cc                     |  74 ++++++
 src/ops/clip/cuda/clip.cu                     |  57 +++++
 src/ops/clip/cuda/clip.cuh                    |  36 +++
 src/ops/clip/operator.cc                      |  71 ++++++
 src/ops/gather/cpu/gather_cpu.cc              | 122 +++++++++
 src/ops/gather/cpu/gather_cpu.h               |  31 +++
 src/ops/gather/cuda/gather.cc                 |  93 +++++++
 src/ops/gather/cuda/gather.cu                 |  76 ++++++
 src/ops/gather/cuda/gather.cuh                |  40 +++
 src/ops/gather/operator.cc                    |  67 +++++
 .../cpu/gather_elements_cpu.cc                | 113 +++++++++
 .../gather_elements/cpu/gather_elements_cpu.h |  31 +++
 .../gather_elements/cuda/gather_elements.cc   |  86 +++++++
 .../gather_elements/cuda/gather_elements.cu   |  74 ++++++
 .../gather_elements/cuda/gather_elements.cuh  |  40 +++
 src/ops/gather_elements/operator.cc           |  67 +++++
 src/ops/reduce/cpu/reduce_cpu.cc              | 178 +++++++++++++
 src/ops/reduce/cpu/reduce_cpu.h               |  45 ++++
 src/ops/reduce/cuda/reduce.cc                 |  92 +++++++
 src/ops/reduce/cuda/reduce.cu                 | 169 ++++++++++++
 src/ops/reduce/cuda/reduce.cuh                |  47 ++++
 src/ops/reduce/operator.cc                    |  89 +++++++
 src/ops/reduce/reduce.h                       |  31 +++
 src/ops/reduceMax/operator.cc                 |  50 ++++
 src/ops/reduceMean/operator.cc                |  51 ++++
 src/ops/reduceMin/operator.cc                 |  51 ++++
 src/ops/reduceSum/operator.cc                 |  51 ++++
 src/ops/where/cpu/where_cpu.cc                |  76 ++++++
 src/ops/where/cpu/where_cpu.h                 |  28 ++
 src/ops/where/cuda/where.cu                   |  58 +++++
 src/ops/where/cuda/where.cuh                  |  37 +++
 src/ops/where/cuda/where_cuda.cc              |  52 ++++
 src/ops/where/operator.cc                     |  68 +++++
 53 files changed, 4225 insertions(+)
 create mode 100644 include/ops/clip/clip.h
 create mode 100644 include/ops/gather/gather.h
 create mode 100644 include/ops/gather_elements/gather_elements.h
 create mode 100644 include/ops/reduceMax/reduceMax.h
 create mode 100644 include/ops/reduceMean/reduceMean.h
 create mode 100644 include/ops/reduceMin/reduceMin.h
 create mode 100644 include/ops/reduceSum/reduceSum.h
 create mode 100644 include/ops/where/where.h
 create mode 100644 new_operators.md
 create mode 100644 operatorspy/tests/clip.py
 create mode 100644 operatorspy/tests/gather.py
 create mode 100644 operatorspy/tests/gather_elements.py
 create mode 100644 operatorspy/tests/reduce_max.py
 create mode 100644 operatorspy/tests/reduce_mean.py
 create mode 100644 operatorspy/tests/reduce_min.py
 create mode 100644 operatorspy/tests/reduce_sum.py
 create mode 100644 operatorspy/tests/where.py
 create mode 100644 src/ops/clip/cpu/clip_cpu.cc
 create mode 100644 src/ops/clip/cpu/clip_cpu.h
 create mode 100644 src/ops/clip/cuda/clip.cc
 create mode 100644 src/ops/clip/cuda/clip.cu
 create mode 100644 src/ops/clip/cuda/clip.cuh
 create mode 100644 src/ops/clip/operator.cc
 create mode 100644 src/ops/gather/cpu/gather_cpu.cc
 create mode 100644 src/ops/gather/cpu/gather_cpu.h
 create mode 100644 src/ops/gather/cuda/gather.cc
 create mode 100644 src/ops/gather/cuda/gather.cu
 create mode 100644 src/ops/gather/cuda/gather.cuh
 create mode 100644 src/ops/gather/operator.cc
 create mode 100644 src/ops/gather_elements/cpu/gather_elements_cpu.cc
 create mode 100644 src/ops/gather_elements/cpu/gather_elements_cpu.h
 create mode 100644 src/ops/gather_elements/cuda/gather_elements.cc
 create mode 100644 src/ops/gather_elements/cuda/gather_elements.cu
 create mode 100644 src/ops/gather_elements/cuda/gather_elements.cuh
 create mode 100644 src/ops/gather_elements/operator.cc
 create mode 100644 src/ops/reduce/cpu/reduce_cpu.cc
 create mode 100644 src/ops/reduce/cpu/reduce_cpu.h
 create mode 100644 src/ops/reduce/cuda/reduce.cc
 create mode 100644 src/ops/reduce/cuda/reduce.cu
 create mode 100644 src/ops/reduce/cuda/reduce.cuh
 create mode 100644 src/ops/reduce/operator.cc
 create mode 100644 src/ops/reduce/reduce.h
 create mode 100644 src/ops/reduceMax/operator.cc
 create mode 100644 src/ops/reduceMean/operator.cc
 create mode 100644 src/ops/reduceMin/operator.cc
 create mode 100644 src/ops/reduceSum/operator.cc
 create mode 100644 src/ops/where/cpu/where_cpu.cc
 create mode 100644 src/ops/where/cpu/where_cpu.h
 create mode 100644 src/ops/where/cuda/where.cu
 create mode 100644 src/ops/where/cuda/where.cuh
 create mode 100644 src/ops/where/cuda/where_cuda.cc
 create mode 100644 src/ops/where/operator.cc

diff --git a/include/infini_operators.h b/include/infini_operators.h
index 9a5a2555..be84ee3f 100644
--- a/include/infini_operators.h
+++ b/include/infini_operators.h
@@ -5,15 +5,23 @@
 #include "ops/causal_softmax/causal_softmax.h"
 #include "ops/global_avg_pool/global_avg_pool.h"
 #include "ops/expand/expand.h"
+#include "ops/gather_elements/gather_elements.h"
+#include "ops/gather/gather.h"
 #include "ops/gemm/gemm.h"
+#include "ops/clip/clip.h"
 #include "ops/conv/conv.h"
 #include "ops/matmul/matmul.h"
 #include "ops/max_pool/max_pool.h"
 #include "ops/mlp/mlp.h"
 #include "ops/random_sample/random_sample.h"
 #include "ops/rearrange/rearrange.h"
+#include "ops/reduceMax/reduceMax.h"
+#include "ops/reduceMean/reduceMean.h"
+#include "ops/reduceMin/reduceMin.h"
+#include "ops/reduceSum/reduceSum.h"
 #include "ops/relu/relu.h"
 #include "ops/rms_norm/rms_norm.h"
 #include "ops/rotary_embedding/rotary_embedding.h"
 #include "ops/swiglu/swiglu.h"
+#include "ops/where/where.h"
 #include "tensor/tensor_descriptor.h"
diff --git a/include/ops/clip/clip.h b/include/ops/clip/clip.h
new file mode 100644
index 00000000..82220bc3
--- /dev/null
+++ b/include/ops/clip/clip.h
@@ -0,0 +1,28 @@
+#ifndef CLIP_H
+#define CLIP_H
+
+#include "../../export.h"
+#include "../../operators.h"
+#include <limits>
+
+typedef struct{
+    Device device;
+}ClipDescriptor;
+
+typedef ClipDescriptor *infiniopClipDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle,
+                                                    infiniopClipDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t dst,
+                                                    infiniopTensorDescriptor_t src,
+                                                    float max,
+                                                    float min);
+
+__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc,
+                                    void *dst, 
+                                    void const *src,
+                                    void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/ops/gather/gather.h b/include/ops/gather/gather.h
new file mode 100644
index 00000000..2d598b03
--- /dev/null
+++ b/include/ops/gather/gather.h
@@ -0,0 +1,28 @@
+#ifndef GATHER_H
+#define GATHER_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct{
+    Device device;
+}GatherDescriptor;
+
+typedef GatherDescriptor *infiniopGatherDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateGatherDescriptor(infiniopHandle_t handle,
+                                                             infiniopGatherDescriptor_t *desc_ptr,
+                                                             infiniopTensorDescriptor_t dst,
+                                                             infiniopTensorDescriptor_t data,
+                                                             infiniopTensorDescriptor_t indices,
+                                                             int axis);
+
+__C __export infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc,
+                                             void *dst, 
+                                             void const *data,
+                                             void const *indices,
+                                             void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/ops/gather_elements/gather_elements.h b/include/ops/gather_elements/gather_elements.h
new file mode 100644
index 00000000..7178e96b
--- /dev/null
+++ b/include/ops/gather_elements/gather_elements.h
@@ -0,0 +1,28 @@
+#ifndef GATHER_ELEMENTS_H
+#define GATHER_ELEMENTS_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct{
+    Device device;
+}GatherElementsDescriptor;
+
+typedef GatherElementsDescriptor *infiniopGatherElementsDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateGatherElementsDescriptor(infiniopHandle_t handle,
+                                                             infiniopGatherElementsDescriptor_t *desc_ptr,
+                                                             infiniopTensorDescriptor_t dst,
+                                                             infiniopTensorDescriptor_t data,
+                                                             infiniopTensorDescriptor_t indices,
+                                                             int axis);
+
+__C __export infiniopStatus_t infiniopGatherElements(infiniopGatherElementsDescriptor_t desc,
+                                             void *dst, 
+                                             void const *data,
+                                             void const *indices,
+                                             void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyGatherElementsDescriptor(infiniopGatherElementsDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/ops/reduceMax/reduceMax.h b/include/ops/reduceMax/reduceMax.h
new file mode 100644
index 00000000..e4f47c54
--- /dev/null
+++ b/include/ops/reduceMax/reduceMax.h
@@ -0,0 +1,25 @@
+#ifndef REDUCEMAX_H
+#define REDUCEMAX_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ReduceMaxDescriptor {
+    Device device;
+} ReduceMaxDescriptor;
+typedef ReduceMaxDescriptor *infiniopReduceMaxDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReduceMaxDescriptor(infiniopHandle_t handle,
+                                                                infiniopReduceMaxDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t dst,
+                                                                infiniopTensorDescriptor_t src,
+                                                                int* axis,
+                                                                const int num_axis, 
+                                                                int const keepdims);
+
+__C __export infiniopStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, uint64_t *size); 
+
+__C __export infiniopStatus_t infiniopReduceMax(infiniopReduceMaxDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc);
+#endif
diff --git a/include/ops/reduceMean/reduceMean.h b/include/ops/reduceMean/reduceMean.h
new file mode 100644
index 00000000..d8767213
--- /dev/null
+++ b/include/ops/reduceMean/reduceMean.h
@@ -0,0 +1,25 @@
+#ifndef REDUCEMEAN_H
+#define REDUCEMEAN_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ReduceMeanDescriptor {
+    Device device;
+} ReduceMeanDescriptor;
+typedef ReduceMeanDescriptor *infiniopReduceMeanDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReduceMeanDescriptor(infiniopHandle_t handle,
+                                                                infiniopReduceMeanDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t dst,
+                                                                infiniopTensorDescriptor_t src,
+                                                                int* axis,
+                                                                const int num_axis, 
+                                                                int const keepdims);
+                                                                
+__C __export infiniopStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, uint64_t *size); 
+
+__C __export infiniopStatus_t infiniopReduceMean(infiniopReduceMeanDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc);
+#endif
diff --git a/include/ops/reduceMin/reduceMin.h b/include/ops/reduceMin/reduceMin.h
new file mode 100644
index 00000000..43197434
--- /dev/null
+++ b/include/ops/reduceMin/reduceMin.h
@@ -0,0 +1,25 @@
+#ifndef REDUCEMIN_H
+#define REDUCEMIN_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ReduceMinDescriptor {
+    Device device;
+} ReduceMinDescriptor;
+typedef ReduceMinDescriptor *infiniopReduceMinDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReduceMinDescriptor(infiniopHandle_t handle,
+                                                                infiniopReduceMinDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t dst,
+                                                                infiniopTensorDescriptor_t src,
+                                                                int* axis,
+                                                                const int num_axis, 
+                                                                int const keepdims);
+
+__C __export infiniopStatus_t infiniopGetReduceMinWorkspaceSize(infiniopReduceMinDescriptor_t desc, uint64_t *size); 
+
+__C __export infiniopStatus_t infiniopReduceMin(infiniopReduceMinDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyReduceMinDescriptor(infiniopReduceMinDescriptor_t desc);
+#endif
diff --git a/include/ops/reduceSum/reduceSum.h b/include/ops/reduceSum/reduceSum.h
new file mode 100644
index 00000000..2cec204e
--- /dev/null
+++ b/include/ops/reduceSum/reduceSum.h
@@ -0,0 +1,25 @@
+#ifndef REDUCESUM_H
+#define REDUCESUM_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct ReduceSumDescriptor {
+    Device device;
+} ReduceSumDescriptor;
+typedef ReduceSumDescriptor *infiniopReduceSumDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateReduceSumDescriptor(infiniopHandle_t handle,
+                                                                infiniopReduceSumDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t dst,
+                                                                infiniopTensorDescriptor_t src,
+                                                                int* axis,
+                                                                const int num_axis, 
+                                                                int const keepdims);
+
+__C __export infiniopStatus_t infiniopGetReduceSumWorkspaceSize(infiniopReduceSumDescriptor_t desc, uint64_t *size); 
+
+__C __export infiniopStatus_t infiniopReduceSum(infiniopReduceSumDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyReduceSumDescriptor(infiniopReduceSumDescriptor_t desc);
+#endif
diff --git a/include/ops/where/where.h b/include/ops/where/where.h
new file mode 100644
index 00000000..3a6e0382
--- /dev/null
+++ b/include/ops/where/where.h
@@ -0,0 +1,29 @@
+#ifndef WHERE_H
+#define WHERE_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct WhereDescriptor{
+    Device device;
+} WhereDescriptor;
+
+typedef WhereDescriptor *infiniopWhereDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle,
+                                                          infiniopWhereDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t dst,
+                                                          infiniopTensorDescriptor_t x,
+                                                          infiniopTensorDescriptor_t y,
+                                                          infiniopTensorDescriptor_t condition);
+
+__C __export infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc,
+                                          void *dst,
+                                          void const *x,
+                                          void const *y,
+                                          void const *condition,
+                                          void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc);
+
+#endif
diff --git a/new_operators.md b/new_operators.md
new file mode 100644
index 00000000..5081dd04
--- /dev/null
+++ b/new_operators.md
@@ -0,0 +1,240 @@
+## Clip
+
+### 基本属性
+
+| 描述                              | 裁剪， 把输入张量中的元素限制在给定的范围之内         |
+| --------------------------------- | -----------------------------------------------  |
+| 是否支持原地（in-place）计算      | 支持原地                                            |
+| 是否需要额外工作空间（workspace） | 不需要                                              |
+
+公式为：
+
+Y=min(max(X,min_val),max_val)
+
+### 接口定义
+
+#### 创建算子描述
+
+```C
+infiniopStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle,
+infiniopClipDescriptor_t *desc_ptr,infiniopTensorDescriptor_t dst,
+infiniopTensorDescriptor_t src, float max, float min);
+```
+
+**参数说明**
+
+| handle   | 硬件控柄                                           |
+| -------- | -------------------------------------------------- |
+| desc_ptr | 算子描述符的地址                                    |
+| dst      | 输出张量描述。形状与src相同。类型与src相同            |
+| src      | 输入张量描述。形状不定，类型可以为fp16               |
+| max      | float类型，所取值的上界                             |
+| min      | float 类型，所取值的下界                            |
+
+**返回值**
+
+| STATUS_SUCCESS              | 成功                             |
+| --------------------------- | ------------------------         |
+| STATUS_BAD_PARAM            | 参数张量不统一                    |
+| STATUS_BAD_TENSOR_DTYPE     | 输入输出张量类型不被支持或两者不一致|
+| STATUS_BAD_TENSOR_SHAPE     | 输入输出张量形状不符合要求         |
+| STATUS_BAD_TENSOR_STRIDES   | 张量步长不符合要求                |
+
+#### 计算
+
+```C
+infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc,
+void *dst, void const *src, void *stream);
+```
+
+#### 删除算子描述
+
+```C
+infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
+```
+
+
+
+
+
+## Where
+
+### 基本属性
+
+| 描述                              | 条件选择， 根据一个条件张量从两个输入张量中选择元素，构建输出张量 |
+| --------------------------------- | ----------------------------------------------- |
+| 是否支持原地（in-place）计算      | 支持原地                                            |
+| 是否需要额外工作空间（workspace） | 不需要                                              |
+
+公式为：
+          
+output[i]=（condition[i] ? X[i] : Y[i]）
+          
+### 接口定义
+
+#### 创建算子描述
+
+```C
+infiniopStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle,
+infiniopWhereDescriptor_t *desc_ptr, infiniopTensorDescriptor_t dst,
+infiniopTensorDescriptor_t x,infiniopTensorDescriptor_t y,
+infiniopTensorDescriptor_t condition);
+```
+
+**参数说明**
+
+| handle         | 硬件控柄                                               |
+| -------------- | --------------------------------------------------    |
+| desc_ptr       | 算子描述符的地址                                        |
+| dst            | 输出张量描述。形状与x，y和condition相同。类型与x和y相同   |
+| x              | 输入张量描述。形状不定，类型可以为fp16                   |
+| y              | 输入张量描述。形状与x相同，类型与x相同                    |
+| condition      | 输入条件张量描述。形状与x和y相同，类型为uint8_t           |
+                            
+
+**返回值**
+
+| STATUS_SUCCESS              | 成功                                                               |
+| --------------------------- | --------------------------------------------------------          |
+| STATUS_BAD_PARAM            | 参数张量不统一                                                     |
+| STATUS_BAD_TENSOR_DTYPE     | (1)输入输出张量类型不被支持或两者不一致  (2)条件张量类型不支持         |
+| STATUS_BAD_TENSOR_SHAPE     | (1)输入输出张量形状不符合要求  (2)条件张量形状不与输入输出张量形状一致  |
+| STATUS_BAD_TENSOR_STRIDES   | 张量步长不符合要求                                                  |
+
+#### 计算
+
+```C
+infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc,
+void *dst,void const *x,void const *y,void const *condition,void *stream);
+```
+
+#### 删除算子描述
+
+```C
+infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc);
+```
+
+
+## Gather
+
+### 基本属性
+
+| 描述                              | 索引收集， 根据指定的索引从输入数据中收集元素，并将这些元素组织成一个新的张量输出 |
+| --------------------------------- | ------------------------------------------------------------------------  |
+| 是否支持原地（in-place）计算      | 不支持原地                                                                   |
+| 是否需要额外工作空间（workspace） | 不需要                                                                       |
+
+公式为：
+          
+在指定的axis上 output = data[index]
+          
+### 接口定义
+
+#### 创建算子描述
+
+```C
+infiniopStatus_t infiniopCreateGatherDescriptor(infiniopHandle_t handle,infiniopGatherDescriptor_t *desc_ptr,infiniopTensorDescriptor_t dst,
+infiniopTensorDescriptor_t data,infiniopTensorDescriptor_t indices,
+int axis);
+```
+
+**参数说明**
+
+| handle         | 硬件控柄                                                                                 |
+| -------------- | --------------------------------------------------                                      |
+| desc_ptr       | 算子描述符的地址                                                                         |
+| dst            | 输出张量描述。形状是data_shape[:axis]+indices_shape+data_shape[axis+1:]相同 类型与data相同 |
+| data           | 输入张量描述。形状不定，类型可以为fp16                                                     |
+| indices        | 索引张量描述。形状不定，类型为一切int32或int64                                             |
+| axis           | int类型，表示指定操作的轴， 其值不能超过data的秩                                            |
+                 
+                            
+
+**返回值**
+
+| STATUS_SUCCESS              | 成功                                                           |
+| --------------------------- | ------------------------------------------------------         |
+| STATUS_BAD_PARAM            | 参数张量不统一     (1)可能是axis取值不合理                       |
+| STATUS_BAD_TENSOR_DTYPE     | (1)输入输出张量类型不被支持或两者不一致  (2)索引值张量类型不被支持  |
+| STATUS_BAD_TENSOR_SHAPE     | (1)输入输出张量形状不符合要求  (2)索引张量形状不符合要求           |
+| STATUS_BAD_TENSOR_STRIDES   | 张量步长不符合要求                                              |
+
+#### 计算
+
+```C
+infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc,
+void *dst,  void const *data, void const *indices, void *stream);
+```
+
+#### 删除算子描述
+
+```C
+infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc);
+```
+
+
+## Reduce
+
+### 基本属性
+
+| 描述                               | 归约， 沿指定的一个或多个维度应用一个二元操作（求最值，求加法），输入张量缩减为形状较小的输出张量 |
+| --------------------------------- | ----------------------------------------------------------------------------------------  |
+| 是否支持原地（in-place）计算        | 不支持原地                                                                                 |
+| 是否需要额外工作空间（workspace）   | 需要                                                                                       |
+
+公式为：
+          
+在指定的axis上 计算(Max,Min,Mean)
+          
+### 接口定义
+
+#### 创建算子描述
+
+```C
+infiniopStatus_t infiniopCreateReduce(Max,Min,Mean)Descriptor(infiniopHandle_t handle,
+infiniopReduce(Max,Min,Mean)Descriptor_t *desc_ptr,infiniopTensorDescriptor_t dst,
+infiniopTensorDescriptor_t src,int* axis,const int num_axis, int const keepdims);
+```
+
+**参数说明**
+
+| handle         | 硬件控柄                                                                                 |
+| -------------- | ---------------------------------------------------------------------------             |
+| desc_ptr       | 算子描述符的地址                                                                         |
+| dst            | 输出张量描述。形状取决于keepdims，保留时rank(dst)=rank(src)，被规约的轴大小为1；不保留时rank(dst) = rank(src)-rank(axis), 相当于去除冗余维度， 类型与src相同 |
+| src            | 输入张量描述。形状不定，类型可以为fp16                                                     |
+| axis           | int类型的数组，形状小于src的秩，指定需要reduce的轴，支持多个轴同时规约                       |
+| num_axis       | axis数组的形状大小。大小小于src的秩，类型为int                                             |
+| keepdims       | 确定是否保留形状。为正整数时保留，类型为int                                                 |
+                 
+                            
+
+**返回值**
+
+| STATUS_SUCCESS              | 成功                             |
+| --------------------------- | ------------------------                         |
+| STATUS_BAD_PARAM            | 参数张量不统一     (1)可能是axis取值不合理 (2)可能是选择的规约模式不支持          |
+| STATUS_BAD_TENSOR_DTYPE     | 输入输出张量类型不被支持或两者不一致|
+| STATUS_BAD_TENSOR_SHAPE     | 输入输出张量形状不符合要求         |
+| STATUS_BAD_TENSOR_STRIDES   | 张量步长不符合要求                |
+
+
+#### 计算工作空间
+
+```C
+infiniopStatus_t infiniopGetReduce(Max,Min,Mean)WorkspaceSize(infiniopReduce(Max,Min,Mean)Descriptor_t desc, uint64_t *size); 
+```
+
+#### 计算
+
+```C
+infiniopStatus_t infiniopReduce(Max,Min,Mean)(infiniopReduce(Max,Min,Mean)Descriptor_t desc, 
+void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream);
+
+```
+
+#### 删除算子描述
+
+```C
+infiniopStatus_t infiniopDestroyReduce(Max,Min,Mean)Descriptor(infiniopReduce(Max,Min,Mean)Descriptor_t desc);
+```
\ No newline at end of file
diff --git a/operatorspy/tests/clip.py b/operatorspy/tests/clip.py
new file mode 100644
index 00000000..4a74b3f2
--- /dev/null
+++ b/operatorspy/tests/clip.py
@@ -0,0 +1,159 @@
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+class ClipDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopClipDescriptor_t = POINTER(ClipDescriptor)
+
+def clip(x, min_val = -torch.inf, max_val = torch.inf):
+    return torch.clip(x, min_val, max_val)
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    tensor_shape, 
+    max_val,
+    min_val,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Clip on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    )
+
+    x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
+    y = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else x
+
+
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = clip(x,min_val,max_val)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = clip(x,min_val,max_val)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor
+    descriptor = infiniopClipDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateClipDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            max_val,
+            min_val
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(lib.infiniopClip(descriptor, y_tensor.data, x_tensor.data, None))   
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopClip(descriptor, y_tensor.data, x_tensor.data, None)
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(y, ans, atol=1e-0, rtol=1e-0)
+    check_error(lib.infiniopDestroyClipDescriptor(descriptor))
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for tensor_shape, max_val, min_val, inplace in test_cases:
+        test(lib, handle, "cpu", tensor_shape, max_val, min_val, tensor_dtype=torch.float32, inplace=inplace)
+        test(lib, handle, "cpu", tensor_shape, max_val, min_val, tensor_dtype=torch.float16, inplace=inplace)
+    destroy_handle(lib, handle)
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for tensor_shape, max_val, min_val, inplace in test_cases:
+        test(lib, handle, "cuda", tensor_shape, max_val, min_val, tensor_dtype=torch.float16, inplace=inplace)
+        test(lib, handle, "cuda", tensor_shape, max_val, min_val, tensor_dtype=torch.float32, inplace=inplace)
+    destroy_handle(lib, handle)
+
+if __name__ == "__main__":
+    test_cases = [
+        # tensor_shape, inplace
+        ((), 0.5, -0.5, Inplace.OUT_OF_PLACE),
+        ((), 0.5, -0.5, Inplace.INPLACE_X),
+        ((1, 3), 0.5, -0.5, Inplace.OUT_OF_PLACE),
+        ((3, 3), 0.5, -0.5, Inplace.OUT_OF_PLACE),
+        ((3, 3, 13, 9, 17), 0.5, -0.5, Inplace.INPLACE_X),
+        ((32, 20, 512), 0.5, -0.5, Inplace.INPLACE_X),
+        ((2, 3, 4, 5), 0.5, -0.5, Inplace.INPLACE_X),
+        ((33, 333, 333), 0.5, -0.5, Inplace.OUT_OF_PLACE),
+        ((32, 256, 112, 112), 0.5, -0.5, Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateClipDescriptor.restype = c_int32
+    lib.infiniopCreateClipDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopClipDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+        c_float,
+    ]
+    lib.infiniopClip.restype = c_int32
+    lib.infiniopClip.argtypes = [
+        infiniopClipDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyClipDescriptor.restype = c_int32
+    lib.infiniopDestroyClipDescriptor.argtypes = [
+        infiniopClipDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    
+    print("\033[92mTest passed!\033[0m")
\ No newline at end of file
diff --git a/operatorspy/tests/gather.py b/operatorspy/tests/gather.py
new file mode 100644
index 00000000..e757fbd2
--- /dev/null
+++ b/operatorspy/tests/gather.py
@@ -0,0 +1,164 @@
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+class GatherDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+infiniopGatherDescriptor_t = POINTER(GatherDescriptor)
+
+def gather(x, indices, axis = 0):
+    idx = [slice(None)] * x.ndim
+    idx[axis] = indices
+    return x[tuple(idx)]
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    tensor_shape,
+    indices_shape, 
+    axis,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Gather on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} axis:{axis} inplace: {inplace.name}"
+    )
+    data = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
+    indices = torch.randint(0, tensor_shape[axis], indices_shape, dtype=torch.int32).to(torch_device)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = gather(data, indices, axis)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = gather(data, indices, axis)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+    
+    dst = torch.empty(ans.shape, dtype=tensor_dtype).to(torch_device) 
+    data_tensor = to_tensor(data, lib)
+    dst_tensor = to_tensor(dst, lib)
+    indices_tensor = to_tensor(indices, lib)
+    descriptor = infiniopGatherDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateGatherDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            dst_tensor.descriptor,
+            data_tensor.descriptor,
+            indices_tensor.descriptor,
+            axis,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    dst_tensor.descriptor.contents.invalidate()
+    data_tensor.descriptor.contents.invalidate()
+    indices_tensor.descriptor.contents.invalidate()
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(lib.infiniopGather(descriptor, dst_tensor.data, data_tensor.data, indices_tensor.data, None))
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopGather(descriptor, dst_tensor.data, data_tensor.data, indices_tensor.data, None)
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(dst, ans, atol=0, rtol=0)
+    check_error(lib.infiniopDestroyGatherDescriptor(descriptor))
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for tensor_shape, dst_shape, axis, inplace in test_cases:
+        test(lib, handle, "cpu", tensor_shape, dst_shape, axis, tensor_dtype=torch.float32, inplace=inplace)
+        test(lib, handle, "cpu", tensor_shape, dst_shape, axis, tensor_dtype=torch.float16, inplace=inplace)
+    destroy_handle(lib, handle)
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for tensor_shape, dst_shape, axis, inplace in test_cases:
+        test(lib, handle, "cuda", tensor_shape, dst_shape, axis, tensor_dtype=torch.float32, inplace=inplace)
+        test(lib, handle, "cuda", tensor_shape, dst_shape, axis, tensor_dtype=torch.float16, inplace=inplace)
+    destroy_handle(lib, handle)
+
+if __name__ == "__main__":
+    test_cases = [
+        # tensor_shape, inplace
+        ((2, 3, 2), (2, 2), 1, Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), (6, 7, 8), 2, Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5, 6), (7, 8, 9, 10), 3, Inplace.OUT_OF_PLACE),
+        ((5, 10), (2,), 0, Inplace.OUT_OF_PLACE),
+        ((3, 2), (2, 2), 0, Inplace.OUT_OF_PLACE),
+        ((3, 2), (3, 2), 1, Inplace.OUT_OF_PLACE),
+        ((33, 333, 333), (4, 5), 1, Inplace.OUT_OF_PLACE),
+        ((3, 2, 2), (3, 2), 2, Inplace.OUT_OF_PLACE),
+        ((2, 3, 4), (3, ), 1, Inplace.OUT_OF_PLACE),
+        ((32, 256, 112, 112), (112, ), 2, Inplace.OUT_OF_PLACE),
+
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateGatherDescriptor.restype = c_int32
+    lib.infiniopCreateGatherDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopGatherDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_int32,
+    ]
+    lib.infiniopGather.restype = c_int32
+    lib.infiniopGather.argtypes = [
+        infiniopGatherDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyGatherDescriptor.restype = c_int32
+    lib.infiniopDestroyGatherDescriptor.argtypes = [
+        infiniopGatherDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+
+    if args.cuda:
+        test_cuda(lib, test_cases)
+
+    print("\033[92mTest passed!\033[0m")
\ No newline at end of file
diff --git a/operatorspy/tests/gather_elements.py b/operatorspy/tests/gather_elements.py
new file mode 100644
index 00000000..815e5d04
--- /dev/null
+++ b/operatorspy/tests/gather_elements.py
@@ -0,0 +1,157 @@
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+class GatherElementsDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+infiniopGatherElementsDescriptor_t = POINTER(GatherElementsDescriptor)
+
+def gather(x, indices, axis = 0):
+    return torch.gather(x, axis, indices)
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    tensor_shape,
+    indices_shape, 
+    axis,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Gather on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} axis:{axis} inplace: {inplace.name}"
+    )
+    data = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
+    dst = torch.empty(indices_shape, dtype=tensor_dtype).to(torch_device) 
+    indices = torch.randint(0, tensor_shape[axis], indices_shape, dtype=torch.int64).to(torch_device)
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = gather(data, indices, axis)
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = gather(data, indices, axis)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    data_tensor = to_tensor(data, lib)
+    dst_tensor = to_tensor(dst, lib)
+    indices_tensor = to_tensor(indices, lib)
+    descriptor = infiniopGatherElementsDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateGatherElementsDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            dst_tensor.descriptor,
+            data_tensor.descriptor,
+            indices_tensor.descriptor,
+            axis,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    dst_tensor.descriptor.contents.invalidate()
+    data_tensor.descriptor.contents.invalidate()
+    indices_tensor.descriptor.contents.invalidate()
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(lib.infiniopGatherElements(descriptor, dst_tensor.data, data_tensor.data, indices_tensor.data, None))
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopGatherElements(descriptor, dst_tensor.data, data_tensor.data, indices_tensor.data, None)
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(dst, ans, atol=0, rtol=0)
+    check_error(lib.infiniopDestroyGatherElementsDescriptor(descriptor))
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for tensor_shape, dst_shape, axis, inplace in test_cases:
+        test(lib, handle, "cpu", tensor_shape, dst_shape, axis, tensor_dtype=torch.float32, inplace=inplace)
+        test(lib, handle, "cpu", tensor_shape, dst_shape, axis, tensor_dtype=torch.float16, inplace=inplace)
+    destroy_handle(lib, handle)
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for tensor_shape, dst_shape, axis, inplace in test_cases:
+        test(lib, handle, "cuda", tensor_shape, dst_shape, axis, tensor_dtype=torch.float32, inplace=inplace)
+        test(lib, handle, "cuda", tensor_shape, dst_shape, axis, tensor_dtype=torch.float16, inplace=inplace)
+    destroy_handle(lib, handle)
+
+if __name__ == "__main__":
+    test_cases = [
+        # tensor_shape, inplace
+        ((3, 2), (2, 2), 0, Inplace.OUT_OF_PLACE),
+        ((3, 2), (3, 2), 1, Inplace.OUT_OF_PLACE),
+        ((33, 333, 333), (33, 200, 333), 1, Inplace.OUT_OF_PLACE),
+        ((3, 2, 2), (3, 2, 1), -1, Inplace.OUT_OF_PLACE),
+        ((2, 3, 4), (2, 2, 4), 1, Inplace.OUT_OF_PLACE),
+        ((32, 256, 112, 112), (32, 256, 112, 112), 2, Inplace.OUT_OF_PLACE),
+
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateGatherElementsDescriptor.restype = c_int32
+    lib.infiniopCreateGatherElementsDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopGatherElementsDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_int32,
+    ]
+    lib.infiniopGatherElements.restype = c_int32
+    lib.infiniopGatherElements.argtypes = [
+        infiniopGatherElementsDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyGatherElementsDescriptor.restype = c_int32
+    lib.infiniopDestroyGatherElementsDescriptor.argtypes = [
+        infiniopGatherElementsDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+
+    if args.cuda:
+        test_cuda(lib, test_cases)
+
+    print("\033[92mTest passed!\033[0m")
\ No newline at end of file
diff --git a/operatorspy/tests/reduce_max.py b/operatorspy/tests/reduce_max.py
new file mode 100644
index 00000000..56efb401
--- /dev/null
+++ b/operatorspy/tests/reduce_max.py
@@ -0,0 +1,191 @@
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+class ReduceMaxDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+infiniopReduceMaxDescriptor_t = POINTER(ReduceMaxDescriptor)
+
+def reduceMax(x, axis, keepdims = True):
+    for i in range(len(axis)):
+        if keepdims:
+            out = torch.max(x, dim=axis[i], keepdim=keepdims)
+        else:
+            out = torch.max(x, dim=axis[i] - i, keepdim=keepdims)
+        x = out.values 
+    return out
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    tensor_shape, 
+    axis,
+    keepdims,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing ReduceMax on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} axis:{axis} keepdim:{keepdims} inplace: {inplace.name}"
+    )  
+    x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
+
+    out_shape = list(tensor_shape)
+    for i in axis:
+        if(keepdims==1):
+            out_shape[i] = 1
+        else:
+            out_shape = out_shape[:i] + out_shape[i+1:]
+    if(len(axis) == len(tensor_shape)):
+        if(keepdims == 0):
+            out_shape = [1]
+
+    y = torch.full(out_shape, -torch.inf, dtype=tensor_dtype).to(torch_device)
+
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = reduceMax(x, axis, bool(keepdims))
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = reduceMax(x, axis, bool(keepdims))
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopReduceMaxDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateReduceMaxDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            (ctypes.c_int * len(axis))(*axis),
+            len(axis),
+            keepdims,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetReduceMaxWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(lib.infiniopReduceMax(descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None))
+
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopReduceMax(descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None)
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(y, ans.values, atol=0, rtol=0)
+    check_error(lib.infiniopDestroyReduceMaxDescriptor(descriptor))
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for tensor_shape, axis, keepdims, inplace in test_cases:
+        test(lib, handle, "cpu", tensor_shape, axis, keepdims, tensor_dtype=torch.float32, inplace=inplace)
+        test(lib, handle, "cpu", tensor_shape, axis, keepdims, tensor_dtype=torch.float16, inplace=inplace)
+    destroy_handle(lib, handle)
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for tensor_shape, axis, keepdims, inplace in test_cases:
+        test(lib, handle, "cuda", tensor_shape, axis, keepdims, tensor_dtype=torch.float32, inplace=inplace)
+        test(lib, handle, "cuda", tensor_shape, axis, keepdims, tensor_dtype=torch.float16, inplace=inplace)
+    destroy_handle(lib, handle)
+    
+if __name__ == "__main__":
+    test_cases = [
+        # tensor_shape, inplace
+        ((2, 3), [1], 1, Inplace.OUT_OF_PLACE),
+        ((2, 3), [0, 1], 0, Inplace.OUT_OF_PLACE),
+        ((4, 7), [0, 1], 1, Inplace.OUT_OF_PLACE),
+        ((300, 1024), [1], 1, Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), [0, 1,2,3], 1, Inplace.OUT_OF_PLACE),
+        ((6, 6, 200), [-1], 1, Inplace.OUT_OF_PLACE),
+        ((6, 2, 7), [0, 1], 1, Inplace.OUT_OF_PLACE),
+        ((256, 400), [1], 0, Inplace.OUT_OF_PLACE),
+        ((32, 256, 112, 112), [1], 1, Inplace.OUT_OF_PLACE),
+        ((32, 288, 112, 112), [2], 0, Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateReduceMaxDescriptor.restype = c_int32
+    lib.infiniopCreateReduceMaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopReduceMaxDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        ctypes.POINTER(ctypes.c_int),
+        c_int32,
+        c_int32,
+    ]
+    lib.infiniopGetReduceMaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetReduceMaxWorkspaceSize.argtypes = [
+        infiniopReduceMaxDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopReduceMax.restype = c_int32
+    lib.infiniopReduceMax.argtypes = [
+        infiniopReduceMaxDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyReduceMaxDescriptor.restype = c_int32
+    lib.infiniopDestroyReduceMaxDescriptor.argtypes = [
+        infiniopReduceMaxDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+
+    if args.cuda:
+        test_cuda(lib, test_cases)
+
+    print("\033[92mTest passed!\033[0m")
\ No newline at end of file
diff --git a/operatorspy/tests/reduce_mean.py b/operatorspy/tests/reduce_mean.py
new file mode 100644
index 00000000..6dc34f57
--- /dev/null
+++ b/operatorspy/tests/reduce_mean.py
@@ -0,0 +1,189 @@
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+class ReduceMeanDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+infiniopReduceMeanDescriptor_t = POINTER(ReduceMeanDescriptor)
+
+def reduceMean(x, axis = 0, keepdims = True):
+    return torch.mean(x, dim=axis, keepdim=keepdims)
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    tensor_shape, 
+    axis,
+    keepdims,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing ReduceMean on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} axis:{axis} keepdim:{keepdims} inplace: {inplace.name}"
+    )  
+    x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
+
+    out_shape = list(tensor_shape)
+    for i in axis:
+        if(keepdims==1):
+            out_shape[i] = 1
+        else:
+            out_shape = out_shape[:i] + out_shape[i+1:]
+    if(len(axis) == len(tensor_shape)):
+        if(keepdims == 0):
+            out_shape = [1]
+
+    y = torch.zeros(out_shape, dtype=tensor_dtype).to(torch_device)
+
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = reduceMean(x, axis, bool(keepdims))
+
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = reduceMean(x, axis, bool(keepdims))
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopReduceMeanDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateReduceMeanDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            (ctypes.c_int * len(axis))(*axis),
+            len(axis),
+            keepdims,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetReduceMeanWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(lib.infiniopReduceMean(descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None))
+
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopReduceMean(descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None)
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    if tensor_dtype == torch.float16:
+        assert torch.allclose(y, ans, atol=1e-3, rtol=1e-7)
+    else:
+        assert torch.allclose(y, ans, atol=1e-6, rtol=1e-6)
+    check_error(lib.infiniopDestroyReduceMeanDescriptor(descriptor))
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for tensor_shape, axis, keepdims, inplace in test_cases:
+        test(lib, handle, "cpu", tensor_shape, axis, keepdims, tensor_dtype=torch.float32, inplace=inplace)
+        test(lib, handle, "cpu", tensor_shape, axis, keepdims, tensor_dtype=torch.float16, inplace=inplace)
+    destroy_handle(lib, handle)
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for tensor_shape, axis, keepdims, inplace in test_cases:
+        test(lib, handle, "cuda", tensor_shape, axis, keepdims, tensor_dtype=torch.float32, inplace=inplace)
+        test(lib, handle, "cuda", tensor_shape, axis, keepdims, tensor_dtype=torch.float16, inplace=inplace)
+    destroy_handle(lib, handle)
+    
+if __name__ == "__main__":
+    test_cases = [
+        # tensor_shape, inplace
+        ((2, 3), [1], 1, Inplace.OUT_OF_PLACE),
+        ((2, 3), [0, 1], 0, Inplace.OUT_OF_PLACE),
+        ((4, 7), [0], 1, Inplace.OUT_OF_PLACE),
+        ((300, 1024), [-1], 1, Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), [1, 2], 1, Inplace.OUT_OF_PLACE),
+        ((6, 6, 200), [0, 1], 1, Inplace.OUT_OF_PLACE),
+        ((6, 2, 7), [0, 2], 1, Inplace.OUT_OF_PLACE),
+        ((266, 400), [0], 0, Inplace.OUT_OF_PLACE),
+        ((32, 256, 112, 112), [3], 1, Inplace.OUT_OF_PLACE),
+        ((32, 288, 112, 112), [3], 0, Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateReduceMeanDescriptor.restype = c_int32
+    lib.infiniopCreateReduceMeanDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopReduceMeanDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        ctypes.POINTER(ctypes.c_int),
+        c_int32,
+        c_int32,
+    ]
+    lib.infiniopGetReduceMeanWorkspaceSize.restype = c_int32
+    lib.infiniopGetReduceMeanWorkspaceSize.argtypes = [
+        infiniopReduceMeanDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopReduceMean.restype = c_int32
+    lib.infiniopReduceMean.argtypes = [
+        infiniopReduceMeanDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyReduceMeanDescriptor.restype = c_int32
+    lib.infiniopDestroyReduceMeanDescriptor.argtypes = [
+        infiniopReduceMeanDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+
+    if args.cuda:
+        test_cuda(lib, test_cases)
+
+    print("\033[92mTest passed!\033[0m")
\ No newline at end of file
diff --git a/operatorspy/tests/reduce_min.py b/operatorspy/tests/reduce_min.py
new file mode 100644
index 00000000..c77334f6
--- /dev/null
+++ b/operatorspy/tests/reduce_min.py
@@ -0,0 +1,193 @@
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+class ReduceMinDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+infiniopReduceMinDescriptor_t = POINTER(ReduceMinDescriptor)
+
+def reduceMin(x, axis, keepdims = True):
+    for i in range(len(axis)):
+        if keepdims:
+            out = torch.min(x, dim=axis[i], keepdim=keepdims)
+        else:
+            out = torch.min(x, dim=axis[i] - i, keepdim=keepdims)
+        x = out.values 
+    return out
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    tensor_shape, 
+    axis,
+    keepdims,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing ReduceMin on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} axis:{axis} keepdim:{keepdims} inplace: {inplace.name}"
+    )  
+    x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
+
+    out_shape = list(tensor_shape)
+    for i in axis:
+        if(keepdims==1):
+            out_shape[i] = 1
+        else:
+            out_shape = out_shape[:i] + out_shape[i+1:]
+    if(len(axis) == len(tensor_shape)):
+        if(keepdims == 0):
+            out_shape = [1]
+    
+
+    y = torch.full(out_shape, torch.inf, dtype=tensor_dtype).to(torch_device)
+
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = reduceMin(x, axis, bool(keepdims))
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = reduceMin(x, axis, bool(keepdims))
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopReduceMinDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateReduceMinDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            (ctypes.c_int * len(axis))(*axis),
+            len(axis),
+            keepdims,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetReduceMinWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(lib.infiniopReduceMin(descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None))
+
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopReduceMin(descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None)
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    assert torch.allclose(y, ans.values, atol=0, rtol=0)
+    check_error(lib.infiniopDestroyReduceMinDescriptor(descriptor))
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for tensor_shape, axis, keepdims, inplace in test_cases:
+        test(lib, handle, "cpu", tensor_shape, axis, keepdims, tensor_dtype=torch.float32, inplace=inplace)
+        test(lib, handle, "cpu", tensor_shape, axis, keepdims, tensor_dtype=torch.float16, inplace=inplace)
+    destroy_handle(lib, handle)
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for tensor_shape, axis, keepdims, inplace in test_cases:
+        test(lib, handle, "cuda", tensor_shape, axis, keepdims, tensor_dtype=torch.float32, inplace=inplace)
+        test(lib, handle, "cuda", tensor_shape, axis, keepdims, tensor_dtype=torch.float16, inplace=inplace)
+    destroy_handle(lib, handle)
+    
+if __name__ == "__main__":
+    test_cases = [
+        # tensor_shape, inplace
+        ((2, 3), [1], 1, Inplace.OUT_OF_PLACE),
+        ((2, 3), [0], 0, Inplace.OUT_OF_PLACE),
+        ((4, 7), [0, 1], 1, Inplace.OUT_OF_PLACE),
+        ((300, 1024), [1], 1, Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), [2], 1, Inplace.OUT_OF_PLACE),
+        ((6, 6, 200), [1], 1, Inplace.OUT_OF_PLACE),
+        ((4, 9), [1], 1, Inplace.OUT_OF_PLACE),
+        ((6, 2, 7), [0], 1, Inplace.OUT_OF_PLACE),
+        ((266, 400), [0], 0, Inplace.OUT_OF_PLACE),
+        ((32, 256, 112, 112), [2], 1, Inplace.OUT_OF_PLACE),
+        ((32, 288, 112, 112), [1], 0, Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateReduceMinDescriptor.restype = c_int32
+    lib.infiniopCreateReduceMinDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopReduceMinDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        ctypes.POINTER(ctypes.c_int),
+        c_int32,
+        c_int32,
+    ]
+    lib.infiniopGetReduceMinWorkspaceSize.restype = c_int32
+    lib.infiniopGetReduceMinWorkspaceSize.argtypes = [
+        infiniopReduceMinDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopReduceMin.restype = c_int32
+    lib.infiniopReduceMin.argtypes = [
+        infiniopReduceMinDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyReduceMinDescriptor.restype = c_int32
+    lib.infiniopDestroyReduceMinDescriptor.argtypes = [
+        infiniopReduceMinDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+
+    if args.cuda:
+        test_cuda(lib, test_cases)
+
+    print("\033[92mTest passed!\033[0m")
\ No newline at end of file
diff --git a/operatorspy/tests/reduce_sum.py b/operatorspy/tests/reduce_sum.py
new file mode 100644
index 00000000..b4d90e68
--- /dev/null
+++ b/operatorspy/tests/reduce_sum.py
@@ -0,0 +1,189 @@
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+class ReduceSumDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+infiniopReduceSumDescriptor_t = POINTER(ReduceSumDescriptor)
+
+def reduceSum(x, axis = 0, keepdims = True):
+    return torch.sum(x, dim=axis, keepdim=keepdims)
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    tensor_shape, 
+    axis,
+    keepdims,
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing ReduceSum on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} axis:{axis} keepdim:{keepdims} inplace: {inplace.name}"
+    )  
+    x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
+
+    out_shape = list(tensor_shape)
+    for i in axis:
+        if(keepdims==1):
+            out_shape[i] = 1
+        else:
+            out_shape = out_shape[:i] + out_shape[i+1:]
+    if(len(axis) == len(tensor_shape)):
+        if(keepdims == 0):
+            out_shape = [1]
+
+    y = torch.zeros(out_shape, dtype=tensor_dtype).to(torch_device)
+
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = reduceSum(x, axis, bool(keepdims))
+
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = reduceSum(x, axis, bool(keepdims))
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopReduceSumDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateReduceSumDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            (ctypes.c_int * len(axis))(*axis),
+            len(axis),
+            keepdims,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+
+    workspaceSize = ctypes.c_uint64(0)
+    check_error(
+        lib.infiniopGetReduceSumWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
+    )
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(lib.infiniopReduceSum(descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None))
+
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopReduceSum(descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None)
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+
+    if tensor_dtype == torch.float16:
+        assert torch.allclose(y, ans, atol=1e-3, rtol=1e-7)
+    else:
+        assert torch.allclose(y, ans, atol=1e-6, rtol=1e-6)
+    check_error(lib.infiniopDestroyReduceSumDescriptor(descriptor))
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for tensor_shape, axis, keepdims, inplace in test_cases:
+        test(lib, handle, "cpu", tensor_shape, axis, keepdims, tensor_dtype=torch.float32, inplace=inplace)
+        test(lib, handle, "cpu", tensor_shape, axis, keepdims, tensor_dtype=torch.float16, inplace=inplace)
+    destroy_handle(lib, handle)
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for tensor_shape, axis, keepdims, inplace in test_cases:
+        test(lib, handle, "cuda", tensor_shape, axis, keepdims, tensor_dtype=torch.float32, inplace=inplace)
+        test(lib, handle, "cuda", tensor_shape, axis, keepdims, tensor_dtype=torch.float16, inplace=inplace)
+    destroy_handle(lib, handle)
+    
+if __name__ == "__main__":
+    test_cases = [
+        # tensor_shape, inplace
+        ((2, 3), [1], 1, Inplace.OUT_OF_PLACE),
+        ((2, 3), [0, 1], 0, Inplace.OUT_OF_PLACE),
+        ((4, 7), [0], 1, Inplace.OUT_OF_PLACE),
+        ((300, 1024), [0], 1, Inplace.OUT_OF_PLACE),
+        ((2, 3, 4, 5), [-2, -1], 1, Inplace.OUT_OF_PLACE),
+        ((6, 6, 200), [0, 1], 1, Inplace.OUT_OF_PLACE),
+        ((6, 2, 7), [0, 2], 1, Inplace.OUT_OF_PLACE),
+        ((266, 400), [0], 0, Inplace.OUT_OF_PLACE),
+        ((32, 256, 112, 112), [3], 1, Inplace.OUT_OF_PLACE),
+        ((32, 288, 112, 112), [3], 0, Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateReduceSumDescriptor.restype = c_int32
+    lib.infiniopCreateReduceSumDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopReduceSumDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        ctypes.POINTER(ctypes.c_int),
+        c_int32,
+        c_int32,
+    ]
+    lib.infiniopGetReduceSumWorkspaceSize.restype = c_int32
+    lib.infiniopGetReduceSumWorkspaceSize.argtypes = [
+        infiniopReduceSumDescriptor_t,
+        POINTER(c_uint64),
+    ]
+    lib.infiniopReduceSum.restype = c_int32
+    lib.infiniopReduceSum.argtypes = [
+        infiniopReduceSumDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyReduceSumDescriptor.restype = c_int32
+    lib.infiniopDestroyReduceSumDescriptor.argtypes = [
+        infiniopReduceSumDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+
+    if args.cuda:
+        test_cuda(lib, test_cases)
+
+    print("\033[92mTest passed!\033[0m")
\ No newline at end of file
diff --git a/operatorspy/tests/where.py b/operatorspy/tests/where.py
new file mode 100644
index 00000000..70314e95
--- /dev/null
+++ b/operatorspy/tests/where.py
@@ -0,0 +1,161 @@
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_uint8, c_void_p, c_float
+import ctypes
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    check_error,
+)
+
+from operatorspy.tests.test_utils import get_args
+from enum import Enum, auto
+import torch
+
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+class WhereDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+infiniopWhereDescriptor_t = POINTER(WhereDescriptor)
+
+def where(condition, x, y):
+    return torch.where(condition, x, y)
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    tensor_shape, 
+    tensor_dtype=torch.float16,
+    inplace=Inplace.OUT_OF_PLACE,
+):
+    print(
+        f"Testing Where on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
+    )    
+
+    x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
+    y = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) 
+    condition = torch.randint(0, 2, tensor_shape, dtype=torch.uint8).to(torch_device)
+    dst = torch.empty_like(x)if inplace == Inplace.OUT_OF_PLACE else x
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        ans = where(condition, y, x)        
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            _ = where(condition, y, x)
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"pytorch time: {elapsed :6f}")
+
+    x_tensor = to_tensor(x, lib)
+    dst_tensor = to_tensor(dst, lib)if inplace == Inplace.OUT_OF_PLACE else x_tensor
+    y_tensor = to_tensor(y, lib) 
+    condition_tensor = to_tensor(condition, lib)
+    descriptor = infiniopWhereDescriptor_t()
+
+    check_error(
+        lib.infiniopCreateWhereDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            dst_tensor.descriptor,
+            x_tensor.descriptor,
+            y_tensor.descriptor,
+            condition_tensor.descriptor,
+        )
+    )
+
+    dst_tensor.descriptor.contents.invalidate()
+    x_tensor.descriptor.contents.invalidate()
+    y_tensor.descriptor.contents.invalidate()
+    condition_tensor.descriptor.contents.invalidate()
+
+    for i in range(NUM_PRERUN if PROFILE else 1):
+        check_error(lib.infiniopWhere(descriptor, dst_tensor.data, x_tensor.data, y_tensor.data, condition_tensor.data, None))
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopWhere(descriptor, dst_tensor.data, x_tensor.data, y_tensor.data, condition_tensor.data, None)
+            )
+        elapsed = (time.time() - start_time) / NUM_ITERATIONS
+        print(f"    lib time: {elapsed :6f}")
+    assert torch.allclose(dst, ans, atol=0, rtol=0)
+    check_error(lib.infiniopDestroyWhereDescriptor(descriptor))
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for tensor_shape, inplace in test_cases:
+        test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+        test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
+    destroy_handle(lib, handle)
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for tensor_shape, inplace in test_cases:
+        test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+        test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
+    destroy_handle(lib, handle)
+
+if __name__ == "__main__":
+    test_cases = [
+        # tensor_shape, inplace
+        ((), Inplace.OUT_OF_PLACE),
+        ((), Inplace.INPLACE_X),
+        ((1, 3), Inplace.OUT_OF_PLACE),
+        ((3, 3), Inplace.OUT_OF_PLACE),
+        ((3, 3, 13, 9, 17), Inplace.INPLACE_X),
+        ((32, 20, 512), Inplace.INPLACE_X),
+        ((33, 333, 333), Inplace.OUT_OF_PLACE),
+        ((32, 256, 112, 112), Inplace.OUT_OF_PLACE),
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateWhereDescriptor.restype = c_int32
+    lib.infiniopCreateWhereDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopWhereDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopWhere.restype = c_int32
+    lib.infiniopWhere.argtypes = [
+        infiniopWhereDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyWhereDescriptor.restype = c_int32
+    lib.infiniopDestroyWhereDescriptor.argtypes = [
+        infiniopWhereDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+
+    if args.cuda:
+        test_cuda(lib, test_cases)
+
+    print("\033[92mTest passed!\033[0m")
\ No newline at end of file
diff --git a/src/ops/clip/cpu/clip_cpu.cc b/src/ops/clip/cpu/clip_cpu.cc
new file mode 100644
index 00000000..d23cacf4
--- /dev/null
+++ b/src/ops/clip/cpu/clip_cpu.cc
@@ -0,0 +1,83 @@
+#include "clip_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+#include <algorithm>
+
+infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t,
+                                        ClipCpuDescriptor_t *desc_ptr,
+                                        infiniopTensorDescriptor_t dst,
+                                        infiniopTensorDescriptor_t src,
+                                        float max,
+                                        float min) {
+    uint64_t ndim = src->ndim;
+    if (ndim != dst->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    for (size_t i = 0; i < ndim; ++i) {
+        if (src->shape[i] != dst->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (!is_contiguous(src) || !is_contiguous(dst)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (src->dt != F16 && src->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (src->dt != dst->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    DT dtype = src->dt;
+    uint64_t data_size = std::accumulate(src->shape, src->shape + src->ndim, 1ULL, std::multiplies<uint64_t>());
+    *desc_ptr = new ClipCpuDescriptor{
+        DevCpu,
+        dtype,
+        data_size,
+        max,
+        min,
+    };
+    
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyClipDescriptor(ClipCpuDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+
+void clip_cpu(ClipCpuDescriptor_t desc, void *dst, void const *src)
+{
+    auto dstPtr = reinterpret_cast<Tdata*>(dst);
+    auto srcPtr = reinterpret_cast<Tdata const*>(src);
+    if constexpr (std::is_same<Tdata, uint16_t>::value) 
+    {
+        for(uint64_t i = 0; i < desc->data_size; i++) {
+            float val = f16_to_f32(srcPtr[i]);
+            float clipped_val = std::min(std::max(val, desc->min), desc->max);
+            dstPtr[i] = f32_to_f16(clipped_val);
+        }
+    }
+    else{
+        for(uint64_t i = 0; i < desc->data_size; i++){
+            dstPtr[i] = std::min(std::max(srcPtr[i], desc->min), desc->max);
+        }
+    }
+
+}
+infiniopStatus_t cpuClip(ClipCpuDescriptor_t desc,
+    void *dst, void const *src,
+    void *stream) {
+        if(desc->dtype == F32)
+        {
+            clip_cpu<float>(desc, dst, src);
+        }
+        else if(desc->dtype == F16)
+        {
+            clip_cpu<uint16_t>(desc, dst, src);
+        }
+        return STATUS_SUCCESS;
+}
+
diff --git a/src/ops/clip/cpu/clip_cpu.h b/src/ops/clip/cpu/clip_cpu.h
new file mode 100644
index 00000000..584fb6de
--- /dev/null
+++ b/src/ops/clip/cpu/clip_cpu.h
@@ -0,0 +1,27 @@
+#ifndef __CPU_CLIP_H__
+#define __CPU_CLIP_H__
+#include "operators.h"
+
+struct ClipCpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t data_size;
+    float max;
+    float min;
+};
+
+typedef ClipCpuDescriptor *ClipCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t,
+                                        ClipCpuDescriptor_t *,
+                                        infiniopTensorDescriptor_t dst,
+                                        infiniopTensorDescriptor_t src,
+                                        float max,
+                                        float min);
+
+infiniopStatus_t cpuClip(ClipCpuDescriptor_t desc,
+                       void *dst, void const *src,
+                       void *stream);
+
+infiniopStatus_t cpuDestroyClipDescriptor(ClipCpuDescriptor_t desc);
+#endif
\ No newline at end of file
diff --git a/src/ops/clip/cuda/clip.cc b/src/ops/clip/cuda/clip.cc
new file mode 100644
index 00000000..2d55cb69
--- /dev/null
+++ b/src/ops/clip/cuda/clip.cc
@@ -0,0 +1,74 @@
+#include "clip.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateClipDescriptor(CudaHandle_t handle,
+                                          ClipCudaDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t dst,
+                                          infiniopTensorDescriptor_t src,
+                                          float max,
+                                          float min) {
+    uint64_t ndim = src->ndim;
+    if (ndim != dst->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    for (size_t i = 0; i < ndim; ++i) {
+        if (src->shape[i] != dst->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (!is_contiguous(src) || !is_contiguous(dst)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (src->dt != F16 && src->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (src->dt != dst->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    
+    DT dtype = src->dt;
+    uint64_t data_size = std::accumulate(src->shape, src->shape + src->ndim, 1ULL, std::multiplies<uint64_t>());
+    void *max_ptr, *min_ptr;
+    if(dtype == F32)
+    {
+        checkCudaErrorWithCode(cudaMalloc(&max_ptr, sizeof(float)), STATUS_MEMORY_NOT_ALLOCATED);
+        checkCudaErrorWithCode(cudaMalloc(&min_ptr, sizeof(float)), STATUS_MEMORY_NOT_ALLOCATED);
+        float* max_p = static_cast<float*>(max_ptr);
+        float* min_p = static_cast<float*>(min_ptr);
+        checkCudaErrorWithCode(cudaMemcpy(max_p, &max, sizeof(float), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED);
+        checkCudaErrorWithCode(cudaMemcpy(min_p, &min, sizeof(float), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED);       
+    }
+    else if(dtype == F16)
+    {
+        half max_h = __float2half(max);
+        half min_h = __float2half(min);
+        checkCudaErrorWithCode(cudaMalloc(&max_ptr, sizeof(half)), STATUS_MEMORY_NOT_ALLOCATED);
+        checkCudaErrorWithCode(cudaMalloc(&min_ptr, sizeof(half)), STATUS_MEMORY_NOT_ALLOCATED);
+        half* max_p = static_cast<half*>(max_ptr);
+        half* min_p = static_cast<half*>(min_ptr);
+        checkCudaErrorWithCode(cudaMemcpy(max_p, &max_h, sizeof(half), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED);
+        checkCudaErrorWithCode(cudaMemcpy(min_p, &min_h, sizeof(half), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED);       
+    }
+
+
+    *desc_ptr = new ClipCudaDescriptor{
+        DevNvGpu,
+        dtype,
+        handle->device_id,
+        ndim,
+        data_size,
+        static_cast<uint64_t>(handle->prop.maxGridSize[0]),
+        max_ptr,
+        min_ptr,
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyClipDescriptor(ClipCudaDescriptor_t desc) {
+    checkCudaErrorWithCode(cudaFree(desc->max_ptr), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree(desc->min_ptr), STATUS_EXECUTION_FAILED);
+    delete desc;
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/ops/clip/cuda/clip.cu b/src/ops/clip/cuda/clip.cu
new file mode 100644
index 00000000..f2de2f1a
--- /dev/null
+++ b/src/ops/clip/cuda/clip.cu
@@ -0,0 +1,57 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include "clip.cuh"
+
+template<typename Tdata>
+__global__ void clip(
+    Tdata *y,
+    const Tdata *x,
+    const Tdata *max_val,
+    const Tdata *min_val,
+    uint64_t offset) {
+    uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    if constexpr (std::is_same<Tdata, half>::value) {
+        y[idx] = __hlt(x[idx], *min_val) ? *min_val : __hgt(x[idx], *max_val) ? *max_val : x[idx];
+    } else {
+        y[idx] = x[idx] < *min_val ? *min_val : x[idx] > *max_val ? *max_val : x[idx];
+    }
+}
+
+template<typename Tdata>
+infiniopStatus_t clip_nv_gpu(ClipCudaDescriptor_t desc, void *y, void const *x, void *stream) {
+    if (desc->data_size == 0) {
+        return STATUS_SUCCESS;
+    }
+    dim3 blockDims = dim3(std::min(static_cast<uint64_t>(256), desc->data_size));
+    dim3 gridDims = dim3(std::min(ROUND_UP_DIV(desc->data_size, blockDims.x), desc->max_grid_size));
+    uint64_t step = gridDims.x * blockDims.x;
+
+    const auto x_ = reinterpret_cast<Tdata const *>(x);
+    const auto y_ = reinterpret_cast<Tdata *>(y);
+    const Tdata *max_val = reinterpret_cast<const Tdata *>(desc->max_ptr);
+    const Tdata *min_val = reinterpret_cast<const Tdata *>(desc->min_ptr);
+    
+    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+#pragma unroll
+    for (uint64_t i = 0; i < desc->data_size; i += step) {
+        clip<Tdata><<<gridDims, blockDims, 0, cuda_stream>>>(
+            y_, x_, max_val, min_val, i);
+    }
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc,
+                            void *dst, void const *src,
+                            void *stream) {
+    checkCudaError(cudaSetDevice(desc->device_id));
+    if (desc->dtype == F16) {
+        return clip_nv_gpu<half>(desc, dst, src, stream);
+    }
+    if (desc->dtype == F32) {
+        return clip_nv_gpu<float>(desc, dst, src, stream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
\ No newline at end of file
diff --git a/src/ops/clip/cuda/clip.cuh b/src/ops/clip/cuda/clip.cuh
new file mode 100644
index 00000000..795b410c
--- /dev/null
+++ b/src/ops/clip/cuda/clip.cuh
@@ -0,0 +1,36 @@
+#ifndef __CUDA_CLIP_H__
+#define __CUDA_CLIP_H__
+
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+#include <cuda_fp16.h>
+#include <numeric>
+
+struct ClipCudaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    uint64_t ndim;
+    uint64_t data_size;
+    uint64_t max_grid_size;
+    void *max_ptr;
+    void *min_ptr;
+};
+
+typedef struct ClipCudaDescriptor *ClipCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateClipDescriptor(CudaHandle_t,
+                                         ClipCudaDescriptor_t *,
+                                         infiniopTensorDescriptor_t dst,
+                                         infiniopTensorDescriptor_t src,
+                                         float max,
+                                         float min);
+
+infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc,
+                         void *dst, void const *src,
+                         void *stream);
+
+infiniopStatus_t cudaDestroyClipDescriptor(ClipCudaDescriptor_t desc);
+
+#endif
diff --git a/src/ops/clip/operator.cc b/src/ops/clip/operator.cc
new file mode 100644
index 00000000..9e40d198
--- /dev/null
+++ b/src/ops/clip/operator.cc
@@ -0,0 +1,71 @@
+#include "../utils.h"
+#include "operators.h"
+
+#include "ops/clip/clip.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/clip_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#include "../../devices/cuda/cuda_handle.h"
+#include "cuda/clip.cuh"
+#endif
+
+__C infiniopStatus_t infiniopCreateClipDescriptor(
+    infiniopHandle_t handle,
+    infiniopClipDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t dst,
+    infiniopTensorDescriptor_t src,
+    float max, 
+    float min){ 
+    switch (handle->device)
+    {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuCreateClipDescriptor(handle, (ClipCpuDescriptor_t *)desc_ptr, dst, src, max, min);
+#endif        
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu: 
+        return cudaCreateClipDescriptor((CudaHandle_t) handle, (ClipCudaDescriptor_t *) desc_ptr, dst, src, max, min);
+        
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc,
+    void *dst, 
+    void const *src,
+    void *stream){
+    switch (desc->device)
+    {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuClip((ClipCpuDescriptor_t)desc, dst, src, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu: 
+        return cudaClip((ClipCudaDescriptor_t) desc, dst, src, stream);
+        
+
+#endif        
+    }
+   return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc){
+    switch (desc->device)
+    {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuDestroyClipDescriptor((ClipCpuDescriptor_t)desc);
+#endif        
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu: 
+        return cudaDestroyClipDescriptor((ClipCudaDescriptor_t) desc);
+    
+
+#endif
+    }
+   return STATUS_BAD_DEVICE;
+}
\ No newline at end of file
diff --git a/src/ops/gather/cpu/gather_cpu.cc b/src/ops/gather/cpu/gather_cpu.cc
new file mode 100644
index 00000000..ead75320
--- /dev/null
+++ b/src/ops/gather/cpu/gather_cpu.cc
@@ -0,0 +1,122 @@
+#include "gather_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+
+
+infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t,
+    GatherCpuDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t dst,
+    infiniopTensorDescriptor_t data,
+    infiniopTensorDescriptor_t indices,
+    int axis)
+{
+    uint64_t data_ndim = data->ndim;
+    uint64_t indices_ndim = indices->ndim;
+    if(axis<0){
+        axis += data_ndim;
+    }
+    if(axis >= data_ndim){
+        return STATUS_BAD_PARAM;
+    }
+    if(data_ndim + indices_ndim - 1 != dst->ndim){
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    int j=0;
+    for (int i=0; i<dst->ndim; i++){
+        if(i<axis){
+            if (data->shape[i] != dst->shape[i]){
+                return STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+        else if(i == axis){
+            for(j=0; j<indices_ndim; j++){
+                if (indices->shape[j] != dst->shape[i]){
+                    return STATUS_BAD_TENSOR_SHAPE;
+                }
+                i++;
+            }
+        }
+        else{
+            if (data->shape[i - j] != dst->shape[i]){
+                return STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+    }
+
+    if (data->dt != F16 && data->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (!is_contiguous(data)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (!is_contiguous(indices)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (indices->dt != I8 && indices->dt != I16 && indices->dt != I32 && indices->dt != I64) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    DT dtype = data->dt;
+    DT indice_type = indices->dt;
+    uint64_t dst_size = std::accumulate(dst->shape, dst->shape + dst->ndim, 1ULL, std::multiplies<uint64_t>());
+    uint64_t stride_axis = (data->strides)[axis];
+    uint64_t shape_axis = (data->shape)[axis];
+    uint64_t num_indices = std::accumulate(indices->shape, indices->shape + indices->ndim, 1ULL, std::multiplies<uint64_t>());
+    *desc_ptr = new GatherCpuDescriptor{
+        DevCpu,
+        dtype,
+        indice_type,
+        dst_size,
+        stride_axis,
+        shape_axis,
+        num_indices,
+    };
+    return STATUS_SUCCESS;
+}
+
+
+
+infiniopStatus_t cpuDestroyGatherDescriptor(GatherCpuDescriptor_t desc){
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata ,typename Idata>
+void gather_cpu(GatherCpuDescriptor_t desc,
+    void *dst, 
+    void const *data,
+    void const *indices){
+    auto dstPtr = reinterpret_cast<Tdata*>(dst);
+    auto dataPtr = reinterpret_cast<Tdata const*>(data);
+    auto indicePtr = reinterpret_cast<Idata const*>(indices);
+    
+    for(uint64_t i = 0; i < desc->dst_size; i++)
+    {
+        uint64_t index = indicePtr[i / desc->stride_axis % desc->num_indices];
+        int64_t linearId = desc->shape_axis * desc->stride_axis * (i / (desc->num_indices * desc->stride_axis)) + index * desc->stride_axis + i % desc->stride_axis;
+        dstPtr[i] = dataPtr[linearId];
+    }
+}
+
+infiniopStatus_t cpuGather(GatherCpuDescriptor_t desc,
+    void *dst, 
+    void const *data,
+    void const *indices,
+    void *stream){
+    if(desc->dtype == F32 && desc->indice_type == I32)
+    {
+        gather_cpu<float, int32_t>(desc, dst, data, indices);
+    }
+    else if(desc->dtype == F16 && desc->indice_type == I32)
+    {
+        gather_cpu<uint16_t, int32_t>(desc, dst, data, indices);
+    }
+    else if(desc->dtype == F32 && desc->indice_type == I64)
+    {
+        gather_cpu<float, int64_t>(desc, dst, data, indices);
+    }
+    else if(desc->dtype == F16 && desc->indice_type == I64)
+    {
+        gather_cpu<uint16_t, int64_t>(desc, dst, data, indices);
+    }
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/ops/gather/cpu/gather_cpu.h b/src/ops/gather/cpu/gather_cpu.h
new file mode 100644
index 00000000..c8a403e9
--- /dev/null
+++ b/src/ops/gather/cpu/gather_cpu.h
@@ -0,0 +1,31 @@
+#ifndef __CPU_GATHER_H__
+#define __CPU_GATHER_H__
+#include "operators.h"
+
+struct GatherCpuDescriptor {
+    Device device;
+    DT dtype;
+    DT indice_type;
+    uint64_t dst_size;
+    uint64_t stride_axis;
+    uint64_t shape_axis;
+    uint64_t num_indices;
+};
+
+typedef GatherCpuDescriptor *GatherCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t,
+                                        GatherCpuDescriptor_t *,
+                                        infiniopTensorDescriptor_t dst,
+                                        infiniopTensorDescriptor_t data,
+                                        infiniopTensorDescriptor_t indices,
+                                        int const axis);
+
+infiniopStatus_t cpuGather(GatherCpuDescriptor_t desc,
+                       void *dst,     
+                       void const *data,
+                       void const *indices,
+                       void *stream);
+
+infiniopStatus_t cpuDestroyGatherDescriptor(GatherCpuDescriptor_t desc);
+#endif
\ No newline at end of file
diff --git a/src/ops/gather/cuda/gather.cc b/src/ops/gather/cuda/gather.cc
new file mode 100644
index 00000000..f6e6841a
--- /dev/null
+++ b/src/ops/gather/cuda/gather.cc
@@ -0,0 +1,93 @@
+#include "gather.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateGatherDescriptor(CudaHandle_t handle,
+    GatherCudaDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t dst,
+    infiniopTensorDescriptor_t data,
+    infiniopTensorDescriptor_t indices,
+    int axis) {
+    uint64_t data_ndim = data->ndim;
+    uint64_t indices_ndim = indices->ndim;
+    if(axis<0){
+        axis += data_ndim;
+    }
+    if(axis >= data_ndim){
+        return STATUS_BAD_PARAM;
+    }
+    
+    int j=0;
+    for (int i=0; i<dst->ndim; i++){
+        if(i<axis){
+            if (data->shape[i] != dst->shape[i]){
+                return STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+        else if(i == axis){
+            for(j=0; j<indices_ndim; j++){
+                if (indices->shape[j] != dst->shape[i]){
+                    return STATUS_BAD_TENSOR_SHAPE;
+                }
+                i++;
+            }
+        }
+        else{
+            if (data->shape[i - j] != dst->shape[i]){
+                return STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+    }
+    if (data->dt != F16 && data->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (!is_contiguous(data)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (!is_contiguous(indices)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (indices->dt != I8 && indices->dt != I16 && indices->dt != I32 && indices->dt != I64) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    DT dtype = data->dt;
+    DT indice_type = indices->dt;
+    uint64_t dst_size = std::accumulate(dst->shape, dst->shape + dst->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    uint64_t stride_axis = (data->strides)[axis];
+    uint64_t shape_axis = (data->shape)[axis];
+    uint64_t num_indices = std::accumulate(indices->shape, indices->shape + indices->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    int64_t *stride_axis_ptr, *shape_axis_ptr;
+    uint64_t *num_indices_ptr;
+    checkCudaErrorWithCode(cudaMalloc(&stride_axis_ptr, sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc(&shape_axis_ptr, sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc(&num_indices_ptr, sizeof(uint64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMemcpy(stride_axis_ptr, &stride_axis, sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMemcpy(shape_axis_ptr, &shape_axis, sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMemcpy(num_indices_ptr, &num_indices, sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED);
+ 
+
+    *desc_ptr = new GatherCudaDescriptor{
+    DevNvGpu,
+    dtype,
+    indice_type,
+    handle->device_id,
+    data_ndim,
+    dst_size,
+    stride_axis_ptr,
+    shape_axis_ptr,
+    num_indices_ptr,
+    static_cast<uint64_t>(handle->prop.maxGridSize[0]),
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyGatherDescriptor(GatherCudaDescriptor_t desc) {
+    checkCudaErrorWithCode(cudaFree(desc->stride_axis_ptr), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree(desc->shape_axis_ptr), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree(desc->num_indices_ptr), STATUS_EXECUTION_FAILED);
+    delete desc;
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/ops/gather/cuda/gather.cu b/src/ops/gather/cuda/gather.cu
new file mode 100644
index 00000000..6726aef0
--- /dev/null
+++ b/src/ops/gather/cuda/gather.cu
@@ -0,0 +1,76 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include "gather.cuh"
+
+template<typename Tdata, typename Idata>
+__global__ void gather(
+    Tdata *dst,
+    const Tdata *data,
+    const Idata *indices,
+    const int64_t *stride_axis,
+    const int64_t *shape_axis,
+    const uint64_t *num_indices,
+    uint64_t offset) {
+    uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    Idata index = indices[idx / *stride_axis % *num_indices];
+    
+    int64_t linearId = *shape_axis * *stride_axis * (idx / (*num_indices * *stride_axis)) + index * *stride_axis + idx % *stride_axis;
+
+    dst[idx] = data[linearId];
+
+
+}
+
+template<typename Tdata, typename Idata>
+infiniopStatus_t gather_nv_gpu(GatherCudaDescriptor_t desc, void *dst, void const *data, void const *indices, void *stream) {
+    if (desc->dst_size == 0) {
+        return STATUS_SUCCESS;
+    }
+    dim3 blockDims = dim3(std::min(static_cast<uint64_t>(256), desc->dst_size));
+    dim3 gridDims = dim3(std::min(ROUND_UP_DIV(desc->dst_size, blockDims.x), desc->max_grid_size));
+    uint64_t step = gridDims.x * blockDims.x;
+
+    const auto dst_ = reinterpret_cast<Tdata *>(dst);
+    const auto data_ = reinterpret_cast<Tdata const *>(data);
+    const auto indices_ = reinterpret_cast<Idata const *>(indices);
+
+    const int64_t *stride_axis = reinterpret_cast<const int64_t *>(desc->stride_axis_ptr);
+    const int64_t *shape_axis = reinterpret_cast<const int64_t *>(desc->shape_axis_ptr);
+    const uint64_t *num_indices = reinterpret_cast<const uint64_t *>(desc->num_indices_ptr);
+    
+    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+#pragma unroll
+    for (uint64_t i = 0; i < desc->dst_size; i += step) {
+        gather<Tdata, Idata><<<gridDims, blockDims, 0, cuda_stream>>>(
+            dst_, data_, indices_, stride_axis, shape_axis, num_indices, i);
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaGather(GatherCudaDescriptor_t desc,
+    void *dst,     
+    void const *data,
+    void const *indices,
+    void *stream){
+    checkCudaError(cudaSetDevice(desc->device_id));
+    if(desc->dtype == F32 && desc->indice_type == I32)
+    {
+        return gather_nv_gpu<float, int32_t>(desc, dst, data, indices, stream);
+    }
+    else if(desc->dtype == F16 && desc->indice_type == I32)
+    {
+        return gather_nv_gpu<half, int32_t>(desc, dst, data, indices, stream);
+    }
+    else if(desc->dtype == F32 && desc->indice_type == I64)
+    {
+        return gather_nv_gpu<float, int64_t>(desc, dst, data, indices, stream);
+    }
+    else if(desc->dtype == F16 && desc->indice_type == I64)
+    {
+        return gather_nv_gpu<half, int64_t>(desc, dst, data, indices, stream);
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
\ No newline at end of file
diff --git a/src/ops/gather/cuda/gather.cuh b/src/ops/gather/cuda/gather.cuh
new file mode 100644
index 00000000..f082d86a
--- /dev/null
+++ b/src/ops/gather/cuda/gather.cuh
@@ -0,0 +1,40 @@
+#ifndef __CUDA_GATHER_H__
+#define __CUDA_GATHER_H__
+
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+#include <cuda_fp16.h>
+#include <numeric>
+
+struct GatherCudaDescriptor {
+    Device device;
+    DT dtype;
+    DT indice_type;
+    int device_id;
+    uint64_t ndim;
+    uint64_t dst_size;
+    int64_t* stride_axis_ptr;
+    int64_t* shape_axis_ptr;
+    uint64_t* num_indices_ptr;
+    uint64_t max_grid_size;
+};
+
+typedef struct GatherCudaDescriptor *GatherCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateGatherDescriptor(CudaHandle_t,
+                                         GatherCudaDescriptor_t *,
+                                         infiniopTensorDescriptor_t dst,
+                                         infiniopTensorDescriptor_t data,
+                                         infiniopTensorDescriptor_t indices,
+                                         int axis);
+
+infiniopStatus_t cudaGather(GatherCudaDescriptor_t desc,
+                            void *dst,     
+                            void const *data,
+                            void const *indices,
+                            void *stream);
+
+infiniopStatus_t cudaDestroyGatherDescriptor(GatherCudaDescriptor_t desc);
+
+#endif
diff --git a/src/ops/gather/operator.cc b/src/ops/gather/operator.cc
new file mode 100644
index 00000000..40fdc52d
--- /dev/null
+++ b/src/ops/gather/operator.cc
@@ -0,0 +1,67 @@
+#include "../utils.h"
+#include "operators.h"
+
+#include "ops/gather/gather.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/gather_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#include "../../devices/cuda/cuda_handle.h"
+#include "cuda/gather.cuh"
+#endif
+
+__C infiniopStatus_t infiniopCreateGatherDescriptor(
+    infiniopHandle_t handle,
+    infiniopGatherDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t dst,
+    infiniopTensorDescriptor_t data,
+    infiniopTensorDescriptor_t indices,
+    int axis){ 
+    switch (handle->device)
+    {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuCreateGatherDescriptor(handle, (GatherCpuDescriptor_t *)desc_ptr, dst, data, indices, axis);
+#endif        
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu:
+        return cudaCreateGatherDescriptor((CudaHandle_t)handle, (GatherCudaDescriptor_t *)desc_ptr, dst, data, indices, axis);
+#endif        
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc,
+    void *dst, 
+    void const *data,
+    void const *indices,
+    void *stream){
+    switch (desc->device)
+    {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuGather((GatherCpuDescriptor_t)desc, dst, data, indices, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu:
+        return cudaGather((GatherCudaDescriptor_t)desc, dst, data, indices, stream);
+#endif          
+    }
+   return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc){
+    switch (desc->device)
+    {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuDestroyGatherDescriptor((GatherCpuDescriptor_t)desc);
+#endif        
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu:
+        return cudaDestroyGatherDescriptor((GatherCudaDescriptor_t)desc);
+#endif        
+    }
+   return STATUS_BAD_DEVICE;
+}
\ No newline at end of file
diff --git a/src/ops/gather_elements/cpu/gather_elements_cpu.cc b/src/ops/gather_elements/cpu/gather_elements_cpu.cc
new file mode 100644
index 00000000..2dd7d683
--- /dev/null
+++ b/src/ops/gather_elements/cpu/gather_elements_cpu.cc
@@ -0,0 +1,113 @@
+#include "gather_elements_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+
+
+infiniopStatus_t cpuCreateGatherElementsDescriptor(infiniopHandle_t,
+    GatherElementsCpuDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t dst,
+    infiniopTensorDescriptor_t data,
+    infiniopTensorDescriptor_t indices,
+    int axis)
+{
+    uint64_t data_ndim = data->ndim;
+    uint64_t indices_ndim = indices->ndim;
+    if (indices_ndim > data_ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if(axis<0){
+        axis = axis + dst->ndim;
+    }
+
+    for(size_t i = 0; i < indices_ndim; i++){
+        if(indices->shape[i] != dst->shape[i]){
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+        if(i == axis){
+            if (data->shape[i] < indices->shape[i]) {
+                return STATUS_BAD_TENSOR_SHAPE;
+            }            
+        }
+        else{
+            if (data->shape[i] != indices->shape[i]) {
+                return STATUS_BAD_TENSOR_SHAPE;
+            }                
+        }
+    }
+    if (data->dt != F16 && data->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (!is_contiguous(data)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (!is_contiguous(indices)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (indices->dt != I8 && indices->dt != I16 && indices->dt != I32 && indices->dt != I64) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    DT dtype = data->dt;
+    DT indice_type = indices->dt;
+    uint64_t indices_size = std::accumulate(indices->shape, indices->shape + indices->ndim, 1ULL, std::multiplies<uint64_t>());
+    uint64_t stride_axis = (data->strides)[axis];
+    uint64_t shape_axis = (data->shape)[axis];
+    uint64_t num_indices = indices->shape[axis];
+    *desc_ptr = new GatherElementsCpuDescriptor{
+        DevCpu,
+        dtype,
+        indice_type,
+        indices_size,
+        stride_axis,
+        shape_axis,
+        num_indices,
+    };
+    return STATUS_SUCCESS;
+}
+
+
+
+infiniopStatus_t cpuDestroyGatherElementsDescriptor(GatherElementsCpuDescriptor_t desc){
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template<typename Tdata, typename Idata>
+void gather_cpu(GatherElementsCpuDescriptor_t desc,
+    void *dst, 
+    void const *data,
+    void const *indices){
+    auto dstPtr = reinterpret_cast<Tdata*>(dst);
+    auto dataPtr = reinterpret_cast<Tdata const*>(data);
+    auto indicePtr = reinterpret_cast<Idata const*>(indices);
+    for(uint64_t i = 0; i < desc->indices_size; i++)
+    {
+        uint64_t index = indicePtr[i];
+        int64_t linearId = desc->shape_axis * desc->stride_axis * (i / (desc->num_indices * desc->stride_axis)) + index * desc->stride_axis + i % desc->stride_axis;
+        dstPtr[i] = dataPtr[linearId];
+    }
+}
+
+infiniopStatus_t cpuGatherElements(GatherElementsCpuDescriptor_t desc,
+    void *dst, 
+    void const *data,
+    void const *indices,
+    void *stream){
+    if(desc->dtype == F32 && desc->indice_type == I32)
+    {
+        gather_cpu<float, int32_t>(desc, dst, data, indices);
+    }
+    else if(desc->dtype == F16 && desc->indice_type == I32)
+    {
+        gather_cpu<uint16_t, int32_t>(desc, dst, data, indices);
+    }
+    else if(desc->dtype == F32 && desc->indice_type == I64)
+    {
+        gather_cpu<float, int64_t>(desc, dst, data, indices);
+    }
+    else if(desc->dtype == F16 && desc->indice_type == I64)
+    {
+        gather_cpu<uint16_t, int64_t>(desc, dst, data, indices);
+    }
+
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/ops/gather_elements/cpu/gather_elements_cpu.h b/src/ops/gather_elements/cpu/gather_elements_cpu.h
new file mode 100644
index 00000000..aebbdf49
--- /dev/null
+++ b/src/ops/gather_elements/cpu/gather_elements_cpu.h
@@ -0,0 +1,31 @@
+#ifndef __CPU_GATHER_ELEMENTS_H__
+#define __CPU_GATHER_ELEMENTS_H__
+#include "operators.h"
+
+struct GatherElementsCpuDescriptor {
+    Device device;
+    DT dtype;
+    DT indice_type;
+    uint64_t indices_size;
+    uint64_t stride_axis;
+    uint64_t shape_axis;
+    uint64_t num_indices;
+};
+
+typedef GatherElementsCpuDescriptor *GatherElementsCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateGatherElementsDescriptor(infiniopHandle_t,
+                                        GatherElementsCpuDescriptor_t *,
+                                        infiniopTensorDescriptor_t dst,
+                                        infiniopTensorDescriptor_t data,
+                                        infiniopTensorDescriptor_t indices,
+                                        int const axis);
+
+infiniopStatus_t cpuGatherElements(GatherElementsCpuDescriptor_t desc,
+                       void *dst,     
+                       void const *data,
+                       void const *indices,
+                       void *stream);
+
+infiniopStatus_t cpuDestroyGatherElementsDescriptor(GatherElementsCpuDescriptor_t desc);
+#endif
\ No newline at end of file
diff --git a/src/ops/gather_elements/cuda/gather_elements.cc b/src/ops/gather_elements/cuda/gather_elements.cc
new file mode 100644
index 00000000..0604f436
--- /dev/null
+++ b/src/ops/gather_elements/cuda/gather_elements.cc
@@ -0,0 +1,86 @@
+#include "gather_elements.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateGatherElementsDescriptor(CudaHandle_t handle,
+    GatherElementsCudaDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t dst,
+    infiniopTensorDescriptor_t data,
+    infiniopTensorDescriptor_t indices,
+    int axis) {
+    uint64_t data_ndim = data->ndim;
+    uint64_t indices_ndim = indices->ndim;
+    if (indices_ndim > data_ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if(axis<0){
+        axis = axis + dst->ndim;
+    }
+
+    for(size_t i = 0; i < indices_ndim; i++){
+        if(indices->shape[i] != dst->shape[i]){
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+        if(i == axis){
+            if (data->shape[i] < indices->shape[i]) {
+                return STATUS_BAD_TENSOR_SHAPE;
+            }            
+        }
+        else{
+            if (data->shape[i] != indices->shape[i]) {
+                return STATUS_BAD_TENSOR_SHAPE;
+            }                
+        }
+    }
+    if (data->dt != F16 && data->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (!is_contiguous(data)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (!is_contiguous(indices)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (indices->dt != I8 && indices->dt != I16 && indices->dt != I32 && indices->dt != I64) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    DT dtype = data->dt;
+    DT indice_type = indices->dt;
+    uint64_t indices_size = std::accumulate(indices->shape, indices->shape + indices->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    uint64_t stride_axis = (data->strides)[axis];
+    uint64_t shape_axis = (data->shape)[axis];
+    uint64_t num_indices = indices->shape[axis];
+
+    int64_t *stride_axis_ptr, *shape_axis_ptr, *num_indices_ptr;
+    checkCudaErrorWithCode(cudaMalloc(&stride_axis_ptr, sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc(&shape_axis_ptr, sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc(&num_indices_ptr, sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMemcpy(stride_axis_ptr, &stride_axis, sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMemcpy(shape_axis_ptr, &shape_axis, sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMemcpy(num_indices_ptr, &num_indices, sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED);
+ 
+
+    *desc_ptr = new GatherElementsCudaDescriptor{
+    DevNvGpu,
+    dtype,
+    indice_type,
+    handle->device_id,
+    data_ndim,
+    indices_size,
+    stride_axis_ptr,
+    shape_axis_ptr,
+    num_indices_ptr,
+    static_cast<uint64_t>(handle->prop.maxGridSize[0]),
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyGatherElementsDescriptor(GatherElementsCudaDescriptor_t desc) {
+    checkCudaErrorWithCode(cudaFree(desc->stride_axis_ptr), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree(desc->shape_axis_ptr), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree(desc->num_indices_ptr), STATUS_EXECUTION_FAILED);
+    delete desc;
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/ops/gather_elements/cuda/gather_elements.cu b/src/ops/gather_elements/cuda/gather_elements.cu
new file mode 100644
index 00000000..b445c40b
--- /dev/null
+++ b/src/ops/gather_elements/cuda/gather_elements.cu
@@ -0,0 +1,74 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include "gather_elements.cuh"
+
+template<typename Tdata, typename Idata>
+__global__ void gather(
+    Tdata *dst,
+    const Tdata *data,
+    const Idata *indices,
+    const int64_t *stride_axis,
+    const int64_t *shape_axis,
+    const int64_t *num_indices,
+    uint64_t offset) {
+    uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    Idata index = indices[idx];
+    int64_t linearId = *shape_axis * *stride_axis * (idx / (*num_indices * *stride_axis)) + index * *stride_axis + idx % *stride_axis;
+    dst[idx] = data[linearId];
+
+}
+
+template<typename Tdata, typename Idata>
+infiniopStatus_t gather_nv_gpu(GatherElementsCudaDescriptor_t desc, void *dst, void const *data, void const *indices, void *stream) {
+    if (desc->indices_size == 0) {
+        return STATUS_SUCCESS;
+    }
+    dim3 blockDims = dim3(std::min(static_cast<uint64_t>(256), desc->indices_size));
+    dim3 gridDims = dim3(std::min(ROUND_UP_DIV(desc->indices_size, blockDims.x), desc->max_grid_size));
+    uint64_t step = gridDims.x * blockDims.x;
+
+    const auto dst_ = reinterpret_cast<Tdata *>(dst);
+    const auto data_ = reinterpret_cast<Tdata const *>(data);
+    const auto indices_ = reinterpret_cast<Idata const *>(indices);
+
+    const int64_t *stride_axis = reinterpret_cast<const int64_t *>(desc->stride_axis_ptr);
+    const int64_t *shape_axis = reinterpret_cast<const int64_t *>(desc->shape_axis_ptr);
+    const int64_t *num_indices = reinterpret_cast<const int64_t *>(desc->num_indices_ptr);
+    
+    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+#pragma unroll
+    for (uint64_t i = 0; i < desc->indices_size; i += step) {
+        gather<Tdata, Idata><<<gridDims, blockDims, 0, cuda_stream>>>(
+            dst_, data_, indices_, stride_axis, shape_axis, num_indices, i);
+    }
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaGatherElements(GatherElementsCudaDescriptor_t desc,
+    void *dst,     
+    void const *data,
+    void const *indices,
+    void *stream){
+    checkCudaError(cudaSetDevice(desc->device_id));
+    if(desc->dtype == F32 && desc->indice_type == I32)
+    {
+        return gather_nv_gpu<float, int32_t>(desc, dst, data, indices, stream);
+    }
+    else if(desc->dtype == F16 && desc->indice_type == I32)
+    {
+        return gather_nv_gpu<half, int32_t>(desc, dst, data, indices, stream);
+    }
+    else if(desc->dtype == F32 && desc->indice_type == I64)
+    {
+        return gather_nv_gpu<float, int64_t>(desc, dst, data, indices, stream);
+    }
+    else if(desc->dtype == F16 && desc->indice_type == I64)
+    {
+        return gather_nv_gpu<half, int64_t>(desc, dst, data, indices, stream);
+    }
+
+    return STATUS_BAD_TENSOR_DTYPE;
+}
\ No newline at end of file
diff --git a/src/ops/gather_elements/cuda/gather_elements.cuh b/src/ops/gather_elements/cuda/gather_elements.cuh
new file mode 100644
index 00000000..ac75763b
--- /dev/null
+++ b/src/ops/gather_elements/cuda/gather_elements.cuh
@@ -0,0 +1,40 @@
+#ifndef __CUDA_GATHER_H__
+#define __CUDA_GATHER_H__
+
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+#include <cuda_fp16.h>
+#include <numeric>
+
+struct GatherElementsCudaDescriptor {
+    Device device;
+    DT dtype;
+    DT indice_type;
+    int device_id;
+    uint64_t ndim;
+    uint64_t indices_size;
+    int64_t* stride_axis_ptr;
+    int64_t* shape_axis_ptr;
+    int64_t* num_indices_ptr;
+    uint64_t max_grid_size;
+};
+
+typedef struct GatherElementsCudaDescriptor *GatherElementsCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateGatherElementsDescriptor(CudaHandle_t,
+                                         GatherElementsCudaDescriptor_t *,
+                                         infiniopTensorDescriptor_t dst,
+                                         infiniopTensorDescriptor_t data,
+                                         infiniopTensorDescriptor_t indices,
+                                         int axis);
+
+infiniopStatus_t cudaGatherElements(GatherElementsCudaDescriptor_t desc,
+                            void *dst,     
+                            void const *data,
+                            void const *indices,
+                            void *stream);
+
+infiniopStatus_t cudaDestroyGatherElementsDescriptor(GatherElementsCudaDescriptor_t desc);
+
+#endif
diff --git a/src/ops/gather_elements/operator.cc b/src/ops/gather_elements/operator.cc
new file mode 100644
index 00000000..980c39b0
--- /dev/null
+++ b/src/ops/gather_elements/operator.cc
@@ -0,0 +1,67 @@
+#include "../utils.h"
+#include "operators.h"
+
+#include "ops/gather_elements/gather_elements.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/gather_elements_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#include "../../devices/cuda/cuda_handle.h"
+#include "cuda/gather_elements.cuh"
+#endif
+
+__C infiniopStatus_t infiniopCreateGatherElementsDescriptor(
+    infiniopHandle_t handle,
+    infiniopGatherElementsDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t dst,
+    infiniopTensorDescriptor_t data,
+    infiniopTensorDescriptor_t indices,
+    int axis){ 
+    switch (handle->device)
+    {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuCreateGatherElementsDescriptor(handle, (GatherElementsCpuDescriptor_t *)desc_ptr, dst, data, indices, axis);
+#endif        
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu:
+        return cudaCreateGatherElementsDescriptor((CudaHandle_t)handle, (GatherElementsCudaDescriptor_t *)desc_ptr, dst, data, indices, axis);
+#endif        
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopGatherElements(infiniopGatherElementsDescriptor_t desc,
+    void *dst, 
+    void const *data,
+    void const *indices,
+    void *stream){
+    switch (desc->device)
+    {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuGatherElements((GatherElementsCpuDescriptor_t)desc, dst, data, indices, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu:
+        return cudaGatherElements((GatherElementsCudaDescriptor_t)desc, dst, data, indices, stream);
+#endif          
+    }
+   return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyGatherElementsDescriptor(infiniopGatherElementsDescriptor_t desc){
+    switch (desc->device)
+    {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuDestroyGatherElementsDescriptor((GatherElementsCpuDescriptor_t)desc);
+#endif        
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu:
+        return cudaDestroyGatherElementsDescriptor((GatherElementsCudaDescriptor_t)desc);
+#endif        
+    }
+   return STATUS_BAD_DEVICE;
+}
\ No newline at end of file
diff --git a/src/ops/reduce/cpu/reduce_cpu.cc b/src/ops/reduce/cpu/reduce_cpu.cc
new file mode 100644
index 00000000..b41f557d
--- /dev/null
+++ b/src/ops/reduce/cpu/reduce_cpu.cc
@@ -0,0 +1,178 @@
+#include "reduce_cpu.h"
+#include "../../utils.h"
+
+infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
+                                           ReduceCpuDescriptor_t *desc_ptr,
+                                           infiniopTensorDescriptor_t dst,
+                                           infiniopTensorDescriptor_t src,
+                                           int *axis,
+                                           const int num_axis,
+                                           int keep_dims,
+                                           int reduce_type){
+    uint64_t ndim = src->ndim;
+    int n = num_axis;
+
+    for (int i=0; i<n; i++)
+    {
+        if(axis[i]<0){
+            axis[i] += ndim;
+        }
+        if(axis[i] >= ndim){
+            return STATUS_BAD_PARAM;
+        }
+        if(keep_dims == 1){
+            if(dst->shape[axis[i]] != 1){
+                return STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+    }
+    
+    if (!is_contiguous(src) || !is_contiguous(dst)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (src->dt != dst->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (src->dt != F16 && src->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (reduce_type > 3) {
+        return STATUS_BAD_PARAM;
+    }
+    DT dtype = src->dt;
+    uint64_t dst_size = std::accumulate(dst->shape, dst->shape + dst->ndim, 1ULL, std::multiplies<uint64_t>());
+    uint64_t data_size = std::accumulate(src->shape, src->shape + ndim, 1ULL, std::multiplies<uint64_t>());
+    uint64_t *shape_axis = new uint64_t[n];
+    uint64_t *stride_axis = new uint64_t[n];
+    uint64_t add_dim = 1;
+    for(int i = 0; i<n; i++){
+        shape_axis[i] =  src->shape[axis[i]];
+        add_dim *= shape_axis[i];
+        stride_axis[i] = src->strides[axis[i]];
+    }
+    
+    *desc_ptr = new ReduceCpuDescriptor{
+        DevCpu,
+        dtype,
+        dst_size,
+        data_size,
+        stride_axis,
+        shape_axis,
+        add_dim,
+        n,
+        reduce_type,
+    };
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuGetReduceWorkspaceSize(ReduceCpuDescriptor_t desc, uint64_t *size) {
+    
+    *size += desc->dst_size * sizeof(float);
+
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t cpuDestroyReduceDescriptor(ReduceCpuDescriptor_t desc) {
+    delete[] desc->shape_axis;
+    delete[] desc->stride_axis;
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+void reduce(ReduceCpuDescriptor_t desc, float *workspacePtr, Tdata *dstPtr, Tdata const *srcPtr, uint64_t i, uint64_t index){
+    switch (desc->reduce_mode)
+    {
+    case 0:
+        if constexpr (std::is_same<Tdata, uint16_t>::value) {
+            workspacePtr[index] = std::fmax(workspacePtr[index], f16_to_f32(srcPtr[i]));
+            dstPtr[index] = f32_to_f16(workspacePtr[index]);
+        } else {
+            dstPtr[index] = std::max(dstPtr[index], srcPtr[i]);  
+        }        
+        break;
+    case 1:
+        if constexpr (std::is_same<Tdata, uint16_t>::value) {
+            workspacePtr[index] = std::fmin(workspacePtr[index], f16_to_f32(srcPtr[i]));
+            dstPtr[index] = f32_to_f16(workspacePtr[index]);
+        } else {
+            dstPtr[index] = std::min(dstPtr[index], srcPtr[i]);  
+        }  
+        break;
+    default:
+        if constexpr (std::is_same<Tdata, uint16_t>::value) {
+            workspacePtr[index] += f16_to_f32(srcPtr[i]);
+        }
+        else{
+            dstPtr[index] += srcPtr[i];
+        }
+        break;
+    }
+}
+
+template <typename Tdata>
+void reduce_cpu(ReduceCpuDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src){
+    auto dstPtr = reinterpret_cast<Tdata*>(dst);
+    auto srcPtr = reinterpret_cast<Tdata const*>(src);
+    auto workspacePtr = reinterpret_cast<float *>(workspace);
+    
+    if constexpr (std::is_same<Tdata, uint16_t>::value){
+        switch (desc->reduce_mode)
+        {
+            case 0:
+                std::fill(workspacePtr, workspacePtr + desc->dst_size, -std::numeric_limits<float>::infinity());
+                break;
+            case 1:
+                std::fill(workspacePtr, workspacePtr + desc->dst_size, std::numeric_limits<float>::infinity());
+                break;
+            default:
+                std::fill(workspacePtr, workspacePtr + desc->dst_size, 0);
+                break;
+        }
+    }
+    for(uint64_t i = 0; i < desc->data_size; i++){
+        uint64_t index = i;
+#pragma unroll  
+        for(int j = 0; j < desc->axis_num; j++){
+            index = index / (desc->stride_axis[j] * desc->shape_axis[j]) * desc->stride_axis[j] + index % desc->stride_axis[j];
+        }
+        reduce<Tdata>(desc, workspacePtr, dstPtr, srcPtr, i, index);
+    }
+    if(desc->reduce_mode == 2){
+        for(int k = 0; k < desc->dst_size; k++){
+            if constexpr (std::is_same<Tdata, uint16_t>::value) {
+                dstPtr[k] = f32_to_f16(workspacePtr[k] / desc->add_dim);
+            }
+            else{
+                dstPtr[k] /= desc->add_dim;
+            }
+        }
+    }
+    else if(desc->reduce_mode == 3){
+        if constexpr (std::is_same<Tdata, uint16_t>::value){
+            for(int k = 0; k < desc->dst_size; k++){
+                dstPtr[k] = f32_to_f16(workspacePtr[k]);
+            }
+        }
+    }
+
+}
+
+infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc,
+    void *workspace,
+    uint64_t workspace_size,
+    void *dst,     
+    void const *src,
+    void *stream){
+    
+    if(desc->dtype == F32)
+    {
+        reduce_cpu<float>(desc, workspace, workspace_size, dst, src);
+    }
+    else if(desc->dtype == F16)
+    {
+        reduce_cpu<uint16_t>(desc, workspace, workspace_size, dst, src);
+    }    
+    return STATUS_SUCCESS;        
+}
\ No newline at end of file
diff --git a/src/ops/reduce/cpu/reduce_cpu.h b/src/ops/reduce/cpu/reduce_cpu.h
new file mode 100644
index 00000000..1d2bfd85
--- /dev/null
+++ b/src/ops/reduce/cpu/reduce_cpu.h
@@ -0,0 +1,45 @@
+#ifndef __CPU_REDUCE_H__
+#define __CPU_REDUCE_H__
+
+#include "../../../devices/cpu/common_cpu.h"
+#include "operators.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <numeric>
+
+struct ReduceCpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t dst_size;
+    uint64_t data_size;
+    uint64_t* stride_axis;
+    uint64_t* shape_axis;
+    uint64_t add_dim;
+    int axis_num;
+    int reduce_mode;
+};
+
+typedef struct ReduceCpuDescriptor *ReduceCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
+                                           ReduceCpuDescriptor_t *,
+                                           infiniopTensorDescriptor_t dst,
+                                           infiniopTensorDescriptor_t src,
+                                           int *axis,
+                                           const int num_axis,
+                                           int const keepdims,
+                                           int reduce_type);
+
+infiniopStatus_t cpuGetReduceWorkspaceSize(ReduceCpuDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc,
+                            void *workspace,
+                            uint64_t workspace_size,
+                            void *dst,     
+                            void const *src,
+                            void *stream);
+
+infiniopStatus_t cpuDestroyReduceDescriptor(ReduceCpuDescriptor_t desc);
+
+#endif
diff --git a/src/ops/reduce/cuda/reduce.cc b/src/ops/reduce/cuda/reduce.cc
new file mode 100644
index 00000000..c2bcb71f
--- /dev/null
+++ b/src/ops/reduce/cuda/reduce.cc
@@ -0,0 +1,92 @@
+#include "reduce.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateReduceDescriptor(CudaHandle_t handle,
+    ReduceCudaDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t dst,
+    infiniopTensorDescriptor_t src,
+    int *axis,
+    int num_axis,
+    int keep_dims,
+    int reduce_type) {
+    uint64_t ndim = src->ndim;
+    int n = num_axis;
+
+    for (int i=0; i<n; i++)
+    {
+        if(axis[i]<0){
+            axis[i] += ndim;
+        }
+        if(axis[i] >= ndim){
+            return STATUS_BAD_PARAM;
+        }
+        if(keep_dims == 1){
+            if(dst->shape[axis[i]] != 1){
+                return STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+    }
+    if (!is_contiguous(src) || !is_contiguous(dst)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (src->dt != dst->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (src->dt != F16 && src->dt != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (reduce_type > 3) {
+        return STATUS_BAD_PARAM;
+    }
+    DT dtype = src->dt;
+    uint64_t dst_size = std::accumulate(dst->shape, dst->shape + dst->ndim, 1ULL, std::multiplies<uint64_t>());
+    uint64_t data_size = std::accumulate(src->shape, src->shape + ndim, 1ULL, std::multiplies<uint64_t>());
+
+    int64_t *shape_axis = new int64_t[n];
+    int64_t *stride_axis = new int64_t[n];
+    uint64_t add_dim = 1;
+    for(int i = 0; i<n; i++){
+        shape_axis[i] =  src->shape[axis[i]];
+        add_dim *= shape_axis[i];
+        stride_axis[i] = src->strides[axis[i]];
+    }
+
+    int64_t *stride_axis_ptr, *shape_axis_ptr;
+    checkCudaErrorWithCode(cudaMalloc(&stride_axis_ptr, n * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMalloc(&shape_axis_ptr, n * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMemcpy(stride_axis_ptr, stride_axis, n * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED);
+    checkCudaErrorWithCode(cudaMemcpy(shape_axis_ptr, shape_axis, n * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_MEMORY_NOT_ALLOCATED);
+
+    delete[] shape_axis;
+    delete[] stride_axis;
+    *desc_ptr = new ReduceCudaDescriptor{
+    DevNvGpu,
+    dtype,
+    handle->device_id,
+    ndim,
+    dst_size,
+    data_size,
+    add_dim,
+    n,
+    stride_axis_ptr,
+    shape_axis_ptr,
+    reduce_type,
+    static_cast<uint64_t>(handle->prop.maxGridSize[0]),
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaGetReduceWorkspaceSize(ReduceCudaDescriptor_t desc, uint64_t *size){
+    *size  = 0;
+    return STATUS_SUCCESS;
+    
+}
+
+infiniopStatus_t cudaDestroyReduceDescriptor(ReduceCudaDescriptor_t desc) {
+    checkCudaErrorWithCode(cudaFree(desc->stride_axis_ptr), STATUS_EXECUTION_FAILED);
+    checkCudaErrorWithCode(cudaFree(desc->shape_axis_ptr), STATUS_EXECUTION_FAILED);
+    delete desc;
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/ops/reduce/cuda/reduce.cu b/src/ops/reduce/cuda/reduce.cu
new file mode 100644
index 00000000..306f8628
--- /dev/null
+++ b/src/ops/reduce/cuda/reduce.cu
@@ -0,0 +1,169 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include "reduce.cuh"
+#include <inttypes.h>
+
+template<typename Tdata>
+__global__ void reduce(
+    Tdata *dst,
+    Tdata const*src,
+    const int64_t *stride_axis,
+    const int64_t *shape_axis,
+    int axis_num,
+    uint64_t add_dim,
+    uint64_t offset,
+    int reduce_type) {
+    int64_t threadId = threadIdx.y * blockDim.x + threadIdx.x;
+    int64_t idx = offset + threadId + blockIdx.x * add_dim;
+    
+    extern __shared__ unsigned char sharedMemory[];
+
+    Tdata* sharedData = reinterpret_cast<Tdata*>(sharedMemory);
+
+
+        int64_t shift_idx = idx;
+    #pragma unroll
+        for (int i=0; i<axis_num; i++){
+            shift_idx = shift_idx/(stride_axis[i] * shape_axis[i]) * (stride_axis[i] * shape_axis[i]) + shift_idx % shape_axis[i] * stride_axis[i] + shift_idx / shape_axis[i] % stride_axis[i];
+        }
+        sharedData[threadId] = src[shift_idx];
+        
+        __syncthreads();
+
+        int64_t pre_s = add_dim;
+        int64_t s = add_dim / 2;
+    #pragma unroll
+        while(s > 0) {
+            if (threadId < s && (threadId + s) < pre_s) {
+                switch(reduce_type) {
+                    case 0:
+                        if constexpr (std::is_same<Tdata, half>::value) {
+                            sharedData[threadId] = __hmax(sharedData[threadId], sharedData[threadId + s]);
+                        } else {
+                            sharedData[threadId] = fmaxf(sharedData[threadId], sharedData[threadId + s]);
+                        }
+                        break;
+                    case 1: 
+                        if constexpr (std::is_same<Tdata, half>::value) {
+                            sharedData[threadId] = __hmin(sharedData[threadId], sharedData[threadId + s]);
+                        } else {
+                            sharedData[threadId] = fminf(sharedData[threadId], sharedData[threadId + s]);
+                        }
+                        break;
+                    default: 
+                        if constexpr (std::is_same<Tdata, half>::value) {
+                            sharedData[threadId] = __hadd(sharedData[threadId], sharedData[threadId + s]);
+                        } else {
+                            sharedData[threadId] += sharedData[threadId + s];
+                        }
+                        break;
+                }
+            }
+            __syncthreads();
+            if (pre_s % 2 != 0 && threadId == 0) {
+                switch(reduce_type) {
+                    case 0:
+                        if constexpr (std::is_same<Tdata, half>::value) {
+                            sharedData[0] = __hmax(sharedData[0], sharedData[pre_s - 1]);
+                        } else {
+                            sharedData[0] = fmaxf(sharedData[0], sharedData[pre_s - 1]);
+                        }
+                        break;
+                    case 1: 
+                        if constexpr (std::is_same<Tdata, half>::value) {
+                            sharedData[0] = __hmin(sharedData[0], sharedData[pre_s - 1]);
+                        } else {
+                            sharedData[0] = fminf(sharedData[0], sharedData[pre_s - 1]);
+                        }
+                        break;
+                    default: 
+                        if constexpr (std::is_same<Tdata, half>::value) {
+                            sharedData[0] = __hadd(sharedData[0], sharedData[pre_s-1]);
+                        } else {
+                            sharedData[0] += sharedData[pre_s-1];
+                        }
+                        break;
+                }
+            }
+            __syncthreads();
+
+            pre_s = s;
+            s = pre_s / 2;
+        }
+
+        if(threadId == 0)
+        {
+            int64_t index = shift_idx;
+        #pragma unroll  
+            for(int j = 0; j < axis_num; j++){
+                index = index / (stride_axis[j] * shape_axis[j]) * stride_axis[j] + index % stride_axis[j];
+            }
+            dst[index] = sharedData[threadId];
+            if(reduce_type == 2)
+            {
+                float shape = static_cast<float>(add_dim);
+                if constexpr (std::is_same<Tdata, half>::value) {
+                    dst[index] = __float2half(__half2float(dst[index]) / (shape));
+                    //dst[index] /= __float2half(shape);
+                } else {
+                    dst[index] /= shape;
+                }
+            }
+        }
+    
+}
+
+template<typename Tdata>
+infiniopStatus_t reduce_nv_gpu(ReduceCudaDescriptor_t desc, void *dst, void const *src, void *stream) {
+    if (desc->data_size == 0) {
+        return STATUS_SUCCESS;
+    }
+    const int64_t *stride_axis = reinterpret_cast<const int64_t *>(desc->stride_axis_ptr);
+    const int64_t *shape_axis = reinterpret_cast<const int64_t *>(desc->shape_axis_ptr);
+
+    dim3 blockDims, gridDims;
+    size_t sharedMemSize;
+    uint64_t step;
+    if (desc->add_dim > 1024){
+        blockDims = dim3(1024, ROUND_UP_DIV(desc->add_dim, 1024));
+        sharedMemSize = desc->add_dim * sizeof(Tdata);
+        step = blockDims.x * blockDims.y;
+    }else{
+        blockDims = dim3(desc->add_dim);
+        sharedMemSize = blockDims.x * sizeof(Tdata);
+        step = blockDims.x;
+    }
+    gridDims = dim3(std::min(ROUND_UP_DIV(desc->data_size, desc->add_dim), desc->max_grid_size));
+
+    step *= gridDims.x;
+    
+
+    const auto dst_ = reinterpret_cast<Tdata *>(dst);
+    const auto src_ = reinterpret_cast<Tdata const *>(src);
+    
+    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+#pragma unroll
+        for (uint64_t i = 0; i < desc->data_size; i += step) {
+            reduce<Tdata><<<gridDims, blockDims, sharedMemSize, cuda_stream>>>(
+                dst_, src_, stride_axis, shape_axis, desc->axis_num, desc->add_dim, i, desc->reduce_mode);
+        }
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaReduce(ReduceCudaDescriptor_t desc,
+    void *workspace, 
+    uint64_t workspace_size,
+    void *dst,     
+    void const *src,
+    void *stream){
+    checkCudaError(cudaSetDevice(desc->device_id));
+    if (desc->dtype == F16) {
+        return reduce_nv_gpu<half>(desc, dst, src, stream);
+    }
+    if (desc->dtype == F32) {
+        return reduce_nv_gpu<float>(desc, dst, src, stream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
\ No newline at end of file
diff --git a/src/ops/reduce/cuda/reduce.cuh b/src/ops/reduce/cuda/reduce.cuh
new file mode 100644
index 00000000..55973044
--- /dev/null
+++ b/src/ops/reduce/cuda/reduce.cuh
@@ -0,0 +1,47 @@
+#ifndef __CUDA_REDUCE_H__
+#define __CUDA_REDUCE_H__
+
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+#include <cuda_fp16.h>
+#include <numeric>
+
+struct ReduceCudaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    uint64_t ndim;
+    uint64_t dst_size;
+    uint64_t data_size;
+    uint64_t add_dim;
+    int axis_num;
+    int64_t* stride_axis_ptr;
+    int64_t* shape_axis_ptr;
+    int reduce_mode;
+    uint64_t max_grid_size;
+};
+
+typedef struct ReduceCudaDescriptor *ReduceCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateReduceDescriptor(CudaHandle_t,
+                                         ReduceCudaDescriptor_t *,
+                                         infiniopTensorDescriptor_t dst,
+                                         infiniopTensorDescriptor_t src,
+                                         int *axis,
+                                         int num_axis,
+                                         int keepdims,
+                                         int reduce_type);
+
+infiniopStatus_t cudaGetReduceWorkspaceSize(ReduceCudaDescriptor_t desc, uint64_t *size);                                         
+
+infiniopStatus_t cudaReduce(ReduceCudaDescriptor_t desc,
+                            void *workspace, 
+                            uint64_t workspace_size,
+                            void *dst,     
+                            void const *src,
+                            void *stream);
+
+infiniopStatus_t cudaDestroyReduceDescriptor(ReduceCudaDescriptor_t desc);
+
+#endif
diff --git a/src/ops/reduce/operator.cc b/src/ops/reduce/operator.cc
new file mode 100644
index 00000000..2ca06e28
--- /dev/null
+++ b/src/ops/reduce/operator.cc
@@ -0,0 +1,89 @@
+#include "../utils.h"
+#include "operators.h"
+#include "reduce.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/reduce_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#include "../../devices/cuda/common_cuda.h"
+#include "../../devices/cuda/cuda_handle.h"
+#include "cuda/reduce.cuh"
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+// TODO
+#endif
+
+__C infiniopStatus_t infiniopCreateReduceDescriptor(
+    infiniopHandle_t handle,
+    infiniopReduceDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t dst,
+    infiniopTensorDescriptor_t src,
+    int *axis,
+    const int num_axis,
+    int const keepdims,
+    int reduce_type) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateReduceDescriptor(handle, (ReduceCpuDescriptor_t *)desc_ptr, dst, src, axis, num_axis, keepdims, reduce_type);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateReduceDescriptor((CudaHandle_t) handle, (ReduceCudaDescriptor_t *) desc_ptr, dst, src, axis, num_axis, keepdims, reduce_type);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc,
+                                    void *workspace, 
+                                    uint64_t workspace_size,    
+                                    void *dst, 
+                                    void const *src,
+                                    void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuReduce((ReduceCpuDescriptor_t)desc, workspace, workspace_size, dst, src, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaReduce((ReduceCudaDescriptor_t)desc, workspace, workspace_size, dst, src, stream);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopGetReduceWorkspaceSize(infiniopReduceDescriptor_t desc, uint64_t *size) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuGetReduceWorkspaceSize((ReduceCpuDescriptor_t) desc, size);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaGetReduceWorkspaceSize((ReduceCudaDescriptor_t) desc, size);
+        }
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyReduceDescriptor(infiniopReduceDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyReduceDescriptor((ReduceCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaDestroyReduceDescriptor((ReduceCudaDescriptor_t)desc);
+        }
+
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
diff --git a/src/ops/reduce/reduce.h b/src/ops/reduce/reduce.h
new file mode 100644
index 00000000..e66b57a9
--- /dev/null
+++ b/src/ops/reduce/reduce.h
@@ -0,0 +1,31 @@
+#ifndef REDUCE_H
+#define REDUCE_H
+
+#include "export.h"
+#include "operators.h"
+
+typedef struct ReduceDescriptor {
+    Device device;
+} ReduceDescriptor;
+typedef ReduceDescriptor *infiniopReduceDescriptor_t;
+
+__C infiniopStatus_t infiniopCreateReduceDescriptor(infiniopHandle_t handle,
+                                                    infiniopReduceDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t dst,
+                                                    infiniopTensorDescriptor_t src,
+                                                    int *axis,
+                                                    const int num_axis,
+                                                    int const keepdims,
+                                                    int reduce_type);
+
+__C infiniopStatus_t infiniopGetReduceWorkspaceSize(infiniopReduceDescriptor_t desc, uint64_t *size);
+
+__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc,
+                                    void *workspace, 
+                                    uint64_t workspace_size,
+                                    void *dst, 
+                                    void const*src,
+                                    void *stream);
+
+__C infiniopStatus_t infiniopDestroyReduceDescriptor(infiniopReduceDescriptor_t desc);
+#endif
diff --git a/src/ops/reduceMax/operator.cc b/src/ops/reduceMax/operator.cc
new file mode 100644
index 00000000..53503eee
--- /dev/null
+++ b/src/ops/reduceMax/operator.cc
@@ -0,0 +1,50 @@
+#include "../reduce/reduce.h"
+#include "../utils.h"
+#include "ops/reduceMax/reduceMax.h"
+
+struct _ReduceMaxDescriptor {
+    Device device;
+    infiniopReduceDescriptor_t reduce_desc;
+    uint64_t workspace_size;
+};
+
+typedef struct _ReduceMaxDescriptor *_ReduceMaxDescriptor_t;
+__C __export infiniopStatus_t infiniopCreateReduceMaxDescriptor(infiniopHandle_t handle,
+                                                              infiniopReduceMaxDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t dst,
+                                                              infiniopTensorDescriptor_t src,
+                                                              int *axis,
+                                                              const int num_axis, 
+                                                              int const keepdims) {
+infiniopReduceDescriptor_t reduce_desc;
+CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, dst, src, axis, num_axis, keepdims, 0), STATUS_SUCCESS);
+uint64_t workspace_size = 0;
+CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS);
+
+*(_ReduceMaxDescriptor_t *) desc_ptr = new _ReduceMaxDescriptor{
+handle->device,
+reduce_desc,
+workspace_size,
+};
+
+return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, uint64_t *size) {
+    *size = ((_ReduceMaxDescriptor_t) desc)->workspace_size;
+    return STATUS_SUCCESS;
+}
+__C __export infiniopStatus_t infiniopReduceMax(infiniopReduceMaxDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream) {
+auto _desc = (_ReduceMaxDescriptor_t) desc;
+if (workspace_size < _desc->workspace_size) {
+    return STATUS_MEMORY_NOT_ALLOCATED;
+}
+CHECK_STATUS(infiniopReduce(_desc->reduce_desc, workspace, workspace_size, dst, src, stream),STATUS_SUCCESS);
+return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc) {
+CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceMaxDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS);
+delete desc;
+return STATUS_SUCCESS;
+}
diff --git a/src/ops/reduceMean/operator.cc b/src/ops/reduceMean/operator.cc
new file mode 100644
index 00000000..245ff710
--- /dev/null
+++ b/src/ops/reduceMean/operator.cc
@@ -0,0 +1,51 @@
+#include "../reduce/reduce.h"
+#include "../utils.h"
+#include "ops/reduceMean/reduceMean.h"
+
+struct _ReduceMeanDescriptor {
+    Device device;
+    infiniopReduceDescriptor_t reduce_desc;
+    uint64_t workspace_size;
+};
+
+typedef struct _ReduceMeanDescriptor *_ReduceMeanDescriptor_t;
+__C __export infiniopStatus_t infiniopCreateReduceMeanDescriptor(infiniopHandle_t handle,
+                                                              infiniopReduceMeanDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t dst,
+                                                              infiniopTensorDescriptor_t src,
+                                                              int* axis,
+                                                              const int num_axis, 
+                                                              int const keepdims) {
+infiniopReduceDescriptor_t reduce_desc;
+CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, dst, src, axis, num_axis, keepdims, 2), STATUS_SUCCESS);
+uint64_t workspace_size = 0;
+CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS);
+
+*(_ReduceMeanDescriptor_t *) desc_ptr = new _ReduceMeanDescriptor{
+handle->device,
+reduce_desc,
+workspace_size,
+};
+
+return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, uint64_t *size) {
+    *size = ((_ReduceMeanDescriptor_t) desc)->workspace_size;
+    return STATUS_SUCCESS;
+}
+__C __export infiniopStatus_t infiniopReduceMean(infiniopReduceMeanDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream) {
+auto _desc = (_ReduceMeanDescriptor_t) desc;
+if (workspace_size < _desc->workspace_size) {
+    return STATUS_MEMORY_NOT_ALLOCATED;
+}
+CHECK_STATUS(infiniopReduce(_desc->reduce_desc, workspace, workspace_size, dst, src, stream),
+STATUS_SUCCESS);
+return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc) {
+CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceMeanDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS);
+delete desc;
+return STATUS_SUCCESS;
+}
diff --git a/src/ops/reduceMin/operator.cc b/src/ops/reduceMin/operator.cc
new file mode 100644
index 00000000..7d029f81
--- /dev/null
+++ b/src/ops/reduceMin/operator.cc
@@ -0,0 +1,51 @@
+#include "../reduce/reduce.h"
+#include "../utils.h"
+#include "ops/reduceMin/reduceMin.h"
+
+struct _ReduceMinDescriptor {
+    Device device;
+    infiniopReduceDescriptor_t reduce_desc;
+    uint64_t workspace_size;
+};
+
+typedef struct _ReduceMinDescriptor *_ReduceMinDescriptor_t;
+__C __export infiniopStatus_t infiniopCreateReduceMinDescriptor(infiniopHandle_t handle,
+                                                              infiniopReduceMinDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t dst,
+                                                              infiniopTensorDescriptor_t src,
+                                                              int* axis,
+                                                              const int num_axis, 
+                                                              int const keepdims) {
+infiniopReduceDescriptor_t reduce_desc;
+CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, dst, src, axis, num_axis, keepdims, 1), STATUS_SUCCESS);
+uint64_t workspace_size = 0;
+CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS);
+
+*(_ReduceMinDescriptor_t *) desc_ptr = new _ReduceMinDescriptor{
+handle->device,
+reduce_desc,
+workspace_size,
+};
+
+return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopGetReduceMinWorkspaceSize(infiniopReduceMinDescriptor_t desc, uint64_t *size) {
+    *size = ((_ReduceMinDescriptor_t) desc)->workspace_size;
+    return STATUS_SUCCESS;
+}
+__C __export infiniopStatus_t infiniopReduceMin(infiniopReduceMinDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream) {
+auto _desc = (_ReduceMinDescriptor_t) desc;
+if (workspace_size < _desc->workspace_size) {
+    return STATUS_MEMORY_NOT_ALLOCATED;
+}
+CHECK_STATUS(infiniopReduce(_desc->reduce_desc, workspace, workspace_size, dst, src, stream),
+STATUS_SUCCESS);
+return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyReduceMinDescriptor(infiniopReduceMinDescriptor_t desc) {
+CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceMinDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS);
+delete desc;
+return STATUS_SUCCESS;
+}
diff --git a/src/ops/reduceSum/operator.cc b/src/ops/reduceSum/operator.cc
new file mode 100644
index 00000000..ff4012fa
--- /dev/null
+++ b/src/ops/reduceSum/operator.cc
@@ -0,0 +1,51 @@
+#include "../reduce/reduce.h"
+#include "../utils.h"
+#include "ops/reduceSum/reduceSum.h"
+
+struct _ReduceSumDescriptor {
+    Device device;
+    infiniopReduceDescriptor_t reduce_desc;
+    uint64_t workspace_size;
+};
+
+typedef struct _ReduceSumDescriptor *_ReduceSumDescriptor_t;
+__C __export infiniopStatus_t infiniopCreateReduceSumDescriptor(infiniopHandle_t handle,
+                                                              infiniopReduceSumDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t dst,
+                                                              infiniopTensorDescriptor_t src,
+                                                              int* axis,
+                                                              const int num_axis, 
+                                                              int const keepdims) {
+infiniopReduceDescriptor_t reduce_desc;
+CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, dst, src, axis, num_axis, keepdims, 3), STATUS_SUCCESS);
+uint64_t workspace_size = 0;
+CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS);
+
+*(_ReduceSumDescriptor_t *) desc_ptr = new _ReduceSumDescriptor{
+handle->device,
+reduce_desc,
+workspace_size,
+};
+
+return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopGetReduceSumWorkspaceSize(infiniopReduceSumDescriptor_t desc, uint64_t *size) {
+    *size = ((_ReduceSumDescriptor_t) desc)->workspace_size;
+    return STATUS_SUCCESS;
+}
+__C __export infiniopStatus_t infiniopReduceSum(infiniopReduceSumDescriptor_t desc, void *workspace, uint64_t workspace_size, void *dst, void const *src, void *stream) {
+auto _desc = (_ReduceSumDescriptor_t) desc;
+if (workspace_size < _desc->workspace_size) {
+    return STATUS_MEMORY_NOT_ALLOCATED;
+}
+CHECK_STATUS(infiniopReduce(_desc->reduce_desc, workspace, workspace_size, dst, src, stream),
+STATUS_SUCCESS);
+return STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyReduceSumDescriptor(infiniopReduceSumDescriptor_t desc) {
+CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceSumDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS);
+delete desc;
+return STATUS_SUCCESS;
+}
diff --git a/src/ops/where/cpu/where_cpu.cc b/src/ops/where/cpu/where_cpu.cc
new file mode 100644
index 00000000..a0fe432d
--- /dev/null
+++ b/src/ops/where/cpu/where_cpu.cc
@@ -0,0 +1,76 @@
+#include "where_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+#include <algorithm>
+
+infiniopStatus_t cpuCreateWhereDescriptor(infiniopHandle_t handle,
+                                          WhereCpuDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t dst,
+                                          infiniopTensorDescriptor_t x,
+                                          infiniopTensorDescriptor_t y,
+                                          infiniopTensorDescriptor_t condition){
+    uint64_t ndim = condition->ndim;
+    if (ndim != x->ndim || ndim != y->ndim || ndim != dst->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }  
+    for (size_t i = 0; i < ndim; ++i) {
+        if (condition->shape[i] != x->shape[i] || condition->shape[i] != y->shape[i] || condition->shape[i] != dst->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (!is_contiguous(condition) || !is_contiguous(x) || !is_contiguous(y) || !is_contiguous(dst)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (x->dt != y->dt || x->dt != dst->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (x->dt != F16 && x->dt != F32 && x->dt != U16) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (condition->dt != U8) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    DT dtype = x->dt;
+    uint64_t data_size = std::accumulate(condition->shape, condition->shape + ndim, 1ULL, std::multiplies<uint64_t>());
+    *desc_ptr = new WhereCpuDescriptor{
+        DevCpu,
+        dtype,
+        data_size,
+    };
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyWhereDescriptor(WhereCpuDescriptor_t desc){
+    delete desc;
+    return STATUS_SUCCESS;    
+}
+
+template<typename Tdata>
+void where_cpu(WhereCpuDescriptor_t desc, void *dst, void const *x, void const *y, void const *condition){
+    auto dstPtr = reinterpret_cast<Tdata*>(dst);
+    auto x_Ptr = reinterpret_cast<Tdata const*>(x);
+    auto y_Ptr = reinterpret_cast<Tdata const*>(y);
+    auto c_Ptr = reinterpret_cast<uint8_t const*>(condition);
+    for(uint64_t i = 0; i < desc->data_size; i++){
+        dstPtr[i] = (c_Ptr[i]) ? y_Ptr[i] : x_Ptr[i];
+    }
+}
+
+infiniopStatus_t cpuWhere(WhereCpuDescriptor_t desc,
+                          void *dst, 
+                          void const *x,
+                          void const *y,
+                          void const *condition,
+                          void *stream){
+    if(desc->dtype == F32)
+    {
+        where_cpu<float>(desc, dst, x, y, condition);
+    }
+    else if(desc->dtype == F16)
+    {
+        where_cpu<uint16_t>(desc, dst, x, y, condition);
+    }
+    return STATUS_SUCCESS;    
+}
+
diff --git a/src/ops/where/cpu/where_cpu.h b/src/ops/where/cpu/where_cpu.h
new file mode 100644
index 00000000..0d185b4a
--- /dev/null
+++ b/src/ops/where/cpu/where_cpu.h
@@ -0,0 +1,28 @@
+#ifndef __CPU_WHERE_H__
+#define __CPU_WHERE_H__
+#include "operators.h"
+
+struct WhereCpuDescriptor {
+    Device device;
+    DT dtype;
+    uint64_t data_size;
+};
+
+typedef WhereCpuDescriptor *WhereCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateWhereDescriptor(infiniopHandle_t,
+                                        WhereCpuDescriptor_t *,
+                                        infiniopTensorDescriptor_t dst,
+                                        infiniopTensorDescriptor_t x,
+                                        infiniopTensorDescriptor_t y,
+                                        infiniopTensorDescriptor_t condition);
+
+infiniopStatus_t cpuWhere(WhereCpuDescriptor_t desc,
+                       void *dst, 
+                       void const *x,
+                       void const *y,
+                       void const *condition,
+                       void *stream);
+
+infiniopStatus_t cpuDestroyWhereDescriptor(WhereCpuDescriptor_t desc);
+#endif
\ No newline at end of file
diff --git a/src/ops/where/cuda/where.cu b/src/ops/where/cuda/where.cu
new file mode 100644
index 00000000..382af1fa
--- /dev/null
+++ b/src/ops/where/cuda/where.cu
@@ -0,0 +1,58 @@
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+#include "where.cuh"
+
+template<typename Tdata>
+__global__ void where(    
+    Tdata *dst,
+    const Tdata *x,
+    const Tdata *y,
+    const uint8_t *condition,
+    uint64_t offset){
+    uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
+
+    dst[idx] = (condition[idx]) ? y[idx] : x[idx];
+
+}
+
+
+template<typename Tdata>
+infiniopStatus_t where_nv_gpu(WhereCudaDescriptor_t desc, void *dst, void const *x, void const *y, void const *condition, void *stream) {
+    if (desc->data_size == 0) {
+        return STATUS_SUCCESS;
+    }
+    dim3 blockDims = dim3(std::min(static_cast<uint64_t>(256), desc->data_size));
+    dim3 gridDims = dim3(std::min(ROUND_UP_DIV(desc->data_size, blockDims.x), desc->max_grid_size));
+    uint64_t step = gridDims.x * blockDims.x;
+
+    const auto dst_ = reinterpret_cast<Tdata*>(dst);
+    const auto x_ = reinterpret_cast<Tdata const*>(x);
+    const auto y_ = reinterpret_cast<Tdata const*>(y);
+    const auto condition_ = reinterpret_cast<uint8_t const*>(condition);
+    
+    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+#pragma unroll
+    for (uint64_t i = 0; i < desc->data_size; i += step) {
+        printf("%ld\n", (int64_t)i);
+        where<Tdata><<<gridDims, blockDims, 0, cuda_stream>>>(dst_, x_, y_, condition_, i);
+    }
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaWhere(WhereCudaDescriptor_t desc,
+                            void *dst,                            
+                            void const *x,
+                            void const *y,
+                            void const *condition,
+                            void *stream) {
+    checkCudaError(cudaSetDevice(desc->device_id));
+    if (desc->dtype == F16) {
+        return where_nv_gpu<half>(desc, dst, x, y, condition, stream);
+    }
+    if (desc->dtype == F32) {
+        return where_nv_gpu<float>(desc, dst, x, y, condition, stream);
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
diff --git a/src/ops/where/cuda/where.cuh b/src/ops/where/cuda/where.cuh
new file mode 100644
index 00000000..9a96fe54
--- /dev/null
+++ b/src/ops/where/cuda/where.cuh
@@ -0,0 +1,37 @@
+#ifndef __CUDA_WHERE_H__
+#define __CUDA_WHERE_H__
+
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../../devices/cuda/cuda_handle.h"
+#include "operators.h"
+#include <cuda_fp16.h>
+#include <numeric>
+
+struct WhereCudaDescriptor {
+    Device device;
+    DT dtype;
+    int device_id;
+    uint64_t ndim;
+    uint64_t data_size;
+    uint64_t max_grid_size;
+};
+
+typedef struct WhereCudaDescriptor *WhereCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateWhereDescriptor(CudaHandle_t,
+                                         WhereCudaDescriptor_t *,
+                                         infiniopTensorDescriptor_t dst,
+                                         infiniopTensorDescriptor_t x,
+                                         infiniopTensorDescriptor_t y,
+                                         infiniopTensorDescriptor_t condition);
+
+infiniopStatus_t cudaWhere(WhereCudaDescriptor_t desc,
+                            void *dst, 
+                            void const *x,
+                            void const *y,
+                            void const *condition,
+                            void *stream);
+
+infiniopStatus_t cudaDestroyWhereDescriptor(WhereCudaDescriptor_t desc);
+
+#endif
diff --git a/src/ops/where/cuda/where_cuda.cc b/src/ops/where/cuda/where_cuda.cc
new file mode 100644
index 00000000..1ab20ed3
--- /dev/null
+++ b/src/ops/where/cuda/where_cuda.cc
@@ -0,0 +1,52 @@
+#include "where.cuh"
+#include "../../../devices/cuda/common_cuda.h"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateWhereDescriptor(CudaHandle_t handle,
+                                          WhereCudaDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t dst,
+                                          infiniopTensorDescriptor_t x,
+                                          infiniopTensorDescriptor_t y,
+                                          infiniopTensorDescriptor_t condition) {
+    uint64_t ndim = condition->ndim;
+    if (ndim != x->ndim || ndim != y->ndim || ndim != dst->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }  
+    for (size_t i = 0; i < ndim; ++i) {
+        if (condition->shape[i] != x->shape[i] || condition->shape[i] != y->shape[i] || condition->shape[i] != dst->shape[i]) {
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (!is_contiguous(condition) || !is_contiguous(x) || !is_contiguous(y) || !is_contiguous(dst)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (x->dt != y->dt || x->dt != dst->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (x->dt != F16 && x->dt != F32 && x->dt != U16) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (condition->dt != U8) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    
+    DT dtype = x->dt;
+    uint64_t data_size = std::accumulate(condition->shape, condition->shape + condition->ndim, 1ULL, std::multiplies<uint64_t>());
+
+
+    *desc_ptr = new WhereCudaDescriptor{
+        DevNvGpu,
+        dtype,
+        handle->device_id,
+        ndim,
+        data_size,
+        static_cast<uint64_t>(handle->prop.maxGridSize[0]),
+    };
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyWhereDescriptor(WhereCudaDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/ops/where/operator.cc b/src/ops/where/operator.cc
new file mode 100644
index 00000000..9187c643
--- /dev/null
+++ b/src/ops/where/operator.cc
@@ -0,0 +1,68 @@
+#include "../utils.h"
+#include "operators.h"
+
+#include "ops/where/where.h"
+
+#ifdef ENABLE_CPU
+#include "cpu/where_cpu.h"
+#endif
+#ifdef ENABLE_NV_GPU
+#include "../../devices/cuda/cuda_handle.h"
+#include "cuda/where.cuh"
+#endif
+
+__C infiniopStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle,
+    infiniopWhereDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t dst,
+    infiniopTensorDescriptor_t x,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t condition){
+    switch (handle->device)
+    {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuCreateWhereDescriptor(handle, (WhereCpuDescriptor_t *)desc_ptr, dst, x, y, condition);
+#endif     
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu:
+        return cudaCreateWhereDescriptor((CudaHandle_t) handle, (WhereCudaDescriptor_t *)desc_ptr, dst, x, y, condition);
+#endif       
+    
+    }
+    return STATUS_BAD_DEVICE;        
+}
+
+__C infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc,
+    void *dst,
+    void const *x,
+    void const *y,
+    void const *condition,
+    void *stream){
+    switch (desc->device)
+    {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuWhere((WhereCpuDescriptor_t)desc, dst, x, y, condition, stream);
+#endif        
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu:
+        return cudaWhere((WhereCudaDescriptor_t) desc, dst, x, y, condition, stream);
+#endif
+    }
+    return STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc){
+    switch (desc->device)
+    {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuDestroyWhereDescriptor((WhereCpuDescriptor_t)desc);
+#endif        
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu:
+        return cudaDestroyWhereDescriptor((WhereCudaDescriptor_t)desc);
+#endif
+    }
+   return STATUS_BAD_DEVICE;
+}
\ No newline at end of file