InfiniTensor
diff --git a/‎.github/workflows/main.yaml
Lines changed: 7 additions & 6 deletions b/‎.github/workflows/main.yaml
Lines changed: 7 additions & 6 deletions
diff --git a/‎include/infini_operators.h
Lines changed: 3 additions & 2 deletions b/‎include/infini_operators.h
Lines changed: 3 additions & 2 deletions
diff --git a/‎include/ops/conv/conv.h
Lines changed: 2 additions & 1 deletion b/‎include/ops/conv/conv.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/ops/conv_act/conv_act.h
Lines changed: 52 additions & 0 deletions b/‎include/ops/conv_act/conv_act.h
Lines changed: 52 additions & 0 deletions
diff --git a/‎include/ops/conv_bias_act/conv_bias_act.h
Lines changed: 0 additions & 33 deletions b/‎include/ops/conv_bias_act/conv_bias_act.h
Lines changed: 0 additions & 33 deletions
diff --git a/‎operatorspy/tests/conv.py
Lines changed: 54 additions & 34 deletions b/‎operatorspy/tests/conv.py
Lines changed: 54 additions & 34 deletions
@@ -33,15 +33,15 @@ jobs:
     - name: configure xmake
       run: xmake f --cpu=true -cv
 
-    - name: Build with XMake
-      run: xmake
-
-    - name: Find and Set INFINI_ROOT
-      id: set_infini_root
+    - name: Set INFINI_ROOT
       run: |
-        export INFINI_ROOT=$GITHUB_WORKSPACE
+        export INFINI_ROOT=$GITHUB_WORKSPACE/.infini
+        mkdir -p $INFINI_ROOT
         echo "INFINI_ROOT=$INFINI_ROOT" >> $GITHUB_ENV
 
+    - name: Build with XMake
+      run: xmake build && xmake install
+
     - name: Run Python Tests
       run: |
         GREEN='\033[0;32m'
@@ -88,3 +88,4 @@ jobs:
         fi
       env:
         INFINI_ROOT: ${{ env.INFINI_ROOT }}
+        
@@ -3,10 +3,11 @@
 #include "ops/attention/attention.h"
 #include "ops/avg_pool/avg_pool.h"
 #include "ops/causal_softmax/causal_softmax.h"
-#include "ops/global_avg_pool/global_avg_pool.h"
+#include "ops/conv/conv.h"
+#include "ops/conv_act/conv_act.h"
 #include "ops/expand/expand.h"
 #include "ops/gemm/gemm.h"
-#include "ops/conv/conv.h"
+#include "ops/global_avg_pool/global_avg_pool.h"
 #include "ops/matmul/matmul.h"
 #include "ops/max_pool/max_pool.h"
 #include "ops/mlp/mlp.h"
 
@@ -15,14 +15,15 @@ __C __export infiniopStatus_t infiniopCreateConvDescriptor(infiniopHandle_t hand
                                                            infiniopTensorDescriptor_t y,
                                                            infiniopTensorDescriptor_t x,
                                                            infiniopTensorDescriptor_t w,
+                                                           infiniopTensorDescriptor_t b,
                                                            uint64_t const *pads,
                                                            int64_t const *strides,
                                                            uint64_t const *dilations,
                                                            uint64_t n);
 
 __C __export infiniopStatus_t infiniopGetConvWorkspaceSize(infiniopConvDescriptor_t desc, uint64_t *size);
 
-__C __export infiniopStatus_t infiniopConv(infiniopConvDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void const *w, void *stream);
+__C __export infiniopStatus_t infiniopConv(infiniopConvDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void const *w, void const *b, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc);
 
 
@@ -0,0 +1,52 @@
+#ifndef CONV_ACT_H
+#define CONV_ACT_H
+
+#include "../../export.h"
+#include "../../operators.h"
+#include <cstddef>
+
+/**
+ * @brief Specifies the type of activation function
+ */
+struct ActivationMode {
+
+    enum Mode {
+        // activation functions
+        IDENTITY,
+        RELU,
+        SIGMOID,
+
+        // Count
+        // NOTE: new activation functions should add before "Count"
+        Count,
+    };
+    constexpr static size_t numOfActivationFunctions = Mode::Count;
+};
+
+typedef struct ConvActDescriptor {
+    Device device;
+} ConvActDescriptor;
+
+typedef ConvActDescriptor *infiniopConvActDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateConvActDescriptor(infiniopHandle_t handle,
+                                                              infiniopConvActDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t y,
+                                                              infiniopTensorDescriptor_t x,
+                                                              infiniopTensorDescriptor_t w,
+                                                              infiniopTensorDescriptor_t b,
+                                                              uint64_t const *pads,
+                                                              int64_t const *strides,
+                                                              uint64_t const *dilations,
+                                                              uint64_t n,
+                                                              ActivationMode::Mode activation_mode,
+                                                              double clip_coef = 0.0);
+
+__C __export infiniopStatus_t infiniopGetConvActWorkspaceSize(infiniopConvActDescriptor_t desc, uint64_t *size);
+
+__C __export infiniopStatus_t infiniopConvAct(infiniopConvActDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void const *w, void const *b, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyConvActDescriptor(infiniopConvActDescriptor_t desc);
+
+
+#endif
@@ -38,7 +38,7 @@ class ConvDescriptor(Structure):
 infiniopConvDescriptor_t = POINTER(ConvDescriptor)
 
 
-def conv(x, w, stride, padding, dilation):
+def conv(x, w, b, stride, padding, dilation):
     ndim = len(x.shape) - 2
     conv_func_map = {
         1: F.conv1d,
@@ -54,10 +54,10 @@ def conv(x, w, stride, padding, dilation):
     conv_func = conv_func_map[ndim]
 
     if PROFILE:
-        ans = conv_func(x, w, stride=stride, padding=padding, dilation=dilation)
+        ans = conv_func(x, w, b, stride=stride, padding=padding, dilation=dilation)
         torch.cuda.synchronize()
         return ans
-    return conv_func(x, w, stride=stride, padding=padding, dilation=dilation)
+    return conv_func(x, w, b, stride=stride, padding=padding, dilation=dilation)
 
 
 # infer the shape of the output given the inputs for a N-ary convolution
@@ -98,31 +98,34 @@ def test(
     pads,
     strides,
     dilations,
-    tensor_stride=None,
+    add_bias,
     tensor_dtype=torch.float16,
 ):
     assert len(pads) == len(strides) == len(dilations)
     print(
-        f"Testing Conv on {torch_device} with x_shape: {x_shape}, w_shape: {w_shape}, b_shape: {w_shape[0]}, pads: {pads}, strides: {strides}, dilations: {dilations}, x_stride: {tensor_stride} dtype:{tensor_dtype}"
+        f"Testing Conv on {torch_device} with x_shape: {x_shape}, w_shape: {w_shape}, add_bias: {add_bias}, "
+        f"b_shape: {w_shape[0]}, pads: {pads}, strides: {strides}, dilations: {dilations}, dtype:{tensor_dtype}"
     )
     x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
     w = torch.rand(w_shape, dtype=tensor_dtype).to(torch_device)
+    b = torch.round((torch.rand(w_shape[0], dtype=tensor_dtype).to(torch_device) * 2 - 1) * 1000) / 1000 if add_bias else None
     y = torch.zeros(
         inferShape(x.shape, w.shape, pads, strides, dilations), dtype=tensor_dtype
     ).to(torch_device)
 
     for i in range(NUM_PRERUN if PROFILE else 1):
-        ans = conv(x, w, strides, pads, dilations)
+        ans = conv(x, w, b, strides, pads, dilations)
     if PROFILE:
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
-            _ = conv(x, w, strides, pads, dilations)
+            _ = conv(x, w, b, strides, pads, dilations)
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"pytorch time: {elapsed :6f}")
 
 
     x_tensor = to_tensor(x, lib)
     w_tensor = to_tensor(w, lib)
+    b_tensor = to_tensor(b, lib) if b is not None else None
     y_tensor = to_tensor(y, lib)
     descriptor = infiniopConvDescriptor_t()
 
@@ -133,6 +136,7 @@ def test(
             y_tensor.descriptor,
             x_tensor.descriptor,
             w_tensor.descriptor,
+            b_tensor.descriptor if b_tensor else None,
             tuple_to_void_p(pads),
             tuple_to_void_p(strides),
             tuple_to_void_p(dilations),
@@ -147,27 +151,33 @@ def test(
     workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
 
     for i in range(NUM_PRERUN if PROFILE else 1):
-        lib.infiniopConv(
-            descriptor,
-            workspace_ptr,
-            workspaceSize,
-            y_tensor.data,
-            x_tensor.data,
-            w_tensor.data,
-            None,
-        )
-    if PROFILE:
-        start_time = time.time()
-        for i in range(NUM_ITERATIONS):
+        check_error(
             lib.infiniopConv(
                 descriptor,
                 workspace_ptr,
                 workspaceSize,
                 y_tensor.data,
                 x_tensor.data,
                 w_tensor.data,
+                b_tensor.data if b_tensor else None,
                 None,
             )
+        )
+    if PROFILE:
+        start_time = time.time()
+        for i in range(NUM_ITERATIONS):
+            check_error(
+                lib.infiniopConv(
+                    descriptor,
+                    workspace_ptr,
+                    workspaceSize,
+                    y_tensor.data,
+                    x_tensor.data,
+                    w_tensor.data,
+                    b_tensor.data if b_tensor else None,
+                    None,
+                )
+            )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
 
@@ -181,18 +191,18 @@ def test(
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
-        test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
-        test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+    for x_shape, w_shape, pads, strides, dilations, add_bias in test_cases:
+        test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, add_bias, tensor_dtype=torch.float16)
+        test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, add_bias, tensor_dtype=torch.float32)
     destroy_handle(lib, handle)
 
 
 def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
-    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
-        test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
-        test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+    for x_shape, w_shape, pads, strides, dilations, add_bias in test_cases:
+        test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, add_bias, tensor_dtype=torch.float16)
+        test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, add_bias, tensor_dtype=torch.float32)
     destroy_handle(lib, handle)
 
 
@@ -201,54 +211,62 @@ def test_bang(lib, test_cases):
 
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
-    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
-        test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
-        test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+    for x_shape, w_shape, pads, strides, dilations, add_bias in test_cases:
+        test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, add_bias, tensor_dtype=torch.float16)
+        test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, add_bias, tensor_dtype=torch.float32)
     destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
     test_cases = [
-        # x_shape, w_shape, pads, strides, dilations, x_strides
+        # x_shape, w_shape, pads, strides, dilations, add_bias
         (
             (32, 3, 4),
             (32, 3, 5),
             (1,),
             (1,),
             (1,),
-            None,
+            False,
+        ),
+        (
+            (3, 7, 4),
+            (3, 7, 5),
+            (1,),
+            (1,),
+            (1,),
+            True,
         ),
         (
             (1, 3, 4, 4),
             (2, 3, 3, 3),
             (1, 1),
             (1, 2),
             (2, 1),
-            None,
+            True,
         ),
         (
             (32, 3, 128, 128),
             (64, 3, 5, 5),
             (2, 2),
             (2, 2),
             (1, 1),
-            None,
+            False,
         ),
         (
             (1, 1, 4, 4, 4),
             (1, 1, 5, 5, 5),
             (1, 1, 1),
             (1, 1, 1),
             (1, 1, 1),
-            None,
+            True,
         ),
         (
             (32, 3, 32, 32, 32),
             (64, 3, 5, 5, 5),
             (3, 2, 2),
             (4, 3, 3),
             (2, 2, 1),
-            None,
+            False,
         ),
     ]
     args = get_args()
@@ -260,6 +278,7 @@ def test_bang(lib, test_cases):
         infiniopTensorDescriptor_t,
         infiniopTensorDescriptor_t,
         infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
         c_void_p,
         c_void_p,
         c_void_p,
@@ -274,6 +293,7 @@ def test_bang(lib, test_cases):
         c_void_p,
         c_void_p,
         c_void_p,
+        c_void_p,
     ]
     lib.infiniopDestroyConvDescriptor.restype = c_int32
     lib.infiniopDestroyConvDescriptor.argtypes = [