InfiniTensor
diff --git a/‎include/ops/clip/clip.h
Lines changed: 1 addition & 1 deletion b/‎include/ops/clip/clip.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ops/gather/gather.h
Lines changed: 1 addition & 1 deletion b/‎include/ops/gather/gather.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ops/reducemax/reducemax.h
Lines changed: 1 addition & 1 deletion b/‎include/ops/reducemax/reducemax.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ops/reducemean/reducemean.h
Lines changed: 1 addition & 1 deletion b/‎include/ops/reducemean/reducemean.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ops/reducemin/reducemin.h
Lines changed: 1 addition & 1 deletion b/‎include/ops/reducemin/reducemin.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎operatorspy/tests/clip.py
Lines changed: 48 additions & 28 deletions b/‎operatorspy/tests/clip.py
Lines changed: 48 additions & 28 deletions
diff --git a/‎operatorspy/tests/gather.py
Lines changed: 1 addition & 1 deletion b/‎operatorspy/tests/gather.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎operatorspy/tests/reducemax.py
Lines changed: 43 additions & 25 deletions b/‎operatorspy/tests/reducemax.py
Lines changed: 43 additions & 25 deletions
@@ -15,7 +15,7 @@ __C __export infiniopStatus_t infiniopCreateClipDescriptor(infiniopHandle_t hand
                                                                 infiniopTensorDescriptor_t y
                                                                 );
 
-__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void const *x, void *min, void *max, void *y, void *stream);
+__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, void *x, float *min, float *max, void *y, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
 
 
@@ -17,7 +17,7 @@ __C __export infiniopStatus_t infiniopCreateGatherDescriptor(infiniopHandle_t ha
                                                                 int64_t axis
                                                                 );
 
-__C __export infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void const *x, void *indices, void *y, void *stream);
+__C __export infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void *x, void *indices, void *y, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc);
 
 
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReducemaxDescriptor(infiniopHandle_t
                                                                 bool noop_with_empty_axes
                                                                 );
 
-__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, void const *x, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyReducemaxDescriptor(infiniopReducemaxDescriptor_t desc);
 #endif
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReducemeanDescriptor(infiniopHandle_
                                                                 bool noop_with_empty_axes
                                                                 );
 
-__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *dst, void const *src, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *dst, void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyReducemeanDescriptor(infiniopReducemeanDescriptor_t desc);
 #endif
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReduceminDescriptor(infiniopHandle_t
                                                                 bool noop_with_empty_axes
                                                                 );
 
-__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *dst, void const *src, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
+__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *dst, void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
 
 __C __export infiniopStatus_t infiniopDestroyReduceminDescriptor(infiniopReduceminDescriptor_t desc);
 #endif
@@ -1,4 +1,4 @@
-from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64, c_bool
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64, c_bool, c_float
 import ctypes
 import sys
 import os
@@ -21,7 +21,7 @@
 from typing import Tuple
 import numpy as np
 
-PROFILE = False
+PROFILE = True
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
@@ -46,29 +46,30 @@ def test(
     x_shape,
     min,
     max,
-    tensor_dtype=torch.float16
+    tensor_dtype=torch.float32
 ):
     print(
         f"Testing clip on {torch_device} with x_shape:{x_shape} dtype:{tensor_dtype} max:{max} min:{min}"
     )
-    x = torch.randn(x_shape, dtype=tensor_dtype, device=torch_device)
-    output = torch.randn(x_shape, dtype=tensor_dtype, device=torch_device)
+    x = torch.randn(x_shape, dtype=torch.float32, device=torch_device)
+    
+    output = torch.randn(x_shape, dtype=torch.float32, device=torch_device)
     if min != None:
-        min = torch.tensor(min, dtype=torch.float32, device=torch_device)
+        min_t = torch.tensor(min, dtype=torch.float32, device=torch_device)
     else:
-        min = torch.tensor(float("-inf"), dtype=torch.float32, device=torch_device)
+        min_t = torch.tensor(float("-inf"), dtype=torch.float32, device=torch_device)
     if max != None:
-        max = torch.tensor(max, dtype=torch.float32, device=torch_device)
+        max_t = torch.tensor(max, dtype=torch.float32, device=torch_device)
     else:
-        max = torch.tensor(float("inf"), dtype=torch.float32, device=torch_device)
+        max_t = torch.tensor(float("inf"), dtype=torch.float32, device=torch_device)
     for i in range(NUM_PRERUN if PROFILE else 1):
         if min == None and max == None:
             break
-        ans = clip(x, min, max)
+        ans = clip(x, min_t, max_t)
     if PROFILE:
         start_time = time.time()
         for i in range(NUM_ITERATIONS):
-            _ = clip(x, min, max)
+            _ = clip(x, min_t, max_t)
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"pytorch time: {elapsed :10f}")
     x_tensor = to_tensor(x, lib)
@@ -82,15 +83,16 @@ def test(
             y_tensor.descriptor,
         )
     )
+    #Ss = [1024, 2048, 4096]
     x_tensor.descriptor.contents.invalidate()
     y_tensor.descriptor.contents.invalidate()
     for i in range(NUM_PRERUN if PROFILE else 1):
         check_error(
             lib.infiniopClip(
                 descriptor,
                 x_tensor.data,
-                min.data_ptr() if min != None else None,
-                max.data_ptr() if max != None else None,
+                ctypes.byref(c_float(min)) if min != None else None,
+                ctypes.byref(c_float(max)) if max != None else None,
                 y_tensor.data,
                 None,
             )
@@ -102,37 +104,50 @@ def test(
                     lib.infiniopClip(
                     descriptor,
                     x_tensor.data,
-                    min.data_ptr() if min != None else None,
-                    max.data_ptr() if max != None else None,
+                    ctypes.byref(c_float(min)) if min != None else None,
+                    ctypes.byref(c_float(max)) if max != None else None,
                     y_tensor.data,
                     None,
                 )
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"lib time: {elapsed :10f}")
-    print("x:", x)
-    print("custom op ans:", output)
-    print("ans:", ans) if max != None or min != None else print("ans:", x)
     assert torch.allclose(output, ans, atol=0, rtol=0) if max != None or min != None else torch.allclose(output, x, atol=0, rtol=0)
     check_error(lib.infiniopDestroyClipDescriptor(descriptor))
 
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for x_shape, min, max in test_cases:
-        test(lib, handle, "cpu", x_shape, min, max, tensor_dtype=torch.float16)
-        print("\n")
-        #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
+    for x_shape, min, max, tensor_type in test_cases:
+        test(lib, handle, "cpu", x_shape, min, max, tensor_dtype=tensor_type)
+    destroy_handle(lib, handle)
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, min, max, tensor_type in test_cases:
+        test(lib, handle, "cuda", x_shape, min, max, tensor_dtype=tensor_type)
     destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
     test_cases = [
-        ((3, 4), -1, 1),
-        ((3, 4), None, 1),
-        ((3, 4), -1, None),
-        ((3, 4), None, None)
-        # stride = 
+        ((3, 4), -1, 1, torch.float32),
+        ((3, 4), None, 1, torch.float32),
+        ((3, 4), -1, None, torch.float32),
+        ((3, 4), None, None, torch.float32),
+        ((16), -1, 1, torch.float32),
+        ((1024, 1024), -1, 1, torch.float32),
+        ((4096, 4096), -1, 1, torch.float32),
+        
+        ((13), -1, 1, torch.float32),
+        ((3, 4), -1, 1, torch.float16),
+        ((3, 4), None, 1, torch.float16),
+        ((3, 4), -1, None, torch.float16),
+        ((3, 4), None, None, torch.float16),
+        ((16), -1, 1, torch.float16),
+        ((1024, 1024), -1, 1, torch.float16),
+        ((4096, 4096), -1, 1, torch.float16),
     ]
     args = get_args()
     lib = open_lib()
@@ -141,6 +156,7 @@ def test_cpu(lib, test_cases):
         infiniopHandle_t,
         POINTER(infiniopClipDescriptor_t),
         infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t
     ]
     lib.infiniopClip.restype = c_int32
     lib.infiniopClip.argtypes = [
@@ -149,8 +165,12 @@ def test_cpu(lib, test_cases):
         c_void_p,
         c_void_p,
         c_void_p,
+        c_void_p
     ]
     lib.infiniopDestroyClipDescriptor.restype = c_int32
     lib.infiniopDestroyClipDescriptor.argtypes = [infiniopClipDescriptor_t]
-    test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.cpu:
+        test_cpu(lib, test_cases)
     print("All tests passed!")
@@ -107,7 +107,7 @@ def test(
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"lib time: {elapsed :10f}")
     print(f"pytorch ans: {ans}")
-    print(f"lib ans: {dst_tensor.data}")
+    print(f"lib ans: {dst}")
     assert torch.allclose(dst, ans, atol=0, rtol=0)
     check_error(lib.infiniopDestroyGatherDescriptor(descriptor))
 
 
@@ -21,9 +21,9 @@
 from typing import Tuple
 import numpy as np
 
-PROFILE = False
+PROFILE = True
 NUM_PRERUN = 1
-NUM_ITERATIONS = 1
+NUM_ITERATIONS = 50
 
 class ReducemaxDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -113,7 +113,6 @@ def test(
             c_bool(noop_with_empty_axes),
         )
     )
-    print(f"op desctiptor created")
     x_tensor.descriptor.contents.invalidate()
     y_tensor.descriptor.contents.invalidate()
     for i in range(NUM_PRERUN if PROFILE else 1):
@@ -142,41 +141,57 @@ def test(
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"lib time: {elapsed :10f}")
-    print(f"custom op output:{y}")
-    print(f"pytorch output:{ans}")
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
-    
+    # print(f"input : {x}")
+    # print(f"custom op output:{y}")
+    # print(f"pytorch output:{ans}")
     check_error(lib.infiniopDestroyReducemaxDescriptor(descriptor))
+    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
 
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes in test_cases:
-        print(dynamic_axes)
-        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=torch.float16)
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
+        test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
         print("\n")
         #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
     destroy_handle(lib, handle)
 
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
+        test(lib, handle, "cuda", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
+        print("\n")
+    destroy_handle(lib, handle)
 
 if __name__ == "__main__":
     test_cases = [
         # dynamic calc test eg
-        ((2, 3, 4, 5), [0, 2], False, True, None),
-        ((2, 3, 4, 5), [0, 2], False, True, None),
-        #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
-        ((2, 10, 24, 10), [0, 2], False, True, None),
-        # stride = 
-        ((2, 10, 24, 10), [0, 1], False, True, None),
-        ((2, 10, 24, 10), [2, 3], False , True, None),
-        ((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
-        # validate attribute noop_with_empty_axes and keepdims
-        ((2, 10, 24, 10), None, True, True, None),
-        ((2, 10, 24, 10), None, True, False, None),
-        ((2, 10, 24, 10), None, False, True, None),
-        ((2, 10, 24, 10), None, False, False, None),
-        ((2, 3, 4), [0, 1], False, False, None),
+        # ((2, 3, 4, 5), [0, 2], False, True, None),
+        # ((2, 3, 4, 5), [0, 2], False, True, None),
+        # #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
+        # ((2, 10, 24, 10), [0, 2], False, True, None),
+        # # stride = 
+        # ((2, 10, 24, 10), [0, 1], False, True, None),
+        # ((2, 10, 24, 10), [2, 3], False , True, None),
+        # ((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
+        # # validate attribute noop_with_empty_axes and keepdims
+        # ((2, 10, 24, 10), None, True, True, None),
+        # ((2, 10, 24, 10), None, True, False, None),
+        # ((2, 10, 24, 10), None, False, True, None),
+        # ((2, 10, 24, 10), None, False, False, None),
+        # ((2, 3, 4), [0, 1], False, False, None),
         #((2, 10, 24, 10), [], True),
+        #((4,), [0], False, False, None, torch.float32),
+        ((1000, 300), [0, 1], False, False, None, torch.float16),
+        ((50, 3), [0, 1], False, False, None, torch.float16),
+        ((1000, 300), [0, 1], False, False, None, torch.float16),
+        ((2000, 200, 50), [0, 1], False, True, None, torch.float32),
+        ((1000, 200, 500), [0, 1], False, True, None, torch.float16),
+        ((1000, 200, 50), [0, 1], False, True, None, torch.float32),
+        ((20, 3, 4, 5), [0, 2], False, False, None, torch.float32),
+        ((20, 30, 40, 5), [0, 2, 3], False, False, None, torch.float32),
+        ((200, 3, 40, 5), [0, 3], False, False, None, torch.float32),
     ]
     args = get_args()
     lib = open_lib()
@@ -202,5 +217,8 @@ def test_cpu(lib, test_cases):
     ]
     lib.infiniopDestroyReducemaxDescriptor.restype = c_int32
     lib.infiniopDestroyReducemaxDescriptor.argtypes = [infiniopReducemaxDescriptor_t]
-    test_cpu(lib, test_cases)
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
     print("All tests passed!")