Merge pull request #72 from InfiniTensor/add_mod_kernel

YdrMaster · web-flow · commit a41a09f3f1ec · 2024-01-11T19:43:52.000+08:00
添加 Mod cpu/cuda 算子
diff --git a/src/04kernel/include/kernel/collectors/simple_binary.h b/src/04kernel/include/kernel/collectors/simple_binary.h
@@ -14,6 +14,8 @@ namespace refactor::kernel {
         And,
         Or,
         Xor,
+        Mod,
+        Fmod,
     };
 
     std::string_view opName(SimpleBinaryType type);
diff --git a/src/04kernel/src/collectors/simple_binary.cc b/src/04kernel/src/collectors/simple_binary.cc
@@ -19,6 +19,8 @@ namespace refactor::kernel {
             CASE(And);
             CASE(Or);
             CASE(Xor);
+            CASE(Mod);
+            CASE(Fmod);
             default:
                 UNREACHABLE();
         }
diff --git a/src/04kernel/src/kernels/simple_binary/cpu_kernel.cc b/src/04kernel/src/kernels/simple_binary/cpu_kernel.cc
@@ -1,4 +1,5 @@
 ﻿#include "cpu_kernel.hh"
+#include <cmath>
 #include <execution>
 
 namespace refactor::kernel {
@@ -118,8 +119,38 @@ namespace refactor::kernel {
                         UNREACHABLE();
                 }
             }
-            default:
-                UNREACHABLE();
+            case Op::Mod: {
+                switch (dataType.internal) {
+                    CASE_DT(a % b, U8);
+                    CASE_DT(a % b, I8);
+                    CASE_DT(a % b, U16);
+                    CASE_DT(a % b, I16);
+                    CASE_DT(a % b, I32);
+                    CASE_DT(a % b, I64);
+                    CASE_DT(a % b, U32);
+                    CASE_DT(a % b, U64);
+                    default:
+                        UNREACHABLE();
+                }
+            }
+            case Op::Fmod: {
+                switch (dataType.internal) {
+                    CASE_DT(std::fmod(a, b), F32);
+                    CASE_DT(a % b, U8);
+                    CASE_DT(a % b < 0 ? (a % b + b) : (a % b), I8);
+                    CASE_DT(a % b, U16);
+                    CASE_DT(a % b < 0 ? (a % b + b) : (a % b), I16);
+                    CASE_DT(a % b < 0 ? (a % b + b) : (a % b), I32);
+                    CASE_DT(a % b < 0 ? (a % b + b) : (a % b), I64);
+                    CASE_DT(std::fmod(a, b), F64);
+                    CASE_DT(a % b, U32);
+                    CASE_DT(a % b, U64);
+                    default:
+                        UNREACHABLE();
+                }
+                default:
+                    UNREACHABLE();
+            }
         }
     }
 
diff --git a/src/04kernel/src/kernels/simple_binary/cuda_kernel.cc b/src/04kernel/src/kernels/simple_binary/cuda_kernel.cc
@@ -135,12 +135,46 @@ extern "C" __global__ void kernel(
                     case DataType::F32:
                         return "powf(a, b)";
                     case DataType::FP16:
-                        return "__float2half(__powf(__half2float(a), __half2float(b)))";
+                        return "__float2half(powf(__half2float(a), __half2float(b)))";
                     case DataType::BF16:
                         return "__float2bfloat16(powf(__bfloat162float(a), __bfloat162float(b)))";
                     default:
                         return "pow(a, b)";
                 }
+            case SimpleBinaryType::Mod:
+                switch (dt) {
+                    case DataType::U8:
+                    case DataType::I8:
+                    case DataType::U16:
+                    case DataType::I16:
+                    case DataType::I32:
+                    case DataType::I64:
+                    case DataType::U32:
+                    case DataType::U64:
+                        return "a % b";
+                    default:
+                        UNREACHABLE();
+                }
+            case SimpleBinaryType::Fmod:
+                switch (dt) {
+                    case DataType::U8:
+                    case DataType::I8:
+                    case DataType::U16:
+                    case DataType::I16:
+                    case DataType::I32:
+                    case DataType::I64:
+                    case DataType::U32:
+                    case DataType::U64:
+                        return "a % b < 0 ? (a % b + b) : (a % b)";
+                    case DataType::F32:
+                        return "fmodf(a, b)";
+                    case DataType::FP16:
+                        return "__float2half(fmodf(__half2float(a), __half2float(b)))";
+                    case DataType::BF16:
+                        return "__float2bfloat16(fmodf(__bfloat162float(a), __bfloat162float(b)))";
+                    default:
+                        UNREACHABLE();
+                }
             default:
                 UNREACHABLE();
         }
diff --git a/src/04kernel/test/kernels/simple_binary/test_binary_cpu.cpp b/src/04kernel/test/kernels/simple_binary/test_binary_cpu.cpp
@@ -1,4 +1,5 @@
 #include "../src/kernels/simple_binary/cpu_kernel.hh"
+#include <cmath>
 #include <gtest/gtest.h>
 
 using namespace refactor;
@@ -27,11 +28,60 @@ void testBinaryCPU(SimpleBinaryType binaryOPT, std::function<float(float, float)
     }
 }
 
+void testModCPU(SimpleBinaryType binaryOPT, std::function<int(int, int)> operation) {
+    // Create Tensor and build kernels
+    auto aTensor = Tensor::share(DataType::I32, Shape{10, 20, 30, 40}, LayoutType::NCHW);
+    auto bTensor = Tensor::share(DataType::I32, Shape{10, 20, 30, 40}, LayoutType::NCHW);
+    auto cTensor = Tensor::share(DataType::I32, Shape{10, 20, 30, 40}, LayoutType::NCHW);
+    auto cpuKernel = BinaryCpu::build(binaryOPT, *aTensor, *bTensor);
+    ASSERT_TRUE(cpuKernel);
+    auto res = runtime::Resources();
+    auto cpuRoutine = cpuKernel->lower(res).routine;
+    // Init inputs and outputs
+    std::vector<int> a(aTensor->elementsSize(), -3);
+    std::vector<int> b(bTensor->elementsSize(), 2);
+    std::vector<int> c(cTensor->elementsSize());
+    // Compute
+    void const *inputs[]{a.data(), b.data()};
+    void *outputs[]{c.data()};
+    cpuRoutine(res, nullptr, inputs, outputs);
+    // Compare
+    for (auto i : range0_(c.size())) {
+        EXPECT_FLOAT_EQ(c[i], operation(a[i], b[i]));
+    }
+}
+
+void testFmodWithI32CPU(SimpleBinaryType binaryOPT, std::function<int(int, int)> operation) {
+    // Create Tensor and build kernels
+    auto aTensor = Tensor::share(DataType::I32, Shape{10, 20, 30, 40}, LayoutType::NCHW);
+    auto bTensor = Tensor::share(DataType::I32, Shape{10, 20, 30, 40}, LayoutType::NCHW);
+    auto cTensor = Tensor::share(DataType::I32, Shape{10, 20, 30, 40}, LayoutType::NCHW);
+    auto cpuKernel = BinaryCpu::build(binaryOPT, *aTensor, *bTensor);
+    ASSERT_TRUE(cpuKernel);
+    auto res = runtime::Resources();
+    auto cpuRoutine = cpuKernel->lower(res).routine;
+    // Init inputs and outputs
+    std::vector<int> a(aTensor->elementsSize(), -3);
+    std::vector<int> b(bTensor->elementsSize(), 2);
+    std::vector<int> c(cTensor->elementsSize());
+    // Compute
+    void const *inputs[]{a.data(), b.data()};
+    void *outputs[]{c.data()};
+    cpuRoutine(res, nullptr, inputs, outputs);
+    // Compare
+    for (auto i : range0_(c.size())) {
+        EXPECT_FLOAT_EQ(c[i], operation(a[i], b[i]));
+    }
+}
+
 TEST(kernel, BinaryCpu) {
     testBinaryCPU(SimpleBinaryType::Add, [](float a, float b) { return a + b; });
     testBinaryCPU(SimpleBinaryType::Sub, [](float a, float b) { return a - b; });
     testBinaryCPU(SimpleBinaryType::Mul, [](float a, float b) { return a * b; });
     testBinaryCPU(SimpleBinaryType::Div, [](float a, float b) { return a / b; });
+    testModCPU(SimpleBinaryType::Mod, [](int a, int b) { return a % b; });
+    testFmodWithI32CPU(SimpleBinaryType::Fmod, [](int a, int b) { return a % b < 0 ? (a % b + b) : (a % b); });
+    testBinaryCPU(SimpleBinaryType::Fmod, [](float a, float b) { return std::fmod(a, b); });
 }
 
 TEST(kernel, BinaryCpuBroadcast) {
diff --git a/src/04kernel/test/kernels/simple_binary/test_binary_cuda.cpp b/src/04kernel/test/kernels/simple_binary/test_binary_cuda.cpp
@@ -9,12 +9,13 @@ using namespace refactor;
 using namespace kernel;
 using namespace hardware;
 
+template<decltype(DataType::internal) T>
 void testBinaryCuda(SimpleBinaryType binaryOPT, Shape dimA, Shape dimB, Shape dimC) {
     // Create Tensor and build kernels
-    using T_ = primitive<DataType::I8>::type;
-    auto aTensor = Tensor::share(DataType::I8, dimA, LayoutType::NCHW);
-    auto bTensor = Tensor::share(DataType::I8, dimB, LayoutType::NCHW);
-    auto cTensor = Tensor::share(DataType::I8, dimC, LayoutType::NCHW);
+    using T_ = primitive<T>::type;
+    auto aTensor = Tensor::share(T, dimA, LayoutType::NCHW);
+    auto bTensor = Tensor::share(T, dimB, LayoutType::NCHW);
+    auto cTensor = Tensor::share(T, dimC, LayoutType::NCHW);
 
     auto cpuKernel = BinaryCpu::build(binaryOPT, *aTensor, *bTensor),
          cudaKernel = BinaryCuda::build(binaryOPT, *aTensor, *bTensor);
@@ -24,8 +25,8 @@ void testBinaryCuda(SimpleBinaryType binaryOPT, Shape dimA, Shape dimB, Shape di
     auto cudaRoutine = cudaKernel->lower(res).routine;
 
     // Init inputs and outputs
-    std::vector<T_> a(aTensor->elementsSize(), 3.0f);
-    std::vector<T_> b(bTensor->elementsSize(), 2.0f);
+    std::vector<T_> a(aTensor->elementsSize(), 3);
+    std::vector<T_> b(bTensor->elementsSize(), 2);
     std::vector<T_> c(cTensor->elementsSize());
     auto &dev = *device::init(Device::Type::Nvidia, 0, "");
     auto aGPU = dev.malloc(aTensor->bytesSize()),
@@ -53,35 +54,56 @@ void testBinaryCuda(SimpleBinaryType binaryOPT, Shape dimA, Shape dimB, Shape di
 }
 
 TEST(kernel, BinaryCudaAdd) {
-    testBinaryCuda(SimpleBinaryType::Add,
-                   Shape{2, 5, 10, 20, 3, 4},
-                   Shape{2, 5, 10, 20, 3, 4},
-                   Shape{2, 5, 10, 20, 3, 4});
+    testBinaryCuda<DataType::I8>(SimpleBinaryType::Add,
+                                 Shape{2, 5, 10, 20, 3, 4},
+                                 Shape{2, 5, 10, 20, 3, 4},
+                                 Shape{2, 5, 10, 20, 3, 4});
 }
 
 TEST(kernel, BinaryCudaMul) {
-    testBinaryCuda(SimpleBinaryType::Mul,
-                   Shape{2, 5, 10, 20, 3, 4},
-                   Shape{2, 5, 10, 20, 3, 4},
-                   Shape{2, 5, 10, 20, 3, 4});
+    testBinaryCuda<DataType::I8>(SimpleBinaryType::Mul,
+                                 Shape{2, 5, 10, 20, 3, 4},
+                                 Shape{2, 5, 10, 20, 3, 4},
+                                 Shape{2, 5, 10, 20, 3, 4});
 }
 
 TEST(kernel, BinaryCudaSub) {
-    testBinaryCuda(SimpleBinaryType::Sub,
-                   Shape{2, 5, 10, 20, 3, 4},
-                   Shape{2, 5, 10, 20, 3, 4},
-                   Shape{2, 5, 10, 20, 3, 4});
+    testBinaryCuda<DataType::I8>(SimpleBinaryType::Sub,
+                                 Shape{2, 5, 10, 20, 3, 4},
+                                 Shape{2, 5, 10, 20, 3, 4},
+                                 Shape{2, 5, 10, 20, 3, 4});
 }
 
 TEST(kernel, BinaryCudaDiv) {
-    testBinaryCuda(SimpleBinaryType::Div,
-                   Shape{2, 5, 10, 20, 3, 4},
-                   Shape{2, 5, 10, 20, 3, 4},
-                   Shape{2, 5, 10, 20, 3, 4});
+    testBinaryCuda<DataType::I8>(SimpleBinaryType::Div,
+                                 Shape{2, 5, 10, 20, 3, 4},
+                                 Shape{2, 5, 10, 20, 3, 4},
+                                 Shape{2, 5, 10, 20, 3, 4});
+}
+
+TEST(kernel, BinaryCudaMod) {
+    testBinaryCuda<DataType::I8>(SimpleBinaryType::Mod,
+                                 Shape{2, 5, 10, 20, 3, 4},
+                                 Shape{2, 5, 10, 20, 3, 4},
+                                 Shape{2, 5, 10, 20, 3, 4});
+}
+
+TEST(kernel, BinaryCudaFmodI8) {
+    testBinaryCuda<DataType::I8>(SimpleBinaryType::Fmod,
+                                 Shape{2, 5, 10, 20, 3, 4},
+                                 Shape{2, 5, 10, 20, 3, 4},
+                                 Shape{2, 5, 10, 20, 3, 4});
+}
+
+TEST(kernel, BinaryCudaFmodF32) {
+    testBinaryCuda<DataType::F32>(SimpleBinaryType::Fmod,
+                                  Shape{2, 5, 10, 20, 3, 4},
+                                  Shape{2, 5, 10, 20, 3, 4},
+                                  Shape{2, 5, 10, 20, 3, 4});
 }
 
 TEST(kernel, BinaryCudaBroadcast) {
-    testBinaryCuda(SimpleBinaryType::Add, Shape{1, 2, 3, 4, 5, 6}, Shape{}, Shape{1, 2, 3, 4, 5, 6});
+    testBinaryCuda<DataType::I8>(SimpleBinaryType::Add, Shape{1, 2, 3, 4, 5, 6}, Shape{}, Shape{1, 2, 3, 4, 5, 6});
 }
 
 #endif
diff --git a/src/05computation/src/operators/simple_binary.cc b/src/05computation/src/operators/simple_binary.cc
@@ -39,6 +39,14 @@ namespace refactor::computation {
                 static uint8_t ID = 8;
                 return reinterpret_cast<size_t>(&ID);
             }
+            case Ty::Mod: {
+                static uint8_t ID = 9;
+                return reinterpret_cast<size_t>(&ID);
+            }
+            case Ty::Fmod: {
+                static uint8_t ID = 10;
+                return reinterpret_cast<size_t>(&ID);
+            }            
             default:
                 UNREACHABLE();
         }
@@ -64,6 +72,10 @@ namespace refactor::computation {
                 return "Or";
             case Ty::Xor:
                 return "Xor";
+            case Ty::Mod:
+                return "Mod";
+            case Ty::Fmod:
+                return "Fmod";                
             default:
                 UNREACHABLE();
         }
diff --git a/src/07onnx/src/operators/simple_binary.cc b/src/07onnx/src/operators/simple_binary.cc
@@ -10,7 +10,7 @@ namespace refactor::onnx {
         : Operator(), type(type_) {}
 
     auto Op::build(ModelContext const &, std::string_view opType, Attributes attributes) -> OpBox {
-        ASSERT(attributes.empty(), "Simple binary operator should not have attributes");
+        auto fmod = defaultOr(attributes, "fmod", {0}).int_();
         // clang-format off
         auto type =
             opType == "onnx::Add" ? Ty::Add :
@@ -21,6 +21,7 @@ namespace refactor::onnx {
             opType == "onnx::And" ? Ty::And :
             opType == "onnx::Or"  ? Ty::Or  :
             opType == "onnx::Xor" ? Ty::Xor :
+            opType == "onnx::Mod" ? (fmod == 0 ? Ty::Mod : Ty::Fmod) :
             UNREACHABLEX(Ty, "Unsupported binary operator: {}", opType);
         // clang-format on
         return OpBox(std::make_unique<Op>(type));
@@ -48,6 +49,26 @@ namespace refactor::onnx {
                 static uint8_t ID = 5;
                 return reinterpret_cast<size_t>(&ID);
             }
+            case Ty::And: {
+                static uint8_t ID = 6;
+                return reinterpret_cast<size_t>(&ID);
+            }
+            case Ty::Or: {
+                static uint8_t ID = 7;
+                return reinterpret_cast<size_t>(&ID);
+            }
+            case Ty::Xor: {
+                static uint8_t ID = 8;
+                return reinterpret_cast<size_t>(&ID);
+            }
+            case Ty::Mod: {
+                static uint8_t ID = 9;
+                return reinterpret_cast<size_t>(&ID);
+            }
+            case Ty::Fmod: {
+                static uint8_t ID = 10;
+                return reinterpret_cast<size_t>(&ID);
+            }            
             default:
                 UNREACHABLE();
         }
@@ -65,6 +86,8 @@ namespace refactor::onnx {
             case Ty::And: return "onnx::And";
             case Ty::Or : return "onnx::Or" ;
             case Ty::Xor: return "onnx::Xor";
+            case Ty::Mod: return "onnx::Mod";
+            case Ty::Fmod: return "onnx::Mod";
             default: UNREACHABLE();
         }
         // clang-format on
@@ -162,6 +185,8 @@ namespace refactor::onnx {
             case Ty::And : type_ = Ty_::And; break;
             case Ty::Or  : type_ = Ty_::Or ; break;
             case Ty::Xor : type_ = Ty_::Xor; break;
+            case Ty::Mod : type_ = Ty_::Mod; break;
+            case Ty::Fmod : type_ = Ty_::Fmod; break;
             default: UNREACHABLE();
         }
         // clang-format on
diff --git a/src/07onnx/src/operators/simple_binary.hh b/src/07onnx/src/operators/simple_binary.hh
@@ -15,6 +15,8 @@ namespace refactor::onnx {
         And,
         Or,
         Xor,
+        Mod,
+        Fmod,
     };
 
     struct SimpleBinary final : public Operator {

-Original file line number
+Diff line change
         And,
         Or,
         Xor,
 +        Mod,
 +        Fmod,
     };
     std::string_view opName(SimpleBinaryType type);
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,8 @@ namespace refactor::kernel {`
`19`	`19`	`CASE(And);`
`20`	`20`	`CASE(Or);`
`21`	`21`	`CASE(Xor);`
	`22`	`+ CASE(Mod);`
	`23`	`+ CASE(Fmod);`
`22`	`24`	`default:`
`23`	`25`	`UNREACHABLE();`
`24`	`26`	`}`