ananthsub
diff --git a/‎aten/src/ATen/core/Array.h
Lines changed: 4 additions & 4 deletions b/‎aten/src/ATen/core/Array.h
Lines changed: 4 additions & 4 deletions
diff --git a/‎aten/src/ATen/cudnn/Utils.h
Lines changed: 0 additions & 1 deletion b/‎aten/src/ATen/cudnn/Utils.h
Lines changed: 0 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/cuda/CUDALoops.cuh
Lines changed: 209 additions & 0 deletions b/‎aten/src/ATen/native/cuda/CUDALoops.cuh
Lines changed: 209 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cuda/GcdLcmKernel.cu
Lines changed: 17 additions & 5 deletions b/‎aten/src/ATen/native/cuda/GcdLcmKernel.cu
Lines changed: 17 additions & 5 deletions
@@ -8,9 +8,9 @@
 
 namespace at { namespace detail {
 
-template <typename T, int size>
+template <typename T, int size_>
 struct Array {
-  T data[size];
+  T data[size_];
 
   C10_HOST_DEVICE T operator[](int i) const {
     return data[i];
@@ -27,10 +27,10 @@ struct Array {
   Array(const Array&) = default;
   Array& operator=(const Array&) = default;
 #endif
-
+  static constexpr int size(){return size_;}
   // Fill the array with x.
   C10_HOST_DEVICE Array(T x) {
-    for (int i = 0; i < size; i++) {
+    for (int i = 0; i < size_; i++) {
       data[i] = x;
     }
   }
 
@@ -2,7 +2,6 @@
 
 #include <ATen/ATen.h>
 #include <ATen/cuda/Exceptions.h>
-#include <THC/THC.h>
 #include <ATen/cudnn/cudnn-wrapper.h>
 #include <ATen/cudnn/Handle.h>
 
 
@@ -994,7 +994,7 @@ void logaddexp2_kernel(TensorIteratorBase& iter) {
 }
 
 void gcd_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "gcd_cpu", [&]() {
+  AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "gcd_cpu", [&]() {
       cpu_kernel(
           iter,
           [](scalar_t a, scalar_t b) -> scalar_t {
@@ -1004,7 +1004,7 @@ void gcd_kernel(TensorIteratorBase& iter) {
 }
 
 void lcm_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "lcm_cpu", [&]() {
+  AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "lcm_cpu", [&]() {
       cpu_kernel(
           iter,
           [](scalar_t a, scalar_t b) -> scalar_t {
 
@@ -30,11 +30,14 @@
 
 #include <type_traits>
 #include <tuple>
+#include <iostream>
+#include <mutex>
 
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/core/Array.h>
 #include <ATen/detail/FunctionTraits.h>
 #include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/jit_utils.h>
 #include <c10/macros/Macros.h>
 #include <c10/core/ScalarType.h>
 #include <c10/util/TypeCast.h>
@@ -120,6 +123,139 @@ static inline void launch_vectorized_kernel(int64_t N, const func_t& f, array_t
   }
 }
 
+template<char const *name,
+         typename result_type,
+         typename compute_type,
+         typename array_t,
+         typename inp_calc_t,
+         typename out_calc_t,
+         typename loader_t,
+         typename storer_t>
+static inline void launch_jitted_unrolled_kernel(
+  DeviceIndex dev_idx, int64_t N, const std::string& f, array_t data,
+  inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s, bool contiguous) {
+
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  const int64_t grid = (N + block_work_size() - 1) / block_work_size();
+
+  static std::mutex _jiterator_mutex;
+  static std::vector<at::cuda::jit::NvrtcFunction> fns(c10::cuda::device_count());
+
+  at::cuda::jit::NvrtcFunction* fn_ptr = &fns[dev_idx];
+  if (!fn_ptr->function) {
+    const std::lock_guard<std::mutex> lock{_jiterator_mutex};
+    if (!fn_ptr->function) {
+      constexpr int nTensors = array_t::size();
+      constexpr bool dynamic_casting = !std::is_same<decltype(l),
+                                                     memory::LoadWithoutCast>() || !std::is_same<decltype(s),
+                                                     memory::StoreWithoutCast>();
+      std::string string_name{name};
+      std::string compute_type_str = at::cuda::jit::typeName<compute_type>();
+      std::string result_type_str = at::cuda::jit::typeName<result_type>();
+      auto code = at::cuda::jit::generate_code(nTensors, f, string_name,
+                                               compute_type_str, result_type_str,
+                                               contiguous, dynamic_casting);
+      *fn_ptr = at::cuda::jit::jit_pwise_function(code, name);
+    }
+  }
+
+  // packs args
+  std::array<void*, 6> args = {
+    (void*)&N,
+    (void*)&data,
+    (void*)&ic,
+    (void*)&oc,
+    (void*)&l,
+    (void*)&s
+  };
+
+  at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, num_threads());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template<
+  char const *name,
+  typename result_type,
+  typename compute_type,
+  int arity,
+  typename array_t>
+static inline void launch_jitted_vectorized_kernel(DeviceIndex dev_idx, int64_t N, const std::string& f, array_t data) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  const int64_t grid = (N + block_work_size() - 1) / block_work_size();
+  const int vec_size = memory::jitted_can_vectorize_up_to<result_type, compute_type, arity>(data);
+
+  // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements)
+  //   fn_ptr is set to the appropriate function based on the vec size and GPU used
+  // TODO: Memory use can probably be optimized by re-using kernels across GPUs with
+  //   the same compute capability
+  static std::mutex _jiterator_mutex;
+  static std::vector<at::cuda::jit::NvrtcFunction> fns4(c10::cuda::device_count());
+  static std::vector<at::cuda::jit::NvrtcFunction> fns2(c10::cuda::device_count());
+  static std::vector<at::cuda::jit::NvrtcFunction> fns1(c10::cuda::device_count());
+
+
+  at::cuda::jit::NvrtcFunction* fn_ptr;
+  if (vec_size == 4) {
+    fn_ptr = &fns4[dev_idx];
+  } else if (vec_size == 2) {
+    fn_ptr = &fns2[dev_idx];
+  } else if (vec_size ==1) {
+    fn_ptr = &fns1[dev_idx];
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "unexpected vec_size for jitter vectorized kernel");
+  }
+
+  bool vectorized = vec_size > 1;
+
+  if (!fn_ptr->function) {
+    const std::lock_guard<std::mutex> lock{_jiterator_mutex};
+    if (!fn_ptr->function) {
+      constexpr int nTensors = array_t::size();
+      std::string string_name{name};
+      std::string compute_type_str = at::cuda::jit::typeName<compute_type>();
+      std::string result_type_str = at::cuda::jit::typeName<result_type>();
+      auto code = at::cuda::jit::generate_code(nTensors, f, string_name,
+                                               compute_type_str, result_type_str,
+                                               /*contiguous=*/true, /*dynamic_casting=*/false,
+                                               vectorized, vec_size);
+      std::string kernel_name = vectorized ? string_name + "_vectorized" + std::to_string(vec_size) : string_name;
+      *fn_ptr = at::cuda::jit::jit_pwise_function(code, kernel_name);
+    }
+  }
+
+  if (vectorized) {
+    std::array<void*, 6> args = {
+      (void*)&N,
+      (void*)&data,
+      nullptr,
+      nullptr,
+      nullptr,
+      nullptr
+    };
+
+    at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, num_threads());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  } else {
+    auto ic = TrivialOffsetCalculator<arity>();
+    auto oc = TrivialOffsetCalculator<1>();
+    auto l = memory::LoadWithoutCast();
+    auto s = memory::StoreWithoutCast();
+
+    std::array<void*, 6> args = {
+      (void*)&N,
+      (void*)&data,
+      (void*)&ic,
+      (void*)&oc,
+      (void*)&l,
+      (void*)&s
+    };
+
+    at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, num_threads());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+
+}
+
 template<typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t, typename loader_t, typename storer_t>
 static inline void launch_unrolled_kernel(int64_t N, const func_t& f, array_t data,
                                           inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s)
@@ -131,6 +267,79 @@ static inline void launch_unrolled_kernel(int64_t N, const func_t& f, array_t da
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
+template <char const *name, typename result_type, typename compute_type, int arity>
+void jitted_gpu_kernel_impl(TensorIteratorBase& iter, const std::string& f, const bool dynamic_casting) {
+  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+
+  constexpr int ntensors = arity + 1;
+  at::detail::Array<char*, ntensors> data;
+  for (auto i = decltype(ntensors){0}; i < ntensors; ++i) {
+    data[i] = (char*)iter.data_ptr(i);
+  }
+
+  int64_t numel = iter.numel();
+  bool contiguous = iter.is_contiguous();
+
+  // Decides which of 4 kernel types to launch
+  // Variations are:
+  //   - Case 1: no dynamic casting and contiguous
+  //   - Case 2: no dynamic casting and noncontiguous
+  //   - Case 3: dynamic casting and contiguous
+  //   - Case 4: dynamic casting and noncontiguous
+  // These cases align with the non-jitted CUDALoops.cuh cases in gpu_kernel_impl
+
+  if (!dynamic_casting) {
+    if (contiguous) {
+      // Case 1: no dynamic casting and contiguous
+      launch_jitted_vectorized_kernel<name, result_type, compute_type, arity>(
+        iter.device().index(), numel, f, data);
+      return;
+    }
+
+    // Case 2: no dynamic casting and noncontiguous
+    auto input_offset_calculator = make_input_offset_calculator<arity>(iter);
+    auto output_offset_calculator = make_output_offset_calculator(iter);
+    auto loader = memory::LoadWithoutCast();
+    auto storer = memory::StoreWithoutCast();
+    launch_jitted_unrolled_kernel<name, result_type, compute_type>(
+      iter.device().index(), numel, f, data, input_offset_calculator,
+      output_offset_calculator, loader, storer, contiguous);
+    return;
+  }
+
+  // Cases 3 and 4 are handled below
+  // Both require construction of a storer (this asserts 1 output) and one or more loaders
+
+  // Creates store cast to output (the zeroth tensor in TensorIterator)
+  auto storer = memory::StoreWithCast(iter.dtype(0));
+
+  // Creates load casts from inputs (note offset indexing into the iterators 1...n tensors)
+  at::detail::Array<ScalarType, arity> dtypes;
+  for (auto i = decltype(arity){0}; i < arity; ++i) {
+    dtypes[i] = iter.dtype(i + 1);
+  }
+  auto loader = memory::LoadWithCast<arity>(dtypes);
+
+  if (contiguous) {
+    // Case 3: dynamic casting and contiguous
+    auto input_offset_calculator = TrivialOffsetCalculator<arity>();
+    auto output_offset_calculator = TrivialOffsetCalculator<1>();
+    launch_jitted_unrolled_kernel<name, result_type, compute_type>(
+      iter.device().index(), numel, f, data, input_offset_calculator,
+      output_offset_calculator, loader, storer, contiguous);
+    return;
+  }
+
+  // Case 4: dynamic casting and noncontiguous
+  auto input_offset_calculator = make_input_offset_calculator<arity>(iter);
+  auto output_offset_calculator = make_output_offset_calculator(iter);
+  launch_jitted_unrolled_kernel<name, result_type, compute_type>(
+    iter.device().index(), numel, f, data, input_offset_calculator,
+    output_offset_calculator, loader, storer, contiguous);
+}
+
 template <typename func_t>
 void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
   using traits = function_traits<func_t>;
 
@@ -5,22 +5,34 @@
 #include <ATen/native/cuda/Math.cuh>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/BinaryOps.h>
+#include <ATen/native/cuda/jit_utils.h>
 
 // NOTE: CUDA on Windows requires that the enclosing function
 // of a __device__ lambda not have internal linkage.
 
 namespace at { namespace native {
 
+// See note [Jiterator]
+const char gcd_name[] = "gcd";
 void gcd_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "gcd_cuda", [&]() {
-    gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
-      return calc_gcd(a, b);
+  #ifdef USE_JITERATOR
+    AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "gcd_cuda", [&]() {
+      jitted_gpu_kernel</*name=*/gcd_name,
+                        /*return_dtype=*/ scalar_t,
+                        /*common_dtype=*/ scalar_t,
+                        /*arity=*/ 2>(iter, gcd_string);
     });
-  });
+  #else
+    AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "gcd_cuda", [&]() {
+      gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
+        return calc_gcd(a, b);
+      });
+    });
+  #endif // USE_JITERATOR
 }
 
 void lcm_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "lcm_cuda", [&]() {
+  AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "lcm_cuda", [&]() {
     gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
       scalar_t g = calc_gcd(a, b);
       return (g == 0) ? 0 : ::abs(a / g * b);