alpharho1
diff --git a/‎BUILD.bazel
Lines changed: 0 additions & 2 deletions b/‎BUILD.bazel
Lines changed: 0 additions & 2 deletions
diff --git a/‎aten/src/ATen/LegacyTHFunctionsCPU.cpp
Lines changed: 0 additions & 153 deletions b/‎aten/src/ATen/LegacyTHFunctionsCPU.cpp
Lines changed: 0 additions & 153 deletions
diff --git a/‎aten/src/ATen/LegacyTHFunctionsCPU.h
Lines changed: 0 additions & 2 deletions b/‎aten/src/ATen/LegacyTHFunctionsCPU.h
Lines changed: 0 additions & 2 deletions
diff --git a/‎aten/src/ATen/ParallelOpenMP.h
Lines changed: 14 additions & 7 deletions b/‎aten/src/ATen/ParallelOpenMP.h
Lines changed: 14 additions & 7 deletions
diff --git a/‎aten/src/ATen/TensorIterator.cpp
Lines changed: 21 additions & 0 deletions b/‎aten/src/ATen/TensorIterator.cpp
Lines changed: 21 additions & 0 deletions
diff --git a/‎aten/src/ATen/TensorIterator.h
Lines changed: 16 additions & 0 deletions b/‎aten/src/ATen/TensorIterator.h
Lines changed: 16 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/ReduceOps.cpp
Lines changed: 0 additions & 13 deletions b/‎aten/src/ATen/native/ReduceOps.cpp
Lines changed: 0 additions & 13 deletions
@@ -329,9 +329,7 @@ filegroup(
         "aten/src/TH/THLapack.cpp",
         "aten/src/TH/THStorageFunctions.cpp",
         "aten/src/TH/THTensor.cpp",
-        "aten/src/TH/THTensorEvenMoreMath.cpp",
         "aten/src/TH/THTensorLapack.cpp",
-        "aten/src/TH/THTensorMath.cpp",
         "aten/src/TH/THTensorMoreMath.cpp",
     ],
 )
 
@@ -35,159 +35,6 @@ namespace {
   }
 }
 
-Tensor & _th_nonzero_out(const Tensor & self, Tensor & result) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Bool: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THBoolTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Byte: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THByteTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THCharTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THDoubleTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THFloatTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THIntTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THLongTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THShortTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THHalfTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::BFloat16: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THBFloat16Tensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::ComplexDouble: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THComplexDoubleTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::ComplexFloat: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THComplexFloatTensor_nonzero(result_, self_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_nonzero_out not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Tensor _th_nonzero(const Tensor & self) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(ScalarType::Long)).release();
-    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Bool: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THBoolTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THByteTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THCharTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THDoubleTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THFloatTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THIntTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THLongTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THShortTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THHalfTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::BFloat16: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THBFloat16Tensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::ComplexDouble: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THComplexDoubleTensor_nonzero(result_, self_);
-            break;
-        }
-        case ScalarType::ComplexFloat: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_nonzero", false, DeviceType::CPU, dispatch_scalar_type);
-            THComplexFloatTensor_nonzero(result_, self_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_nonzero not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return result;
-}
 Scalar _th_std_var(const Tensor& self, int64_t correction, bool take_sqrt) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
 
@@ -20,8 +20,6 @@ namespace cpu {
 
 Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source);
 Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source);
-Tensor& _th_nonzero_out(const Tensor& self, Tensor& result);
-Tensor _th_nonzero(const Tensor & self);
 Scalar _th_std_var(const Tensor& self, int64_t correction, bool take_sqrt);
 Tensor & _th_renorm_out(const Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm, Tensor & result);
 Tensor _th_renorm(const Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm);
 
@@ -17,24 +17,30 @@ inline void parallel_for(
     const int64_t end,
     const int64_t grain_size,
     const F& f) {
-  TORCH_CHECK(grain_size >= 0);
-  at::internal::lazy_init_num_threads();
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(grain_size >= 0);
   if (begin >= end) {
     return;
   }
-  if (end - begin == 1) {
+
+#ifdef _OPENMP
+  at::internal::lazy_init_num_threads();
+  const auto numiter = end - begin;
+  const bool use_parallel = (
+    numiter > grain_size && numiter > 1 &&
+    omp_get_max_threads() > 1 && !omp_in_parallel());
+  if (!use_parallel) {
     f(begin, end);
     return;
   }
-#ifdef _OPENMP
+
   std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
   std::exception_ptr eptr;
   // Work around memory leak when using 1 thread in nested "omp parallel"
   // caused by some buggy OpenMP versions and the fact that omp_in_parallel()
   // returns false when omp_get_max_threads() == 1 inside nested "omp parallel"
   // See issue gh-32284
 
-#pragma omp parallel if (omp_get_max_threads() > 1 && !omp_in_parallel() && ((end - begin) > grain_size))
+#pragma omp parallel
   {
     // choose number of tasks based on grain size and number of threads
     // can't use num_threads clause due to bugs in GOMP's thread pool (See #32008)
@@ -76,15 +82,16 @@ inline scalar_t parallel_reduce(
   at::internal::lazy_init_num_threads();
   if (begin >= end) {
     return ident;
-  } else if (in_parallel_region() || get_num_threads() == 1) {
+  } else if ((end - begin) <= grain_size || in_parallel_region() ||
+             get_num_threads() == 1) {
     return f(begin, end, ident);
   } else {
     const int64_t num_results = divup((end - begin), grain_size);
     std::vector<scalar_t> results(num_results);
     scalar_t* results_data = results.data();
     std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
     std::exception_ptr eptr;
-#pragma omp parallel for if ((end - begin) >= grain_size)
+#pragma omp parallel for
     for (int64_t id = 0; id < num_results; id++) {
       int64_t i = begin + id * grain_size;
       try {
 
@@ -129,6 +129,12 @@ void TensorIteratorBase::reorder_dimensions() {
   // initialize perm with n-1, n-2, ..., 1, 0
   std::iota(perm_.rbegin(), perm_.rend(), 0);
 
+  // Reordering dimensions changes iteraton order
+  if (enforce_linear_iteration_) {
+    permute_dimensions(perm_);
+    return;
+  }
+
   // returns 1 if the dim0 should come after dim1, -1 if dim0 should come
   // before dim1, and 0 if the comparison is ambiguous.
   auto should_swap = [&](size_t dim0, size_t dim1) {
@@ -1213,6 +1219,20 @@ FastSetupType TensorIteratorBase::compute_fast_setup_type(const TensorIteratorCo
     return FastSetupType::NONE;
   }
 
+  // For linear iteration, only contiguous tensors can be coalesced
+  // Fast setup of any other format requires changing iteration order
+  if (enforce_linear_iteration_) {
+    for (const auto& op : operands_) {
+      if (op.tensor->defined() && !op.will_resize) {
+        auto is_contiguous = op.tensor->is_contiguous(at::MemoryFormat::Contiguous);
+        if (!is_contiguous) {
+          return FastSetupType::NONE;
+        }
+      }
+    }
+    return FastSetupType::CONTIGUOUS;
+  }
+
   bool is_contiguous = true;
   bool is_channels_last = true;
   bool is_non_overlapping_and_dense = true;
@@ -1265,6 +1285,7 @@ TensorIteratorBase::TensorIteratorBase() = default;
 void TensorIteratorBase::build(TensorIteratorConfig& config) {
   // populate some persistent configuration fields
   is_reduction_ = config.is_reduction_;
+  enforce_linear_iteration_ = config.enforce_linear_iteration_;
 
   // fill in operands_ based on configuration
   populate_operands(config);
 
@@ -426,6 +426,10 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   /// been called?  This is SOLELY used to check validity of perm_.
   bool has_coalesced_dimensions_ = false;
 
+  /// Whether iteration must be fixed. This disables dimension permuting and also
+  /// changes how for_each divides work among threads.
+  bool enforce_linear_iteration_ = false;
+
   /// The index offsets into the original tensors for each dimension.
   /// This is only non-zero when you narrow() a TensorIterator (e.g.,
   /// when you make sub-TensorIterators).
@@ -583,6 +587,17 @@ class TORCH_API TensorIteratorConfig final {
     return *this;
   }
 
+  // Sets the enforce_linear_iteration_ flag, which is false by default.
+  // If true, iteration goes in the same order as a C-contiguous tensor
+  // is layed out in memory. i.e. last dimension iterates fastest.
+  //
+  // This iteration order can be less efficient and may even prevent vectorization.
+  // So only use if the correctness of your kernel depends on it.
+  TensorIteratorConfig& enforce_linear_iteration(const bool _enforce_linear_iteration = true) {
+    enforce_linear_iteration_ = _enforce_linear_iteration;
+    return *this;
+  }
+
   // Sets the promote_inputs_to_common_dtype_ flag, which is false by default
   // If true, the iterator's "common dtype" is always computed (see the
   //   [Common Dtype Computation] note) and, on the CPU, temporary copies of
@@ -664,6 +679,7 @@ class TORCH_API TensorIteratorConfig final {
   bool check_all_same_dtype_ = true;
   bool check_all_same_device_ = true;
   bool enforce_safe_casting_to_output_ = false;
+  bool enforce_linear_iteration_ = false;
   bool promote_inputs_to_common_dtype_ = false;
   bool promote_integer_inputs_to_float_ = false;
   bool cast_common_dtype_to_outputs_ = false;
 
@@ -1746,19 +1746,6 @@ Tensor dist(const Tensor &self, const Tensor& other, const Scalar& p){
   return at::norm(self - other, p);
 }
 
-Tensor count_nonzero(const Tensor& self, IntArrayRef dims){
-  auto mask = (self != 0);
-  return mask.sum(dims);
-}
-
-Tensor count_nonzero(const Tensor& self, c10::optional<int64_t> dim){
-  if (dim){
-    auto wrap_dim = maybe_wrap_dim(dim.value(), self.dim());
-    return at::count_nonzero(self, IntArrayRef{wrap_dim});
-  }
-  return at::count_nonzero(self, IntArrayRef{});
-}
-
 bool cpu_equal(const Tensor& self, const Tensor& other) {
   if (!at::namedinference::are_names_equal(
         self.unsafeGetTensorImpl(), other.unsafeGetTensorImpl())) {