alpharho1
diff --git a/‎aten/src/ATen/FunctionalInverses.cpp
+21-8 b/‎aten/src/ATen/FunctionalInverses.cpp
+21-8
diff --git a/‎aten/src/ATen/LegacyBatchingRegistrations.cpp
+8 b/‎aten/src/ATen/LegacyBatchingRegistrations.cpp
+8
diff --git a/‎aten/src/ATen/NamedTensorUtils.cpp
+14 b/‎aten/src/ATen/NamedTensorUtils.cpp
+14
diff --git a/‎aten/src/ATen/NamedTensorUtils.h
+4 b/‎aten/src/ATen/NamedTensorUtils.h
+4
diff --git a/‎aten/src/ATen/functorch/BatchRulesViews.cpp
+29-21 b/‎aten/src/ATen/functorch/BatchRulesViews.cpp
+29-21
diff --git a/‎aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
+29-16 b/‎aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
+29-16
diff --git a/‎aten/src/ATen/native/ReduceOps.cpp
+2-13 b/‎aten/src/ATen/native/ReduceOps.cpp
+2-13
@@ -3,6 +3,7 @@
 
 #include <ATen/ATen.h>
 #include <ATen/ExpandUtils.h>
+#include <ATen/WrapDimUtilsMulti.h>
 
 #include <utility>
 namespace at {
@@ -42,18 +43,26 @@ Tensor unsqueeze_copy_to(const Tensor & self, c10::SymIntArrayRef sizes, bool re
   return result;
 }
 
-Tensor unsqueeze_copy_to(const Tensor & self, int64_t dim, c10::SymIntArrayRef sizes, bool reapply_views) {
-  dim = at::maybe_wrap_dim(dim, sizes.size());
+Tensor unsqueeze_copy_to(const Tensor & self, IntArrayRef dim, c10::SymIntArrayRef sizes, bool reapply_views) {
+  const auto ndim = sizes.size();
+  const auto mask = at::dim_list_to_bitset(dim, ndim);
   // in NumPy it's not an error to unsqueeze a scalar, but we still need to avoided
   // unsqueezing in the backward.
-  if (sizes.size() > 0 && sizes[dim] == 1) {
-    if (reapply_views) {
-      return at::unsqueeze(self, dim);
-    } else {
-      return at::unsqueeze_copy(self, dim);
+  if (ndim == 0) {
+    return self;
+  }
+
+  Tensor result = self;
+  for (const auto d : c10::irange(ndim)) {
+    if (mask.test(d) && sizes[d] == 1) {
+      if (reapply_views) {
+        result = at::unsqueeze(result, d);
+      } else {
+        result = at::unsqueeze_copy(result, d);
+      }
     }
   }
-  return self;
+  return result;
 }
 
 // Note [Functionalization Pass: View Inverses].
@@ -215,6 +224,10 @@ Tensor FunctionalInverses::squeeze_copy_dim_inverse(const Tensor& base, const Te
     return unsqueeze_copy_to(mutated_view, dim, base.sym_sizes(), reapply_views);
 }
 
+Tensor FunctionalInverses::squeeze_copy_dims_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, IntArrayRef dim) {
+    return unsqueeze_copy_to(mutated_view, dim, base.sym_sizes(), reapply_views);
+}
+
 Tensor FunctionalInverses::t_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) {
     if (reapply_views) {
       return at::t(mutated_view);
 
@@ -296,6 +296,13 @@ Tensor squeeze_dim_batching_rule(const Tensor& self, int64_t dim) {
   return self_physical.getPhysicalToLogicalMap().apply(result);
 }
 
+Tensor squeeze_dims_batching_rule(const Tensor& self, IntArrayRef dims) {
+  auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
+  auto dims_physical = self_physical.getPhysicalDims(dims);
+  auto result = self_physical.tensor().squeeze(dims_physical);
+  return self_physical.getPhysicalToLogicalMap().apply(result);
+}
+
 Tensor trace_batching_rule(const Tensor& self) {
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
   // Batched Diagonal View
@@ -1116,6 +1123,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   m.impl("split_with_sizes", split_with_sizes_batching_rule);
   m.impl("squeeze", squeeze_batching_rule);
   m.impl("squeeze.dim", squeeze_dim_batching_rule);
+  m.impl("squeeze.dims", squeeze_dims_batching_rule);
   m.impl("t", native::t); // composite wrt autograd
   m.impl("trace", trace_batching_rule);
   m.impl("transpose.int", transpose_int_batching_rule);
 
@@ -241,6 +241,20 @@ std::vector<Dimname> compute_squeeze_outnames(const Tensor& tensor) {
   return outnames;
 }
 
+std::vector<Dimname> compute_squeeze_outnames(const Tensor& tensor, std::bitset<dim_bitset_size> dims) {
+  if (!tensor.has_names()) {
+    return {};
+  }
+  std::vector<Dimname> outnames;
+  auto tensor_names = tensor.names();
+  for (const auto d : c10::irange(tensor.dim())) {
+    if (!dims.test(d) || tensor.sym_sizes()[d] != 1) {
+      outnames.push_back(tensor_names[d]);
+    }
+  }
+  return outnames;
+}
+
 std::vector<Dimname> compute_diagonal_outnames(
     const Tensor& tensor,
     int64_t dim1,
 
@@ -1,6 +1,7 @@
 #pragma once
 #include <ATen/NamedTensor.h>
 #include <ATen/TensorNames.h>
+#include <ATen/WrapDimUtilsMulti.h>
 
 #include <ATen/core/DimVector.h>
 #include <ATen/core/Tensor.h>
@@ -144,6 +145,9 @@ TORCH_API std::vector<Dimname> compute_bmm_outnames(
     const Tensor& other);
 
 TORCH_API std::vector<Dimname> compute_squeeze_outnames(const Tensor& tensor);
+TORCH_API std::vector<Dimname> compute_squeeze_outnames(
+    const Tensor& tensor,
+    std::bitset<dim_bitset_size> dims);
 
 std::vector<Dimname> compute_diagonal_outnames(
     const Tensor& tensor,
 
@@ -239,34 +239,41 @@ std::tuple<Tensor, optional<int64_t>> squeeze_batch_rule(const Tensor& self, opt
   return std::make_tuple(result, c10::optional<int64_t>(new_batch_idx));
 }
 
-std::tuple<Tensor, optional<int64_t>> squeeze_dim_batch_rule(const Tensor& self, optional<int64_t> bdim, int64_t dim) {
+std::tuple<Tensor, optional<int64_t>> squeeze_dims_batch_rule(
+    const Tensor& self, optional<int64_t> bdim, IntArrayRef dims) {
   TORCH_INTERNAL_ASSERT(bdim.has_value());
   // Special case for scalar arrays to replicate PyTorch behavior.
-  if (self.dim() == 1) {
-    TORCH_CHECK(dim == 0, "Dimension is out of range (expected to be in range of [-1, 0], but got ", dim);
+  auto ndim = self.dim();
+  if (ndim == 1) {
+    TORCH_CHECK(
+        dims.size() == 0 || (dims.size() == 1 && dims[0] == 0),
+        "Dimension is out of range (expected to be in range of [-1, 0], but got ", dims);
     return std::make_tuple(self.alias(), bdim);
   }
 
-  // Calculate the proper offset if dim is negative.
-  auto actual_dim = dim;
-  if (dim < 0) {
-    actual_dim = self.dim() + dim - 1;
-  }
-  if (actual_dim < bdim) {
-    // Since dimension to be squeezed is before the batch dimension pass as-is.
-    auto original_size = self.dim();
-    auto result = self.squeeze(actual_dim);
-    auto updated_batch_idx = *bdim;
-    if (result.dim() != original_size) {
-      // A column before batch dimension has been dropped so adjust accordingly.
-      --updated_batch_idx;
+  // Adjust any dimensions higher than the batch dimension
+  DimVector adjusted_dims(dims.begin(), dims.end());
+  int64_t updated_batch_idx = *bdim;
+  for (auto &d : adjusted_dims) {
+    auto actual_dim = c10::maybe_wrap_dim(d, ndim - 1);
+    if (actual_dim < *bdim) {
+      d = actual_dim;
+      if (self.sym_size(actual_dim) == 1) {
+        // A column before batch dimension will be dropped so adjust accordingly.
+        --updated_batch_idx;
+      }
+    } else {
+      // Since dimension to be squeezed is after the batch dimension adjust by one to account
+      // for the original batch dimension. In this case batch dimension won't move.
+      d = actual_dim + 1;
     }
-    return std::make_tuple(result, optional<int64_t>(updated_batch_idx));
-  } else {
-    // Since dimension to be squeezed is after the batch dimension adjust by one to account
-    // for the original batch dimension. In this case batch dimension won't move.
-    return std::make_tuple(self.squeeze(actual_dim + 1), bdim);
   }
+  return std::make_tuple(self.squeeze(adjusted_dims), optional<int64_t>(updated_batch_idx));
+}
+
+std::tuple<Tensor, optional<int64_t>> squeeze_dim_batch_rule(
+    const Tensor& self, optional<int64_t> bdim, int64_t dim) {
+  return squeeze_dims_batch_rule(self, bdim, {dim});
 }
 
 std::tuple<std::vector<Tensor>, optional<int64_t>> chunk_batching_rule(const Tensor& self, optional<int64_t> self_bdim, int64_t chunks, int64_t dim) {
@@ -547,6 +554,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT2(select, int, select_batching_rule);
   VMAP_SUPPORT(squeeze, squeeze_batch_rule);
   VMAP_SUPPORT2(squeeze, dim, squeeze_dim_batch_rule);
+  VMAP_SUPPORT2(squeeze, dims, squeeze_dims_batch_rule);
   VMAP_SUPPORT(_reshape_alias, _reshape_alias_batch_rule);
   VMAP_SUPPORT(roll, roll_batch_rule);
   VMAP_SUPPORT(permute, permute_batching_rule);
 
@@ -144,40 +144,52 @@ std::vector<Tensor> tensor_split_indices_batching_rule(const Tensor& self, IntAr
   return result;
 }
 
-Tensor& squeeze_dim__batching_rule(Tensor& self, int64_t dim) {
+Tensor& squeeze_dims__batching_rule(Tensor& self, IntArrayRef dims) {
   if (!participatesInCurrentLevel(self)) {
     c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
-    return self.squeeze_(dim);
+    return self.squeeze_(dims);
   }
   auto* batched = maybeGetBatchedImpl(self);
   const auto bdim = batched->bdim();
   auto logical_dim = self.dim();
 
-  // If logically a scalar tensor, then Tensor.squeeze_(dim) is a no-op
   if (logical_dim == 0) {
+    TORCH_CHECK(
+        dims.size() == 0 || (dims.size() == 1 && dims[0] == 0),
+        "Dimension is out of range (expected to be in range of [-1, 0], but got ", dims);
     return self;
   }
 
-  dim = maybe_wrap_dim(dim, logical_dim);
-  if (dim >= bdim) {
-    dim = dim + 1;
-    batched->value().squeeze_(dim);
-    batched->refreshTensorMetadata();
-    return self;
+  // Adjust any dimensions higher than the batch dimension
+  DimVector adjusted_dims(dims.begin(), dims.end());
+  int64_t updated_batch_idx = bdim;
+  for (auto &d : adjusted_dims) {
+    auto actual_dim = c10::maybe_wrap_dim(d, logical_dim);
+    if (actual_dim < bdim) {
+      d = actual_dim;
+      if (batched->value().sym_size(actual_dim) == 1) {
+        // A column before batch dimension will be dropped so adjust accordingly.
+        --updated_batch_idx;
+      }
+    } else {
+      // Since dimension to be squeezed is after the batch dimension adjust by one to account
+      // for the original batch dimension. In this case batch dimension won't move.
+      d = actual_dim + 1;
+    }
   }
 
-  // Tensor.squeeze_(0) is a no-op if dim 0 has a size other than 1
-  if (batched->value().size(dim) != 1) {
-    return self;
+  batched->value().squeeze_(adjusted_dims);
+  if (updated_batch_idx != bdim) {
+    batched->unsafe_set_bdim(updated_batch_idx);
   }
-
-  // dim < bdim, so we need to adjust bdim
-  batched->value().squeeze_(dim);
-  batched->unsafe_set_bdim(bdim - 1);
   batched->refreshTensorMetadata();
   return self;
 }
 
+Tensor& squeeze_dim__batching_rule(Tensor& self, int64_t dim) {
+  return squeeze_dims__batching_rule(self, {dim});
+}
+
 Tensor& squeeze__batching_rule(Tensor& self) {
   if (!participatesInCurrentLevel(self)) {
     c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
@@ -816,6 +828,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   // still legacy b/c needs special inplace rules
   m.impl("squeeze_", squeeze__batching_rule);
   m.impl("squeeze_.dim", squeeze_dim__batching_rule);
+  m.impl("squeeze_.dims", squeeze_dims__batching_rule);
   m.impl("unsqueeze_", unsqueeze__batching_rule);
   m.impl("transpose_", transpose__batching_rule);
 
 
@@ -90,6 +90,7 @@
 #include <ATen/ops/slice.h>
 #include <ATen/ops/special_logsumexp_native.h>
 #include <ATen/ops/sqrt.h>
+#include <ATen/ops/squeeze.h>
 #include <ATen/ops/stack.h>
 #include <ATen/ops/std.h>
 #include <ATen/ops/std_mean.h>
@@ -1381,23 +1382,11 @@ Tensor nanmean(
   return at::nansum(self, dim, keepdim, opt_dtype).div(factor);
 }
 
-static Tensor squeeze_multiple(const Tensor& self, IntArrayRef dims) {
-  int ndims = self.sizes().size();
-  auto dims_to_squeeze = at::dim_list_to_bitset(dims, ndims);
-  Tensor result = self;
-  for (int i = ndims - 1; i >= 0; --i) {
-    if (dims_to_squeeze[i]) {
-      result = result.squeeze(i);
-    }
-  }
-  return result;
-}
-
 static Tensor& logsumexp_out_impl(Tensor& result, const Tensor& self, IntArrayRef dims, bool keepdim) {
   // can't take max of empty tensor
   if (self.numel() != 0) {
     auto maxes = at::amax(self, dims, true);
-    auto maxes_squeezed = (keepdim ? maxes : squeeze_multiple(maxes, dims));
+    auto maxes_squeezed = (keepdim ? maxes : at::squeeze(maxes, dims));
     maxes_squeezed.masked_fill_(maxes_squeezed.abs() == INFINITY, 0);
     at::sum_out(result, (self - maxes).exp_(), dims, keepdim);
     result.log_().add_(maxes_squeezed);