changes

qihqi · qihqi · commit 05716d26dd7d · 2025-06-25T03:44:20.000Z
diff --git a/torchax/torchax/config.py b/torchax/torchax/config.py
@@ -10,6 +10,11 @@ class Configuration:
 
   use_int32_for_index: bool = False
 
+  # normally, math between CPU torch.Tensor with torchax.Tensor is not
+  # allowed. However, if that torch.Tensor happens to be scalar, then we
+  # can use scalar * tensor math to handle it
+  allow_mixed_tensor_for_scalar_tensor: bool = True
+
   # If true, we will convert Views into torchax.Tensors eagerly
   force_materialize_views: bool = False
 
diff --git a/torchax/torchax/ops/jtorch.py b/torchax/torchax/ops/jtorch.py
@@ -50,6 +50,11 @@ def _tensor(data, *, dtype=None, **kwargs):
     leaves = jax.tree_util.tree_leaves(data)
     if len(leaves) > 0:
       dtype = python_types_to_torch_types.get(type(leaves[0]))
+  def to_scalar(x):
+    if isinstance(x, torch.Tensor):
+      return x.item()
+    return x
+  data = jax.tree.map(to_scalar, data)
 
   return jnp.array(
       data, dtype=dtype or mappings.t2j_dtype(torch.get_default_dtype()))
@@ -566,3 +571,9 @@ def torch_Tensor_repeat_interleave(self,
                                    *,
                                    output_size=None):
   return jnp.repeat(self, repeats, axis=dim, total_repeat_length=output_size)
+
+@register_function(torch.nn.functional.conv2d)
+def torch_conv2d(
+  input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1
+):
+  return jaten._aten_conv2d(input, weight, bias, stride, padding, dilation, groups)
diff --git a/torchax/torchax/tensor.py b/torchax/torchax/tensor.py
@@ -339,6 +339,7 @@ def target_device(self):
   @target_device.setter
   def target_device(self, device: str):
     self._target_device = device.lower()
+    self.default_device_or_sharding = jax.local_devices()[0]
 
   def manual_seed(self, key):
     self._prng_key = mutable_array(jax.random.key(key))
@@ -359,10 +360,10 @@ def get_as_jax_device(self, device: Any):
       return jax.devices("cpu")[0]
 
     if self.config.treat_cuda_as_jax_device and device.startswith("cuda"):
-      return jax.local_devices()[0]
+      return self.default_device_or_sharding
 
     if device.startswith("xla"):
-      return jax.local_devices()[0]
+      return self.default_device_or_sharding
 
     # TODO (wen): jax is NOT a device type,
     # once we can register more than one backend, revisit
@@ -461,6 +462,7 @@ def _to_copy(self, the_tensor, new_dtype, new_device):
         return the_tensor
 
       jax_device = self.get_as_jax_device(new_device)
+
       if jax_device:
         arr = self.t2j_copy(the_tensor)
         arr = jax.device_put(arr, jax_device)
@@ -488,15 +490,16 @@ def _handle_tensor_constructor(self, func, args, kwargs):
       # let torch handle it
       with mode_utils.no_dispatch(), torch._C.DisableTorchFunction():
         return func(*args, **kwargs)
-    with jax.default_device(jax_device):
-      requires_grad = kwargs.get("requires_grad", False)
-      op = self._get_op_or_decomp(func)
-      res = op.func(*args, **kwargs)
-      if isinstance(res, jax.Array):
-        res = Tensor(res, self)
-      if requires_grad:
-        res.requires_grad = True
-      return res
+
+    requires_grad = kwargs.get("requires_grad", False)
+    op = self._get_op_or_decomp(func)
+    res = op.func(*args, **kwargs)
+    if isinstance(res, jax.Array):
+      res = jax.device_put(res, jax_device)
+      res = Tensor(res, self)
+    if requires_grad:
+      res.requires_grad = True
+    return res
 
   def _torch_Tensor_to(self, args, kwargs):
     the_tensor = args[0]
@@ -593,6 +596,10 @@ def is_not_torchax_tensor(x):
 
       if self.config.debug_accuracy_for_each_op:
         debug_accuracy(func, old_args, old_kwargs, res)
+
+      for r in torch_pytree.tree_flatten(res)[0]:
+        if isinstance(r, Tensor) and r.dtype != super(torch.Tensor, r).dtype:
+          breakpoint()
       return res
 
   def enable_torch_modes(self):
@@ -642,6 +649,9 @@ def to_jax(x):
       if isinstance(
           x, torch.distributed._functional_collectives.AsyncCollectiveTensor):
         x = x.wait()
+      if self.config.allow_mixed_tensor_for_scalar_tensor and not isinstance(x, Tensor):
+        if x.squeeze().ndim == 0:
+          return x.item()
       assert isinstance(x, Tensor) or isinstance(x, View), (
           f"Expect a Tensor or a View but got {type(x)}; usually this means there is a mixed math between XLATensor and torch.Tensor"
       )