[optim][adagrad] group tensors in foreach to maximize perf (pytorch#92362)

janeyx99 · pytorchmergebot · commit b2ca2c86624d · 2023-01-20T16:24:39.000Z
another one Pull Request resolved: pytorch#92362 Approved by: https://github.com/albanD
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
@@ -3,6 +3,7 @@
 
 from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value,
                         _differentiable_doc, _maximize_doc)
+from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
 from typing import List, Optional
 
 __all__ = ["Adagrad", "adagrad"]
@@ -322,48 +323,50 @@ def _multi_tensor_adagrad(
     if len(params) == 0:
         return
 
-    if maximize:
-        grads = torch._foreach_neg(grads)
+    grouped_tensorlists = _group_tensors_by_device_and_dtype([params, grads, state_sums, state_steps])
+    for device_params, device_grads, device_state_sums, device_state_steps in grouped_tensorlists.values():
+
+        if maximize:
+            device_grads = torch._foreach_neg(device_grads)
+
+        device_has_sparse_grad = any(grad.is_sparse for grad in device_grads)
+
+        if device_has_sparse_grad:
+            return _single_tensor_adagrad(
+                device_params,
+                device_grads,
+                device_state_sums,
+                device_state_steps,
+                lr=lr,
+                weight_decay=weight_decay,
+                lr_decay=lr_decay,
+                eps=eps,
+                has_sparse_grad=True,
+                maximize=False,
+                differentiable=differentiable,
+            )
 
-    if has_sparse_grad is None:
-        has_sparse_grad = any(grad.is_sparse for grad in grads)
+        # Update steps
+        torch._foreach_add_(device_state_steps, 1)
 
-    if has_sparse_grad:
-        return _single_tensor_adagrad(
-            params,
-            grads,
-            state_sums,
-            state_steps,
-            lr=lr,
-            weight_decay=weight_decay,
-            lr_decay=lr_decay,
-            eps=eps,
-            has_sparse_grad=has_sparse_grad,
-            maximize=False,
-            differentiable=differentiable,
-        )
-
-    # Update steps
-    torch._foreach_add_(state_steps, 1)
-
-    if weight_decay != 0:
-        torch._foreach_add_(grads, params, alpha=weight_decay)
-
-    minus_clr = [-lr / (1 + (step - 1) * lr_decay) for step in state_steps]
-
-    grads = [torch.view_as_real(x) if torch.is_complex(x) else x for x in grads]
-    state_sums = [
-        torch.view_as_real(x) if torch.is_complex(x) else x for x in state_sums
-    ]
-    torch._foreach_addcmul_(state_sums, grads, grads, value=1)
-    std = torch._foreach_add(torch._foreach_sqrt(state_sums), eps)
-    toAdd = torch._foreach_div(torch._foreach_mul(grads, minus_clr), std)
-    toAdd = [
-        torch.view_as_complex(x) if torch.is_complex(params[i]) else x
-        for i, x in enumerate(toAdd)
-    ]
-    torch._foreach_add_(params, toAdd)
-    state_sums = [
-        torch.view_as_complex(x) if torch.is_complex(params[i]) else x
-        for i, x in enumerate(state_sums)
-    ]
+        if weight_decay != 0:
+            torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
+
+        minus_clr = [-lr / (1 + (step - 1) * lr_decay) for step in device_state_steps]
+
+        device_grads = [torch.view_as_real(x) if torch.is_complex(x) else x for x in device_grads]
+        device_state_sums = [
+            torch.view_as_real(x) if torch.is_complex(x) else x for x in device_state_sums
+        ]
+        torch._foreach_addcmul_(device_state_sums, device_grads, device_grads, value=1)
+        std = torch._foreach_add(torch._foreach_sqrt(device_state_sums), eps)
+        toAdd = torch._foreach_div(torch._foreach_mul(device_grads, minus_clr), std)
+        toAdd = [
+            torch.view_as_complex(x) if torch.is_complex(device_params[i]) else x
+            for i, x in enumerate(toAdd)
+        ]
+        torch._foreach_add_(device_params, toAdd)
+        device_state_sums = [
+            torch.view_as_complex(x) if torch.is_complex(device_params[i]) else x
+            for i, x in enumerate(device_state_sums)
+        ]