TWA Fused Tasks (#3317)

jeffkbkim · facebook-github-bot · commit 527e8b8919b1 · 2025-08-27T22:44:45.000-07:00
Summary: Pull Request resolved: #3317 TensorWeightedAvgMetric currently does not support FUSED_TASKS computation. With this patch, TWA supports FUSED_TASKS mode Updated unit tests and created new ones for FUSED mode Reviewed By: iamzainhuda Differential Revision: D77958663 fbshipit-source-id: bea413046706c6f1a09f0bbfe1eda7281a481311
diff --git a/torchrec/metrics/rec_metric.py b/torchrec/metrics/rec_metric.py
@@ -623,6 +623,27 @@ def _update(
                     labels, torch.Tensor
                 )
 
+                # Metrics such as TensorWeightedAvgMetric will have tensors that we also need to stack.
+                # Stack in task order: (n_tasks, batch_size)
+                if "required_inputs" in kwargs:
+                    target_tensors: list[torch.Tensor] = []
+                    for task in self._tasks:
+                        if (
+                            task.tensor_name
+                            and task.tensor_name in kwargs["required_inputs"]
+                        ):
+                            target_tensors.append(
+                                kwargs["required_inputs"][task.tensor_name]
+                            )
+
+                    if target_tensors:
+                        stacked_tensor = torch.stack(target_tensors)
+
+                        # Reshape the stacked_tensor to size([len(self._tasks), self._batch_size])
+                        stacked_tensor = stacked_tensor.view(len(self._tasks), -1)
+                        assert isinstance(stacked_tensor, torch.Tensor)
+                        kwargs["required_inputs"]["target_tensor"] = stacked_tensor
+
                 predictions = (
                     # Reshape the predictions to size([len(self._tasks), self._batch_size])
                     predictions.view(len(self._tasks), -1)
diff --git a/torchrec/metrics/tensor_weighted_avg.py b/torchrec/metrics/tensor_weighted_avg.py
@@ -30,23 +30,29 @@ class TensorWeightedAvgMetricComputation(RecMetricComputation):
 
     It is a sibling to WeightedAvgMetricComputation, but it computes the weighted average of a tensor
     passed in as a required input instead of the predictions tensor.
+
+    FUSED_TASKS_COMPUTATION:
+        This class requires all target tensors from tasks to be stacked together in RecMetrics._update().
+        During TensorWeightedAvgMetricComputation.update(), the weighted sum and weighted num samples are
+        computed per stacked tensor.
     """
 
     def __init__(
         self,
         *args: Any,
-        tensor_name: Optional[str] = None,
-        weighted: bool = True,
+        tasks: List[RecTaskInfo],
         description: Optional[str] = None,
         **kwargs: Any,
     ) -> None:
         super().__init__(*args, **kwargs)
-        if tensor_name is None:
-            raise RecMetricException(
-                f"TensorWeightedAvgMetricComputation expects tensor_name to not be None got {tensor_name}"
-            )
-        self.tensor_name: str = tensor_name
-        self.weighted: bool = weighted
+        self.tasks: List[RecTaskInfo] = tasks
+
+        for task in self.tasks:
+            if task.tensor_name is None:
+                raise RecMetricException(
+                    "TensorWeightedAvgMetricComputation expects all tasks to have tensor_name, but got None."
+                )
+
         self._add_state(
             "weighted_sum",
             torch.zeros(self._n_tasks, dtype=torch.double),
@@ -63,6 +69,13 @@ def __init__(
         )
         self._description = description
 
+        self.weighted_mask: torch.Tensor = torch.tensor(
+            [task.weighted for task in self.tasks]
+        ).unsqueeze(dim=-1)
+
+        if torch.cuda.is_available():
+            self.weighted_mask = self.weighted_mask.cuda()
+
     def update(
         self,
         *,
@@ -71,25 +84,54 @@ def update(
         weights: Optional[torch.Tensor],
         **kwargs: Dict[str, Any],
     ) -> None:
-        if (
-            "required_inputs" not in kwargs
-            or self.tensor_name not in kwargs["required_inputs"]
-        ):
+
+        target_tensor: torch.Tensor
+
+        if "required_inputs" not in kwargs:
             raise RecMetricException(
-                f"TensorWeightedAvgMetricComputation expects {self.tensor_name} in the required_inputs"
+                "TensorWeightedAvgMetricComputation expects 'required_inputs' to exist."
             )
+        else:
+            if len(self.tasks) > 1:
+                # In FUSED mode, RecMetric._update() always creates "target_tensor" for the stacked tensor.
+                # Note that RecMetric._update() only stacks if the tensor_name exists in kwargs["required_inputs"].
+                target_tensor = cast(
+                    torch.Tensor,
+                    kwargs["required_inputs"]["target_tensor"],
+                )
+            elif len(self.tasks) == 1:
+                # UNFUSED_TASKS_COMPUTATION
+                tensor_name = self.tasks[0].tensor_name
+                if tensor_name not in kwargs["required_inputs"]:
+                    raise RecMetricException(
+                        f"TensorWeightedAvgMetricComputation expects required_inputs to contain target tensor {self.tasks[0].tensor_name}"
+                    )
+                else:
+                    target_tensor = cast(
+                        torch.Tensor,
+                        kwargs["required_inputs"][tensor_name],
+                    )
+
         num_samples = labels.shape[0]
-        target_tensor = cast(torch.Tensor, kwargs["required_inputs"][self.tensor_name])
         weights = cast(torch.Tensor, weights)
+
+        # Vectorized computation using masks
+        weighted_values = torch.where(
+            self.weighted_mask, target_tensor * weights, target_tensor
+        )
+
+        weighted_counts = torch.where(
+            self.weighted_mask, weights, torch.ones_like(weights)
+        )
+
+        # Sum across batch dimension to Shape(n_tasks,)
+        weighted_sum = weighted_values.sum(dim=-1)
+        weighted_num_samples = weighted_counts.sum(dim=-1)
+
+        # Update states
         states = {
-            "weighted_sum": (
-                target_tensor * weights if self.weighted else target_tensor
-            ).sum(dim=-1),
-            "weighted_num_samples": (
-                weights.sum(dim=-1)
-                if self.weighted
-                else torch.ones(weights.shape).sum(dim=-1).to(device=weights.device)
-            ),
+            "weighted_sum": weighted_sum,
+            "weighted_num_samples": weighted_num_samples,
         }
         for state_name, state_value in states.items():
             state = getattr(self, state_name)
@@ -126,23 +168,40 @@ class TensorWeightedAvgMetric(RecMetric):
     def _get_task_kwargs(
         self, task_config: Union[RecTaskInfo, List[RecTaskInfo]]
     ) -> Dict[str, Any]:
-        if not isinstance(task_config, RecTaskInfo):
-            raise RecMetricException(
-                f"TensorWeightedAvgMetric expects task_config to be RecTaskInfo not {type(task_config)}. Check the FUSED_TASKS_COMPUTATION settings."
-            )
+        all_tasks = (
+            [task_config] if isinstance(task_config, RecTaskInfo) else task_config
+        )
         return {
-            "tensor_name": task_config.tensor_name,
-            "weighted": task_config.weighted,
+            "tasks": all_tasks,
         }
 
     def _get_task_required_inputs(
         self, task_config: Union[RecTaskInfo, List[RecTaskInfo]]
     ) -> Set[str]:
-        if not isinstance(task_config, RecTaskInfo):
-            raise RecMetricException(
-                f"TensorWeightedAvgMetric expects task_config to be RecTaskInfo not {type(task_config)}. Check the FUSED_TASKS_COMPUTATION settings."
-            )
-        required_inputs = set()
-        if task_config.tensor_name is not None:
-            required_inputs.add(task_config.tensor_name)
-        return required_inputs
+        """
+        Returns the required inputs for the task.
+
+        FUSED_TASKS_COMPUTATION:
+            - Given two tasks with the same tensor_name, assume the same tensor reference
+            - For a given tensor_name, assume all tasks have the same weighted flag
+        """
+        all_tasks = (
+            [task_config] if isinstance(task_config, RecTaskInfo) else task_config
+        )
+
+        required_inputs: dict[str, bool] = {}
+        for task in all_tasks:
+            if task.tensor_name is not None:
+                if (
+                    task.tensor_name in required_inputs
+                    and task.weighted is not required_inputs[task.tensor_name]
+                ):
+                    existing_weighted_flag = required_inputs[task.tensor_name]
+                    raise RecMetricException(
+                        f"This target tensor was already registered as weighted={existing_weighted_flag}. "
+                        f"This target tensor cannot be re-registered with weighted={task.weighted}"
+                    )
+                else:
+                    required_inputs[str(task.tensor_name)] = task.weighted
+
+        return set(required_inputs.keys())
diff --git a/torchrec/metrics/test_utils/__init__.py b/torchrec/metrics/test_utils/__init__.py
@@ -12,7 +12,7 @@
 import random
 import tempfile
 import uuid
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 from unittest.mock import Mock, patch
 
 import torch
@@ -45,7 +45,9 @@ def gen_test_batch(
     mask: Optional[torch.Tensor] = None,
     n_classes: Optional[int] = None,
     seed: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
 ) -> Dict[str, torch.Tensor]:
+    device = torch.device(device or "cpu")
     if seed is not None:
         torch.manual_seed(seed)
     if label_value is not None:
@@ -65,14 +67,14 @@ def gen_test_batch(
     else:
         weight = torch.rand(batch_size, dtype=torch.double)
     test_batch = {
-        label_name: label,
-        prediction_name: prediction,
-        weight_name: weight,
-        tensor_name: torch.rand(batch_size, dtype=torch.double),
+        label_name: label.to(device),
+        prediction_name: prediction.to(device),
+        weight_name: weight.to(device),
+        tensor_name: torch.rand(batch_size, dtype=torch.double).to(device),
     }
     if mask_tensor_name is not None:
         if mask is None:
-            mask = torch.ones(batch_size, dtype=torch.double)
+            mask = torch.ones(batch_size, dtype=torch.double).to(device)
         test_batch[mask_tensor_name] = mask
 
     return test_batch
@@ -240,8 +242,10 @@ def rec_metric_value_test_helper(
     n_classes: Optional[int] = None,
     zero_weights: bool = False,
     zero_labels: bool = False,
+    device: Optional[Union[str, torch.device]] = None,
     **kwargs: Any,
 ) -> Tuple[Dict[str, torch.Tensor], Tuple[Dict[str, torch.Tensor], ...]]:
+    device = torch.device(device or "cpu")
     tasks = gen_test_tasks(task_names)
     model_outs = []
     for _ in range(nsteps):
@@ -263,6 +267,7 @@ def rec_metric_value_test_helper(
                 n_classes=n_classes,
                 weight_value=weight_value,
                 label_value=label_value,
+                device=device,
             )
             for task in tasks
         ]
@@ -293,7 +298,8 @@ def get_target_rec_metric_value(
             compute_on_all_ranks=compute_on_all_ranks,
             should_validate_update=should_validate_update,
             **kwargs,
-        )
+        ).to(device)
+
         for i in range(nsteps):
             # Get required_inputs_list from the target metric
             required_inputs_list = list(target_metric_obj.get_required_inputs())
@@ -381,6 +387,7 @@ def rec_metric_gpu_sync_test_launcher(
     entry_point: Callable[..., None],
     batch_size: int = BATCH_SIZE,
     batch_window_size: int = BATCH_WINDOW_SIZE,
+    device: Optional[Union[str, torch.device]] = None,
     **kwargs: Dict[str, Any],
 ) -> None:
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -402,6 +409,8 @@ def rec_metric_gpu_sync_test_launcher(
             batch_size,
             batch_window_size,
             kwargs.get("n_classes", None),
+            False,
+            torch.device(device or "cpu"),
         )
 
 
@@ -419,8 +428,10 @@ def sync_test_helper(
     batch_window_size: int = BATCH_WINDOW_SIZE,
     n_classes: Optional[int] = None,
     zero_weights: bool = False,
+    device: Optional[Union[str, torch.device]] = None,
     **kwargs: Dict[str, Any],
 ) -> None:
+    device = torch.device(device or "cpu")
     rank = int(os.environ["RANK"])
     world_size = int(os.environ["WORLD_SIZE"])
     dist.init_process_group(
@@ -444,7 +455,7 @@ def sync_test_helper(
         window_size=batch_window_size * world_size,
         # pyre-ignore[6]: Incompatible parameter type
         **kwargs,
-    )
+    ).to(device)
 
     weight_value: Optional[torch.Tensor] = None
 
@@ -458,6 +469,7 @@ def sync_test_helper(
             n_classes=n_classes,
             weight_value=weight_value,
             seed=42,  # we set seed because of how test metric places tensors on ranks
+            device=device,
         )
         for task in tasks
     ]
@@ -575,6 +587,7 @@ def rec_metric_value_test_launcher(
     n_classes: Optional[int] = None,
     zero_weights: bool = False,
     zero_labels: bool = False,
+    device: Optional[Union[str, torch.device]] = None,
     **kwargs: Any,
 ) -> None:
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -600,6 +613,7 @@ def rec_metric_value_test_launcher(
             n_classes=n_classes,
             zero_weights=zero_weights,
             zero_labels=zero_labels,
+            device=device,
             **kwargs,
         )
 
@@ -616,6 +630,7 @@ def rec_metric_value_test_launcher(
             n_classes,
             test_nsteps,
             zero_weights,
+            device,
         )
 
 
@@ -642,6 +657,7 @@ def metric_test_helper(
     n_classes: Optional[int] = None,
     nsteps: int = 1,
     zero_weights: bool = False,
+    device: Optional[Union[str, torch.device]] = None,
     is_time_dependent: bool = False,
     time_dependent_metric: Optional[Dict[Type[RecMetric], str]] = None,
     **kwargs: Any,
@@ -670,6 +686,7 @@ def metric_test_helper(
         is_time_dependent=is_time_dependent,
         time_dependent_metric=time_dependent_metric,
         zero_weights=zero_weights,
+        device=device,
         **kwargs,
     )
 
diff --git a/torchrec/metrics/tests/test_tensor_weighted_avg.py b/torchrec/metrics/tests/test_tensor_weighted_avg.py