fix RecMetrics loading (make trained_batches a buffer) (#3534)

prajjwal1 · facebook-github-bot · commit df086d401b3e · 2025-11-13T07:28:00.000-08:00
Summary:

This diff addresses the following task: T209753398

Currently `trained_batches` is not stored in state_dict, requiring us to manually sync this variable upon checkpoint loading. We make this variable a buffer so that it can now be captured with model state dict.

Differential Revision: D86697665
diff --git a/torchrec/metrics/cpu_offloaded_metric_module.py b/torchrec/metrics/cpu_offloaded_metric_module.py
@@ -474,6 +474,9 @@ def sync(self) -> None:
         )
         self.comms_module.load_pre_compute_states(aggregated_states)
 
+        # Sync _trained_batches to comms module
+        self.comms_module._trained_batches.copy_(self._trained_batches)
+
         logger.info("CPUOffloadedRecMetricModule synced.")
 
     @override
diff --git a/torchrec/metrics/metric_module.py b/torchrec/metrics/metric_module.py
@@ -202,7 +202,11 @@ def __init__(
         self.rec_metrics = rec_metrics if rec_metrics else RecMetricList([])
         self.throughput_metric = throughput_metric
         self.state_metrics = state_metrics if state_metrics else {}
-        self.trained_batches: int = 0
+
+        self.register_buffer(
+            "_trained_batches", torch.tensor([0], dtype=torch.int64), persistent=True
+        )
+
         self.batch_size = batch_size
         self.world_size = world_size
         self.oom_count = 0
@@ -228,6 +232,15 @@ def __init__(
         )
         self.last_compute_time = -1.0
 
+    @property
+    def trained_batches(self) -> int:
+        # .trained_batches should return an int
+        return int(self._trained_batches.item())
+
+    @trained_batches.setter
+    def trained_batches(self, value: int) -> None:
+        self._trained_batches.fill_(int(value))
+
     def _update_rec_metrics(
         self, model_out: Dict[str, torch.Tensor], **kwargs: Any
     ) -> None:
@@ -260,7 +273,7 @@ def update(self, model_out: Dict[str, torch.Tensor], **kwargs: Any) -> None:
             self._update_rec_metrics(model_out, **kwargs)
             if self.throughput_metric:
                 self.throughput_metric.update()
-            self.trained_batches += 1
+            self._trained_batches.add_(1)
 
     def _adjust_compute_interval(self) -> None:
         """
diff --git a/torchrec/metrics/tests/test_cpu_offloaded_metric_module.py b/torchrec/metrics/tests/test_cpu_offloaded_metric_module.py
@@ -341,6 +341,7 @@ def test_state_dict_save_load(self) -> None:
                 "rec_metrics.rec_metrics.0._metrics_computations.0.state_3": torch.tensor(
                     [6.0]
                 ),
+                "_trained_batches": torch.tensor([0], dtype=torch.int64),
             },
         )
 
diff --git a/torchrec/metrics/tests/test_metric_module.py b/torchrec/metrics/tests/test_metric_module.py
@@ -248,7 +248,11 @@ def _run_trainer_checkpointing(rank: int, world_size: int, backend: str) -> None
         state_dict = metric_module.state_dict()
         keys = list(state_dict.keys())
         for k in state_dict.keys():
-            state_dict[k] = torch.tensor(value, dtype=torch.long).detach()
+            # _trained_batches is now a 1-D tensor, not a scalar
+            if k == "_trained_batches":
+                state_dict[k] = torch.tensor([value], dtype=torch.long).detach()
+            else:
+                state_dict[k] = torch.tensor(value, dtype=torch.long).detach()
         logging.info(f"Metrics state keys = {keys}")
         metric_module.load_state_dict(state_dict)
         tc = unittest.TestCase()

Original file line number	Diff line number	Diff line change
`@@ -474,6 +474,9 @@ def sync(self) -> None:`
`474`	`474`	`)`
`475`	`475`	`self.comms_module.load_pre_compute_states(aggregated_states)`
`476`	`476`
	`477`	`+ # Sync _trained_batches to comms module`
	`478`	`+ self.comms_module._trained_batches.copy_(self._trained_batches)`
	`479`	`+`
`477`	`480`	`logger.info("CPUOffloadedRecMetricModule synced.")`
`478`	`481`
`479`	`482`	`@override`
Original file line number	Diff line number	Diff line change
`@@ -341,6 +341,7 @@ def test_state_dict_save_load(self) -> None:`
`341`	`341`	`"rec_metrics.rec_metrics.0._metrics_computations.0.state_3": torch.tensor(`
`342`	`342`	`[6.0]`
`343`	`343`	`),`
	`344`	`+ "_trained_batches": torch.tensor([0], dtype=torch.int64),`
`344`	`345`	`},`
`345`	`346`	`)`
`346`	`347`