allow disabling ft checkpoints

tushar00jain · tushar00jain · commit 705204f6d524 · 2025-10-08T13:05:11.000-07:00
Summary:
Allows disabling the storage of checkpoints related to torchft.

Users don't really have to rely on any external storage. So it reduces set up time to get things up and running. Since we also don't really need model checkpoints when we have torchft. And if checkpoint storage has issues, this can work as a killswitch to completely disable the storage so it doesn't impact training.
diff --git a/torchtitan/components/checkpoint.py b/torchtitan/components/checkpoint.py
@@ -193,8 +193,18 @@ def __init__(
         self.load_only = checkpoint_config.load_only
 
         self.ft_manager = (
-            ft_manager.manager if ft_manager and ft_manager.enabled else None
+            ft_manager.manager
+            if ft_manager
+            and ft_manager.enabled
+            and checkpoint_config.enable_ft_dataloader_checkpoints
+            else None
         )
+
+        if ft_manager and ft_manager.enabled and not self.ft_manager:
+            logger.warn(
+                "Fault tolerance is enabled but enable_ft_dataloader_checkpoints is False. This means replicas can retrain over the same data multiple times, which can result in overfitting."
+            )
+
         if self.ft_manager:
             optimizers.init_cache_state_dict()
 
diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
@@ -422,6 +422,28 @@ class Checkpoint:
     enable: bool = False
     """Whether to enable checkpoint"""
 
+    enable_ft_dataloader_checkpoints: bool = True
+    """
+    Warning: Disabling this can have fault tolerant replicas training
+    over the same data multiple times. Use it with caution if training
+    over the same data is acceptable.
+
+    Used to enable checkpointing the dataloader index for fault tolerant training with torchft.
+
+    Fault tolerant training stores data loader index in the checkpoints, so that training can resume
+    without going over the same batch twice.
+
+    If enabled, data loader state is checkpointed. Otherwise, replicas
+    will train over the same data multiple times, which can result in
+    overfitting.
+
+    The failed replcia will still recover other state e.g. model
+    parameters from other replcias.
+
+    Note, if regular checkpointing is enabled, we also checkpoint the
+    data loader state. But when not using fault tolerance, the entire training starts from scratch.
+    """
+
     folder: str = "checkpoint"
     """
     The folder to store the checkpoints.