allenai · AkshitaB · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `SkipStepAdamW` optimizer.
 - The trainer can load model-only checkpoints now.
 - Added the option to throttle checkpoint uploads to one rank from each node at a time.
+- Added `RunDuration` in `model_ladder` to configure training durations in terms of Chinchilla multipliers.
 
 ### Changed
 

diff --git a/src/olmo_core/internal/model_ladder.py b/src/olmo_core/internal/model_ladder.py
@@ -9,7 +9,7 @@
 from olmo_core.data import NumpyDataLoaderConfig, NumpyDatasetConfig
 from olmo_core.distributed.utils import get_local_rank
 from olmo_core.launch.beaker import BeakerLaunchConfig
-from olmo_core.model_ladder import ModelLadder, ModelSize
+from olmo_core.model_ladder import ModelLadder, ModelSize, RunDuration
 from olmo_core.nn.transformer import TransformerConfig
 from olmo_core.optim import OptimConfig
 from olmo_core.train import (
@@ -97,6 +97,7 @@ def build_config(
     ladder: ModelLadder,
     script: str,
     size: ModelSize,
+    run_duration: RunDuration,
     cmd: SubCmd,
     cluster: str,
     overrides: List[str],
@@ -118,7 +119,9 @@ def build_config(
     optim = ladder.get_optim_config(size=size)
     dataset = ladder.get_dataset_config()
     data_loader = ladder.get_data_loader_config(size=size)
-    trainer = ladder.get_trainer_config(size=size, gpu_type=gpu_type, dp_world_size=dp_world_size)
+    trainer = ladder.get_trainer_config(
+        size=size, run_duration=run_duration, gpu_type=gpu_type, dp_world_size=dp_world_size
+    )
 
     return LadderRunConfig(
         launch=launch,
@@ -133,7 +136,7 @@ def build_config(
 
 def main(ladder_builder: Callable[[str], ModelLadder]):
     usage = f"""
-[yellow]Usage:[/] [i blue]python[/] [i cyan]{sys.argv[0]}[/] [i b magenta]{'|'.join(SubCmd)}[/] [i b]SIZE CLUSTER[/] [i][OVERRIDES...][/]
+[yellow]Usage:[/] [i blue]python[/] [i cyan]{sys.argv[0]}[/] [i b magenta]{'|'.join(SubCmd)}[/] [i b]SIZE RUN_DURATION CLUSTER[/] [i][OVERRIDES...][/]
 
 [b]Subcommands[/]
 [b magenta]launch:[/]      Launch the script on Beaker with the [b magenta]train[/] subcommand.
@@ -142,16 +145,17 @@ def main(ladder_builder: Callable[[str], ModelLadder]):
 [b magenta]dry_run:[/]     Pretty print the config to run and exit.
 
 [b]Examples[/]
-$ [i]python {sys.argv[0]} {SubCmd.launch} 1B ai2/pluto-cirrascale --launch.num_nodes=2[/]
+$ [i]python {sys.argv[0]} {SubCmd.launch} 1B Cx1 ai2/pluto-cirrascale --launch.num_nodes=2[/]
     """.strip()
 
     try:
-        script, cmd, size, cluster, overrides = (
+        script, cmd, size, run_duration, cluster, overrides = (
             sys.argv[0],
             SubCmd(sys.argv[1]),
             ModelSize(sys.argv[2]),
-            sys.argv[3],
-            sys.argv[4:],
+            RunDuration(sys.argv[3]),
+            sys.argv[4],
+            sys.argv[5:],
         )
     except (IndexError, ValueError):
         import rich
@@ -166,7 +170,7 @@ def main(ladder_builder: Callable[[str], ModelLadder]):
     ladder.merge(overrides, prefix="ladder")
 
     # Build run config.
-    config = build_config(ladder, script, size, cmd, cluster, overrides)
+    config = build_config(ladder, script, size, run_duration, cmd, cluster, overrides)
     config.ladder.validate()
 
     # Run the cmd.

diff --git a/src/olmo_core/model_ladder.py b/src/olmo_core/model_ladder.py
@@ -89,6 +89,39 @@ def num_params(self) -> int:
             raise NotImplementedError(self)
 
 
+class RunDuration(StrEnum):
+    """
+    An enumeration of the standard training durations for the ladder, in terms of Chinchilla multipliers.
+    """
+
+    Cx0_5 = "0.5xC"
+    """
+    Multiplier of 0.5.
+    """
+
+    Cx1 = "1xC"
+    """
+    Multiplier of 1.
+    """
+    Cx2 = "2xC"
+    """
+    Multiplier of 2.
+    """
+    Cx5 = "5xC"
+    """
+    Multiplier of 5.
+    """
+
+    Cx10 = "10xC"
+    """
+    Multiplier of 10.
+    """
+
+    @property
+    def multiplier(self) -> float:
+        return float(self.split("xC")[0])
+
+
 @beta_feature
 @dataclass
 class ModelLadder(Config, metaclass=ABCMeta):
@@ -236,18 +269,21 @@ def get_global_batch_size(self, *, size: ModelSize) -> int:
 
         return self.sequence_length * global_batch_size
 
-    def get_duration(self, size: ModelSize) -> Duration:
+    def get_duration(
+        self, size: ModelSize, run_duration: RunDuration = RunDuration.Cx2
+    ) -> Duration:
         """
         Get the duration to train for given the model size. Defaults to 2 x Chinchilla optimal.
 
         :param size: The target model size.
         """
-        return Duration.tokens(2 * 20 * size.num_params)
+        return Duration.tokens(int(run_duration.multiplier * 20) * size.num_params)
 
     def get_trainer_config(
         self,
         *,
         size: ModelSize,
+        run_duration: RunDuration,
         gpu_type: str,
         dp_world_size: int,
     ) -> TrainerConfig:
@@ -315,7 +351,7 @@ def get_trainer_config(
                 metrics_collect_interval=10,
                 cancel_check_interval=1,
                 compile_loss=True,
-                max_duration=self.get_duration(size),
+                max_duration=self.get_duration(size, run_duration),
             )
             .with_callback(
                 "lr_scheduler", SchedulerCallback(scheduler=CosWithWarmup(warmup_steps=2000))