NVIDIA · dimapihtar · Mar 12, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 17, 2026
@@ -159,7 +159,8 @@ def load(
         )
         merge(common_state_dict, sharded_objects)
 
-    loaded_state_dict = sharded_strategy.load(sharded_state_dict, checkpoint_dir)
+    async_strategy = getattr(common_state_dict.get("args"), "async_strategy", "nvrx")
+    loaded_state_dict = sharded_strategy.load(sharded_state_dict, checkpoint_dir, async_strategy)
 
     merge(common_state_dict, loaded_state_dict)
 
@@ -322,6 +323,7 @@ def save(
         Callable[[CommonStateDict], StateDict]
     ] = None,
     content_metadata: Optional[dict] = None,
+    async_strategy: Optional[str] = "nvrx",
 ) -> Optional[AsyncRequest]:
     """Saving entrypoint.
 
@@ -434,7 +436,7 @@ def metadata_finalize_fn():
         raise CheckpointingException(
             f'Cannot apply async_save to non-async strategy {sharded_strategy}'
         )
-    async_request = sharded_strategy.async_save(sharded_state_dict, checkpoint_dir)
+    async_request = sharded_strategy.async_save(sharded_state_dict, checkpoint_dir, async_strategy)
     async_request.finalize_fns.append(metadata_finalize_fn)
     return async_request
 

@@ -85,15 +85,22 @@ def __init__(
 
         self.cached_distribution: Optional[ShardDistribution] = None
 
-    def async_save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+    def async_save(
+        self,
+        sharded_state_dict: ShardedStateDict,
+        checkpoint_dir: Path,
+        async_strategy: str = "nvrx",
+    ):
+        """ """
         if not isinstance(self.base_strategy, AsyncSaveShardedStrategy):
             raise CheckpointingException(
                 f'Cannot apply async_save to non-async base strategy {self.base_strategy}'
             )
         self.apply_saving_parallelization(sharded_state_dict)
-        return self.base_strategy.async_save(sharded_state_dict, checkpoint_dir)
+        return self.base_strategy.async_save(sharded_state_dict, checkpoint_dir, async_strategy)
 
     def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        """ """
         self.apply_saving_parallelization(sharded_state_dict)
         return self.base_strategy.save(sharded_state_dict, checkpoint_dir)
 
@@ -135,6 +142,7 @@ def apply_saving_parallelization(self, sharded_state_dict: ShardedStateDict) ->
 
     @property
     def can_handle_sharded_objects(self):
+        """ """
         return self.base_strategy.can_handle_sharded_objects
 
 
@@ -185,7 +193,12 @@ def __init__(
         self.cached_global_metadata: Optional[Metadata] = None
 
     @debug_time("FullyParallelLoadStrategyWrapper.load", logger)
-    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> StateDict:
+    def load(
+        self,
+        sharded_state_dict: ShardedStateDict,
+        checkpoint_dir: Path,
+        async_strategy: str = "nvrx",
+    ) -> StateDict:
         """Distributes the load and calls underlying strategy only for parts of the state dict.
 
         Steps:
@@ -218,7 +231,7 @@ def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> St
         loaded_state_dict = {}
 
         if get_pg_size(self.parallelization_group) <= 1:
-            return self.base_strategy.load(sharded_state_dict, checkpoint_dir)
+            return self.base_strategy.load(sharded_state_dict, checkpoint_dir, async_strategy)
 
         # Step 1 and 2: exchange load metadata and distribute the load
         with debug_time("self.apply_loading_parallelization", logger):
@@ -245,11 +258,13 @@ def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> St
         ), "sharded_state_dict is not empty after deferring tensors and objects"
         with debug_time("base_load_ShardedObjects", logger):
             # Load sharded objects first
-            loaded_objects = self.base_strategy.load(to_load_objects, checkpoint_dir)
+            loaded_objects = self.base_strategy.load(
+                to_load_objects, checkpoint_dir, async_strategy
+            )
 
         with debug_time("base_load_ShardedTensors", logger):
             # Load sharded tensors separately
-            loaded_tensors = self.base_strategy.load(to_load_shards, checkpoint_dir)
+            loaded_tensors = self.base_strategy.load(to_load_shards, checkpoint_dir, async_strategy)
 
         with debug_time("self.exchange_loaded_tensors", logger):
 
@@ -390,18 +405,23 @@ def apply_loading_parallelization(
 
     @property
     def can_handle_sharded_objects(self):
+        """ """
         return self.base_strategy.can_handle_sharded_objects
 
     def load_tensors_metadata(self, checkpoint_dir: Path):
+        """ """
         return self.base_strategy.load_tensors_metadata(checkpoint_dir)
 
     def load_sharded_metadata(self, checkpoint_dir: Path):
+        """ """
         return self.base_strategy.load_sharded_metadata(checkpoint_dir)
 
     def check_backend_compatibility(self, loaded_version):
+        """ """
         return self.base_strategy.check_backend_compatibility(loaded_version)
 
     def check_version_compatibility(self, loaded_version):
+        """ """
         return self.base_strategy.check_version_compatibility(loaded_version)