Skip to content

Commit

Permalink
fix a minor bug with async checkpointing where a checkpoint would get…
Browse files Browse the repository at this point in the history
… saved on_train_batch_end and on_validation_end within the same step (NVIDIA#9856) (NVIDIA#9867)

Signed-off-by: ashors1 <[email protected]>
Co-authored-by: Anna Shors <[email protected]>
  • Loading branch information
github-actions[bot] and ashors1 authored Jul 30, 2024
1 parent bd17e77 commit 86bfac2
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion nemo/lightning/pytorch/callbacks/model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,8 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
self.set_checkpoint_unfinished_marker(filepath, barrier_after=True)
ema_callback = self._ema_callback(trainer)

self._last_global_step_saved = trainer.global_step

if ema_callback is not None:
if self.async_save:
raise ValueError('async_save with EMA not supported')
Expand Down Expand Up @@ -422,7 +424,6 @@ def _get_finalize_save_checkpoint_callback(

def _cb():
logging.debug(f'Finalize callback called for step {global_step}, filepath {filepath}')
self._last_global_step_saved = global_step
self._last_checkpoint_saved = filepath

from nemo.utils.get_rank import is_global_rank_zero
Expand Down

0 comments on commit 86bfac2

Please sign in to comment.