Clear cache after EMA swapping

deepdelirious · Dec 31, 2024 · 080ae73 · 080ae73
1 parent 11e642b
commit 080ae73
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 3 deletions.
diff --git a/flux_train.py b/flux_train.py
@@ -284,7 +284,8 @@ def train(args):
         args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors
     )
 
-    ema = EMA(flux, beta = args.ema_beta, update_after_step=args.ema_update_after_step, update_every=args.ema_update_every, update_model_with_ema_every=args.ema_switch_every, allow_different_devices=True) if args.ema else None
+    if args.ema:
+        ema = EMA(flux, beta = args.ema_beta, update_after_step=args.ema_update_after_step, update_every=args.ema_update_every, update_model_with_ema_every=args.ema_switch_every, allow_different_devices=True) if args.ema else None
 
     if args.gradient_checkpointing:
         flux.enable_gradient_checkpointing(cpu_offload=args.cpu_offload_checkpointing)
@@ -771,7 +772,7 @@ def grad_hook(parameter: torch.Tensor):
                             num_train_epochs,
                             global_step,
                             accelerator.unwrap_model(flux),
-                            ema
+                            ema if not args.no_ema_sampling else None
                         )
                 optimizer_train_fn()
 
@@ -815,7 +816,7 @@ def grad_hook(parameter: torch.Tensor):
                     num_train_epochs,
                     global_step,
                     accelerator.unwrap_model(flux),
-                    ema
+                    ema if not args.no_ema_sampling else None
                 )
 
         flux_train_utils.sample_images(
@@ -929,6 +930,10 @@ def setup_parser() -> argparse.ArgumentParser:
         type=int,
         default=None
     )
+    parser.add_argument(
+        "--no_ema_sampling",
+        action="store_true"
+    )
     parser.add_argument(
         "--no_shuffle",
         action="store_true",

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
@@ -127,6 +127,8 @@ def sample_images(
                     )
                 ema.to("cpu")
                 flux.to(device)
+                with torch.cuda.device(device):
+                    torch.cuda.empty_cache()
     else:
         # Creating list with N elements, where each element is a list of prompt_dicts, and N is the number of processes available (number of devices available)
         # prompt_dicts are assigned to lists based on order of processes, to attempt to time the image creation time to match enum order. Probably only works when steps and sampler are identical.
@@ -173,6 +175,8 @@ def sample_images(
                         )
                     ema.to("cpu")
                     flux.to(device)
+                    with torch.cuda.device(device):
+                        torch.cuda.empty_cache()
 
     torch.set_rng_state(rng_state)
     if cuda_rng_state is not None: