From 2890b3338f18c972246b26487d0d4a18795248fd Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Wed, 10 Apr 2024 20:17:11 -0700
Subject: [PATCH] Akoumparouli/fix sd train (#8876)

* hardcode autocast

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* uncomment sd_train

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 Jenkinsfile                                   | 86 +++++++++----------
 .../stable_diffusion/sd_train.py              |  3 +-
 2 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 431bc24907ed..6471fa3d011f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -208,48 +208,48 @@ pipeline {
         sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
       }
     }
-    //stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') {
-    //  when {
-    //    anyOf {
-    //      branch 'main'
-    //      changeRequest target: 'main'
-    //    }
-    //  }
-    //  failFast true
-    //  steps {
-    //    sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
-    //    sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
-    //        trainer.precision=16 \
-    //        trainer.num_nodes=1 \
-    //        trainer.devices=1 \
-    //        ++exp_manager.max_time_per_run=00:00:03:00 \
-    //        exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \
-    //        trainer.max_steps=20 \
-    //        model.micro_batch_size=1 \
-    //        model.global_batch_size=1 \
-    //       model.data.synthetic_data=True \
-    //        model.first_stage_key=images_moments \
-    //        model.cond_stage_key=clip_encoded \
-    //        model.optim.name=megatron_fused_adam \
-    //        +model.optim.capturable=True \
-    //        exp_manager.ema.enable=False \
-    //        model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
-    //        ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
-    //        ++model.cond_stage_config.max_length=77 \
-    //        model.inductor=False \
-    //        ~model.cond_stage_config.restore_from_path \
-    //        ~model.cond_stage_config.freeze \
-    //        ~model.cond_stage_config.layer \
-    //        model.first_stage_config.from_pretrained=null \
-    //        model.ddp_overlap=False \
-    //        model.capture_cudagraph_iters=15 \
-    //        model.unet_config.use_flash_attention=False \
-    //        model.unet_config.attention_resolutions=[1] \
-    //        model.unet_config.channel_mult=[1] \
-    //        "
-    //    sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
-    //  }
-    //}
+    stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps {
+        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
+        sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
+            trainer.precision=16 \
+            trainer.num_nodes=1 \
+            trainer.devices=1 \
+            ++exp_manager.max_time_per_run=00:00:03:00 \
+            exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \
+            trainer.max_steps=20 \
+            model.micro_batch_size=1 \
+            model.global_batch_size=1 \
+           model.data.synthetic_data=True \
+            model.first_stage_key=images_moments \
+            model.cond_stage_key=clip_encoded \
+            model.optim.name=megatron_fused_adam \
+            +model.optim.capturable=True \
+            exp_manager.ema.enable=False \
+            model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
+            ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
+            ++model.cond_stage_config.max_length=77 \
+            model.inductor=False \
+            ~model.cond_stage_config.restore_from_path \
+            ~model.cond_stage_config.freeze \
+            ~model.cond_stage_config.layer \
+            model.first_stage_config.from_pretrained=null \
+            model.ddp_overlap=False \
+            model.capture_cudagraph_iters=15 \
+            model.unet_config.use_flash_attention=False \
+            model.unet_config.attention_resolutions=[1] \
+            model.unet_config.channel_mult=[1] \
+            "
+        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
+      }
+    }
 //     stage('L2: Multimodal ControlNet Train') {
 //       when {
 //         anyOf {
@@ -5849,4 +5849,4 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
       cleanWs()
     }
   }
-}
\ No newline at end of file
+}
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_train.py b/examples/multimodal/text_to_image/stable_diffusion/sd_train.py
index 968d9bec2884..b10eda550e9a 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/sd_train.py
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_train.py
@@ -83,7 +83,8 @@ def main(cfg) -> None:
             else:
                 autocast_enabled = True
                 dgrad_dtype = torch.float16
-
+            # akoumparouli: temp fix.
+            autocast_enabled = True
             model = model.cuda()
             for _ in range(5):
                 with torch.autocast(device_type="cuda", enabled=autocast_enabled, dtype=torch.float16):