From 2890b3338f18c972246b26487d0d4a18795248fd Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Wed, 10 Apr 2024 20:17:11 -0700 Subject: [PATCH] Akoumparouli/fix sd train (#8876) * hardcode autocast Signed-off-by: Alexandros Koumparoulis * uncomment sd_train Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis --- Jenkinsfile | 86 +++++++++---------- .../stable_diffusion/sd_train.py | 3 +- 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 431bc24907ed..6471fa3d011f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -208,48 +208,48 @@ pipeline { sh "rm -rf /home/TestData/multimodal/stable_diffusion_train" } } - //stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps { - // sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs" - // sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \ - // trainer.precision=16 \ - // trainer.num_nodes=1 \ - // trainer.devices=1 \ - // ++exp_manager.max_time_per_run=00:00:03:00 \ - // exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \ - // trainer.max_steps=20 \ - // model.micro_batch_size=1 \ - // model.global_batch_size=1 \ - // model.data.synthetic_data=True \ - // model.first_stage_key=images_moments \ - // model.cond_stage_key=clip_encoded \ - // model.optim.name=megatron_fused_adam \ - // +model.optim.capturable=True \ - // exp_manager.ema.enable=False \ - // model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \ - // ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \ - // ++model.cond_stage_config.max_length=77 \ - // model.inductor=False \ - // ~model.cond_stage_config.restore_from_path \ - // ~model.cond_stage_config.freeze \ - // ~model.cond_stage_config.layer \ - // model.first_stage_config.from_pretrained=null \ - // model.ddp_overlap=False \ - // model.capture_cudagraph_iters=15 \ - // model.unet_config.use_flash_attention=False \ - // model.unet_config.attention_resolutions=[1] \ - // model.unet_config.channel_mult=[1] \ - // " - // sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs" - // } - //} + stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + steps { + sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs" + sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \ + trainer.precision=16 \ + trainer.num_nodes=1 \ + trainer.devices=1 \ + ++exp_manager.max_time_per_run=00:00:03:00 \ + exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \ + trainer.max_steps=20 \ + model.micro_batch_size=1 \ + model.global_batch_size=1 \ + model.data.synthetic_data=True \ + model.first_stage_key=images_moments \ + model.cond_stage_key=clip_encoded \ + model.optim.name=megatron_fused_adam \ + +model.optim.capturable=True \ + exp_manager.ema.enable=False \ + model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \ + ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \ + ++model.cond_stage_config.max_length=77 \ + model.inductor=False \ + ~model.cond_stage_config.restore_from_path \ + ~model.cond_stage_config.freeze \ + ~model.cond_stage_config.layer \ + model.first_stage_config.from_pretrained=null \ + model.ddp_overlap=False \ + model.capture_cudagraph_iters=15 \ + model.unet_config.use_flash_attention=False \ + model.unet_config.attention_resolutions=[1] \ + model.unet_config.channel_mult=[1] \ + " + sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs" + } + } // stage('L2: Multimodal ControlNet Train') { // when { // anyOf { @@ -5849,4 +5849,4 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' cleanWs() } } -} \ No newline at end of file +} diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_train.py b/examples/multimodal/text_to_image/stable_diffusion/sd_train.py index 968d9bec2884..b10eda550e9a 100644 --- a/examples/multimodal/text_to_image/stable_diffusion/sd_train.py +++ b/examples/multimodal/text_to_image/stable_diffusion/sd_train.py @@ -83,7 +83,8 @@ def main(cfg) -> None: else: autocast_enabled = True dgrad_dtype = torch.float16 - + # akoumparouli: temp fix. + autocast_enabled = True model = model.cuda() for _ in range(5): with torch.autocast(device_type="cuda", enabled=autocast_enabled, dtype=torch.float16):