Skip to content

Commit

Permalink
Akoumparouli/fix sd train (NVIDIA#8876)
Browse files Browse the repository at this point in the history
* hardcode autocast

Signed-off-by: Alexandros Koumparoulis <[email protected]>

* uncomment sd_train

Signed-off-by: Alexandros Koumparoulis <[email protected]>

---------

Signed-off-by: Alexandros Koumparoulis <[email protected]>
  • Loading branch information
akoumpa committed Apr 11, 2024
1 parent 1809b61 commit 2890b33
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 44 deletions.
86 changes: 43 additions & 43 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -208,48 +208,48 @@ pipeline {
sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
}
}
//stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') {
// when {
// anyOf {
// branch 'main'
// changeRequest target: 'main'
// }
// }
// failFast true
// steps {
// sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
// sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
// trainer.precision=16 \
// trainer.num_nodes=1 \
// trainer.devices=1 \
// ++exp_manager.max_time_per_run=00:00:03:00 \
// exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \
// trainer.max_steps=20 \
// model.micro_batch_size=1 \
// model.global_batch_size=1 \
// model.data.synthetic_data=True \
// model.first_stage_key=images_moments \
// model.cond_stage_key=clip_encoded \
// model.optim.name=megatron_fused_adam \
// +model.optim.capturable=True \
// exp_manager.ema.enable=False \
// model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
// ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
// ++model.cond_stage_config.max_length=77 \
// model.inductor=False \
// ~model.cond_stage_config.restore_from_path \
// ~model.cond_stage_config.freeze \
// ~model.cond_stage_config.layer \
// model.first_stage_config.from_pretrained=null \
// model.ddp_overlap=False \
// model.capture_cudagraph_iters=15 \
// model.unet_config.use_flash_attention=False \
// model.unet_config.attention_resolutions=[1] \
// model.unet_config.channel_mult=[1] \
// "
// sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
// }
//}
stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
steps {
sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
trainer.precision=16 \
trainer.num_nodes=1 \
trainer.devices=1 \
++exp_manager.max_time_per_run=00:00:03:00 \
exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \
trainer.max_steps=20 \
model.micro_batch_size=1 \
model.global_batch_size=1 \
model.data.synthetic_data=True \
model.first_stage_key=images_moments \
model.cond_stage_key=clip_encoded \
model.optim.name=megatron_fused_adam \
+model.optim.capturable=True \
exp_manager.ema.enable=False \
model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
++model.cond_stage_config.max_length=77 \
model.inductor=False \
~model.cond_stage_config.restore_from_path \
~model.cond_stage_config.freeze \
~model.cond_stage_config.layer \
model.first_stage_config.from_pretrained=null \
model.ddp_overlap=False \
model.capture_cudagraph_iters=15 \
model.unet_config.use_flash_attention=False \
model.unet_config.attention_resolutions=[1] \
model.unet_config.channel_mult=[1] \
"
sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
}
}
// stage('L2: Multimodal ControlNet Train') {
// when {
// anyOf {
Expand Down Expand Up @@ -5849,4 +5849,4 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
cleanWs()
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ def main(cfg) -> None:
else:
autocast_enabled = True
dgrad_dtype = torch.float16

# akoumparouli: temp fix.
autocast_enabled = True
model = model.cuda()
for _ in range(5):
with torch.autocast(device_type="cuda", enabled=autocast_enabled, dtype=torch.float16):
Expand Down

0 comments on commit 2890b33

Please sign in to comment.